aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py235
1 files changed, 188 insertions, 47 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 9b6498894..7a5bf9392 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -7,11 +7,12 @@ import re
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
compat_xml_parse_error,
-
+)
+from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
@@ -22,6 +23,7 @@ from ..utils import (
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ UnsupportedError,
url_basename,
)
from .brightcove import BrightcoveIE
@@ -99,6 +101,22 @@ class GenericIE(InfoExtractor):
'uploader': 'Championat',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/3541
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+ 'info_dict': {
+ 'id': '3866516442001',
+ 'ext': 'mp4',
+ 'title': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'description': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'uploader': 'SBS Broadcasting',
+ },
+ 'skip': 'Restricted to Netherlands',
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -113,12 +131,13 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
'title': '2cc213299525360.mov', # that's what we get
},
+ 'add_ie': ['Ooyala'],
},
# google redirect
{
@@ -128,7 +147,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20130224',
'uploader_id': 'TheVerge',
- 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
@@ -163,6 +182,14 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
+ # BBC iPlayer embeds
+ {
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
+ 'info_dict': {
+ 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_mincount': 18,
+ },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -325,7 +352,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'age_limit': 18,
'uploader': 'www.handjobhub.com',
- 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
+ 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
# RSS feed
@@ -389,7 +416,77 @@ class GenericIE(InfoExtractor):
'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
'duration': 1715.0,
'uploader': 'thoughtworks.wistia.com',
- },
+ },
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Soundcloud embed
+ {
+ 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
+ 'info_dict': {
+ 'id': '174391317',
+ 'ext': 'mp3',
+ 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
+ 'uploader': 'Sophos Security',
+ 'title': 'Chet Chat 171 - Oct 29, 2014',
+ 'upload_date': '20141029',
+ }
+ },
+ # Livestream embed
+ {
+ 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
+ 'info_dict': {
+ 'id': '67864563',
+ 'ext': 'flv',
+ 'upload_date': '20141112',
+ 'title': 'Rosetta #CometLanding webcast HL 10',
+ }
+ },
+ # LazyYT
+ {
+ 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
+ 'info_dict': {
+ 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
+ },
+ 'playlist_mincount': 2,
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Cinchcast embed
+ {
+ 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+ 'info_dict': {
+ 'id': '7141703',
+ 'ext': 'mp3',
+ 'upload_date': '20141126',
+ 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+ }
},
]
@@ -483,9 +580,9 @@ class GenericIE(InfoExtractor):
if default_search in ('error', 'fixup_error'):
raise ExtractorError(
- ('%r is not a valid URL. '
- 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
- ) % (url, url), expected=True)
+ '%r is not a valid URL. '
+ 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
+ % (url, url), expected=True)
else:
if ':' not in default_search:
default_search += ':'
@@ -503,14 +600,14 @@ class GenericIE(InfoExtractor):
self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url)
- response = self._request_webpage(
+ head_response = self._request_webpage(
head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False)
- if response is not False:
+ if head_response is not False:
# Check for redirect
- new_url = response.geturl()
+ new_url = head_response.geturl()
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -518,33 +615,53 @@ class GenericIE(InfoExtractor):
new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
- # Check for direct link to a video
- content_type = response.headers.get('Content-Type', '')
- m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
- if m:
- upload_date = response.headers.get('Last-Modified')
- if upload_date:
- upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
- 'formats': [{
- 'format_id': m.group('format_id'),
- 'url': url,
- 'vcodec': 'none' if m.group('type') == 'audio' else None
- }],
- 'upload_date': upload_date,
- }
+ full_response = None
+ if head_response is False:
+ full_response = self._request_webpage(url, video_id)
+ head_response = full_response
+
+ # Check for direct link to a video
+ content_type = head_response.headers.get('Content-Type', '')
+ m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+ if m:
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
+ 'formats': [{
+ 'format_id': m.group('format_id'),
+ 'url': url,
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
+ }],
+ 'upload_date': upload_date,
+ }
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
- try:
- webpage = self._download_webpage(url, video_id)
- except ValueError:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError('Failed to download URL: %s' % url)
+ if not full_response:
+ full_response = self._request_webpage(url, video_id)
+
+ # Maybe it's a direct link to a video?
+ # Be careful not to download the whole thing!
+ first_bytes = full_response.read(512)
+ if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
+ self._downloader.report_warning(
+ 'URL could be a direct video link, returning it as such.')
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
+ 'url': url,
+ 'upload_date': upload_date,
+ }
+
+ webpage = self._webpage_read_content(
+ full_response, url, video_id, prefix=first_bytes)
self.report_extraction(video_id)
@@ -591,9 +708,9 @@ class GenericIE(InfoExtractor):
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Helper method
- def _playlist_from_matches(matches, getter, ie=None):
+ def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m)), ie)
+ self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -635,7 +752,8 @@ class GenericIE(InfoExtractor):
<iframe[^>]+?src=|
data-video-url=|
<embed[^>]+?src=|
- embedSWF\(?:\s*
+ embedSWF\(?:\s*|
+ new\s+SWFObject\(
)
(["\'])
(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
@@ -645,6 +763,12 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
+ # Look for lazyYT YouTube embed
+ matches = re.findall(
+ r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
+ if matches:
+ return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+
# Look for embedded Dailymotion player
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
@@ -676,7 +800,7 @@ class GenericIE(InfoExtractor):
'title': video_title,
'id': video_id,
}
-
+
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
@@ -691,7 +815,7 @@ class GenericIE(InfoExtractor):
# Look for embedded blip.tv player
mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
if mobj:
- return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
+ return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
if mobj:
return self.url_result(mobj.group(1), 'BlipTV')
@@ -727,7 +851,7 @@ class GenericIE(InfoExtractor):
# Look for Ooyala videos
mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
- re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+ re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
return OoyalaIE._build_url_result(mobj.group('ec'))
@@ -790,6 +914,11 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for BBC iPlayer embed
+ matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='BBCCoUk')
+
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
if rutv_url:
@@ -797,7 +926,7 @@ class GenericIE(InfoExtractor):
# Look for embedded TED player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
@@ -821,7 +950,7 @@ class GenericIE(InfoExtractor):
# Look for embeded soundcloud player
mobj = re.search(
- r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
+ r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
webpage)
if mobj is not None:
url = unescapeHTML(mobj.group('url'))
@@ -857,8 +986,15 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
+ # Look for embedded Cinchcast player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Cinchcast')
+
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
@@ -869,6 +1005,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Livestream')
+
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
@@ -916,7 +1058,7 @@ class GenericIE(InfoExtractor):
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
- found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
if not found:
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
@@ -930,7 +1072,7 @@ class GenericIE(InfoExtractor):
'url': new_url,
}
if not found:
- raise ExtractorError('Unsupported URL: %s' % url)
+ raise UnsupportedError(url)
entries = []
for video_url in found:
@@ -962,4 +1104,3 @@ class GenericIE(InfoExtractor):
'_type': 'playlist',
'entries': entries,
}
-