aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py109
1 files changed, 74 insertions, 35 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index dfc2ef4e7..35a7664b2 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -28,6 +28,7 @@ from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .smotri import SmotriIE
+from .condenast import CondeNastIE
class GenericIE(InfoExtractor):
@@ -324,7 +325,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'age_limit': 18,
'uploader': 'www.handjobhub.com',
- 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
+ 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
# RSS feed
@@ -379,6 +380,32 @@ class GenericIE(InfoExtractor):
'uploader': 'education-portal.com',
},
},
+ {
+ 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
+ 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
+ 'info_dict': {
+ 'id': 'uxjb0lwrcz',
+ 'ext': 'mp4',
+ 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+ 'duration': 1715.0,
+ 'uploader': 'thoughtworks.wistia.com',
+ },
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ }
]
def report_following_redirect(self, new_url):
@@ -475,7 +502,8 @@ class GenericIE(InfoExtractor):
'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
) % (url, url), expected=True)
else:
- assert ':' in default_search
+ if ':' not in default_search:
+ default_search += ':'
return self.url_result(default_search + url)
url, smuggled_data = unsmuggle_url(url)
@@ -490,14 +518,14 @@ class GenericIE(InfoExtractor):
self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url)
- response = self._request_webpage(
+ head_response = self._request_webpage(
head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False)
- if response is not False:
+ if head_response is not False:
# Check for redirect
- new_url = response.geturl()
+ new_url = head_response.geturl()
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -505,34 +533,35 @@ class GenericIE(InfoExtractor):
new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
- # Check for direct link to a video
- content_type = response.headers.get('Content-Type', '')
- m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
- if m:
- upload_date = response.headers.get('Last-Modified')
- if upload_date:
- upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
- 'formats': [{
- 'format_id': m.group('format_id'),
- 'url': url,
- 'vcodec': 'none' if m.group('type') == 'audio' else None
- }],
- 'upload_date': upload_date,
- }
+ full_response = None
+ if head_response is False:
+ full_response = self._request_webpage(url, video_id)
+ head_response = full_response
+
+ # Check for direct link to a video
+ content_type = head_response.headers.get('Content-Type', '')
+ m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+ if m:
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'formats': [{
+ 'format_id': m.group('format_id'),
+ 'url': url,
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
+ }],
+ 'upload_date': upload_date,
+ }
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
- try:
+ if full_response:
+ webpage = self._webpage_read_content(full_response, url, video_id)
+ else:
webpage = self._download_webpage(url, video_id)
- except ValueError:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError('Failed to download URL: %s' % url)
-
self.report_extraction(video_id)
# Is it an RSS feed?
@@ -608,13 +637,13 @@ class GenericIE(InfoExtractor):
if mobj:
player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
- return self.url_result(surl, 'Vimeo')
+ return self.url_result(surl)
# Look for embedded (swf embed) Vimeo player
mobj = re.search(
- r'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+ r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
if mobj:
- return self.url_result(mobj.group(1), 'Vimeo')
+ return self.url_result(mobj.group(1))
# Look for embedded YouTube player
matches = re.findall(r'''(?x)
@@ -622,7 +651,8 @@ class GenericIE(InfoExtractor):
<iframe[^>]+?src=|
data-video-url=|
<embed[^>]+?src=|
- embedSWF\(?:\s*
+ embedSWF\(?:\s*|
+ new\s+SWFObject\(
)
(["\'])
(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
@@ -651,17 +681,20 @@ class GenericIE(InfoExtractor):
# Look for embedded Wistia player
match = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
+ embed_url = self._proto_relative_url(
+ unescapeHTML(match.group('url')))
return {
'_type': 'url_transparent',
- 'url': unescapeHTML(match.group('url')),
+ 'url': embed_url,
'ie_key': 'Wistia',
'uploader': video_uploader,
'title': video_title,
'id': video_id,
}
- match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+
+ match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
'_type': 'url_transparent',
@@ -847,6 +880,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+ webpage)
+ if mobj is not None:
+ return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
+
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)