diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 73 |
1 files changed, 44 insertions, 29 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9057a6beb..51dbbc8db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -380,6 +380,17 @@ class GenericIE(InfoExtractor): 'uploader': 'education-portal.com', }, }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -476,7 +487,8 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) @@ -491,14 +503,14 @@ class GenericIE(InfoExtractor): self.to_screen('%s: Requesting header' % video_id) head_req = HEADRequest(url) - response = self._request_webpage( + head_response = self._request_webpage( head_req, video_id, note=False, errnote='Could not send HEAD request to %s' % url, fatal=False) - if response is not False: + if head_response is not False: # Check for redirect - new_url = response.geturl() + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -506,34 +518,35 @@ class GenericIE(InfoExtractor): new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) - # Check for direct link to a video - content_type = response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) - if m: - upload_date = response.headers.get('Last-Modified') - if upload_date: - upload_date = unified_strdate(upload_date) - return { - 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], - 'formats': [{ - 'format_id': m.group('format_id'), - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }], - 'upload_date': upload_date, - } + full_response = None + if head_response is False: + full_response = self._request_webpage(url, video_id) + head_response = full_response + + # Check for direct link to a video + content_type = head_response.headers.get('Content-Type', '') + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) + if m: + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }], + 'upload_date': upload_date, + } if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - try: + if full_response: + webpage = _webpage_read_content(url, video_id) + else: webpage = self._download_webpage(url, video_id) - except ValueError: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError('Failed to download URL: %s' % url) - self.report_extraction(video_id) # Is it an RSS feed? @@ -623,7 +636,8 @@ class GenericIE(InfoExtractor): <iframe[^>]+?src=| data-video-url=| <embed[^>]+?src=| - embedSWF\(?:\s* + embedSWF\(?:\s*| + new\s+SWFObject\( ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ @@ -652,7 +666,7 @@ class GenericIE(InfoExtractor): # Look for embedded Wistia player match = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: embed_url = self._proto_relative_url( unescapeHTML(match.group('url'))) @@ -664,6 +678,7 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) if match: return { |
