diff options
-rw-r--r-- | youtube_dl/extractor/common.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 53 |
2 files changed, 32 insertions, 28 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cf3781cd6..e1bd6bb49 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -242,7 +242,6 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@ -251,6 +250,10 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -309,7 +312,7 @@ class InfoExtractor(object): msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) - return (content, urlh) + return content def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the data of the page as a string """ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 524215408..51dbbc8db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -503,14 +503,14 @@ class GenericIE(InfoExtractor): self.to_screen('%s: Requesting header' % video_id) head_req = HEADRequest(url) - response = self._request_webpage( + head_response = self._request_webpage( head_req, video_id, note=False, errnote='Could not send HEAD request to %s' % url, fatal=False) - if response is not False: + if head_response is not False: # Check for redirect - new_url = response.geturl() + new_url = head_response.geturl() if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -518,34 +518,35 @@ class GenericIE(InfoExtractor): new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) - # Check for direct link to a video - content_type = response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) - if m: - upload_date = response.headers.get('Last-Modified') - if upload_date: - upload_date = unified_strdate(upload_date) - return { - 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], - 'formats': [{ - 'format_id': m.group('format_id'), - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }], - 'upload_date': upload_date, - } + full_response = None + if head_response is False: + full_response = self._request_webpage(url, video_id) + head_response = full_response + + # Check for direct link to a video + content_type = head_response.headers.get('Content-Type', '') + m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) + if m: + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'formats': [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }], + 'upload_date': upload_date, + } if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - try: + if full_response: + webpage = _webpage_read_content(url, video_id) + else: webpage = self._download_webpage(url, video_id) - except ValueError: - # since this is the last-resort InfoExtractor, if - # this error is thrown, it'll be thrown here - raise ExtractorError('Failed to download URL: %s' % url) - self.report_extraction(video_id) # Is it an RSS feed? |