diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-10-26 17:05:44 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-10-26 17:05:44 +0100 | 
| commit | 23be51d8ce132dbb967f460e1225fdaaa43dff39 (patch) | |
| tree | 1a4e5404654e8f63d80bb662836ab05bb80cb340 | |
| parent | 488447455d3d90e1d83a7ebc2f9ce552e031e0d8 (diff) | |
[generic] Handle audio streams that do not implement HEAD (Fixes #4032)
| -rw-r--r-- | youtube_dl/extractor/common.py | 7 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 53 | 
2 files changed, 32 insertions, 28 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cf3781cd6..e1bd6bb49 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -242,7 +242,6 @@ class InfoExtractor(object):      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns a tuple (page content as string, URL handle) """ -          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] @@ -251,6 +250,10 @@ class InfoExtractor(object):          if urlh is False:              assert not fatal              return False +        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) +        return (content, urlh) + +    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):          content_type = urlh.headers.get('Content-Type', '')          webpage_bytes = urlh.read()          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -309,7 +312,7 @@ class InfoExtractor(object):                  msg += ' Visit %s for more details' % blocked_iframe              raise ExtractorError(msg, expected=True) -        return (content, urlh) +        return content      def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the data of the page as a string """ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 524215408..51dbbc8db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -503,14 +503,14 @@ class GenericIE(InfoExtractor):          self.to_screen('%s: Requesting header' % video_id)          head_req = HEADRequest(url) -        response = self._request_webpage( +        head_response = self._request_webpage(              head_req, video_id,              note=False, errnote='Could not send HEAD request to %s' % url,              fatal=False) -        if response is not False: +        if head_response is not False:              # Check for redirect -            new_url = response.geturl() +            new_url = head_response.geturl()              if url != new_url:                  self.report_following_redirect(new_url)                  if force_videoid: @@ -518,34 +518,35 @@ class GenericIE(InfoExtractor):                          new_url, {'force_videoid': force_videoid})                  return self.url_result(new_url) -            # Check for direct link to a video -            content_type = response.headers.get('Content-Type', '') -            m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) -            if m: -                upload_date = response.headers.get('Last-Modified') -                if upload_date: -                    upload_date = unified_strdate(upload_date) -                return { -                    'id': video_id, -                    'title': os.path.splitext(url_basename(url))[0], -                    'formats': [{ -                        'format_id': m.group('format_id'), -                        'url': url, -                        'vcodec': 'none' if m.group('type') == 'audio' else None -                    }], -                    'upload_date': upload_date, -                } +        full_response = None +        if head_response is False: +            full_response = self._request_webpage(url, video_id) +            head_response = full_response + +        # Check for direct link to a video +        content_type = head_response.headers.get('Content-Type', '') +        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) +        if m: +            upload_date = unified_strdate( +                head_response.headers.get('Last-Modified')) +            return { +                'id': video_id, +                'title': os.path.splitext(url_basename(url))[0], +                'formats': [{ +                    'format_id': m.group('format_id'), +                    'url': url, +                    'vcodec': 'none' if m.group('type') == 'audio' else None +                }], +                'upload_date': upload_date, +            }          if not self._downloader.params.get('test', False) and not is_intentional:              self._downloader.report_warning('Falling back on generic information extractor.') -        try: +        if full_response: +            webpage = _webpage_read_content(url, video_id) +        else:              webpage = self._download_webpage(url, video_id) -        except ValueError: -            # since this is the last-resort InfoExtractor, if -            # this error is thrown, it'll be thrown here -            raise ExtractorError('Failed to download URL: %s' % url) -          self.report_extraction(video_id)          # Is it an RSS feed?  | 
