diff options
| -rw-r--r-- | youtube_dl/extractor/common.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 42 | 
2 files changed, 41 insertions, 5 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3c2d46dd5..3e84d0e62 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -296,9 +296,11 @@ class InfoExtractor(object):          content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)          return (content, urlh) -    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):          content_type = urlh.headers.get('Content-Type', '')          webpage_bytes = urlh.read() +        if prefix is not None: +            webpage_bytes = prefix + webpage_bytes          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)          if m:              encoding = m.group(1) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6a95e2952..328301de3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -452,7 +452,23 @@ class GenericIE(InfoExtractor):                  'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',              },              'playlist_mincount': 2, +        }, +        # Direct link with incorrect MIME type +        { +            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +            'md5': '4ccbebe5f36706d85221f204d7eb5913', +            'info_dict': { +                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', +                'id': '5_Lennart_Poettering_-_Systemd', +                'ext': 'webm', +                'title': '5_Lennart_Poettering_-_Systemd', +                'upload_date': '20141120', +            }, +            'expected_warnings': [ +                'URL could be a direct video link, returning it as such.' +            ]          } +      ]      def report_following_redirect(self, new_url): @@ -606,10 +622,28 @@ class GenericIE(InfoExtractor):          if not self._downloader.params.get('test', False) and not is_intentional:              self._downloader.report_warning('Falling back on generic information extractor.') -        if full_response: -            webpage = self._webpage_read_content(full_response, url, video_id) -        else: -            webpage = self._download_webpage(url, video_id) +        if not full_response: +            full_response = self._request_webpage(url, video_id) + +        # Maybe it's a direct link to a video? +        # Be careful not to download the whole thing! +        first_bytes = full_response.read(512) +        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): +            self._downloader.report_warning( +                'URL could be a direct video link, returning it as such.') +            upload_date = unified_strdate( +                head_response.headers.get('Last-Modified')) +            return { +                'id': video_id, +                'title': os.path.splitext(url_basename(url))[0], +                'direct': True, +                'url': url, +                'upload_date': upload_date, +            } + +        webpage = self._webpage_read_content( +            full_response, url, video_id, prefix=first_bytes) +          self.report_extraction(video_id)          # Is it an RSS feed? | 
