diff options
| -rw-r--r-- | test/test_utils.py | 17 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 3 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 20 | 
3 files changed, 39 insertions, 1 deletions
| diff --git a/test/test_utils.py b/test/test_utils.py index 206760d99..bdd7f268a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import (      fix_xml_ampersands,      InAdvancePagedList,      intlist_to_bytes, +    is_html,      js_to_json,      limit_length,      OnDemandPagedList, @@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')          self.assertTrue(age_restricted(18, 14))          self.assertFalse(age_restricted(18, 18)) +    def test_is_html(self): +        self.assertFalse(is_html(b'\x49\x44\x43<html')) +        self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa')) +        self.assertTrue(is_html(  # UTF-8 with BOM +            b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa')) +        self.assertTrue(is_html(  # UTF-16-LE +            b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00' +        )) +        self.assertTrue(is_html(  # UTF-16-BE +            b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4' +        )) +        self.assertTrue(is_html(  # UTF-32-BE +            b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4')) +        self.assertTrue(is_html(  # UTF-32-LE +            b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00')) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a5bf9392..b893d8149 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..utils import (      ExtractorError,      float_or_none,      HEADRequest, +    is_html,      orderedSet,      parse_xml,      smuggle_url, @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):          # Maybe it's a direct link to a video?          # Be careful not to download the whole thing!          first_bytes = full_response.read(512) -        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): +        if not is_html(first_bytes):              self._downloader.report_warning(                  'URL could be a direct video link, returning it as such.')              upload_date = unified_strdate( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 764474c33..b433b591b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):      if content_limit is None:          return False  # Content available for everyone      return age_limit < content_limit + + +def is_html(first_bytes): +    """ Detect whether a file contains HTML by examining its first bytes. """ + +    BOMS = [ +        (b'\xef\xbb\xbf', 'utf-8'), +        (b'\x00\x00\xfe\xff', 'utf-32-be'), +        (b'\xff\xfe\x00\x00', 'utf-32-le'), +        (b'\xff\xfe', 'utf-16-le'), +        (b'\xfe\xff', 'utf-16-be'), +    ] +    for bom, enc in BOMS: +        if first_bytes.startswith(bom): +            s = first_bytes[len(bom):].decode(enc, 'replace') +            break +    else: +        s = first_bytes.decode('utf-8', 'replace') + +    return re.match(r'^\s*<', s) | 
