diff options
Diffstat (limited to 'youtube_dl')
| -rw-r--r-- | youtube_dl/extractor/generic.py | 3 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 20 | 
2 files changed, 22 insertions, 1 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a5bf9392..b893d8149 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..utils import (      ExtractorError,      float_or_none,      HEADRequest, +    is_html,      orderedSet,      parse_xml,      smuggle_url, @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):          # Maybe it's a direct link to a video?          # Be careful not to download the whole thing!          first_bytes = full_response.read(512) -        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): +        if not is_html(first_bytes):              self._downloader.report_warning(                  'URL could be a direct video link, returning it as such.')              upload_date = unified_strdate( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 764474c33..b433b591b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):      if content_limit is None:          return False  # Content available for everyone      return age_limit < content_limit + + +def is_html(first_bytes): +    """ Detect whether a file contains HTML by examining its first bytes. """ + +    BOMS = [ +        (b'\xef\xbb\xbf', 'utf-8'), +        (b'\x00\x00\xfe\xff', 'utf-32-be'), +        (b'\xff\xfe\x00\x00', 'utf-32-le'), +        (b'\xff\xfe', 'utf-16-le'), +        (b'\xfe\xff', 'utf-16-be'), +    ] +    for bom, enc in BOMS: +        if first_bytes.startswith(bom): +            s = first_bytes[len(bom):].decode(enc, 'replace') +            break +    else: +        s = first_bytes.decode('utf-8', 'replace') + +    return re.match(r'^\s*<', s)  | 
