diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2015-01-23 01:21:30 +0100 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2015-01-23 01:21:30 +0100 |
commit | 61ca9a80b34b14fa0e5e19e35ee76e0086d49edf (patch) | |
tree | 74122c9f0b93c68fb3dbf7bd582b5b9346750b66 /youtube_dl | |
parent | 317239b097a913824e20d436cdb8c74161564268 (diff) |
[generic] Add support for BOMs (Fixes #4753)
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/generic.py | 3 | ||||
-rw-r--r-- | youtube_dl/utils.py | 20 |
2 files changed, 22 insertions, 1 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a5bf9392..b893d8149 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -17,6 +17,7 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + is_html, orderedSet, parse_xml, smuggle_url, @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor): # Maybe it's a direct link to a video? # Be careful not to download the whole thing! first_bytes = full_response.read(512) - if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): + if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') upload_date = unified_strdate( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 764474c33..b433b591b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit): if content_limit is None: return False # Content available for everyone return age_limit < content_limit + + +def is_html(first_bytes): + """ Detect whether a file contains HTML by examining its first bytes. """ + + BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), + ] + for bom, enc in BOMS: + if first_bytes.startswith(bom): + s = first_bytes[len(bom):].decode(enc, 'replace') + break + else: + s = first_bytes.decode('utf-8', 'replace') + + return re.match(r'^\s*<', s) |