aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2015-01-23 01:21:30 +0100
committerPhilipp Hagemeister <phihag@phihag.de>2015-01-23 01:21:30 +0100
commit61ca9a80b34b14fa0e5e19e35ee76e0086d49edf (patch)
tree74122c9f0b93c68fb3dbf7bd582b5b9346750b66 /youtube_dl/utils.py
parent317239b097a913824e20d436cdb8c74161564268 (diff)
[generic] Add support for BOMs (Fixes #4753)
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py20
1 files changed, 20 insertions, 0 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 764474c33..b433b591b 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
if content_limit is None:
return False # Content available for everyone
return age_limit < content_limit
+
+
+def is_html(first_bytes):
+ """ Detect whether a file contains HTML by examining its first bytes. """
+
+ BOMS = [
+ (b'\xef\xbb\xbf', 'utf-8'),
+ (b'\x00\x00\xfe\xff', 'utf-32-be'),
+ (b'\xff\xfe\x00\x00', 'utf-32-le'),
+ (b'\xff\xfe', 'utf-16-le'),
+ (b'\xfe\xff', 'utf-16-be'),
+ ]
+ for bom, enc in BOMS:
+ if first_bytes.startswith(bom):
+ s = first_bytes[len(bom):].decode(enc, 'replace')
+ break
+ else:
+ s = first_bytes.decode('utf-8', 'replace')
+
+ return re.match(r'^\s*<', s)