[generic] Add support for BOMs (Fixes #4753)

author: Philipp Hagemeister <phihag@phihag.de> 2015-01-23 01:21:30 +0100
committer: Philipp Hagemeister <phihag@phihag.de> 2015-01-23 01:21:30 +0100
commit: 61ca9a80b34b14fa0e5e19e35ee76e0086d49edf (patch)
tree: 74122c9f0b93c68fb3dbf7bd582b5b9346750b66 /youtube_dl
parent: 317239b097a913824e20d436cdb8c74161564268 (diff)
2 files changed, 22 insertions, 1 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a5bf9392..b893d8149 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -17,6 +17,7 @@ from ..utils import (
     ExtractorError,
     float_or_none,
     HEADRequest,
+    is_html,
     orderedSet,
     parse_xml,
     smuggle_url,
@@ -647,7 +648,7 @@ class GenericIE(InfoExtractor):
         # Maybe it's a direct link to a video?
         # Be careful not to download the whole thing!
         first_bytes = full_response.read(512)
-        if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')):
+        if not is_html(first_bytes):
             self._downloader.report_warning(
                 'URL could be a direct video link, returning it as such.')
             upload_date = unified_strdate(
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 764474c33..b433b591b 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
     if content_limit is None:
         return False  # Content available for everyone
     return age_limit < content_limit
+
+
+def is_html(first_bytes):
+    """ Detect whether a file contains HTML by examining its first bytes. """
+
+    BOMS = [
+        (b'\xef\xbb\xbf', 'utf-8'),
+        (b'\x00\x00\xfe\xff', 'utf-32-be'),
+        (b'\xff\xfe\x00\x00', 'utf-32-le'),
+        (b'\xff\xfe', 'utf-16-le'),
+        (b'\xfe\xff', 'utf-16-be'),
+    ]
+    for bom, enc in BOMS:
+        if first_bytes.startswith(bom):
+            s = first_bytes[len(bom):].decode(enc, 'replace')
+            break
+    else:
+        s = first_bytes.decode('utf-8', 'replace')
+
+    return re.match(r'^\s*<', s)
author	Philipp Hagemeister <phihag@phihag.de>	2015-01-23 01:21:30 +0100
committer	Philipp Hagemeister <phihag@phihag.de>	2015-01-23 01:21:30 +0100
commit	61ca9a80b34b14fa0e5e19e35ee76e0086d49edf (patch)
tree	74122c9f0b93c68fb3dbf7bd582b5b9346750b66 /youtube_dl
parent	317239b097a913824e20d436cdb8c74161564268 (diff)