[sohu] Handle encoding, and fix tests

author: Philipp Hagemeister <phihag@phihag.de> 2013-08-28 13:59:08 +0200
committer: Philipp Hagemeister <phihag@phihag.de> 2013-08-28 14:00:05 +0200
commit: f143d86ad2fc0633d8e2da598cf21e73ff0f2872 (patch)
tree: e04c6cee3d5698e790c6fabd1885853bcc94c6f6 /youtube_dl/extractor/common.py
parent: f8b362739e4f469b501aa804beb95cf1cfb1c916 (diff)
1 files changed, 7 insertions, 2 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 77a13aea5..a2986cebe 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -145,12 +145,17 @@ class InfoExtractor(object):
 
         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
         content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
         else:
-            encoding = 'utf-8'
-        webpage_bytes = urlh.read()
+            m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]',
+                          webpage_bytes[:1024])
+            if m:
+                encoding = m.group(1).decode('ascii')
+            else:
+                encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
author	Philipp Hagemeister <phihag@phihag.de>	2013-08-28 13:59:08 +0200
committer	Philipp Hagemeister <phihag@phihag.de>	2013-08-28 14:00:05 +0200
commit	f143d86ad2fc0633d8e2da598cf21e73ff0f2872 (patch)
tree	e04c6cee3d5698e790c6fabd1885853bcc94c6f6 /youtube_dl/extractor/common.py
parent	f8b362739e4f469b501aa804beb95cf1cfb1c916 (diff)