aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2013-08-28 13:59:08 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2013-08-28 14:00:05 +0200
commitf143d86ad2fc0633d8e2da598cf21e73ff0f2872 (patch)
treee04c6cee3d5698e790c6fabd1885853bcc94c6f6 /youtube_dl/extractor/common.py
parentf8b362739e4f469b501aa804beb95cf1cfb1c916 (diff)
downloadyoutube-dl-f143d86ad2fc0633d8e2da598cf21e73ff0f2872.tar.xz
[sohu] Handle encoding, and fix tests
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 77a13aea5..a2986cebe 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -145,12 +145,17 @@ class InfoExtractor(object):
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
else:
- encoding = 'utf-8'
- webpage_bytes = urlh.read()
+ m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]',
+ webpage_bytes[:1024])
+ if m:
+ encoding = m.group(1).decode('ascii')
+ else:
+ encoding = 'utf-8'
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()