diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2013-08-28 13:59:08 +0200 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2013-08-28 14:00:05 +0200 |
commit | f143d86ad2fc0633d8e2da598cf21e73ff0f2872 (patch) | |
tree | e04c6cee3d5698e790c6fabd1885853bcc94c6f6 /youtube_dl/extractor/common.py | |
parent | f8b362739e4f469b501aa804beb95cf1cfb1c916 (diff) |
[sohu] Handle encoding, and fix tests
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77a13aea5..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br'<meta[^>]+charset="?([^"]+)[ /">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() |