Merge branch 'master' into subtitles_rework

author: Ismael Mejia <iemejia@gmail.com> 2013-09-06 23:23:23 +0200
committer: Ismael Mejia <iemejia@gmail.com> 2013-09-06 23:24:41 +0200
commit: 72836fcee453386f4f16325c5b8fa4c1ba1bb442 (patch)
tree: 58efd36f4a56269a07774969e2ac385aacf8eae6 /youtube_dl/extractor/common.py
parent: d6e203b3dcef8f291b57021903e629d3e30e1f0b (diff)
parent: a7130543fa0368175740f5fa173ef920671db866 (diff)
1 files changed, 13 insertions, 3 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 52c4483c9..77726ee24 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -114,6 +114,11 @@ class InfoExtractor(object):
         """Real extraction process. Redefine in subclasses."""
         pass
 
+    @classmethod
+    def ie_key(cls):
+        """A string for getting the InfoExtractor with get_info_extractor"""
+        return cls.__name__[:-2]
+
     @property
     def IE_NAME(self):
         return type(self).__name__[:-2]
@@ -129,7 +134,7 @@ class InfoExtractor(object):
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
             if errnote is None:
                 errnote = u'Unable to download webpage'
-            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
 
     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
         """ Returns a tuple (page content as string, URL handle) """
@@ -140,12 +145,17 @@ class InfoExtractor(object):
 
         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
         content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
         else:
-            encoding = 'utf-8'
-        webpage_bytes = urlh.read()
+            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+                          webpage_bytes[:1024])
+            if m:
+                encoding = m.group(1).decode('ascii')
+            else:
+                encoding = 'utf-8'
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
author	Ismael Mejia <iemejia@gmail.com>	2013-09-06 23:23:23 +0200
committer	Ismael Mejia <iemejia@gmail.com>	2013-09-06 23:24:41 +0200
commit	72836fcee453386f4f16325c5b8fa4c1ba1bb442 (patch)
tree	58efd36f4a56269a07774969e2ac385aacf8eae6 /youtube_dl/extractor/common.py
parent	d6e203b3dcef8f291b57021903e629d3e30e1f0b (diff)
parent	a7130543fa0368175740f5fa173ef920671db866 (diff)