From 56c7366547462ecec0536df58971249a8a870ddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 8 Jul 2013 15:14:27 +0200 Subject: YoutubeIE: reuse instances of InfoExtractors (closes #998) When a IE is added to the list, it's also added to a dictionary. When a IE is requested it first looks in the dictionary and if there's no instance it will create a new one. That way _real_initialize is only called once for each IE, saving time if it needs to login for example. --- youtube_dl/extractor/common.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1d98222ce..236c7b12c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -106,6 +106,11 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] -- cgit v1.2.3 From f143d86ad2fc0633d8e2da598cf21e73ff0f2872 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 28 Aug 2013 13:59:08 +0200 Subject: [sohu] Handle encoding, and fix tests --- youtube_dl/extractor/common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 77a13aea5..a2986cebe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -145,12 +145,17 @@ class InfoExtractor(object): urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br']+charset="?([^"]+)[ /">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() -- cgit v1.2.3 From 0d75ae2ce313c5738b2bdd9602ab3cc15e78810d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 29 Aug 2013 11:35:15 +0200 Subject: Fix detection of the webpage charset if it's declared using ' instead of " Like in "" --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor/common.py') diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a2986cebe..77726ee24 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -150,7 +150,7 @@ class InfoExtractor(object): if m: encoding = m.group(1) else: - m = re.search(br']+charset="?([^"]+)[ /">]', + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') -- cgit v1.2.3