diff options
| author | huohuarong <huohuarong@gmail.com> | 2013-08-03 10:29:58 +0800 | 
|---|---|---|
| committer | huohuarong <huohuarong@gmail.com> | 2013-08-03 10:29:58 +0800 | 
| commit | 4ec929dc9b55a2588b4a27e64871c5bfa900bf37 (patch) | |
| tree | bdee9a28442f183a2a1c5c8c6ff561d536033afa | |
| parent | 6624a2b07dafad4de895b4e84f4595214817518d (diff) | |
use ..utils/clean_html()
| -rw-r--r-- | youtube_dl/extractor/sohu.py | 19 | 
1 files changed, 6 insertions, 13 deletions
| diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 830814221..cf0ab5478 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -7,7 +7,7 @@ import logging  import urllib2  from .common import InfoExtractor -from ..utils import compat_urllib_request +from ..utils import compat_urllib_request, clean_html  class SohuIE(InfoExtractor): @@ -22,16 +22,6 @@ class SohuIE(InfoExtractor):          },      } -    def _clearn_html(self, string): -        tags = re.findall(r'<.+?>', string) -        for t in tags: -            string = string.replace(t, ' ') -        for i in range(2): -            spaces = re.findall(r'\s+', string) -            for s in spaces: -                string = string.replace(s, ' ') -        string = string.strip() -        return string      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -40,7 +30,7 @@ class SohuIE(InfoExtractor):          pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'          compiled = re.compile(pattern, re.DOTALL)          title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') -        title = self._clearn_html(title) +        title = clean_html(title)          pattern = re.compile(r'var vid="(\d+)"')          result = re.search(pattern, webpage)          if not result: @@ -93,5 +83,8 @@ class SohuIE(InfoExtractor):              }              files_info.append(info)              time.sleep(1) - +        if num_of_parts == 1: +            info =  files_info[0] +            info['id'] = video_id +            return info          return files_info | 
