diff options
| -rw-r--r-- | youtube_dl/extractor/common.py | 34 | ||||
| -rw-r--r-- | youtube_dl/extractor/qqmusic.py | 4 | 
2 files changed, 23 insertions, 15 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed97f8dd..28f672e42 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -324,7 +324,7 @@ class InfoExtractor(object):                  self._downloader.report_warning(errmsg)                  return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)): @@ -334,14 +334,11 @@ class InfoExtractor(object):          if urlh is False:              assert not fatal              return False -        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) +        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)          return (content, urlh) -    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): -        content_type = urlh.headers.get('Content-Type', '') -        webpage_bytes = urlh.read() -        if prefix is not None: -            webpage_bytes = prefix + webpage_bytes +    @staticmethod +    def _guess_encoding_from_content(content_type, webpage_bytes):          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)          if m:              encoding = m.group(1) @@ -354,6 +351,16 @@ class InfoExtractor(object):                  encoding = 'utf-16'              else:                  encoding = 'utf-8' + +        return encoding + +    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): +        content_type = urlh.headers.get('Content-Type', '') +        webpage_bytes = urlh.read() +        if prefix is not None: +            webpage_bytes = prefix + webpage_bytes +        if not encoding: +            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)          if self._downloader.params.get('dump_intermediate_pages', False):              try:                  url = url_or_request.get_full_url() @@ -410,13 +417,13 @@ class InfoExtractor(object):          return content -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):          """ Returns the data of the page as a string """          success = False          try_count = 0          while success is False:              try: -                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) +                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)                  success = True              except compat_http_client.IncompleteRead as e:                  try_count += 1 @@ -431,10 +438,10 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True): +                      transform_source=None, fatal=True, encoding=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal) +            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)          if xml_string is False:              return xml_string          if transform_source: @@ -445,9 +452,10 @@ class InfoExtractor(object):                         note='Downloading JSON metadata',                         errnote='Unable to download JSON metadata',                         transform_source=None, -                       fatal=True): +                       fatal=True, encoding=None):          json_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal) +            url_or_request, video_id, note, errnote, fatal=fatal, +            encoding=encoding)          if (not fatal) and json_string is False:              return None          return self._parse_json( diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index e8aacbc3d..174c8e0ae 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -24,7 +24,7 @@ class QQMusicIE(InfoExtractor):              'title': '可惜没如果',              'upload_date': '20141227',              'creator': '林俊杰', -            'description': 'md5:242c97c2847e0495583b7b13764f7106', +            'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',          }      }] @@ -41,7 +41,7 @@ class QQMusicIE(InfoExtractor):          detail_info_page = self._download_webpage(              'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,              mid, note='Download song detail info', -            errnote='Unable to get song detail info') +            errnote='Unable to get song detail info', encoding='gbk')          song_name = self._html_search_regex(              r"songname:\s*'([^']+)'", detail_info_page, 'song name') | 
