diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 127 | 
1 files changed, 86 insertions, 41 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed97f8dd..b9014fc23 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,17 +22,20 @@ from ..compat import (      compat_str,  )  from ..utils import ( +    NO_DEFAULT,      age_restricted, +    bug_reports_message,      clean_html,      compiled_regex_type, +    determine_ext,      ExtractorError, +    fix_xml_ampersands,      float_or_none,      int_or_none,      RegexNotFoundError,      sanitize_filename,      unescapeHTML,  ) -_NO_DEFAULT = object()  class InfoExtractor(object): @@ -46,7 +49,7 @@ class InfoExtractor(object):      information possibly downloading the video to the file system, among      other possible outcomes. -    The type field determines the the type of the result. +    The type field determines the type of the result.      By far the most common value (and the default if _type is missing) is      "video", which indicates a single video. @@ -110,11 +113,8 @@ class InfoExtractor(object):                                    (quality takes higher priority)                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. -                    * http_method  HTTP method to use for the download.                      * http_headers  A dictionary of additional HTTP headers                                   to add to the request. -                    * http_post_data  Additional data to send with a POST -                                 request.                      * stretched_ratio  If given and not 1, indicates that the                                   video's pixels are not square.                                   width : height ratio as float. @@ -324,7 +324,7 @@ class InfoExtractor(object):                  self._downloader.report_warning(errmsg)                  return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)): @@ -334,14 +334,11 @@ class InfoExtractor(object):          if urlh is False:              assert not fatal              return False -        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) +        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)          return (content, urlh) -    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): -        content_type = urlh.headers.get('Content-Type', '') -        webpage_bytes = urlh.read() -        if prefix is not None: -            webpage_bytes = prefix + webpage_bytes +    @staticmethod +    def _guess_encoding_from_content(content_type, webpage_bytes):          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)          if m:              encoding = m.group(1) @@ -354,6 +351,16 @@ class InfoExtractor(object):                  encoding = 'utf-16'              else:                  encoding = 'utf-8' + +        return encoding + +    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): +        content_type = urlh.headers.get('Content-Type', '') +        webpage_bytes = urlh.read() +        if prefix is not None: +            webpage_bytes = prefix + webpage_bytes +        if not encoding: +            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)          if self._downloader.params.get('dump_intermediate_pages', False):              try:                  url = url_or_request.get_full_url() @@ -410,13 +417,13 @@ class InfoExtractor(object):          return content -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):          """ Returns the data of the page as a string """          success = False          try_count = 0          while success is False:              try: -                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) +                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)                  success = True              except compat_http_client.IncompleteRead as e:                  try_count += 1 @@ -431,10 +438,10 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True): +                      transform_source=None, fatal=True, encoding=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal) +            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)          if xml_string is False:              return xml_string          if transform_source: @@ -445,9 +452,10 @@ class InfoExtractor(object):                         note='Downloading JSON metadata',                         errnote='Unable to download JSON metadata',                         transform_source=None, -                       fatal=True): +                       fatal=True, encoding=None):          json_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal) +            url_or_request, video_id, note, errnote, fatal=fatal, +            encoding=encoding)          if (not fatal) and json_string is False:              return None          return self._parse_json( @@ -517,7 +525,7 @@ class InfoExtractor(object):              video_info['description'] = playlist_description          return video_info -    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): +    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):          """          Perform a regex search on the given string, using a single or a list of          patterns returning the first matching group. @@ -543,16 +551,15 @@ class InfoExtractor(object):                  return next(g for g in mobj.groups() if g is not None)              else:                  return mobj.group(group) -        elif default is not _NO_DEFAULT: +        elif default is not NO_DEFAULT:              return default          elif fatal:              raise RegexNotFoundError('Unable to extract %s' % _name)          else: -            self._downloader.report_warning('unable to extract %s; ' -                                            'please report this issue on http://yt-dl.org/bug' % _name) +            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())              return None -    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): +    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):          """          Like _search_regex, but strips HTML tags and unescapes entities.          """ @@ -564,7 +571,7 @@ class InfoExtractor(object):      def _get_login_info(self):          """ -        Get the the login info as (username, password) +        Get the login info as (username, password)          It will look in the netrc file using the _NETRC_MACHINE value          If there's no info available, return (None, None)          """ @@ -700,7 +707,26 @@ class InfoExtractor(object):          return self._html_search_meta('twitter:player', html,                                        'twitter card player') -    def _sort_formats(self, formats): +    @staticmethod +    def _hidden_inputs(html): +        return dict([ +            (input.group('name'), input.group('value')) for input in re.finditer( +                r'''(?x) +                    <input\s+ +                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ +                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ +                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? +                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) +                ''', html) +        ]) + +    def _form_hidden_inputs(self, form_id, html): +        form = self._search_regex( +            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, +            html, '%s form' % form_id, group='form') +        return self._hidden_inputs(form) + +    def _sort_formats(self, formats, field_preference=None):          if not formats:              raise ExtractorError('No video formats found') @@ -710,6 +736,9 @@ class InfoExtractor(object):              if not f.get('ext') and 'url' in f:                  f['ext'] = determine_ext(f['url']) +            if isinstance(field_preference, (list, tuple)): +                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) +              preference = f.get('preference')              if preference is None:                  proto = f.get('protocol') @@ -756,7 +785,7 @@ class InfoExtractor(object):                  f.get('fps') if f.get('fps') is not None else -1,                  f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,                  f.get('source_preference') if f.get('source_preference') is not None else -1, -                f.get('format_id'), +                f.get('format_id') if f.get('format_id') is not None else '',              )          formats.sort(key=_formats_key) @@ -778,8 +807,8 @@ class InfoExtractor(object):              return True          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError): -                self.report_warning( -                    '%s URL is invalid, skipping' % item, video_id) +                self.to_screen( +                    '%s: %s URL is invalid, skipping' % (video_id, item))                  return False              raise @@ -807,10 +836,14 @@ class InfoExtractor(object):          self.to_screen(msg)          time.sleep(timeout) -    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): +    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, +                             transform_source=lambda s: fix_xml_ampersands(s).strip()):          manifest = self._download_xml(              manifest_url, video_id, 'Downloading f4m manifest', -            'Unable to download f4m manifest') +            'Unable to download f4m manifest', +            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests +            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) +            transform_source=transform_source)          formats = []          manifest_version = '1.0' @@ -820,8 +853,19 @@ class InfoExtractor(object):              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')          for i, media_el in enumerate(media_nodes):              if manifest_version == '2.0': -                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + -                                (media_el.attrib.get('href') or media_el.attrib.get('url'))) +                media_url = media_el.attrib.get('href') or media_el.attrib.get('url') +                if not media_url: +                    continue +                manifest_url = ( +                    media_url if media_url.startswith('http://') or media_url.startswith('https://') +                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) +                # If media_url is itself a f4m manifest do the recursive extraction +                # since bitrates in parent manifest (this one) and media_url manifest +                # may differ leading to inability to resolve the format by requested +                # bitrate in f4m downloader +                if determine_ext(manifest_url) == 'f4m': +                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) +                    continue              tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({                  'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), @@ -838,7 +882,8 @@ class InfoExtractor(object):      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,                                entry_protocol='m3u8', preference=None, -                              m3u8_id=None): +                              m3u8_id=None, note=None, errnote=None, +                              fatal=True):          formats = [{              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -857,8 +902,11 @@ class InfoExtractor(object):          m3u8_doc = self._download_webpage(              m3u8_url, video_id, -            note='Downloading m3u8 information', -            errnote='Failed to download m3u8 information') +            note=note or 'Downloading m3u8 information', +            errnote=errnote or 'Failed to download m3u8 information', +            fatal=fatal) +        if m3u8_doc is False: +            return m3u8_doc          last_info = None          last_media = None          kv_rex = re.compile( @@ -888,7 +936,7 @@ class InfoExtractor(object):                  format_id = []                  if m3u8_id:                      format_id.append(m3u8_id) -                last_media_name = last_media.get('NAME') if last_media else None +                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None                  format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))                  f = {                      'format_id': '-'.join(format_id), @@ -948,7 +996,7 @@ class InfoExtractor(object):      def _parse_smil_video(self, video, video_id, base, rtmp_count):          src = video.get('src')          if not src: -            return ([], rtmp_count) +            return [], rtmp_count          bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)          width = int_or_none(video.get('width'))          height = int_or_none(video.get('height')) @@ -961,7 +1009,7 @@ class InfoExtractor(object):                      proto = 'http'          ext = video.get('ext')          if proto == 'm3u8': -            return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) +            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count          elif proto == 'rtmp':              rtmp_count += 1              streamer = video.get('streamer') or base @@ -1064,9 +1112,6 @@ class InfoExtractor(object):      def _get_automatic_captions(self, *args, **kwargs):          raise NotImplementedError("This method must be implemented by subclasses") -    def _subtitles_timecode(self, seconds): -        return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) -  class SearchInfoExtractor(InfoExtractor):      """  | 
