diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 429 | 
1 files changed, 336 insertions, 93 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b9014fc23..828f58f12 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,16 +10,18 @@ import re  import socket  import sys  import time -import xml.etree.ElementTree  from ..compat import (      compat_cookiejar, -    compat_HTTPError, +    compat_cookies, +    compat_getpass,      compat_http_client,      compat_urllib_error, +    compat_urllib_parse,      compat_urllib_parse_urlparse,      compat_urlparse,      compat_str, +    compat_etree_fromstring,  )  from ..utils import (      NO_DEFAULT, @@ -28,13 +30,19 @@ from ..utils import (      clean_html,      compiled_regex_type,      determine_ext, +    error_to_compat_str,      ExtractorError,      fix_xml_ampersands,      float_or_none,      int_or_none,      RegexNotFoundError,      sanitize_filename, +    sanitized_Request,      unescapeHTML, +    unified_strdate, +    url_basename, +    xpath_text, +    xpath_with_ns,  ) @@ -65,7 +73,7 @@ class InfoExtractor(object):                      Potential fields:                      * url        Mandatory. The URL of the video file -                    * ext        Will be calculated from url if missing +                    * ext        Will be calculated from URL if missing                      * format     A human-readable description of the format                                   ("mp4 container with h264/opus").                                   Calculated from the format_id, width, height. @@ -145,6 +153,7 @@ class InfoExtractor(object):      description:    Full video description.      uploader:       Full name of the video uploader.      creator:        The main artist who created the video. +    release_date:   The date (YYYYMMDD) when the video was released.      timestamp:      UNIX timestamp of the moment the video became available.      upload_date:    Video upload date (YYYYMMDD).                      If not explicitly set, calculated from timestamp. @@ -155,13 +164,15 @@ class InfoExtractor(object):                      lower to higher preference, each element is a dictionary                      with the "ext" entry and one of:                          * "data": The subtitles file contents -                        * "url": A url pointing to the subtitles file +                        * "url": A URL pointing to the subtitles file +                    "ext" will be calculated from URL if missing      automatic_captions: Like 'subtitles', used by the YoutubeIE for                      automatically generated captions -    duration:       Length of the video in seconds, as an integer. +    duration:       Length of the video in seconds, as an integer or float.      view_count:     How many users have watched the video on the platform.      like_count:     Number of positive ratings of the video      dislike_count:  Number of negative ratings of the video +    repost_count:   Number of reposts of the video      average_rating: Average rating give by users, the scale used depends on the webpage      comment_count:  Number of comments on the video      comments:       A list of comments, each with one or more of the following @@ -176,13 +187,18 @@ class InfoExtractor(object):                                       Set to "root" to indicate that this is a                                       comment to the original video.      age_limit:      Age restriction for the video, as an integer (years) -    webpage_url:    The url to the video webpage, if given to youtube-dl it +    webpage_url:    The URL to the video webpage, if given to youtube-dl it                      should allow to get the same result again. (It will be set                      by YoutubeDL if it's missing)      categories:     A list of categories that the video falls in, for example                      ["Sports", "Berlin"] +    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]      is_live:        True, False, or None (=unknown). Whether this video is a                      live stream that goes on instead of a fixed-length video. +    start_time:     Time in seconds where the reproduction should start, as +                    specified in the URL. +    end_time:       Time in seconds where the reproduction should end, as +                    specified in the URL.      Unless mentioned otherwise, the fields should be Unicode strings. @@ -193,8 +209,8 @@ class InfoExtractor(object):      There must be a key "entries", which is a list, an iterable, or a PagedList      object, each element of which is a valid dictionary by this specification. -    Additionally, playlists can have "title" and "id" attributes with the same -    semantics as videos (see above). +    Additionally, playlists can have "title", "description" and "id" attributes +    with the same semantics as videos (see above).      _type "multi_video" indicates that there are multiple videos that @@ -295,11 +311,11 @@ class InfoExtractor(object):      @classmethod      def ie_key(cls):          """A string for getting the InfoExtractor with get_info_extractor""" -        return cls.__name__[:-2] +        return compat_str(cls.__name__[:-2])      @property      def IE_NAME(self): -        return type(self).__name__[:-2] +        return compat_str(type(self).__name__[:-2])      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the response handle """ @@ -317,7 +333,8 @@ class InfoExtractor(object):                  return False              if errnote is None:                  errnote = 'Unable to download webpage' -            errmsg = '%s: %s' % (errnote, compat_str(err)) + +            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))              if fatal:                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)              else: @@ -446,7 +463,7 @@ class InfoExtractor(object):              return xml_string          if transform_source:              xml_string = transform_source(xml_string) -        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) +        return compat_etree_fromstring(xml_string.encode('utf-8'))      def _download_json(self, url_or_request, video_id,                         note='Downloading JSON metadata', @@ -498,10 +515,22 @@ class InfoExtractor(object):          """Report attempt to log in."""          self.to_screen('Logging in') +    @staticmethod +    def raise_login_required(msg='This video is only available for registered users'): +        raise ExtractorError( +            '%s. Use --username and --password or --netrc to provide account credentials.' % msg, +            expected=True) + +    @staticmethod +    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): +        raise ExtractorError( +            '%s. You might want to use --proxy to workaround.' % msg, +            expected=True) +      # Methods for following #608      @staticmethod      def url_result(url, ie=None, video_id=None, video_title=None): -        """Returns a url that points to a page that should be processed""" +        """Returns a URL that points to a page that should be processed"""          # TODO: ie should be the class used for getting the info          video_info = {'_type': 'url',                        'url': url, @@ -595,11 +624,11 @@ class InfoExtractor(object):                  else:                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)              except (IOError, netrc.NetrcParseError) as err: -                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) +                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))          return (username, password) -    def _get_tfa_info(self): +    def _get_tfa_info(self, note='two-factor verification code'):          """          Get the two-factor authentication info          TODO - asking the user will be required for sms/phone verify @@ -613,19 +642,26 @@ class InfoExtractor(object):          if downloader_params.get('twofactor', None) is not None:              return downloader_params['twofactor'] -        return None +        return compat_getpass('Type %s and press [Return]: ' % note)      # Helper functions for extracting OpenGraph info      @staticmethod      def _og_regexes(prop): -        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' -        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) +        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' +        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' +                       % {'prop': re.escape(prop)})          template = r'<meta[^>]+?%s[^>]+?%s'          return [              template % (property_re, content_re),              template % (content_re, property_re),          ] +    @staticmethod +    def _meta_regex(prop): +        return r'''(?isx)<meta +                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) +                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) +      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop @@ -635,7 +671,7 @@ class InfoExtractor(object):          return unescapeHTML(escaped)      def _og_search_thumbnail(self, html, **kargs): -        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) +        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)      def _og_search_description(self, html, **kargs):          return self._og_search_property('description', html, fatal=False, **kargs) @@ -656,9 +692,7 @@ class InfoExtractor(object):          if display_name is None:              display_name = name          return self._html_search_regex( -            r'''(?isx)<meta -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) -                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), +            self._meta_regex(name),              html, display_name, fatal=fatal, group='content', **kwargs)      def _dc_search_uploader(self, html): @@ -709,20 +743,23 @@ class InfoExtractor(object):      @staticmethod      def _hidden_inputs(html): -        return dict([ -            (input.group('name'), input.group('value')) for input in re.finditer( -                r'''(?x) -                    <input\s+ -                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ -                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ -                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? -                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) -                ''', html) -        ]) +        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) +        hidden_inputs = {} +        for input in re.findall(r'(?i)<input([^>]+)>', html): +            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): +                continue +            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) +            if not name: +                continue +            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input) +            if not value: +                continue +            hidden_inputs[name.group('value')] = value.group('value') +        return hidden_inputs      def _form_hidden_inputs(self, form_id, html):          form = self._search_regex( -            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, +            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,              html, '%s form' % form_id, group='form')          return self._hidden_inputs(form) @@ -806,7 +843,7 @@ class InfoExtractor(object):              self._request_webpage(url, video_id, 'Checking %s URL' % item)              return True          except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError): +            if isinstance(e.cause, compat_urllib_error.URLError):                  self.to_screen(                      '%s: %s URL is invalid, skipping' % (video_id, item))                  return False @@ -837,13 +874,18 @@ class InfoExtractor(object):          time.sleep(timeout)      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, -                             transform_source=lambda s: fix_xml_ampersands(s).strip()): +                             transform_source=lambda s: fix_xml_ampersands(s).strip(), +                             fatal=True):          manifest = self._download_xml(              manifest_url, video_id, 'Downloading f4m manifest',              'Unable to download f4m manifest',              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests              # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) -            transform_source=transform_source) +            transform_source=transform_source, +            fatal=fatal) + +        if manifest is False: +            return manifest          formats = []          manifest_version = '1.0' @@ -851,6 +893,11 @@ class InfoExtractor(object):          if not media_nodes:              manifest_version = '2.0'              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') +        base_url = xpath_text( +            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], +            'base URL', default=None) +        if base_url: +            base_url = base_url.strip()          for i, media_el in enumerate(media_nodes):              if manifest_version == '2.0':                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -858,13 +905,16 @@ class InfoExtractor(object):                      continue                  manifest_url = (                      media_url if media_url.startswith('http://') or media_url.startswith('https://') -                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) +                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))                  # If media_url is itself a f4m manifest do the recursive extraction                  # since bitrates in parent manifest (this one) and media_url manifest                  # may differ leading to inability to resolve the format by requested                  # bitrate in f4m downloader                  if determine_ext(manifest_url) == 'f4m': -                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) +                    f4m_formats = self._extract_f4m_formats( +                        manifest_url, video_id, preference, f4m_id, fatal=fatal) +                    if f4m_formats: +                        formats.extend(f4m_formats)                      continue              tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({ @@ -900,13 +950,15 @@ class InfoExtractor(object):              if re.match(r'^https?://', u)              else compat_urlparse.urljoin(m3u8_url, u)) -        m3u8_doc = self._download_webpage( +        res = self._download_webpage_handle(              m3u8_url, video_id,              note=note or 'Downloading m3u8 information',              errnote=errnote or 'Failed to download m3u8 information',              fatal=fatal) -        if m3u8_doc is False: -            return m3u8_doc +        if res is False: +            return res +        m3u8_doc, urlh = res +        m3u8_url = urlh.geturl()          last_info = None          last_media = None          kv_rex = re.compile( @@ -967,69 +1019,237 @@ class InfoExtractor(object):          self._sort_formats(formats)          return formats -    # TODO: improve extraction -    def _extract_smil_formats(self, smil_url, video_id, fatal=True): -        smil = self._download_xml( -            smil_url, video_id, 'Downloading SMIL file', -            'Unable to download SMIL file', fatal=fatal) +    @staticmethod +    def _xpath_ns(path, namespace=None): +        if not namespace: +            return path +        out = [] +        for c in path.split('/'): +            if not c or c == '.': +                out.append(c) +            else: +                out.append('{%s}%s' % (namespace, c)) +        return '/'.join(out) + +    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal) +          if smil is False:              assert not fatal              return [] -        base = smil.find('./head/meta').get('base') +        namespace = self._parse_smil_namespace(smil) + +        return self._parse_smil_formats( +            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + +    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal) +        if smil is False: +            return {} +        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + +    def _download_smil(self, smil_url, video_id, fatal=True): +        return self._download_xml( +            smil_url, video_id, 'Downloading SMIL file', +            'Unable to download SMIL file', fatal=fatal) + +    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): +        namespace = self._parse_smil_namespace(smil) + +        formats = self._parse_smil_formats( +            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) +        subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + +        video_id = os.path.splitext(url_basename(smil_url))[0] +        title = None +        description = None +        upload_date = None +        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): +            name = meta.attrib.get('name') +            content = meta.attrib.get('content') +            if not name or not content: +                continue +            if not title and name == 'title': +                title = content +            elif not description and name in ('description', 'abstract'): +                description = content +            elif not upload_date and name == 'date': +                upload_date = unified_strdate(content) + +        thumbnails = [{ +            'id': image.get('type'), +            'url': image.get('src'), +            'width': int_or_none(image.get('width')), +            'height': int_or_none(image.get('height')), +        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + +        return { +            'id': video_id, +            'title': title or video_id, +            'description': description, +            'upload_date': upload_date, +            'thumbnails': thumbnails, +            'formats': formats, +            'subtitles': subtitles, +        } + +    def _parse_smil_namespace(self, smil): +        return self._search_regex( +            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + +    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): +        base = smil_url +        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): +            b = meta.get('base') or meta.get('httpBase') +            if b: +                base = b +                break          formats = []          rtmp_count = 0 -        if smil.findall('./body/seq/video'): -            video = smil.findall('./body/seq/video')[0] -            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) -            formats.extend(fmts) -        else: -            for video in smil.findall('./body/switch/video'): -                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) -                formats.extend(fmts) +        http_count = 0 + +        videos = smil.findall(self._xpath_ns('.//video', namespace)) +        for video in videos: +            src = video.get('src') +            if not src: +                continue + +            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) +            filesize = int_or_none(video.get('size') or video.get('fileSize')) +            width = int_or_none(video.get('width')) +            height = int_or_none(video.get('height')) +            proto = video.get('proto') +            ext = video.get('ext') +            src_ext = determine_ext(src) +            streamer = video.get('streamer') or base + +            if proto == 'rtmp' or streamer.startswith('rtmp'): +                rtmp_count += 1 +                formats.append({ +                    'url': streamer, +                    'play_path': src, +                    'ext': 'flv', +                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), +                    'tbr': bitrate, +                    'filesize': filesize, +                    'width': width, +                    'height': height, +                }) +                if transform_rtmp_url: +                    streamer, src = transform_rtmp_url(streamer, src) +                    formats[-1].update({ +                        'url': streamer, +                        'play_path': src, +                    }) +                continue + +            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + +            if proto == 'm3u8' or src_ext == 'm3u8': +                m3u8_formats = self._extract_m3u8_formats( +                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) +                if m3u8_formats: +                    formats.extend(m3u8_formats) +                continue + +            if src_ext == 'f4m': +                f4m_url = src_url +                if not f4m_params: +                    f4m_params = { +                        'hdcore': '3.2.0', +                        'plugin': 'flowplayer-3.2.0.1', +                    } +                f4m_url += '&' if '?' in f4m_url else '?' +                f4m_url += compat_urllib_parse.urlencode(f4m_params) +                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) +                if f4m_formats: +                    formats.extend(f4m_formats) +                continue + +            if src_url.startswith('http') and self._is_valid_url(src, video_id): +                http_count += 1 +                formats.append({ +                    'url': src_url, +                    'ext': ext or src_ext or 'flv', +                    'format_id': 'http-%d' % (bitrate or http_count), +                    'tbr': bitrate, +                    'filesize': filesize, +                    'width': width, +                    'height': height, +                }) +                continue          self._sort_formats(formats)          return formats -    def _parse_smil_video(self, video, video_id, base, rtmp_count): -        src = video.get('src') -        if not src: -            return [], rtmp_count -        bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) -        width = int_or_none(video.get('width')) -        height = int_or_none(video.get('height')) -        proto = video.get('proto') -        if not proto: -            if base: -                if base.startswith('rtmp'): -                    proto = 'rtmp' -                elif base.startswith('http'): -                    proto = 'http' -        ext = video.get('ext') -        if proto == 'm3u8': -            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count -        elif proto == 'rtmp': -            rtmp_count += 1 -            streamer = video.get('streamer') or base -            return ([{ -                'url': streamer, -                'play_path': src, -                'ext': 'flv', -                'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), -                'tbr': bitrate, -                'width': width, -                'height': height, -            }], rtmp_count) -        elif proto.startswith('http'): -            return ([{ -                'url': base + src, -                'ext': ext or 'flv', -                'tbr': bitrate, -                'width': width, -                'height': height, -            }], rtmp_count) +    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): +        subtitles = {} +        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): +            src = textstream.get('src') +            if not src: +                continue +            ext = textstream.get('ext') or determine_ext(src) +            if not ext: +                type_ = textstream.get('type') +                SUBTITLES_TYPES = { +                    'text/vtt': 'vtt', +                    'text/srt': 'srt', +                    'application/smptett+xml': 'tt', +                } +                if type_ in SUBTITLES_TYPES: +                    ext = SUBTITLES_TYPES[type_] +            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang +            subtitles.setdefault(lang, []).append({ +                'url': src, +                'ext': ext, +            }) +        return subtitles + +    def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): +        xspf = self._download_xml( +            playlist_url, playlist_id, 'Downloading xpsf playlist', +            'Unable to download xspf manifest', fatal=fatal) +        if xspf is False: +            return [] +        return self._parse_xspf(xspf, playlist_id) + +    def _parse_xspf(self, playlist, playlist_id): +        NS_MAP = { +            'xspf': 'http://xspf.org/ns/0/', +            's1': 'http://static.streamone.nl/player/ns/0', +        } + +        entries = [] +        for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): +            title = xpath_text( +                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) +            description = xpath_text( +                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') +            thumbnail = xpath_text( +                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') +            duration = float_or_none( +                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + +            formats = [{ +                'url': location.text, +                'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), +                'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), +                'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), +            } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] +            self._sort_formats(formats) + +            entries.append({ +                'id': playlist_id, +                'title': title, +                'description': description, +                'thumbnail': thumbnail, +                'duration': duration, +                'formats': formats, +            }) +        return entries      def _live_title(self, name):          """ Generate the title for a live video """ @@ -1065,6 +1285,12 @@ class InfoExtractor(object):              None, '/', True, False, expire_time, '', None, None, None)          self._downloader.cookiejar.set_cookie(cookie) +    def _get_cookies(self, url): +        """ Return a compat_cookies.SimpleCookie with the cookies for the url """ +        req = sanitized_Request(url) +        self._downloader.cookiejar.add_cookie_header(req) +        return compat_cookies.SimpleCookie(req.get_header('Cookie')) +      def get_testcases(self, include_onlymatching=False):          t = getattr(self, '_TEST', None)          if t: @@ -1103,6 +1329,23 @@ class InfoExtractor(object):      def _get_subtitles(self, *args, **kwargs):          raise NotImplementedError("This method must be implemented by subclasses") +    @staticmethod +    def _merge_subtitle_items(subtitle_list1, subtitle_list2): +        """ Merge subtitle items for one language. Items with duplicated URLs +        will be dropped. """ +        list1_urls = set([item['url'] for item in subtitle_list1]) +        ret = list(subtitle_list1) +        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) +        return ret + +    @classmethod +    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): +        """ Merge two subtitle dictionaries, language by language. """ +        ret = dict(subtitle_dict1) +        for lang in subtitle_dict2: +            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) +        return ret +      def extract_automatic_captions(self, *args, **kwargs):          if (self._downloader.params.get('writeautomaticsub', False) or                  self._downloader.params.get('listsubtitles')): @@ -1116,7 +1359,7 @@ class InfoExtractor(object):  class SearchInfoExtractor(InfoExtractor):      """      Base class for paged search queries extractors. -    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} +    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}      Instances should define _SEARCH_KEY and _MAX_RESULTS.      """ | 
