diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 326 | 
1 files changed, 199 insertions, 127 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d33557135..ed55d3e07 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2,6 +2,7 @@  from __future__ import unicode_literals  import base64 +import collections  import datetime  import functools  import hashlib @@ -58,6 +59,7 @@ from ..utils import (      GeoRestrictedError,      GeoUtils,      int_or_none, +    join_nonempty,      js_to_json,      JSON_LD_RE,      mimetype2ext, @@ -74,6 +76,7 @@ from ..utils import (      str_or_none,      str_to_int,      strip_or_none, +    T,      traverse_obj,      try_get,      unescapeHTML, @@ -1751,6 +1754,12 @@ class InfoExtractor(object):              'format_note': 'Quality selection URL',          } +    def _report_ignoring_subs(self, name): +        self.report_warning(bug_reports_message( +            'Ignoring subtitle tracks found in the {0} manifest; ' +            'if any subtitle tracks are missing,'.format(name) +        ), only_once=True) +      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,                                entry_protocol='m3u8', preference=None,                                m3u8_id=None, note=None, errnote=None, @@ -2191,23 +2200,46 @@ class InfoExtractor(object):              })          return entries -    def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): +    def _extract_mpd_formats(self, *args, **kwargs): +        fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs) +        if subs: +            self._report_ignoring_subs('DASH') +        return fmts + +    def _extract_mpd_formats_and_subtitles( +            self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, +            fatal=True, data=None, headers=None, query=None): + +        # TODO: or not? param not yet implemented +        if self.get_param('ignore_no_formats_error'): +            fatal = False +          res = self._download_xml_handle(              mpd_url, video_id, -            note=note or 'Downloading MPD manifest', -            errnote=errnote or 'Failed to download MPD manifest', -            fatal=fatal, data=data, headers=headers, query=query) +            note='Downloading MPD manifest' if note is None else note, +            errnote='Failed to download MPD manifest' if errnote is None else errnote, +            fatal=fatal, data=data, headers=headers or {}, query=query or {})          if res is False: -            return [] +            return [], {}          mpd_doc, urlh = res          if mpd_doc is None: -            return [] -        mpd_base_url = base_url(urlh.geturl()) +            return [], {} + +        # We could have been redirected to a new url when we retrieved our mpd file. +        mpd_url = urlh.geturl() +        mpd_base_url = base_url(mpd_url) -        return self._parse_mpd_formats( +        return self._parse_mpd_formats_and_subtitles(              mpd_doc, mpd_id, mpd_base_url, mpd_url) -    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None): +    def _parse_mpd_formats(self, *args, **kwargs): +        fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs) +        if subs: +            self._report_ignoring_subs('DASH') +        return fmts + +    def _parse_mpd_formats_and_subtitles( +            self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):          """          Parse formats from MPD manifest.          References: @@ -2215,8 +2247,10 @@ class InfoExtractor(object):              http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip           2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP          """ -        if mpd_doc.get('type') == 'dynamic': -            return [] +        # TODO: param not yet implemented: default like previous yt-dl logic +        if not self.get_param('dynamic_mpd', False): +            if mpd_doc.get('type') == 'dynamic': +                return [], {}          namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) @@ -2228,6 +2262,7 @@ class InfoExtractor(object):          def extract_multisegment_info(element, ms_parent_info):              ms_info = ms_parent_info.copy() +            base_url = ms_info.get('base_url')              # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some              # common attributes and elements.  We will only extract relevant @@ -2285,7 +2320,8 @@ class InfoExtractor(object):              return ms_info          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) -        formats = [] +        formats, subtitles = [], {} +        stream_numbers = collections.defaultdict(int)          for period in mpd_doc.findall(_add_ns('Period')):              period_duration = parse_duration(period.get('duration')) or mpd_duration              period_ms_info = extract_multisegment_info(period, { @@ -2295,7 +2331,7 @@ class InfoExtractor(object):              for adaptation_set in period.findall(_add_ns('AdaptationSet')):                  if is_drm_protected(adaptation_set):                      continue -                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) +                adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)                  for representation in adaptation_set.findall(_add_ns('Representation')):                      if is_drm_protected(representation):                          continue @@ -2303,27 +2339,35 @@ class InfoExtractor(object):                      representation_attrib.update(representation.attrib)                      # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory                      mime_type = representation_attrib['mimeType'] -                    content_type = mime_type.split('/')[0] -                    if content_type == 'text': -                        # TODO implement WebVTT downloading -                        pass -                    elif content_type in ('video', 'audio'): -                        base_url = '' -                        for element in (representation, adaptation_set, period, mpd_doc): -                            base_url_e = element.find(_add_ns('BaseURL')) -                            if base_url_e is not None: -                                base_url = base_url_e.text + base_url -                                if re.match(r'^https?://', base_url): -                                    break -                        if mpd_base_url and not re.match(r'^https?://', base_url): -                            if not mpd_base_url.endswith('/') and not base_url.startswith('/'): -                                mpd_base_url += '/' -                            base_url = mpd_base_url + base_url -                        representation_id = representation_attrib.get('id') -                        lang = representation_attrib.get('lang') -                        url_el = representation.find(_add_ns('BaseURL')) -                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) -                        bandwidth = int_or_none(representation_attrib.get('bandwidth')) +                    content_type = representation_attrib.get('contentType') or mime_type.split('/')[0] +                    codec_str = representation_attrib.get('codecs', '') +                    # Some kind of binary subtitle found in some youtube livestreams +                    if mime_type == 'application/x-rawcc': +                        codecs = {'scodec': codec_str} +                    else: +                        codecs = parse_codecs(codec_str) +                    if content_type not in ('video', 'audio', 'text'): +                        if mime_type == 'image/jpeg': +                            content_type = mime_type +                        elif codecs.get('vcodec', 'none') != 'none': +                            content_type = 'video' +                        elif codecs.get('acodec', 'none') != 'none': +                            content_type = 'audio' +                        elif codecs.get('scodec', 'none') != 'none': +                            content_type = 'text' +                        elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): +                            content_type = 'text' +                        else: +                            self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) +                            continue + +                    representation_id = representation_attrib.get('id') +                    lang = representation_attrib.get('lang') +                    url_el = representation.find(_add_ns('BaseURL')) +                    filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) +                    bandwidth = int_or_none(representation_attrib.get('bandwidth')) +                    format_id = join_nonempty(representation_id or content_type, mpd_id) +                    if content_type in ('video', 'audio'):                          f = {                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,                              'manifest_url': mpd_url, @@ -2338,104 +2382,125 @@ class InfoExtractor(object):                              'filesize': filesize,                              'container': mimetype2ext(mime_type) + '_dash',                          } -                        f.update(parse_codecs(representation_attrib.get('codecs'))) -                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - -                        def prepare_template(template_name, identifiers): -                            tmpl = representation_ms_info[template_name] -                            # First of, % characters outside $...$ templates -                            # must be escaped by doubling for proper processing -                            # by % operator string formatting used further (see -                            # https://github.com/ytdl-org/youtube-dl/issues/16867). -                            t = '' -                            in_template = False -                            for c in tmpl: +                        f.update(codecs) +                    elif content_type == 'text': +                        f = { +                            'ext': mimetype2ext(mime_type), +                            'manifest_url': mpd_url, +                            'filesize': filesize, +                        } +                    elif content_type == 'image/jpeg': +                        # See test case in VikiIE +                        # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 +                        f = { +                            'format_id': format_id, +                            'ext': 'mhtml', +                            'manifest_url': mpd_url, +                            'format_note': 'DASH storyboards (jpeg)', +                            'acodec': 'none', +                            'vcodec': 'none', +                        } +                    if is_drm_protected(adaptation_set) or is_drm_protected(representation): +                        f['has_drm'] = True +                    representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info) + +                    def prepare_template(template_name, identifiers): +                        tmpl = representation_ms_info[template_name] +                        # First of, % characters outside $...$ templates +                        # must be escaped by doubling for proper processing +                        # by % operator string formatting used further (see +                        # https://github.com/ytdl-org/youtube-dl/issues/16867). +                        t = '' +                        in_template = False +                        for c in tmpl: +                            t += c +                            if c == '$': +                                in_template = not in_template +                            elif c == '%' and not in_template:                                  t += c -                                if c == '$': -                                    in_template = not in_template -                                elif c == '%' and not in_template: -                                    t += c -                            # Next, $...$ templates are translated to their -                            # %(...) counterparts to be used with % operator -                            t = t.replace('$RepresentationID$', representation_id) -                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) -                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) -                            t.replace('$$', '$') -                            return t - -                        # @initialization is a regular template like @media one -                        # so it should be handled just the same way (see -                        # https://github.com/ytdl-org/youtube-dl/issues/11605) -                        if 'initialization' in representation_ms_info: -                            initialization_template = prepare_template( -                                'initialization', -                                # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and -                                # $Time$ shall not be included for @initialization thus -                                # only $Bandwidth$ remains -                                ('Bandwidth', )) -                            representation_ms_info['initialization_url'] = initialization_template % { -                                'Bandwidth': bandwidth, -                            } - -                        def location_key(location): -                            return 'url' if re.match(r'^https?://', location) else 'path' - -                        if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: - -                            media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) -                            media_location_key = location_key(media_template) - -                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ -                            # can't be used at the same time -                            if '%(Number' in media_template and 's' not in representation_ms_info: -                                segment_duration = None -                                if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: -                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) -                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) -                                representation_ms_info['fragments'] = [{ -                                    media_location_key: media_template % { -                                        'Number': segment_number, -                                        'Bandwidth': bandwidth, -                                    }, -                                    'duration': segment_duration, -                                } for segment_number in range( -                                    representation_ms_info['start_number'], -                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])] -                            else: -                                # $Number*$ or $Time$ in media template with S list available -                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg -                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 -                                representation_ms_info['fragments'] = [] -                                segment_time = 0 -                                segment_d = None -                                segment_number = representation_ms_info['start_number'] - -                                def add_segment_url(): -                                    segment_url = media_template % { -                                        'Time': segment_time, -                                        'Bandwidth': bandwidth, -                                        'Number': segment_number, -                                    } -                                    representation_ms_info['fragments'].append({ -                                        media_location_key: segment_url, -                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']), -                                    }) +                        # Next, $...$ templates are translated to their +                        # %(...) counterparts to be used with % operator +                        t = t.replace('$RepresentationID$', representation_id) +                        t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) +                        t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) +                        t.replace('$$', '$') +                        return t + +                    # @initialization is a regular template like @media one +                    # so it should be handled just the same way (see +                    # https://github.com/ytdl-org/youtube-dl/issues/11605) +                    if 'initialization' in representation_ms_info: +                        initialization_template = prepare_template( +                            'initialization', +                            # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and +                            # $Time$ shall not be included for @initialization thus +                            # only $Bandwidth$ remains +                            ('Bandwidth', )) +                        representation_ms_info['initialization_url'] = initialization_template % { +                            'Bandwidth': bandwidth, +                        } -                                for num, s in enumerate(representation_ms_info['s']): -                                    segment_time = s.get('t') or segment_time -                                    segment_d = s['d'] +                    def location_key(location): +                        return 'url' if re.match(r'^https?://', location) else 'path' + +                    if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + +                        media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) +                        media_location_key = location_key(media_template) + +                        # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ +                        # can't be used at the same time +                        if '%(Number' in media_template and 's' not in representation_ms_info: +                            segment_duration = None +                            if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: +                                segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) +                                representation_ms_info['total_number'] = int(math.ceil( +                                    float_or_none(period_duration, segment_duration, default=0))) +                            representation_ms_info['fragments'] = [{ +                                media_location_key: media_template % { +                                    'Number': segment_number, +                                    'Bandwidth': bandwidth, +                                }, +                                'duration': segment_duration, +                            } for segment_number in range( +                                representation_ms_info['start_number'], +                                representation_ms_info['total_number'] + representation_ms_info['start_number'])] +                        else: +                            # $Number*$ or $Time$ in media template with S list available +                            # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg +                            # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 +                            representation_ms_info['fragments'] = [] +                            segment_time = 0 +                            segment_d = None +                            segment_number = representation_ms_info['start_number'] + +                            def add_segment_url(): +                                segment_url = media_template % { +                                    'Time': segment_time, +                                    'Bandwidth': bandwidth, +                                    'Number': segment_number, +                                } +                                representation_ms_info['fragments'].append({ +                                    media_location_key: segment_url, +                                    'duration': float_or_none(segment_d, representation_ms_info['timescale']), +                                }) + +                            for num, s in enumerate(representation_ms_info['s']): +                                segment_time = s.get('t') or segment_time +                                segment_d = s['d'] +                                add_segment_url() +                                segment_number += 1 +                                for r in range(s.get('r', 0)): +                                    segment_time += segment_d                                      add_segment_url()                                      segment_number += 1 -                                    for r in range(s.get('r', 0)): -                                        segment_time += segment_d -                                        add_segment_url() -                                        segment_number += 1 -                                    segment_time += segment_d -                        elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: +                                segment_time += segment_d +                    elif 'segment_urls' in representation_ms_info: +                        fragments = [] +                        if 's' in representation_ms_info:                              # No media template                              # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI                              # or any YouTube dashsegments video -                            fragments = []                              segment_index = 0                              timescale = representation_ms_info['timescale']                              for s in representation_ms_info['s']: @@ -2487,8 +2552,15 @@ class InfoExtractor(object):                              f['url'] = base_url                          formats.append(f)                      else: -                        self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) -        return formats +                        # Assuming direct URL to unfragmented media. +                        f['url'] = representation_ms_info['base_url'] +                    if content_type in ('video', 'audio', 'image/jpeg'): +                        f['manifest_stream_number'] = stream_numbers[f['url']] +                        stream_numbers[f['url']] += 1 +                        formats.append(f) +                    elif content_type == 'text': +                        subtitles.setdefault(lang or 'und', []).append(f) +        return formats, subtitles      def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):          res = self._download_xml_handle( | 
