diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 114 | 
1 files changed, 78 insertions, 36 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ed55d3e07..76414554a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -183,6 +183,8 @@ class InfoExtractor(object):                                              fragment_base_url                                   * "duration" (optional, int or float)                                   * "filesize" (optional, int) +                                 * "range" (optional, str of the form "start-end" +                                            to use in HTTP Range header)                      * preference Order number of this format. If this field is                                   present and not None, the formats get sorted                                   by this field, regardless of all other values. @@ -2296,15 +2298,27 @@ class InfoExtractor(object):              def extract_Initialization(source):                  initialization = source.find(_add_ns('Initialization'))                  if initialization is not None: -                    ms_info['initialization_url'] = initialization.attrib['sourceURL'] +                    ms_info['initialization_url'] = initialization.get('sourceURL') or base_url +                    initialization_url_range = initialization.get('range') +                    if initialization_url_range: +                        ms_info['initialization_url_range'] = initialization_url_range              segment_list = element.find(_add_ns('SegmentList'))              if segment_list is not None:                  extract_common(segment_list)                  extract_Initialization(segment_list)                  segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) -                if segment_urls_e: -                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] +                segment_urls = traverse_obj(segment_urls_e, ( +                    Ellipsis, T(lambda e: e.attrib), 'media')) +                if segment_urls: +                    ms_info['segment_urls'] = segment_urls +                segment_urls_range = traverse_obj(segment_urls_e, ( +                    Ellipsis, T(lambda e: e.attrib), 'mediaRange', +                    T(lambda r: re.findall(r'^\d+-\d+$', r)), 0)) +                if segment_urls_range: +                    ms_info['segment_urls_range'] = segment_urls_range +                    if not segment_urls: +                        ms_info['segment_urls'] = [base_url for _ in segment_urls_range]              else:                  segment_template = element.find(_add_ns('SegmentTemplate'))                  if segment_template is not None: @@ -2443,6 +2457,11 @@ class InfoExtractor(object):                      def location_key(location):                          return 'url' if re.match(r'^https?://', location) else 'path' +                    def calc_segment_duration(): +                        return float_or_none( +                            representation_ms_info['segment_duration'], +                            representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None +                      if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:                          media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) @@ -2512,45 +2531,68 @@ class InfoExtractor(object):                                          'duration': duration,                                      })                                      segment_index += 1 -                            representation_ms_info['fragments'] = fragments -                        elif 'segment_urls' in representation_ms_info: +                        elif 'segment_urls_range' in representation_ms_info: +                            # Segment URLs with mediaRange +                            # Example: https://kinescope.io/200615537/master.mpd +                            # https://github.com/ytdl-org/youtube-dl/issues/30235 +                            # or any mpd generated with Bento4 `mp4dash --no-split --use-segment-list` +                            segment_duration = calc_segment_duration() +                            for segment_url, segment_url_range in zip( +                                    representation_ms_info['segment_urls'], representation_ms_info['segment_urls_range']): +                                fragments.append({ +                                    location_key(segment_url): segment_url, +                                    'range': segment_url_range, +                                    'duration': segment_duration, +                                }) +                        else:                              # Segment URLs with no SegmentTimeline                              # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091                              # https://github.com/ytdl-org/youtube-dl/pull/14844 -                            fragments = [] -                            segment_duration = float_or_none( -                                representation_ms_info['segment_duration'], -                                representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None +                            segment_duration = calc_segment_duration()                              for segment_url in representation_ms_info['segment_urls']: -                                fragment = { +                                fragments.append({                                      location_key(segment_url): segment_url, -                                } -                                if segment_duration: -                                    fragment['duration'] = segment_duration -                                fragments.append(fragment) -                            representation_ms_info['fragments'] = fragments -                        # If there is a fragments key available then we correctly recognized fragmented media. -                        # Otherwise we will assume unfragmented media with direct access. Technically, such -                        # assumption is not necessarily correct since we may simply have no support for -                        # some forms of fragmented media renditions yet, but for now we'll use this fallback. -                        if 'fragments' in representation_ms_info: -                            f.update({ -                                # NB: mpd_url may be empty when MPD manifest is parsed from a string -                                'url': mpd_url or base_url, -                                'fragment_base_url': base_url, -                                'fragments': [], -                                'protocol': 'http_dash_segments', +                                    'duration': segment_duration, +                                }) +                        representation_ms_info['fragments'] = fragments + +                    # If there is a fragments key available then we correctly recognized fragmented media. +                    # Otherwise we will assume unfragmented media with direct access. Technically, such +                    # assumption is not necessarily correct since we may simply have no support for +                    # some forms of fragmented media renditions yet, but for now we'll use this fallback. +                    if 'fragments' in representation_ms_info: +                        base_url = representation_ms_info['base_url']  +                        f.update({ +                            # NB: mpd_url may be empty when MPD manifest is parsed from a string +                            'url': mpd_url or base_url, +                            'fragment_base_url': base_url, +                            'fragments': [], +                            'protocol': 'http_dash_segments', +                        }) +                        if 'initialization_url' in representation_ms_info and 'initialization_url_range' in representation_ms_info: +                            # Initialization URL with range (accompanied by Segment URLs with mediaRange above) +                            # https://github.com/ytdl-org/youtube-dl/issues/30235 +                            initialization_url = representation_ms_info['initialization_url'] +                            f['fragments'].append({ +                                location_key(initialization_url): initialization_url, +                                'range': representation_ms_info['initialization_url_range'],                              }) -                            if 'initialization_url' in representation_ms_info: -                                initialization_url = representation_ms_info['initialization_url'] -                                if not f.get('url'): -                                    f['url'] = initialization_url -                                f['fragments'].append({location_key(initialization_url): initialization_url}) -                            f['fragments'].extend(representation_ms_info['fragments']) -                        else: -                            # Assuming direct URL to unfragmented media. -                            f['url'] = base_url -                        formats.append(f) +                        elif 'initialization_url' in representation_ms_info: +                            initialization_url = representation_ms_info['initialization_url'] +                            if not f.get('url'): +                                f['url'] = initialization_url +                            f['fragments'].append({location_key(initialization_url): initialization_url}) +                        elif 'initialization_url_range' in representation_ms_info: +                            # no Initialization URL but range (accompanied by no Segment URLs but mediaRange above) +                            # https://github.com/ytdl-org/youtube-dl/issues/27575 +                            f['fragments'].append({ +                                location_key(base_url): base_url, +                                'range': representation_ms_info['initialization_url_range'], +                            }) +                        f['fragments'].extend(representation_ms_info['fragments']) +                        if not period_duration: +                            period_duration = sum(traverse_obj(representation_ms_info, ( +                                'fragments', Ellipsis, 'duration', T(float_or_none))))                      else:                          # Assuming direct URL to unfragmented media.                          f['url'] = representation_ms_info['base_url'] | 
