diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 142 | 
1 files changed, 99 insertions, 43 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 566ed7a4d..e637b33d5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1551,42 +1551,52 @@ class InfoExtractor(object):          def extract_multisegment_info(element, ms_parent_info):              ms_info = ms_parent_info.copy() + +            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some +            # common attributes and elements.  We will only extract relevant +            # for us. +            def extract_common(source): +                segment_timeline = source.find(_add_ns('SegmentTimeline')) +                if segment_timeline is not None: +                    s_e = segment_timeline.findall(_add_ns('S')) +                    if s_e: +                        ms_info['total_number'] = 0 +                        ms_info['s'] = [] +                        for s in s_e: +                            r = int(s.get('r', 0)) +                            ms_info['total_number'] += 1 + r +                            ms_info['s'].append({ +                                't': int(s.get('t', 0)), +                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) +                                'd': int(s.attrib['d']), +                                'r': r, +                            }) +                start_number = source.get('startNumber') +                if start_number: +                    ms_info['start_number'] = int(start_number) +                timescale = source.get('timescale') +                if timescale: +                    ms_info['timescale'] = int(timescale) +                segment_duration = source.get('duration') +                if segment_duration: +                    ms_info['segment_duration'] = int(segment_duration) + +            def extract_Initialization(source): +                initialization = source.find(_add_ns('Initialization')) +                if initialization is not None: +                    ms_info['initialization_url'] = initialization.attrib['sourceURL'] +              segment_list = element.find(_add_ns('SegmentList'))              if segment_list is not None: +                extract_common(segment_list) +                extract_Initialization(segment_list)                  segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))                  if segment_urls_e:                      ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] -                initialization = segment_list.find(_add_ns('Initialization')) -                if initialization is not None: -                    ms_info['initialization_url'] = initialization.attrib['sourceURL']              else:                  segment_template = element.find(_add_ns('SegmentTemplate'))                  if segment_template is not None: -                    start_number = segment_template.get('startNumber') -                    if start_number: -                        ms_info['start_number'] = int(start_number) -                    segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) -                    if segment_timeline is not None: -                        s_e = segment_timeline.findall(_add_ns('S')) -                        if s_e: -                            ms_info['total_number'] = 0 -                            ms_info['s'] = [] -                            for s in s_e: -                                r = int(s.get('r', 0)) -                                ms_info['total_number'] += 1 + r -                                ms_info['s'].append({ -                                    't': int(s.get('t', 0)), -                                    # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) -                                    'd': int(s.attrib['d']), -                                    'r': r, -                                }) -                    else: -                        timescale = segment_template.get('timescale') -                        if timescale: -                            ms_info['timescale'] = int(timescale) -                        segment_duration = segment_template.get('duration') -                        if segment_duration: -                            ms_info['segment_duration'] = int(segment_duration) +                    extract_common(segment_template)                      media_template = segment_template.get('media')                      if media_template:                          ms_info['media_template'] = media_template @@ -1594,11 +1604,14 @@ class InfoExtractor(object):                      if initialization:                          ms_info['initialization_url'] = initialization                      else: -                        initialization = segment_template.find(_add_ns('Initialization')) -                        if initialization is not None: -                            ms_info['initialization_url'] = initialization.attrib['sourceURL'] +                        extract_Initialization(segment_template)              return ms_info +        def combine_url(base_url, target_url): +            if re.match(r'^https?://', target_url): +                return target_url +            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) +          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))          formats = []          for period in mpd_doc.findall(_add_ns('Period')): @@ -1655,9 +1668,7 @@ class InfoExtractor(object):                          }                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)                          if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: -                            if 'total_number' not in representation_ms_info and 'segment_duration': -                                segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) -                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) +                              media_template = representation_ms_info['media_template']                              media_template = media_template.replace('$RepresentationID$', representation_id)                              media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) @@ -1666,7 +1677,11 @@ class InfoExtractor(object):                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$                              # can't be used at the same time -                            if '%(Number' in media_template: +                            if '%(Number' in media_template and 's' not in representation_ms_info: +                                segment_duration = None +                                if 'total_number' not in representation_ms_info and 'segment_duration': +                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) +                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))                                  representation_ms_info['segment_urls'] = [                                      media_template % {                                          'Number': segment_number, @@ -1675,28 +1690,65 @@ class InfoExtractor(object):                                      for segment_number in range(                                          representation_ms_info['start_number'],                                          representation_ms_info['total_number'] + representation_ms_info['start_number'])] +                                representation_ms_info['fragments'] = [{ +                                    'url': media_template % { +                                        'Number': segment_number, +                                        'Bandwidth': representation_attrib.get('bandwidth'), +                                    }, +                                    'duration': segment_duration, +                                } for segment_number in range( +                                    representation_ms_info['start_number'], +                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]                              else: +                                # $Number*$ or $Time$ in media template with S list available +                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg +                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411                                  representation_ms_info['segment_urls'] = [] +                                representation_ms_info['fragments'] = []                                  segment_time = 0 +                                segment_d = None +                                segment_number = representation_ms_info['start_number']                                  def add_segment_url(): -                                    representation_ms_info['segment_urls'].append( -                                        media_template % { -                                            'Time': segment_time, -                                            'Bandwidth': representation_attrib.get('bandwidth'), -                                        } -                                    ) +                                    segment_url = media_template % { +                                        'Time': segment_time, +                                        'Bandwidth': representation_attrib.get('bandwidth'), +                                        'Number': segment_number, +                                    } +                                    representation_ms_info['segment_urls'].append(segment_url) +                                    representation_ms_info['fragments'].append({ +                                        'url': segment_url, +                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']), +                                    })                                  for num, s in enumerate(representation_ms_info['s']):                                      segment_time = s.get('t') or segment_time +                                    segment_d = s['d']                                      add_segment_url() +                                    segment_number += 1                                      for r in range(s.get('r', 0)): -                                        segment_time += s['d'] +                                        segment_time += segment_d                                          add_segment_url() -                                    segment_time += s['d'] +                                        segment_number += 1 +                                    segment_time += segment_d +                        elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: +                            # No media template +                            # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI +                            # or any YouTube dashsegments video +                            fragments = [] +                            s_num = 0 +                            for segment_url in representation_ms_info['segment_urls']: +                                s = representation_ms_info['s'][s_num] +                                for r in range(s.get('r', 0) + 1): +                                    fragments.append({ +                                        'url': segment_url, +                                        'duration': float_or_none(s['d'], representation_ms_info['timescale']), +                                    }) +                            representation_ms_info['fragments'] = fragments                          if 'segment_urls' in representation_ms_info:                              f.update({                                  'segment_urls': representation_ms_info['segment_urls'], +                                'fragments': [],                                  'protocol': 'http_dash_segments',                              })                              if 'initialization_url' in representation_ms_info: @@ -1706,6 +1758,10 @@ class InfoExtractor(object):                                  })                                  if not f.get('url'):                                      f['url'] = initialization_url +                                f['fragments'].append({'url': initialization_url}) +                            f['fragments'].extend(representation_ms_info['fragments']) +                            for fragment in f['fragments']: +                                fragment['url'] = combine_url(base_url, fragment['url'])                          try:                              existing_format = next(                                  fo for fo in formats  | 
