diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 106 | 
1 files changed, 74 insertions, 32 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0843d89af..bfd432160 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -45,6 +45,7 @@ from ..utils import (      unescapeHTML,      unified_strdate,      url_basename, +    xpath_element,      xpath_text,      xpath_with_ns,      determine_protocol, @@ -987,7 +988,7 @@ class InfoExtractor(object):      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,                               transform_source=lambda s: fix_xml_ampersands(s).strip(), -                             fatal=True): +                             fatal=True, m3u8_id=None):          manifest = self._download_xml(              manifest_url, video_id, 'Downloading f4m manifest',              'Unable to download f4m manifest', @@ -1001,11 +1002,11 @@ class InfoExtractor(object):          return self._parse_f4m_formats(              manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, -            transform_source=transform_source, fatal=fatal) +            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,                             transform_source=lambda s: fix_xml_ampersands(s).strip(), -                           fatal=True): +                           fatal=True, m3u8_id=None):          # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy          akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')          if akamai_pv is not None and ';' in akamai_pv.text: @@ -1029,9 +1030,26 @@ class InfoExtractor(object):              'base URL', default=None)          if base_url:              base_url = base_url.strip() + +        bootstrap_info = xpath_element( +            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], +            'bootstrap info', default=None) +          for i, media_el in enumerate(media_nodes): -            if manifest_version == '2.0': -                media_url = media_el.attrib.get('href') or media_el.attrib.get('url') +            tbr = int_or_none(media_el.attrib.get('bitrate')) +            width = int_or_none(media_el.attrib.get('width')) +            height = int_or_none(media_el.attrib.get('height')) +            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) +            # If <bootstrapInfo> is present, the specified f4m is a +            # stream-level manifest, and only set-level manifests may refer to +            # external resources.  See section 11.4 and section 4 of F4M spec +            if bootstrap_info is None: +                media_url = None +                # @href is introduced in 2.0, see section 11.6 of F4M spec +                if manifest_version == '2.0': +                    media_url = media_el.attrib.get('href') +                if media_url is None: +                    media_url = media_el.attrib.get('url')                  if not media_url:                      continue                  manifest_url = ( @@ -1041,29 +1059,43 @@ class InfoExtractor(object):                  # since bitrates in parent manifest (this one) and media_url manifest                  # may differ leading to inability to resolve the format by requested                  # bitrate in f4m downloader -                if determine_ext(manifest_url) == 'f4m': -                    formats.extend(self._extract_f4m_formats( +                ext = determine_ext(manifest_url) +                if ext == 'f4m': +                    f4m_formats = self._extract_f4m_formats(                          manifest_url, video_id, preference=preference, f4m_id=f4m_id, -                        transform_source=transform_source, fatal=fatal)) +                        transform_source=transform_source, fatal=fatal) +                    # Sometimes stream-level manifest contains single media entry that +                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). +                    # At the same time parent's media entry in set-level manifest may +                    # contain it. We will copy it from parent in such cases. +                    if len(f4m_formats) == 1: +                        f = f4m_formats[0] +                        f.update({ +                            'tbr': f.get('tbr') or tbr, +                            'width': f.get('width') or width, +                            'height': f.get('height') or height, +                            'format_id': f.get('format_id') if not tbr else format_id, +                        }) +                    formats.extend(f4m_formats) +                    continue +                elif ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        manifest_url, video_id, 'mp4', preference=preference, +                        m3u8_id=m3u8_id, fatal=fatal))                      continue -            tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({ -                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), +                'format_id': format_id,                  'url': manifest_url, -                'ext': 'flv', +                'ext': 'flv' if bootstrap_info is not None else None,                  'tbr': tbr, -                'width': int_or_none(media_el.attrib.get('width')), -                'height': int_or_none(media_el.attrib.get('height')), +                'width': width, +                'height': height,                  'preference': preference,              })          return formats -    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, -                              entry_protocol='m3u8', preference=None, -                              m3u8_id=None, note=None, errnote=None, -                              fatal=True, live=False): - -        formats = [{ +    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): +        return {              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),              'url': m3u8_url,              'ext': ext, @@ -1071,7 +1103,14 @@ class InfoExtractor(object):              'preference': preference - 1 if preference else -1,              'resolution': 'multiple',              'format_note': 'Quality selection URL', -        }] +        } + +    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, +                              entry_protocol='m3u8', preference=None, +                              m3u8_id=None, note=None, errnote=None, +                              fatal=True, live=False): + +        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]          format_url = lambda u: (              u @@ -1138,12 +1177,15 @@ class InfoExtractor(object):                  format_id = []                  if m3u8_id:                      format_id.append(m3u8_id) -                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None +                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None +                # Despite specification does not mention NAME attribute for +                # EXT-X-STREAM-INF it still sometimes may be present +                stream_name = last_info.get('NAME') or last_media_name                  # Bandwidth of live streams may differ over time thus making                  # format_id unpredictable. So it's better to keep provided                  # format_id intact.                  if not live: -                    format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) +                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))                  f = {                      'format_id': '-'.join(format_id),                      'url': format_url(line.strip()), @@ -1275,21 +1317,21 @@ class InfoExtractor(object):          m3u8_count = 0          srcs = [] -        videos = smil.findall(self._xpath_ns('.//video', namespace)) -        for video in videos: -            src = video.get('src') +        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) +        for medium in media: +            src = medium.get('src')              if not src or src in srcs:                  continue              srcs.append(src) -            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) -            filesize = int_or_none(video.get('size') or video.get('fileSize')) -            width = int_or_none(video.get('width')) -            height = int_or_none(video.get('height')) -            proto = video.get('proto') -            ext = video.get('ext') +            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) +            filesize = int_or_none(medium.get('size') or medium.get('fileSize')) +            width = int_or_none(medium.get('width')) +            height = int_or_none(medium.get('height')) +            proto = medium.get('proto') +            ext = medium.get('ext')              src_ext = determine_ext(src) -            streamer = video.get('streamer') or base +            streamer = medium.get('streamer') or base              if proto == 'rtmp' or streamer.startswith('rtmp'):                  rtmp_count += 1 | 
