diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 117 | 
1 files changed, 80 insertions, 37 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 76b5378e9..317a9a76f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import (      compat_urllib_parse_urlencode,      compat_urllib_request,      compat_urlparse, +    compat_xml_parse_error,  )  from ..downloader.f4m import remove_encrypted_media  from ..utils import ( @@ -376,7 +377,7 @@ class InfoExtractor(object):              cls._VALID_URL_RE = re.compile(cls._VALID_URL)          m = cls._VALID_URL_RE.match(url)          assert m -        return m.group('id') +        return compat_str(m.group('id'))      @classmethod      def working(cls): @@ -420,7 +421,7 @@ class InfoExtractor(object):              if country_code:                  self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)                  if self._downloader.params.get('verbose', False): -                    self._downloader.to_stdout( +                    self._downloader.to_screen(                          '[debug] Using fake IP %s (%s) as X-Forwarded-For.'                          % (self._x_forwarded_for_ip, country_code.upper())) @@ -646,15 +647,29 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): +                      transform_source=None, fatal=True, encoding=None, +                      data=None, headers={}, query={}):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) +            url_or_request, video_id, note, errnote, fatal=fatal, +            encoding=encoding, data=data, headers=headers, query=query)          if xml_string is False:              return xml_string +        return self._parse_xml( +            xml_string, video_id, transform_source=transform_source, +            fatal=fatal) + +    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):          if transform_source:              xml_string = transform_source(xml_string) -        return compat_etree_fromstring(xml_string.encode('utf-8')) +        try: +            return compat_etree_fromstring(xml_string.encode('utf-8')) +        except compat_xml_parse_error as ve: +            errmsg = '%s: Failed to parse XML ' % video_id +            if fatal: +                raise ExtractorError(errmsg, cause=ve) +            else: +                self.report_warning(errmsg + str(ve))      def _download_json(self, url_or_request, video_id,                         note='Downloading JSON metadata', @@ -730,12 +745,12 @@ class InfoExtractor(object):              video_info['title'] = video_title          return video_info -    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): -        urlrs = orderedSet( +    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): +        urls = orderedSet(              self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)              for m in matches)          return self.playlist_result( -            urlrs, playlist_id=video_id, playlist_title=video_title) +            urls, playlist_id=playlist_id, playlist_title=playlist_title)      @staticmethod      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): @@ -940,7 +955,8 @@ class InfoExtractor(object):      def _family_friendly_search(self, html):          # See http://schema.org/VideoObject -        family_friendly = self._html_search_meta('isFamilyFriendly', html) +        family_friendly = self._html_search_meta( +            'isFamilyFriendly', html, default=None)          if not family_friendly:              return None @@ -1002,17 +1018,17 @@ class InfoExtractor(object):                  item_type = e.get('@type')                  if expected_type is not None and expected_type != item_type:                      return info -                if item_type == 'TVEpisode': +                if item_type in ('TVEpisode', 'Episode'):                      info.update({                          'episode': unescapeHTML(e.get('name')),                          'episode_number': int_or_none(e.get('episodeNumber')),                          'description': unescapeHTML(e.get('description')),                      })                      part_of_season = e.get('partOfSeason') -                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': +                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):                          info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))                      part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') -                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': +                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):                          info['series'] = unescapeHTML(part_of_series.get('name'))                  elif item_type == 'Article':                      info.update({ @@ -1022,10 +1038,10 @@ class InfoExtractor(object):                      })                  elif item_type == 'VideoObject':                      extract_video_object(e) -                elif item_type == 'WebPage': -                    video = e.get('video') -                    if isinstance(video, dict) and video.get('@type') == 'VideoObject': -                        extract_video_object(video) +                    continue +                video = e.get('video') +                if isinstance(video, dict) and video.get('@type') == 'VideoObject': +                    extract_video_object(video)                  break          return dict((k, v) for k, v in info.items() if v is not None) @@ -1785,7 +1801,7 @@ class InfoExtractor(object):                      ms_info['timescale'] = int(timescale)                  segment_duration = source.get('duration')                  if segment_duration: -                    ms_info['segment_duration'] = int(segment_duration) +                    ms_info['segment_duration'] = float(segment_duration)              def extract_Initialization(source):                  initialization = source.find(_add_ns('Initialization')) @@ -1892,9 +1908,13 @@ class InfoExtractor(object):                                  'Bandwidth': bandwidth,                              } +                        def location_key(location): +                            return 'url' if re.match(r'^https?://', location) else 'path' +                          if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:                              media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) +                            media_location_key = location_key(media_template)                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$                              # can't be used at the same time @@ -1904,7 +1924,7 @@ class InfoExtractor(object):                                      segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])                                      representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))                                  representation_ms_info['fragments'] = [{ -                                    'url': media_template % { +                                    media_location_key: media_template % {                                          'Number': segment_number,                                          'Bandwidth': bandwidth,                                      }, @@ -1928,7 +1948,7 @@ class InfoExtractor(object):                                          'Number': segment_number,                                      }                                      representation_ms_info['fragments'].append({ -                                        'url': segment_url, +                                        media_location_key: segment_url,                                          'duration': float_or_none(segment_d, representation_ms_info['timescale']),                                      }) @@ -1952,8 +1972,9 @@ class InfoExtractor(object):                              for s in representation_ms_info['s']:                                  duration = float_or_none(s['d'], timescale)                                  for r in range(s.get('r', 0) + 1): +                                    segment_uri = representation_ms_info['segment_urls'][segment_index]                                      fragments.append({ -                                        'url': representation_ms_info['segment_urls'][segment_index], +                                        location_key(segment_uri): segment_uri,                                          'duration': duration,                                      })                                      segment_index += 1 @@ -1962,6 +1983,7 @@ class InfoExtractor(object):                          # No fragments key is present in this case.                          if 'fragments' in representation_ms_info:                              f.update({ +                                'fragment_base_url': base_url,                                  'fragments': [],                                  'protocol': 'http_dash_segments',                              }) @@ -1969,10 +1991,8 @@ class InfoExtractor(object):                                  initialization_url = representation_ms_info['initialization_url']                                  if not f.get('url'):                                      f['url'] = initialization_url -                                f['fragments'].append({'url': initialization_url}) +                                f['fragments'].append({location_key(initialization_url): initialization_url})                              f['fragments'].extend(representation_ms_info['fragments']) -                            for fragment in f['fragments']: -                                fragment['url'] = urljoin(base_url, fragment['url'])                          try:                              existing_format = next(                                  fo for fo in formats @@ -2001,6 +2021,12 @@ class InfoExtractor(object):              compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)      def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): +        """ +        Parse formats from ISM manifest. +        References: +         1. [MS-SSTR]: Smooth Streaming Protocol, +            https://msdn.microsoft.com/en-us/library/ff469518.aspx +        """          if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:              return [] @@ -2022,8 +2048,11 @@ class InfoExtractor(object):                      self.report_warning('%s is not a supported codec' % fourcc)                      continue                  tbr = int(track.attrib['Bitrate']) // 1000 -                width = int_or_none(track.get('MaxWidth')) -                height = int_or_none(track.get('MaxHeight')) +                # [1] does not mention Width and Height attributes. However, +                # they're often present while MaxWidth and MaxHeight are +                # missing, so should be used as fallbacks +                width = int_or_none(track.get('MaxWidth') or track.get('Width')) +                height = int_or_none(track.get('MaxHeight') or track.get('Height'))                  sampling_rate = int_or_none(track.get('SamplingRate'))                  track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2101,19 +2130,19 @@ class InfoExtractor(object):                  return f              return {} -        def _media_formats(src, cur_media_type): +        def _media_formats(src, cur_media_type, type_info={}):              full_url = absolute_url(src) -            ext = determine_ext(full_url) +            ext = type_info.get('ext') or determine_ext(full_url)              if ext == 'm3u8':                  is_plain_url = False                  formats = self._extract_m3u8_formats(                      full_url, video_id, ext='mp4',                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, -                    preference=preference) +                    preference=preference, fatal=False)              elif ext == 'mpd':                  is_plain_url = False                  formats = self._extract_mpd_formats( -                    full_url, video_id, mpd_id=mpd_id) +                    full_url, video_id, mpd_id=mpd_id, fatal=False)              else:                  is_plain_url = True                  formats = [{ @@ -2123,15 +2152,18 @@ class InfoExtractor(object):              return is_plain_url, formats          entries = [] +        # amp-video and amp-audio are very similar to their HTML5 counterparts +        # so we wll include them right here (see +        # https://www.ampproject.org/docs/reference/components/amp-video)          media_tags = [(media_tag, media_type, '')                        for media_tag, media_type -                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] +                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]          media_tags.extend(re.findall(              # We only allow video|audio followed by a whitespace or '>'.              # Allowing more characters may end up in significant slow down (see              # https://github.com/rg3/youtube-dl/issues/11979, example URL:              # http://www.porntrex.com/maps/videositemap.xml). -            r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) +            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))          for media_tag, media_type, media_content in media_tags:              media_info = {                  'formats': [], @@ -2149,9 +2181,15 @@ class InfoExtractor(object):                      src = source_attributes.get('src')                      if not src:                          continue -                    is_plain_url, formats = _media_formats(src, media_type) +                    f = parse_content_type(source_attributes.get('type')) +                    is_plain_url, formats = _media_formats(src, media_type, f)                      if is_plain_url: -                        f = parse_content_type(source_attributes.get('type')) +                        # res attribute is not standard but seen several times +                        # in the wild +                        f.update({ +                            'height': int_or_none(source_attributes.get('res')), +                            'format_id': source_attributes.get('label'), +                        })                          f.update(formats[0])                          media_info['formats'].append(f)                      else: @@ -2174,7 +2212,7 @@ class InfoExtractor(object):      def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):          formats = []          hdcore_sign = 'hdcore=3.7.0' -        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') +        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')          hds_host = hosts.get('hds')          if hds_host:              f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2196,8 +2234,9 @@ class InfoExtractor(object):      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):          url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) -        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') -        http_base_url = 'http' + url_base +        url_base = self._search_regex( +            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') +        http_base_url = '%s:%s' % ('http', url_base)          formats = []          if 'm3u8' not in skip_protocols:              formats.extend(self._extract_m3u8_formats( @@ -2231,7 +2270,7 @@ class InfoExtractor(object):              for protocol in ('rtmp', 'rtsp'):                  if protocol not in skip_protocols:                      formats.append({ -                        'url': protocol + url_base, +                        'url': '%s:%s' % (protocol, url_base),                          'format_id': protocol,                          'protocol': protocol,                      }) @@ -2289,6 +2328,8 @@ class InfoExtractor(object):              tracks = video_data.get('tracks')              if tracks and isinstance(tracks, list):                  for track in tracks: +                    if not isinstance(track, dict): +                        continue                      if track.get('kind') != 'captions':                          continue                      track_url = urljoin(base_url, track.get('file')) @@ -2318,6 +2359,8 @@ class InfoExtractor(object):          urls = []          formats = []          for source in jwplayer_sources_data: +            if not isinstance(source, dict): +                continue              source_url = self._proto_relative_url(source.get('file'))              if not source_url:                  continue | 
