diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 135 | 
1 files changed, 95 insertions, 40 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..3c629d38a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,9 +29,11 @@ from ..utils import (      get_element_by_id,      int_or_none,      orderedSet, +    str_to_int,      unescapeHTML,      unified_strdate,      uppercase_escape, +    ISO3166Utils,  ) @@ -518,6 +520,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'skip_download': 'requires avconv',              }          }, +        # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) +        { +            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', +            'info_dict': { +                'id': 'FIl7x6_3R5Y', +                'ext': 'mp4', +                'title': 'md5:7b81415841e02ecd4313668cde88737a', +                'description': 'md5:116377fd2963b81ec4ce64b542173306', +                'upload_date': '20150625', +                'uploader_id': 'dorappi2000', +                'uploader': 'dorappi2000', +                'formats': 'mincount:33', +            }, +        }      ]      def __init__(self, *args, **kwargs): @@ -782,7 +798,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')      def _parse_dash_manifest( -            self, video_id, dash_manifest_url, player_url, age_gate): +            self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):          def decrypt_sig(mobj):              s = mobj.group(1)              dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) @@ -791,7 +807,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          dash_doc = self._download_xml(              dash_manifest_url, video_id,              note='Downloading DASH manifest', -            errnote='Could not download DASH manifest') +            errnote='Could not download DASH manifest', +            fatal=fatal) + +        if dash_doc is False: +            return []          formats = []          for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'): @@ -824,6 +844,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      except StopIteration:                          full_info = self._formats.get(format_id, {}).copy()                          full_info.update(f) +                        codecs = r.attrib.get('codecs') +                        if codecs: +                            if full_info.get('acodec') == 'none' and 'vcodec' not in full_info: +                                full_info['vcodec'] = codecs +                            elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info: +                                full_info['acodec'] = codecs                          formats.append(full_info)                      else:                          existing_format.update(f) @@ -853,6 +879,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              player_url = None +        dash_mpds = [] + +        def add_dash_mpd(video_info): +            dash_mpd = video_info.get('dashmpd') +            if dash_mpd and dash_mpd[0] not in dash_mpds: +                dash_mpds.append(dash_mpd[0]) +          # Get video info          embed_webpage = None          if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -873,24 +906,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  note='Refetching age-gated info webpage',                  errnote='unable to download video info webpage')              video_info = compat_parse_qs(video_info_webpage) +            add_dash_mpd(video_info)          else:              age_gate = False -            try: -                # Try looking directly into the video webpage -                mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) -                if not mobj: -                    raise ValueError('Could not find ytplayer.config')  # caught below +            video_info = None +            # Try looking directly into the video webpage +            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) +            if mobj:                  json_code = uppercase_escape(mobj.group(1))                  ytplayer_config = json.loads(json_code)                  args = ytplayer_config['args'] -                # Convert to the same format returned by compat_parse_qs -                video_info = dict((k, [v]) for k, v in args.items()) -                if not args.get('url_encoded_fmt_stream_map'): -                    raise ValueError('No stream_map present')  # caught below -            except ValueError: -                # We fallback to the get_video_info pages (used by the embed page) +                if args.get('url_encoded_fmt_stream_map'): +                    # Convert to the same format returned by compat_parse_qs +                    video_info = dict((k, [v]) for k, v in args.items()) +                    add_dash_mpd(video_info) +            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): +                # We also try looking in get_video_info since it may contain different dashmpd +                # URL that points to a DASH manifest with possibly different itag set (some itags +                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH +                # manifest pointed by get_video_info's dashmpd). +                # The general idea is to take a union of itags of both DASH manifests (for example +                # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)                  self.report_video_info_webpage_download(video_id) -                for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: +                for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:                      video_info_url = (                          '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'                          % (proto, video_id, el_type)) @@ -898,11 +936,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                          video_info_url,                          video_id, note=False,                          errnote='unable to download video info webpage') -                    video_info = compat_parse_qs(video_info_webpage) -                    if 'token' in video_info: +                    get_video_info = compat_parse_qs(video_info_webpage) +                    add_dash_mpd(get_video_info) +                    if not video_info: +                        video_info = get_video_info +                    if 'token' in get_video_info:                          break          if 'token' not in video_info:              if 'reason' in video_info: +                if 'The uploader has not made this video available in your country.' in video_info['reason']: +                    regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) +                    if regions_allowed is not None: +                        raise ExtractorError('YouTube said: This video is available in %s only' % ( +                            ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), +                            expected=True)                  raise ExtractorError(                      'YouTube said: %s' % video_info['reason'][0],                      expected=True, video_id=video_id) @@ -956,15 +1003,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])          # upload date -        upload_date = None -        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage) -        if mobj is None: -            mobj = re.search( -                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>', -                video_webpage) -        if mobj is not None: -            upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) -            upload_date = unified_strdate(upload_date) +        upload_date = self._html_search_meta( +            'datePublished', video_webpage, 'upload date', default=None) +        if not upload_date: +            upload_date = self._search_regex( +                [r'(?s)id="eow-date.*?>(.*?)</span>', +                 r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], +                video_webpage, 'upload date', default=None) +            if upload_date: +                upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) +        upload_date = unified_strdate(upload_date)          m_cat_container = self._search_regex(              r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', @@ -998,12 +1046,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  video_description = ''          def _extract_count(count_name): -            count = self._search_regex( -                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name), -                video_webpage, count_name, default=None) -            if count is not None: -                return int(count.replace(',', '')) -            return None +            return str_to_int(self._search_regex( +                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' +                % re.escape(count_name), +                video_webpage, count_name, default=None)) +          like_count = _extract_count('like')          dislike_count = _extract_count('dislike') @@ -1118,24 +1165,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # Look for the DASH manifest          if self._downloader.params.get('youtube_include_dash_manifest', True): -            dash_mpd = video_info.get('dashmpd') -            if dash_mpd: -                dash_manifest_url = dash_mpd[0] +            dash_mpd_fatal = True +            for dash_manifest_url in dash_mpds: +                dash_formats = {}                  try: -                    dash_formats = self._parse_dash_manifest( -                        video_id, dash_manifest_url, player_url, age_gate) +                    for df in self._parse_dash_manifest( +                            video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal): +                        # Do not overwrite DASH format found in some previous DASH manifest +                        if df['format_id'] not in dash_formats: +                            dash_formats[df['format_id']] = df +                        # Additional DASH manifests may end up in HTTP Error 403 therefore +                        # allow them to fail without bug report message if we already have +                        # some DASH manifest succeeded. This is temporary workaround to reduce +                        # burst of bug reports until we figure out the reason and whether it +                        # can be fixed at all. +                        dash_mpd_fatal = False                  except (ExtractorError, KeyError) as e:                      self.report_warning(                          'Skipping DASH manifest: %r' % e, video_id) -                else: +                if dash_formats:                      # Remove the formats we found through non-DASH, they                      # contain less info and it can be wrong, because we use                      # fixed values (for example the resolution). See                      # https://github.com/rg3/youtube-dl/issues/5774 for an                      # example. -                    dash_keys = set(df['format_id'] for df in dash_formats) -                    formats = [f for f in formats if f['format_id'] not in dash_keys] -                    formats.extend(dash_formats) +                    formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] +                    formats.extend(dash_formats.values())          # Check for malformed aspect ratio          stretched_m = re.search(  | 
