diff options
| author | Sergey M․ <dstftw@gmail.com> | 2018-06-03 02:08:38 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2018-06-03 02:08:38 +0700 | 
| commit | 7e72694b5e0691adfd90f5d5ecd47647625511e9 (patch) | |
| tree | 592d01301e884048340e964e1d8495d505f5ef55 | |
| parent | 936784b272db3f85f5ff5bdd2d5a71e0397ee7bd (diff) | |
[youtube] Move metadata extraction after video availability check
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 259 | 
1 files changed, 128 insertions, 131 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 677907aba..b8cea1191 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -530,7 +530,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',                  'license': 'Standard YouTube License',                  'creator': 'Justin Timberlake', -                'track': 'Tunnel Vision`', +                'track': 'Tunnel Vision',                  'artist': 'Justin Timberlake',                  'age_limit': 18,              } @@ -1698,136 +1698,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:              raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) -        # Start extracting information -        self.report_information_extraction(video_id) - -        # uploader -        video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) -        if video_uploader: -            video_uploader = compat_urllib_parse_unquote_plus(video_uploader) -        else: -            self._downloader.report_warning('unable to extract uploader name') - -        # uploader_id -        video_uploader_id = None -        video_uploader_url = None -        mobj = re.search( -            r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', -            video_webpage) -        if mobj is not None: -            video_uploader_id = mobj.group('uploader_id') -            video_uploader_url = mobj.group('uploader_url') -        else: -            self._downloader.report_warning('unable to extract uploader nickname') - -        # thumbnail image -        # We try first to get a high quality image: -        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', -                            video_webpage, re.DOTALL) -        if m_thumb is not None: -            video_thumbnail = m_thumb.group(1) -        elif 'thumbnail_url' not in video_info: -            self._downloader.report_warning('unable to extract video thumbnail') -            video_thumbnail = None -        else:   # don't panic if we can't find it -            video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) - -        # upload date -        upload_date = self._html_search_meta( -            'datePublished', video_webpage, 'upload date', default=None) -        if not upload_date: -            upload_date = self._search_regex( -                [r'(?s)id="eow-date.*?>(.*?)</span>', -                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], -                video_webpage, 'upload date', default=None) -        upload_date = unified_strdate(upload_date) - -        video_license = self._html_search_regex( -            r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', -            video_webpage, 'license', default=None) - -        m_music = re.search( -            r'''(?x) -                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* -                <ul[^>]*>\s* -                <li>(?P<title>.+?) -                by (?P<creator>.+?) -                (?: -                    \(.+?\)| -                    <a[^>]* -                        (?: -                            \bhref=["\']/red[^>]*>|             # drop possible -                            >\s*Listen ad-free with YouTube Red # YouTube Red ad -                        ) -                    .*? -                )?</li -            ''', -            video_webpage) -        if m_music: -            video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) -            video_creator = clean_html(m_music.group('creator')) -        else: -            video_alt_title = video_creator = None - -        def extract_meta(field): -            return self._html_search_regex( -                r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, -                video_webpage, field, default=None) - -        track = extract_meta('Song') -        artist = extract_meta('Artist') - -        m_episode = re.search( -            r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', -            video_webpage) -        if m_episode: -            series = m_episode.group('series') -            season_number = int(m_episode.group('season')) -            episode_number = int(m_episode.group('episode')) -        else: -            series = season_number = episode_number = None - -        m_cat_container = self._search_regex( -            r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', -            video_webpage, 'categories', default=None) -        if m_cat_container: -            category = self._html_search_regex( -                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', -                default=None) -            video_categories = None if category is None else [category] -        else: -            video_categories = None - -        video_tags = [ -            unescapeHTML(m.group('content')) -            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] - -        def _extract_count(count_name): -            return str_to_int(self._search_regex( -                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' -                % re.escape(count_name), -                video_webpage, count_name, default=None)) - -        like_count = _extract_count('like') -        dislike_count = _extract_count('dislike') - -        # subtitles -        video_subtitles = self.extract_subtitles(video_id, video_webpage) -        automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - -        video_duration = try_get( -            video_info, lambda x: int_or_none(x['length_seconds'][0])) -        if not video_duration: -            video_duration = parse_duration(self._html_search_meta( -                'duration', video_webpage, 'video duration')) - -        # annotations -        video_annotations = None -        if self._downloader.params.get('writeannotations', False): -            video_annotations = self._extract_annotations(video_id) - -        chapters = self._extract_chapters(description_original, video_duration) -          def _extract_filesize(media_url):              return int_or_none(self._search_regex(                  r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) @@ -2002,6 +1872,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  raise ExtractorError(error_message, expected=True)              raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') +        # uploader +        video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) +        if video_uploader: +            video_uploader = compat_urllib_parse_unquote_plus(video_uploader) +        else: +            self._downloader.report_warning('unable to extract uploader name') + +        # uploader_id +        video_uploader_id = None +        video_uploader_url = None +        mobj = re.search( +            r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', +            video_webpage) +        if mobj is not None: +            video_uploader_id = mobj.group('uploader_id') +            video_uploader_url = mobj.group('uploader_url') +        else: +            self._downloader.report_warning('unable to extract uploader nickname') + +        # thumbnail image +        # We try first to get a high quality image: +        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', +                            video_webpage, re.DOTALL) +        if m_thumb is not None: +            video_thumbnail = m_thumb.group(1) +        elif 'thumbnail_url' not in video_info: +            self._downloader.report_warning('unable to extract video thumbnail') +            video_thumbnail = None +        else:   # don't panic if we can't find it +            video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + +        # upload date +        upload_date = self._html_search_meta( +            'datePublished', video_webpage, 'upload date', default=None) +        if not upload_date: +            upload_date = self._search_regex( +                [r'(?s)id="eow-date.*?>(.*?)</span>', +                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], +                video_webpage, 'upload date', default=None) +        upload_date = unified_strdate(upload_date) + +        video_license = self._html_search_regex( +            r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', +            video_webpage, 'license', default=None) + +        m_music = re.search( +            r'''(?x) +                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* +                <ul[^>]*>\s* +                <li>(?P<title>.+?) +                by (?P<creator>.+?) +                (?: +                    \(.+?\)| +                    <a[^>]* +                        (?: +                            \bhref=["\']/red[^>]*>|             # drop possible +                            >\s*Listen ad-free with YouTube Red # YouTube Red ad +                        ) +                    .*? +                )?</li +            ''', +            video_webpage) +        if m_music: +            video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) +            video_creator = clean_html(m_music.group('creator')) +        else: +            video_alt_title = video_creator = None + +        def extract_meta(field): +            return self._html_search_regex( +                r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field, +                video_webpage, field, default=None) + +        track = extract_meta('Song') +        artist = extract_meta('Artist') + +        m_episode = re.search( +            r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', +            video_webpage) +        if m_episode: +            series = m_episode.group('series') +            season_number = int(m_episode.group('season')) +            episode_number = int(m_episode.group('episode')) +        else: +            series = season_number = episode_number = None + +        m_cat_container = self._search_regex( +            r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', +            video_webpage, 'categories', default=None) +        if m_cat_container: +            category = self._html_search_regex( +                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', +                default=None) +            video_categories = None if category is None else [category] +        else: +            video_categories = None + +        video_tags = [ +            unescapeHTML(m.group('content')) +            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + +        def _extract_count(count_name): +            return str_to_int(self._search_regex( +                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' +                % re.escape(count_name), +                video_webpage, count_name, default=None)) + +        like_count = _extract_count('like') +        dislike_count = _extract_count('dislike') + +        # subtitles +        video_subtitles = self.extract_subtitles(video_id, video_webpage) +        automatic_captions = self.extract_automatic_captions(video_id, video_webpage) + +        video_duration = try_get( +            video_info, lambda x: int_or_none(x['length_seconds'][0])) +        if not video_duration: +            video_duration = parse_duration(self._html_search_meta( +                'duration', video_webpage, 'video duration')) + +        # annotations +        video_annotations = None +        if self._downloader.params.get('writeannotations', False): +            video_annotations = self._extract_annotations(video_id) + +        chapters = self._extract_chapters(description_original, video_duration) +          # Look for the DASH manifest          if self._downloader.params.get('youtube_include_dash_manifest', True):              dash_mpd_fatal = True | 
