diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 188 | 
1 files changed, 147 insertions, 41 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 323681960..67a1df9a0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -19,6 +19,7 @@ from ..compat import (      compat_urllib_parse,      compat_urllib_parse_unquote,      compat_urllib_parse_unquote_plus, +    compat_urllib_parse_urlparse,      compat_urllib_request,      compat_urlparse,      compat_str, @@ -31,9 +32,12 @@ from ..utils import (      get_element_by_id,      int_or_none,      orderedSet, +    parse_duration, +    smuggle_url,      str_to_int,      unescapeHTML,      unified_strdate, +    unsmuggle_url,      uppercase_escape,      ISO3166Utils,  ) @@ -279,13 +283,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},          # Dash webm -        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, -        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, +        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, +        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},          '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -295,11 +299,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, -        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, -        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, -        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, +        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, +        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, +        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},          # Dash webm audio          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -317,7 +321,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      IE_NAME = 'youtube'      _TESTS = [          { -            'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', +            'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',              'info_dict': {                  'id': 'BaW_jenozKc',                  'ext': 'mp4', @@ -327,8 +331,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'upload_date': '20121002',                  'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',                  'categories': ['Science & Technology'], +                'tags': ['youtube-dl'],                  'like_count': int,                  'dislike_count': int, +                'start_time': 1, +                'end_time': 9,              }          },          { @@ -339,7 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', -                'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', +                'description': 'md5:782e8651347686cba06e58f71ab51773', +                'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', +                         'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', +                         'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop',              } @@ -554,6 +564,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'format': '135',  # bestvideo              }          }, +        { +            # Multifeed videos (multiple cameras), URL is for Main Camera +            'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', +            'info_dict': { +                'id': 'jqWvoWXjCVs', +                'title': 'teamPGP: Rocket League Noob Stream', +                'description': 'md5:dc7872fb300e143831327f1bae3af010', +            }, +            'playlist': [{ +                'info_dict': { +                    'id': 'jqWvoWXjCVs', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': '6h8e8xoXJzg', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': 'PUOgX5z9xZw', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (grizzle)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }, { +                'info_dict': { +                    'id': 'teuwxikvS5k', +                    'ext': 'mp4', +                    'title': 'teamPGP: Rocket League Noob Stream (zim)', +                    'description': 'md5:dc7872fb300e143831327f1bae3af010', +                    'upload_date': '20150721', +                    'uploader': 'Beer Games Beer', +                    'uploader_id': 'beergamesbeer', +                }, +            }], +            'params': { +                'skip_download': True, +            }, +        }      ]      def __init__(self, *args, **kwargs): @@ -885,10 +948,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          return formats      def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +          proto = (              'http' if self._downloader.params.get('prefer_insecure', False)              else 'https') +        start_time = None +        end_time = None +        parsed_url = compat_urllib_parse_urlparse(url) +        for component in [parsed_url.fragment, parsed_url.query]: +            query = compat_parse_qs(component) +            if start_time is None and 't' in query: +                start_time = parse_duration(query['t'][0]) +            if start_time is None and 'start' in query: +                start_time = parse_duration(query['start'][0]) +            if end_time is None and 'end' in query: +                end_time = parse_duration(query['end'][0]) +          # Extract original video URL from URL with redirection, like age verification, using next_url parameter          mobj = re.search(self._NEXT_URL_RE, url)          if mobj: @@ -977,7 +1054,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              if 'reason' in video_info:                  if 'The uploader has not made this video available in your country.' in video_info['reason']:                      regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) -                    if regions_allowed is not None: +                    if regions_allowed:                          raise ExtractorError('YouTube said: This video is available in %s only' % (                              ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),                              expected=True) @@ -989,6 +1066,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      '"token" parameter not in video info for unknown reason',                      video_id=video_id) +        # title +        if 'title' in video_info: +            video_title = video_info['title'][0] +        else: +            self._downloader.report_warning('Unable to extract video title') +            video_title = '_' + +        # description +        video_description = get_element_by_id("eow-description", video_webpage) +        if video_description: +            video_description = re.sub(r'''(?x) +                <a\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    title="([^"]+)"\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    class="yt-uix-redirect-link"\s*> +                [^<]+ +                </a> +            ''', r'\1', video_description) +            video_description = clean_html(video_description) +        else: +            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) +            if fd_mobj: +                video_description = unescapeHTML(fd_mobj.group(1)) +            else: +                video_description = '' + +        if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): +            if not self._downloader.params.get('noplaylist'): +                entries = [] +                feed_ids = [] +                multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) +                for feed in multifeed_metadata_list.split(','): +                    feed_data = compat_parse_qs(feed) +                    entries.append({ +                        '_type': 'url_transparent', +                        'ie_key': 'Youtube', +                        'url': smuggle_url( +                            '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), +                            {'force_singlefeed': True}), +                        'title': '%s (%s)' % (video_title, feed_data['title'][0]), +                    }) +                    feed_ids.append(feed_data['id'][0]) +                self.to_screen( +                    'Downloading multifeed video (%s) - add --no-playlist to just download video %s' +                    % (', '.join(feed_ids), video_id)) +                return self.playlist_result(entries, video_id, video_title, video_description) +            self.to_screen('Downloading just video %s because of --no-playlist' % video_id) +          if 'view_count' in video_info:              view_count = int(video_info['view_count'][0])          else: @@ -1014,13 +1140,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              self._downloader.report_warning('unable to extract uploader nickname') -        # title -        if 'title' in video_info: -            video_title = video_info['title'][0] -        else: -            self._downloader.report_warning('Unable to extract video title') -            video_title = '_' -          # thumbnail image          # We try first to get a high quality image:          m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">', @@ -1056,25 +1175,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          else:              video_categories = None -        # description -        video_description = get_element_by_id("eow-description", video_webpage) -        if video_description: -            video_description = re.sub(r'''(?x) -                <a\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? -                    title="([^"]+)"\s+ -                    (?:[a-zA-Z-]+="[^"]+"\s+)*? -                    class="yt-uix-redirect-link"\s*> -                [^<]+ -                </a> -            ''', r'\1', video_description) -            video_description = clean_html(video_description) -        else: -            fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) -            if fd_mobj: -                video_description = unescapeHTML(fd_mobj.group(1)) -            else: -                video_description = '' +        video_tags = [ +            unescapeHTML(m.group('content')) +            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]          def _extract_count(count_name):              return str_to_int(self._search_regex( @@ -1244,6 +1347,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'thumbnail': video_thumbnail,              'description': video_description,              'categories': video_categories, +            'tags': video_tags,              'subtitles': video_subtitles,              'automatic_captions': automatic_captions,              'duration': video_duration, @@ -1256,6 +1360,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),              'formats': formats,              'is_live': is_live, +            'start_time': start_time, +            'end_time': end_time,          } | 
