diff options
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 55 | 
1 files changed, 48 insertions, 7 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c70a98d1..ba0f5c8b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,8 @@ from ..utils import (      dict_get,      error_to_compat_str,      float_or_none, +    extract_attributes, +    get_element_by_attribute,      int_or_none,      js_to_json,      mimetype2ext, @@ -38,6 +40,7 @@ from ..utils import (      smuggle_url,      str_or_none,      str_to_int, +    traverse_obj,      try_get,      unescapeHTML,      unified_strdate, @@ -656,6 +659,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:bf77e03fcae5529475e500129b05668a',                  'duration': 177,                  'uploader': 'FlyingKitty', +                'uploader_id': 'FlyingKitty900',                  'upload_date': '20200408',                  'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg',                  'age_limit': 18, @@ -678,6 +682,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:17eccca93a786d51bc67646756894066',                  'duration': 106,                  'uploader': 'Projekt Melody', +                'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',                  'upload_date': '20191227',                  'age_limit': 18,                  'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -929,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'id': 'lsguqyKfVQg',                  'ext': 'mp4',                  'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', -                'alt_title': 'Dark Walk - Position Music', +                'alt_title': 'Dark Walk',                  'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',                  'duration': 133,                  'upload_date': '20151119',                  'uploader_id': 'IronSoulElf',                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',                  'uploader': 'IronSoulElf', -                'creator': 'Todd Haberman,  Daniel Law Heath and Aaron Kaplan', -                'track': 'Dark Walk - Position Music', -                'artist': 'Todd Haberman,  Daniel Law Heath and Aaron Kaplan', +                'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', +                'track': 'Dark Walk', +                'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan',                  'album': 'Position Music - Production Music Vol. 143 - Dark Walk',              },              'params': { @@ -2091,7 +2096,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              or microformat.get('lengthSeconds')) \              or parse_duration(search_meta('duration'))          is_live = video_details.get('isLive') -        owner_profile_url = microformat.get('ownerProfileUrl') + +        def gen_owner_profile_url(): +            yield microformat.get('ownerProfileUrl') +            yield extract_attributes(self._search_regex( +                r'''(?s)(<link\b[^>]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', +                get_element_by_attribute('itemprop', 'author', webpage), +                'owner_profile_url', default='')).get('href') + +        owner_profile_url = next( +            (x for x in map(url_or_none, gen_owner_profile_url()) if x), +            None)          if not player_url:              player_url = self._extract_player_url(webpage) @@ -2176,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                          info[d_k] = parse_duration(query[k][0])          if video_description: +            # Youtube Music Auto-generated description              mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)              if mobj:                  release_year = mobj.group('release_year') @@ -2250,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':                              info['location'] = stl                          else: -                            mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) +                            # •? doesn't match, but [•]? does; \xa0 = non-breaking space +                            mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl)                              if mobj:                                  info.update({                                      'series': mobj.group(1), @@ -2261,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                              vpir,                              lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],                              list) or []): -                        tbr = tlb.get('toggleButtonRenderer') or {} +                        tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {}                          for getter, regex in [(                                  lambda x: x['defaultText']['accessibility']['accessibilityData'],                                  r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ @@ -2315,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                              elif mrr_title == 'Song':                                  info['track'] = mrr_contents_text +            # this is not extraction but spelunking! +            carousel_lockups = traverse_obj( +                initial_data, +                ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', +                 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, +                 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), +                expected_type=dict) or [] +            # try to reproduce logic from metadataRowContainerRenderer above (if it still is) +            fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license')) +            # multiple_songs ? +            if len(carousel_lockups) > 1: +                fields = fields[-1:] +            for info_row in traverse_obj( +                    carousel_lockups, +                    (0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'), +                    expected_type=dict): +                row_title = traverse_obj(info_row, ('title', 'simpleText')) +                row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text) +                if not row_text: +                    continue +                for name, field in fields: +                    if name == row_title and not info.get(field): +                        info[field] = row_text +          for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:              v = info.get(s_k)              if v:  | 
