diff options
| author | Remita Amine <remitamine@gmail.com> | 2019-04-27 09:16:17 +0100 | 
|---|---|---|
| committer | Remita Amine <remitamine@gmail.com> | 2019-04-27 09:16:17 +0100 | 
| commit | 822b9d9cb09429645582791dba31f4cbed7583cf (patch) | |
| tree | e2fcf4121007b62fd44f1ee85caecef06d3befc2 | |
| parent | 5caabd3c701a484271d197f7006ecf831e38136b (diff) | |
[youtube] improve Youtube Music Auto-generated description parsing(closes #20742)
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 57 | 
1 files changed, 23 insertions, 34 deletions
| diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 438eb5aa7..55eafb866 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1088,7 +1088,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { -            # artist and track fields should return non-null, per issue #20599 +            # Youtube Music Auto-generated description              'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',              'info_dict': {                  'id': 'MgNrAu2pzNs', @@ -1109,11 +1109,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { +            # Youtube Music Auto-generated description              # Retrieve 'artist' field from 'Artist:' in video description              # when it is present on youtube music video -            # Some videos have release_date and no release_year - -            # (release_year should be extracted from release_date) -            # https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932              'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',              'info_dict': {                  'id': 'k0jLE7tTwjY', @@ -1134,6 +1132,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { +            # Youtube Music Auto-generated description              # handle multiple artists on youtube music video              'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',              'info_dict': { @@ -1155,6 +1154,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { +            # Youtube Music Auto-generated description              # handle youtube music video with release_year and no release_date              'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',              'info_dict': { @@ -2161,36 +2161,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          track = extract_meta('Song')          artist = extract_meta('Artist') -        album = None -        release_date = None -        release_year = None - -        description_info = video_description.split('\n\n') -        # If the description of the video has the youtube music auto-generated format, extract additional info -        if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.': -            track_artist = description_info[1].split(' · ') -            if len(track_artist) >= 2: -                if track is None: -                    track = track_artist[0] -                if artist is None: -                    artist = re.search(r'Artist: ([^\n]+)', description_info[-2]) -                    if artist: -                        artist = artist.group(1) -                    if artist is None: -                        artist = track_artist[1] -                        # handle multiple artists -                        if len(track_artist) > 2: -                            for i in range(2, len(track_artist)): -                                artist += ', %s' % track_artist[i] -            release_year = re.search(r'℗ ([0-9]+)', video_description) -            if release_year: -                release_year = int_or_none(release_year.group(1)) -            album = description_info[2] -            if description_info[4].startswith('Released on: '): -                release_date = description_info[4].split(': ')[1].replace('-', '') -                # extract release_year from release_date if necessary -                if release_year is None: -                    release_year = int_or_none(release_date[0:4]) + +        # Youtube Music Auto-generated description +        album = release_date = release_year = None +        if video_description: +            mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) +            if mobj: +                if not track: +                    track = mobj.group('track').strip() +                if not artist: +                    artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) +                album = mobj.group('album'.strip()) +                release_year = mobj.group('release_year') +                release_date = mobj.group('release_date') +                if release_date: +                    release_date = release_date.replace('-', '') +                    if not release_year: +                        release_year = int(release_date[:4]) +                if release_year: +                    release_year = int(release_year)          m_episode = re.search(              r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', | 
