diff options
author | Sergey M․ <dstftw@gmail.com> | 2020-12-29 02:29:34 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2020-12-29 02:29:34 +0700 |
commit | 1a95953867412bc7a785f21f6bff5145b2b13fd0 (patch) | |
tree | 27d8b6e7e9740492ccb4d5e3dd0e53ade16a81f0 | |
parent | 71febd1c52d6de89ff571d4c212846aaaafb33ac (diff) |
[youtube] Improve yt initial data extraction (closes #27524)
-rw-r--r-- | youtube_dl/extractor/youtube.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0044ed909..87bdc1677 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -280,6 +280,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' + _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' def _call_api(self, ep, query, video_id): data = self._DEFAULT_API_DATA.copy() @@ -297,7 +298,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_yt_initial_data(self, video_id, webpage): return self._parse_json( self._search_regex( - (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, + (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), video_id) @@ -1104,6 +1105,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { + # another example of '};' in ytInitialData + 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY', + 'only_matching': True, + }, + { 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ', 'only_matching': True, }, @@ -1706,7 +1712,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info and not player_response: player_response = extract_player_response( self._search_regex( - (r'%s\s*(?:var\s+meta|</script|\n)' % self._YT_INITIAL_PLAYER_RESPONSE_RE, + (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE), self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage, 'initial player response', default='{}'), video_id) |