[youtube] Improve yt initial data extraction (closes #27093)

author: Sergey M․ <dstftw@gmail.com> 2020-11-20 23:21:52 +0700
committer: Sergey M․ <dstftw@gmail.com> 2020-11-20 23:21:52 +0700
commit: b31b5f4434b52816f3a5a1ae2cbe1d162be0fbd0 (patch)
tree: 34ca3998ccf5d8b12a35b728106234833e8ce7cb /youtube_dl
parent: 86f2fa1590991fffae7b1daacae9164771312c0b (diff)
1 files changed, 20 insertions, 2 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 79f87aa85..a85aede8e 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -283,6 +283,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
         },
     }
 
+    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
+
     def _call_api(self, ep, query, video_id):
         data = self._DEFAULT_API_DATA.copy()
         data.update(query)
@@ -299,8 +301,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     def _extract_yt_initial_data(self, video_id, webpage):
         return self._parse_json(
             self._search_regex(
-                r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
-                webpage, 'yt initial data'),
+                (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
+                 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
             video_id)
 
 
@@ -1066,6 +1068,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
+            'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
+            'info_dict': {
+                'id': 'CHqg6qOn4no',
+                'ext': 'mp4',
+                'title': 'Part 77   Sort a list of simple types in c#',
+                'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
+                'upload_date': '20130831',
+                'uploader_id': 'kudvenkat',
+                'uploader': 'kudvenkat',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
     ]
 
     def __init__(self, *args, **kwargs):
author	Sergey M․ <dstftw@gmail.com>	2020-11-20 23:21:52 +0700
committer	Sergey M․ <dstftw@gmail.com>	2020-11-20 23:21:52 +0700
commit	b31b5f4434b52816f3a5a1ae2cbe1d162be0fbd0 (patch)
tree	34ca3998ccf5d8b12a35b728106234833e8ce7cb /youtube_dl
parent	86f2fa1590991fffae7b1daacae9164771312c0b (diff)