aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbashonly <88596187+bashonly@users.noreply.github.com>2023-12-24 10:41:28 -0600
committerGitHub <noreply@github.com>2023-12-24 16:41:28 +0000
commit116c268438ea4d3738f6fa502c169081ca8f0ee7 (patch)
tree51c8ef8f14a6a457e41072b5af3bf1f3837bf95c
parente7d22348e77367740da78a3db27167ecf894b7c9 (diff)
[ie/twitter] Work around API rate-limit (#8825)
Closes #8762 Authored by: bashonly
-rw-r--r--yt_dlp/extractor/twitter.py75
1 files changed, 43 insertions, 32 deletions
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py
index 5e8504aa1..c3a6e406c 100644
--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@@ -10,6 +10,7 @@ from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
+from ..networking.exceptions import HTTPError
from ..utils import (
ExtractorError,
dict_get,
@@ -1317,41 +1318,51 @@ class TwitterIE(TwitterBaseIE):
}
}
- def _extract_status(self, twid):
- if self.is_logged_in or self._selected_api == 'graphql':
- status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
-
- elif self._selected_api == 'legacy':
- status = self._call_api(f'statuses/show/{twid}.json', twid, {
- 'cards_platform': 'Web-12',
- 'include_cards': 1,
- 'include_reply_count': 1,
- 'include_user_entities': 0,
- 'tweet_mode': 'extended',
+ def _call_syndication_api(self, twid):
+ self.report_warning(
+ 'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
+ status = self._download_json(
+ 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
+ headers={'User-Agent': 'Googlebot'}, query={
+ 'id': twid,
+ # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
+ 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
})
+ if not status:
+ raise ExtractorError('Syndication endpoint returned empty JSON response')
+ # Transform the result so its structure matches that of legacy/graphql
+ media = []
+ for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
+ detail['id_str'] = traverse_obj(detail, (
+ 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
+ media.append(detail)
+ status['extended_entities'] = {'media': media}
- elif self._selected_api == 'syndication':
- self.report_warning(
- 'Not all metadata or media is available via syndication endpoint', twid, only_once=True)
- status = self._download_json(
- 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON',
- headers={'User-Agent': 'Googlebot'}, query={
- 'id': twid,
- # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '')
- 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)),
- })
- if not status:
- raise ExtractorError('Syndication endpoint returned empty JSON response')
- # Transform the result so its structure matches that of legacy/graphql
- media = []
- for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})):
- detail['id_str'] = traverse_obj(detail, (
- 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid
- media.append(detail)
- status['extended_entities'] = {'media': media}
+ return status
- else:
- raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True)
+ def _extract_status(self, twid):
+ if self._selected_api not in ('graphql', 'legacy', 'syndication'):
+ raise ExtractorError(f'{self._selected_api!r} is not a valid API selection', expected=True)
+
+ try:
+ if self.is_logged_in or self._selected_api == 'graphql':
+ status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid)
+ elif self._selected_api == 'legacy':
+ status = self._call_api(f'statuses/show/{twid}.json', twid, {
+ 'cards_platform': 'Web-12',
+ 'include_cards': 1,
+ 'include_reply_count': 1,
+ 'include_user_entities': 0,
+ 'tweet_mode': 'extended',
+ })
+ except ExtractorError as e:
+ if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
+ raise
+ self.report_warning('Rate-limit exceeded; falling back to syndication endpoint')
+ status = self._call_syndication_api(twid)
+
+ if self._selected_api == 'syndication':
+ status = self._call_syndication_api(twid)
return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {}