aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMozi <29089388+pzhlkj6612@users.noreply.github.com>2024-11-04 07:02:48 +0800
committerGitHub <noreply@github.com>2024-11-03 23:02:48 +0000
commita403dcf9be20b49cbb3017328f4aaa352fb6d685 (patch)
treea31ad4aa9c77012f90aa8f56325e4aba1b32ee88
parent754940e9a558565d6bd3c0c529802569b1d0ae4e (diff)
[ie/Dailymotion] Improve embed extraction (#10843)
Closes #8848, Closes #9432 Authored by: pzhlkj6612, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
-rw-r--r--yt_dlp/extractor/dailymotion.py115
1 files changed, 101 insertions, 14 deletions
diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py
index 632335e5b..4e99fdda7 100644
--- a/yt_dlp/extractor/dailymotion.py
+++ b/yt_dlp/extractor/dailymotion.py
@@ -10,11 +10,14 @@ from ..utils import (
OnDemandPagedList,
age_restricted,
clean_html,
+ extract_attributes,
int_or_none,
traverse_obj,
try_get,
unescapeHTML,
unsmuggle_url,
+ update_url,
+ url_or_none,
urlencode_postdata,
)
@@ -99,11 +102,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'''(?ix)
https?://
(?:
- (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)|
- (?:www\.)?lequipe\.fr/video
+ (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}|
+ (?:www\.)?lequipe\.fr
+ )/
+ (?:
+ swf/(?!video)|
+ (?:(?:crawler|embed|swf)/)?video/|
+ player(?:/[\da-z]+)?\.html\?(?:video|(?P<is_playlist>playlist))=
)
- [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
- '''
+ (?P<id>[^/?_&#]+)(?:[\w-]*\?playlist=(?P<playlist_id>x[0-9a-z]+))?
+ '''
IE_NAME = 'dailymotion'
_EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1']
_TESTS = [{
@@ -217,6 +225,63 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
}, {
'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video',
'only_matching': True,
+ }, { # playlist-only
+ 'url': 'https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://geo.dailymotion.com/player/xmyye.html?video=x93blhi',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dailymotion.com/crawler/video/x8u4owg',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.dailymotion.com/embed/video/x8u4owg',
+ 'only_matching': True,
+ }]
+ _WEBPAGE_TESTS = [{
+ # https://geo.dailymotion.com/player/xmyye.html?video=x93blhi
+ 'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/',
+ 'info_dict': {
+ 'id': 'x93blhi',
+ 'ext': 'mp4',
+ 'title': 'OnAir - 01/08/24',
+ 'description': '',
+ 'duration': 217,
+ 'timestamp': 1722505658,
+ 'upload_date': '20240801',
+ 'uploader': 'Financialounge',
+ 'uploader_id': 'x2vtgmm',
+ 'age_limit': 0,
+ 'tags': [],
+ 'view_count': int,
+ 'like_count': int,
+ },
+ }, {
+ # https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj
+ 'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/',
+ 'info_dict': {
+ 'id': 'x7wdsj',
+ },
+ 'playlist_mincount': 50,
+ }, {
+ # https://www.dailymotion.com/crawler/video/x8u4owg
+ 'url': 'https://www.leparisien.fr/environnement/video-le-veloto-la-voiture-a-pedales-qui-aimerait-se-faire-une-place-sur-les-routes-09-03-2024-KCYMCPM4WFHJXMSKBUI66UNFPU.php',
+ 'info_dict': {
+ 'id': 'x8u4owg',
+ 'ext': 'mp4',
+ 'like_count': int,
+ 'uploader': 'Le Parisien',
+ 'thumbnail': 'https://www.leparisien.fr/resizer/ho_GwveeYftNkLwg_cEta--5Bv4=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/BFXJNEBN75EUNHGYJLORUC3TX4.jpg',
+ 'upload_date': '20240309',
+ 'view_count': int,
+ 'timestamp': 1709997866,
+ 'age_limit': 0,
+ 'uploader_id': 'x32f7b',
+ 'title': 'VIDÉO. Le «\xa0véloto\xa0», la voiture à pédales qui aimerait se faire une place sur les routes',
+ 'duration': 428.0,
+ 'description': 'À bord du « véloto », l’alternative à la voiture pour la campagne',
+ 'tags': ['biclou', 'vélo', 'véloto', 'campagne', 'voiture', 'environnement', 'véhicules intermédiaires'],
+ },
}]
_GEO_BYPASS = False
_COMMON_MEDIA_FIELDS = '''description
@@ -232,16 +297,35 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
for mobj in re.finditer(
r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id')
+ for mobj in re.finditer(
+ r'(?s)<script [^>]*\bsrc=(["\'])(?:https?:)?//[\w-]+\.dailymotion\.com/player/(?:(?!\1).)+\1[^>]*>', webpage):
+ attrs = extract_attributes(mobj.group(0))
+ player_url = url_or_none(attrs.get('src'))
+ if not player_url:
+ continue
+ player_url = player_url.replace('.js', '.html')
+ if player_url.startswith('//'):
+ player_url = f'https:{player_url}'
+ if video_id := attrs.get('data-video'):
+ query_string = f'video={video_id}'
+ elif playlist_id := attrs.get('data-playlist'):
+ query_string = f'playlist={playlist_id}'
+ else:
+ continue
+ yield update_url(player_url, query=query_string)
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url)
- video_id, playlist_id = self._match_valid_url(url).groups()
+ video_id, is_playlist, playlist_id = self._match_valid_url(url).group('id', 'is_playlist', 'playlist_id')
+
+ if is_playlist: # We matched the playlist query param as video_id
+ playlist_id = video_id
+ video_id = None
- if playlist_id:
- if self._yes_playlist(playlist_id, video_id):
- return self.url_result(
- 'http://www.dailymotion.com/playlist/' + playlist_id,
- 'DailymotionPlaylist', playlist_id)
+ if self._yes_playlist(playlist_id, video_id):
+ return self.url_result(
+ f'http://www.dailymotion.com/playlist/{playlist_id}',
+ 'DailymotionPlaylist', playlist_id)
password = self.get_param('videopassword')
media = self._call_api(
@@ -282,6 +366,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
title = metadata['title']
is_live = media.get('isOnAir')
formats = []
+ subtitles = {}
+
for quality, media_list in metadata['qualities'].items():
for m in media_list:
media_url = m.get('url')
@@ -289,8 +375,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not media_url or media_type == 'application/vnd.lumberjack.manifest':
continue
if media_type == 'application/x-mpegURL':
- formats.extend(self._extract_m3u8_formats(
- media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False))
+ fmt, subs = self._extract_m3u8_formats_and_subtitles(
+ media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
+ formats.extend(fmt)
+ self._merge_subtitles(subs, target=subtitles)
else:
f = {
'url': media_url,
@@ -310,7 +398,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
if not f.get('fps') and f['format_id'].endswith('@60'):
f['fps'] = 60
- subtitles = {}
subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {}
for subtitle_lang, subtitle in subtitles_data.items():
subtitles[subtitle_lang] = [{
@@ -447,7 +534,7 @@ class DailymotionSearchIE(DailymotionPlaylistBaseIE):
class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist|search|crawler)/)(?:(?:old/)?user/)?(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {