aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py1551
1 files changed, 1245 insertions, 306 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index b22c3122a..b01900afa 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -10,6 +10,7 @@ from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
compat_etree_fromstring,
+ compat_str,
compat_urllib_parse_unquote,
compat_urlparse,
compat_xml_parse_error,
@@ -19,28 +20,41 @@ from ..utils import (
ExtractorError,
float_or_none,
HEADRequest,
+ int_or_none,
is_html,
js_to_json,
+ KNOWN_EXTENSIONS,
+ merge_dicts,
+ mimetype2ext,
orderedSet,
+ parse_duration,
+ parse_resolution,
sanitized_Request,
smuggle_url,
unescapeHTML,
- unified_strdate,
+ unified_timestamp,
unsmuggle_url,
UnsupportedError,
+ url_or_none,
+ urljoin,
+ xpath_attr,
xpath_text,
+ xpath_with_ns,
)
from .commonprotocols import RtmpIE
from .brightcove import (
BrightcoveLegacyIE,
BrightcoveNewIE,
)
+from .nexx import (
+ NexxIE,
+ NexxEmbedIE,
+)
from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .tvc import TVCIE
-from .sportbox import SportBoxEmbedIE
-from .smotri import SmotriIE
+from .sportbox import SportBoxIE
from .myvi import MyviIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
@@ -51,11 +65,16 @@ from .xhamster import XHamsterEmbedIE
from .tnaflix import TNAFlixNetworkEmbedIE
from .drtuber import DrTuberIE
from .redtube import RedTubeIE
-from .vimeo import VimeoIE
-from .dailymotion import (
- DailymotionIE,
- DailymotionCloudIE,
+from .tube8 import Tube8IE
+from .mofosex import MofosexEmbedIE
+from .spankwire import SpankwireIE
+from .youporn import YouPornIE
+from .vimeo import (
+ VimeoIE,
+ VHXEmbedIE,
)
+from .dailymotion import DailymotionIE
+from .dailymail import DailyMailIE
from .onionstudios import OnionStudiosIE
from .viewlift import ViewLiftEmbedIE
from .mtv import MTVServicesEmbeddedIE
@@ -67,14 +86,12 @@ from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE
from .arkena import ArkenaIE
from .instagram import InstagramIE
-from .liveleak import LiveLeakIE
from .threeqsdn import ThreeQSDNIE
from .theplatform import ThePlatformIE
-from .vessel import VesselIE
from .kaltura import KalturaIE
from .eagleplatform import EaglePlatformIE
from .facebook import FacebookIE
-from .soundcloud import SoundcloudIE
+from .soundcloud import SoundcloudEmbedIE
from .tunein import TuneInBaseIE
from .vbox7 import Vbox7IE
from .dbtv import DBTVIE
@@ -82,10 +99,39 @@ from .piksel import PikselIE
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
-from .openload import OpenloadIE
+from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
+from .channel9 import Channel9IE
+from .vshare import VShareIE
+from .mediasite import MediasiteIE
+from .springboardplatform import SpringboardPlatformIE
+from .yapfiles import YapFilesIE
+from .vice import ViceIE
+from .xfileshare import XFileShareIE
+from .cloudflarestream import CloudflareStreamIE
+from .peertube import PeerTubeIE
+from .teachable import TeachableIE
+from .indavideo import IndavideoEmbedIE
+from .apa import APAIE
+from .foxnews import FoxNewsIE
+from .viqeo import ViqeoIE
+from .expressen import ExpressenIE
+from .zype import ZypeIE
+from .odnoklassniki import OdnoklassnikiIE
+from .vk import VKIE
+from .kinja import KinjaEmbedIE
+from .arcpublishing import ArcPublishingIE
+from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
class GenericIE(InfoExtractor):
@@ -164,11 +210,58 @@ class GenericIE(InfoExtractor):
{
'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
'info_dict': {
- 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- 'ext': 'm4v',
- 'upload_date': '20150228',
- 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- }
+ 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'title': 'MSNBC Rachel Maddow (video)',
+ 'description': 're:.*her unique approach to storytelling.*',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'ext': 'mov',
+ 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335',
+ 'title': 're:MSNBC Rachel Maddow',
+ 'description': 're:.*her unique approach to storytelling.*',
+ 'timestamp': int,
+ 'upload_date': compat_str,
+ 'duration': float,
+ },
+ }],
+ },
+ # RSS feed with item with description and thumbnails
+ {
+ 'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
+ 'info_dict': {
+ 'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
+ 'title': 're:.*100% Hydrogen.*',
+ 'description': 're:.*In this episode.*',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'ext': 'm4a',
+ 'id': 'c1c879525ce2cb640b344507e682c36d',
+ 'title': 're:Hydrogen!',
+ 'description': 're:.*In this episode we are going.*',
+ 'timestamp': 1567977776,
+ 'upload_date': '20190908',
+ 'duration': 459,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 1,
+ 'season_number': 1,
+ 'age_limit': 0,
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # RSS feed with enclosures and unsupported link URLs
+ {
+ 'url': 'http://www.hellointernet.fm/podcast?format=rss',
+ 'info_dict': {
+ 'id': 'http://www.hellointernet.fm/podcast?format=rss',
+ 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.',
+ 'title': 'Hello Internet',
+ },
+ 'playlist_mincount': 100,
},
# SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
{
@@ -390,7 +483,7 @@ class GenericIE(InfoExtractor):
},
},
{
- # https://github.com/rg3/youtube-dl/issues/2253
+ # https://github.com/ytdl-org/youtube-dl/issues/2253
'url': 'http://bcove.me/i6nfkrc3',
'md5': '0ba9446db037002366bab3b3eb30c88c',
'info_dict': {
@@ -415,7 +508,7 @@ class GenericIE(InfoExtractor):
},
},
{
- # https://github.com/rg3/youtube-dl/issues/3541
+ # https://github.com/ytdl-org/youtube-dl/issues/3541
'add_ie': ['BrightcoveLegacy'],
'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
'info_dict': {
@@ -431,6 +524,22 @@ class GenericIE(InfoExtractor):
},
},
{
+ # Brightcove video in <iframe>
+ 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
+ 'md5': '36d74ef5e37c8b4a2ce92880d208b968',
+ 'info_dict': {
+ 'id': '5360463607001',
+ 'ext': 'mp4',
+ 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活',
+ 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。',
+ 'uploader': 'United Nations',
+ 'uploader_id': '1362235914001',
+ 'timestamp': 1489593889,
+ 'upload_date': '20170315',
+ },
+ 'add_ie': ['BrightcoveLegacy'],
+ },
+ {
# Brightcove with alternative playerID key
'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
'info_dict': {
@@ -547,6 +656,19 @@ class GenericIE(InfoExtractor):
},
'skip': 'movie expired',
},
+ # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+ {
+ 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+ 'info_dict': {
+ 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+ 'ext': 'mp4',
+ 'title': 'Steampunk Fest Comes to Honesdale',
+ 'duration': 43.276,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -738,6 +860,20 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Dailymotion'],
},
+ # DailyMail embed
+ {
+ 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+ 'info_dict': {
+ 'id': '1495629',
+ 'ext': 'mp4',
+ 'title': 'Care worker punches elderly dementia patient in head 11 times',
+ 'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+ },
+ 'add_ie': ['DailyMail'],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# YouTube embed
{
'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@@ -755,7 +891,7 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
- # MTVSercices embed
+ # MTVServices embed
{
'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
'md5': 'ca1aef97695ef2c1d6973256a57e5252',
@@ -836,7 +972,7 @@ class GenericIE(InfoExtractor):
}
},
# Multiple brightcove videos
- # https://github.com/rg3/youtube-dl/issues/2283
+ # https://github.com/ytdl-org/youtube-dl/issues/2283
{
'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
'info_dict': {
@@ -1032,23 +1168,24 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20150212',
'uploader': 'The National Archives UK',
- 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
+ 'description': 'md5:8078af856dca76edc42910b61273dbbf',
'uploader_id': 'NationalArchives08',
'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
},
},
# jwplayer rtmp
{
- 'url': 'http://www.suffolk.edu/sjc/',
+ 'url': 'http://www.suffolk.edu/sjc/live.php',
'info_dict': {
- 'id': 'sjclive',
+ 'id': 'live',
'ext': 'flv',
'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
'uploader': 'www.suffolk.edu',
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/',
},
# Complex jwplayer
{
@@ -1057,6 +1194,7 @@ class GenericIE(InfoExtractor):
'id': 'videos',
'ext': 'mp4',
'title': 'king machine trailer 1',
+ 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
'thumbnail': r're:^https?://.*\.jpg$',
},
},
@@ -1074,13 +1212,55 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
+ {
+ # JWPlatform iframe
+ 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/',
+ 'md5': 'ca00a040364b5b439230e7ebfd02c4e9',
+ 'info_dict': {
+ 'id': 'O0c5JcKT',
+ 'ext': 'mp4',
+ 'upload_date': '20171122',
+ 'timestamp': 1511366290,
+ 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone',
+ },
+ 'add_ie': [JWPlatformIE.ie_key()],
+ },
+ {
+ # Video.js embed, multiple formats
+ 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
+ 'info_dict': {
+ 'id': 'yygqldloqIk',
+ 'ext': 'mp4',
+ 'title': 'SolidWorks. Урок 6 Настройка чертежа',
+ 'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
+ 'upload_date': '20130314',
+ 'uploader': 'PROстое3D',
+ 'uploader_id': 'PROstoe3D',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Video.js embed, single format
+ 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=',
+ 'info_dict': {
+ 'id': 'watch',
+ 'ext': 'mp4',
+ 'title': 'Step 1 - Good Foundation',
+ 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# rtl.nl embed
{
'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
'playlist_mincount': 5,
'info_dict': {
'id': 'aanslagen-kopenhagen',
- 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
+ 'title': 'Aanslagen Kopenhagen',
}
},
# Zapiks embed
@@ -1113,7 +1293,7 @@ class GenericIE(InfoExtractor):
'title': '35871',
'timestamp': 1355743100,
'upload_date': '20121217',
- 'uploader_id': 'batchUser',
+ 'uploader_id': 'cplapp@learn360.com',
},
'add_ie': ['Kaltura'],
},
@@ -1164,22 +1344,55 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Kaltura'],
},
- # Eagle.Platform embed (generic URL)
{
- 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
- # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
+ # Kaltura iframe embed, more sophisticated
+ 'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html',
'info_dict': {
- 'id': '227304',
+ 'id': '1_9gzouybz',
'ext': 'mp4',
- 'title': 'Навальный вышел на свободу',
- 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+ 'title': 'lecture-05sep2017',
+ 'description': 'md5:40f347d91fd4ba047e511c5321064b49',
+ 'upload_date': '20170913',
+ 'uploader_id': 'eps2',
+ 'timestamp': 1505340777,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Kaltura'],
+ },
+ {
+ # meta twitter:player
+ 'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/',
+ 'info_dict': {
+ 'id': '0_01b42zps',
+ 'ext': 'mp4',
+ 'title': 'Main Twerk (Video)',
+ 'upload_date': '20171208',
+ 'uploader_id': 'sebastian.salinas@thechive.com',
+ 'timestamp': 1512713057,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Kaltura'],
+ },
+ # referrer protected EaglePlatform embed
+ {
+ 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+ 'info_dict': {
+ 'id': '582306',
+ 'ext': 'mp4',
+ 'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 87,
+ 'duration': 3382,
'view_count': int,
- 'age_limit': 0,
+ },
+ 'params': {
+ 'skip_download': True,
},
},
- # ClipYou (Eagle.Platform) embed (custom URL)
+ # ClipYou (EaglePlatform) embed (custom URL)
{
'url': 'http://muz-tv.ru/play/7129/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1191,6 +1404,10 @@ class GenericIE(InfoExtractor):
'duration': 216,
'view_count': int,
},
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This video is unavailable.',
},
# Pladform embed
{
@@ -1204,6 +1421,7 @@ class GenericIE(InfoExtractor):
'duration': 694,
'age_limit': 0,
},
+ 'skip': 'HTTP Error 404: Not Found',
},
# Playwire embed
{
@@ -1224,17 +1442,14 @@ class GenericIE(InfoExtractor):
'id': '518726732',
'ext': 'mp4',
'title': 'Facebook Creates "On This Day" | Crunch Report',
+ 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild',
+ 'timestamp': 1427237531,
+ 'uploader': 'Crunch Report',
+ 'upload_date': '20150324',
},
- },
- # SVT embed
- {
- 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
- 'info_dict': {
- 'id': '2900353',
- 'ext': 'flv',
- 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
- 'duration': 27,
- 'age_limit': 0,
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
},
# Crooks and Liars embed
@@ -1275,16 +1490,20 @@ class GenericIE(InfoExtractor):
'upload_date': '20140107',
'timestamp': 1389118457,
},
+ 'skip': 'Invalid Page URL',
},
# NBC News embed
{
'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
'md5': '1aa589c675898ae6d37a17913cf68d66',
'info_dict': {
- 'id': '701714499682',
+ 'id': 'x_dtl_oa_LettermanliftPR_160608',
'ext': 'mp4',
- 'title': 'PREVIEW: On Assignment: David Letterman',
+ 'title': 'David Letterman: A Preview',
'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
+ 'upload_date': '20160609',
+ 'timestamp': 1465431544,
+ 'uploader': 'NBCU-NEWS',
},
},
# UDN embed
@@ -1301,21 +1520,7 @@ class GenericIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
- },
- # Ooyala embed
- {
- 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
- 'info_dict': {
- 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
- 'ext': 'mp4',
- 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.',
- 'title': 'This is what separates the Excel masters from the wannabes',
- 'duration': 191.933,
- },
- 'params': {
- # m3u8 downloads
- 'skip_download': True,
- }
+ 'expected_warnings': ['Failed to parse JSON Expecting value'],
},
# Brightcove URL in single quotes
{
@@ -1332,32 +1537,18 @@ class GenericIE(InfoExtractor):
'timestamp': 1432570283,
},
},
- # Dailymotion Cloud video
- {
- 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
- 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
- 'info_dict': {
- 'id': 'x2uy8t3',
- 'ext': 'mp4',
- 'title': 'Sauvons les abeilles ! - Le débat',
- 'description': 'md5:d9082128b1c5277987825d684939ca26',
- 'thumbnail': r're:^https?://.*\.jpe?g$',
- 'timestamp': 1434970506,
- 'upload_date': '20150622',
- 'uploader': 'Public Sénat',
- 'uploader_id': 'xa9gza',
- }
- },
- # OnionStudios embed
+ # Kinja embed
{
'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
'info_dict': {
- 'id': '2855',
+ 'id': '106351',
'ext': 'mp4',
'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+ 'description': 'Migrated from OnionStudios',
'thumbnail': r're:^https?://.*\.jpe?g$',
- 'uploader': 'ClickHole',
- 'uploader_id': 'clickhole',
+ 'uploader': 'clickhole',
+ 'upload_date': '20150527',
+ 'timestamp': 1432744860,
}
},
# SnagFilms embed
@@ -1411,6 +1602,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1423,18 +1630,6 @@ class GenericIE(InfoExtractor):
'upload_date': '20160409',
},
},
- # LiveLeak embed
- {
- 'url': 'http://www.wykop.pl/link/3088787/',
- 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
- 'info_dict': {
- 'id': '874_1459135191',
- 'ext': 'mp4',
- 'title': 'Man shows poor quality of new apartment building',
- 'description': 'The wall is like a sand pile.',
- 'uploader': 'Lake8737',
- }
- },
# Duplicated embedded video URLs
{
'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
@@ -1485,6 +1680,21 @@ class GenericIE(InfoExtractor):
'title': 'Facebook video #599637780109885',
},
},
+ # Facebook <iframe> embed, plugin video
+ {
+ 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
+ 'info_dict': {
+ 'id': '1754168231264132',
+ 'ext': 'mp4',
+ 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
+ 'uploader': 'Tariq Ramadan (official)',
+ 'timestamp': 1496758379,
+ 'upload_date': '20170606',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Facebook API embed
{
'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
@@ -1524,6 +1734,15 @@ class GenericIE(InfoExtractor):
'add_ie': ['Kaltura'],
},
{
+ # multiple kaltura embeds, nsfw
+ 'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html',
+ 'info_dict': {
+ 'id': 'kamila-avec-video-jaime-sadomie',
+ 'title': "Kamila avec vídeo “J'aime sadomie”",
+ },
+ 'playlist_count': 8,
+ },
+ {
# Non-standard Vimeo embed
'url': 'https://openclassrooms.com/courses/understanding-the-web',
'md5': '64d86f1c7d369afd9a78b38cbb88d80a',
@@ -1661,6 +1880,286 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 5,
},
+ {
+ # Limelight embed (LimelightPlayerUtil.embed)
+ 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+ 'info_dict': {
+ 'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+ 'ext': 'mp4',
+ 'title': '07448641',
+ 'timestamp': 1499890639,
+ 'upload_date': '20170712',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['LimelightMedia'],
+ },
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
+ {
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': '720642',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [MediasetIE.ie_key()],
+ },
+ {
+ # JOJ.sk embeds
+ 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+ 'info_dict': {
+ 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+ 'title': 'Slovenskom sa prehnala vlna silných búrok',
+ },
+ 'playlist_mincount': 5,
+ 'add_ie': [JojIE.ie_key()],
+ },
+ {
+ # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+ 'url': 'https://tvrain.ru/amp/418921/',
+ 'md5': 'cc00413936695987e8de148b67d14f1d',
+ 'info_dict': {
+ 'id': '418921',
+ 'ext': 'mp4',
+ 'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+ },
+ },
+ {
+ # vzaar embed
+ 'url': 'http://help.vzaar.com/article/165-embedding-video',
+ 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+ 'info_dict': {
+ 'id': '8707641',
+ 'ext': 'mp4',
+ 'title': 'Building A Business Online: Principal Chairs Q & A',
+ },
+ },
+ {
+ # multiple HTML5 videos on one page
+ 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
+ 'info_dict': {
+ 'id': 'keyscenarios',
+ 'title': 'Rescue Kit 14 Free Edition - Getting started',
+ },
+ 'playlist_count': 4,
+ },
+ {
+ # vshare embed
+ 'url': 'https://youtube-dl-demo.neocities.org/vshare.html',
+ 'md5': '17b39f55b5497ae8b59f5fbce8e35886',
+ 'info_dict': {
+ 'id': '0f64ce6',
+ 'title': 'vl14062007715967',
+ 'ext': 'mp4',
+ }
+ },
+ {
+ 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
+ 'md5': 'aecd089f55b1cb5a59032cb049d3a356',
+ 'info_dict': {
+ 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d',
+ 'ext': 'mp4',
+ 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare',
+ 'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
+ 'timestamp': 1474354800,
+ 'upload_date': '20160920',
+ }
+ },
+ {
+ 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton',
+ 'info_dict': {
+ 'id': '1731611',
+ 'ext': 'mp4',
+ 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!',
+ 'description': 'md5:eb5f23826a027ba95277d105f248b825',
+ 'timestamp': 1516100691,
+ 'upload_date': '20180116',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [SpringboardPlatformIE.ie_key()],
+ },
+ {
+ 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
+ 'info_dict': {
+ 'id': 'vMDE4NzI1Mjgt690b',
+ 'ext': 'mp4',
+ 'title': 'Котята',
+ },
+ 'add_ie': [YapFilesIE.ie_key()],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # CloudflareStream embed
+ 'url': 'https://www.cloudflare.com/products/cloudflare-stream/',
+ 'info_dict': {
+ 'id': '31c9291ab41fac05471db4e73aa11717',
+ 'ext': 'mp4',
+ 'title': '31c9291ab41fac05471db4e73aa11717',
+ },
+ 'add_ie': [CloudflareStreamIE.ie_key()],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # PeerTube embed
+ 'url': 'https://joinpeertube.org/fr/home/',
+ 'info_dict': {
+ 'id': 'home',
+ 'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube',
+ },
+ 'playlist_count': 2,
+ },
+ {
+ # Indavideo embed
+ 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/',
+ 'info_dict': {
+ 'id': '1693903',
+ 'ext': 'mp4',
+ 'title': 'Így kell otthon hamburgert sütni',
+ 'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7',
+ 'timestamp': 1426330212,
+ 'upload_date': '20150314',
+ 'uploader': 'StreetKitchen',
+ 'uploader_id': '546363',
+ },
+ 'add_ie': [IndavideoEmbedIE.ie_key()],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # APA embed via JWPlatform embed
+ 'url': 'http://www.vol.at/blue-man-group/5593454',
+ 'info_dict': {
+ 'id': 'jjv85FdZ',
+ 'ext': 'mp4',
+ 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 254,
+ 'timestamp': 1519211149,
+ 'upload_date': '20180221',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://share-videos.se/auto/video/83645793?uid=13',
+ 'md5': 'b68d276de422ab07ee1d49388103f457',
+ 'info_dict': {
+ 'id': '83645793',
+ 'title': 'Lock up and get excited',
+ 'ext': 'mp4'
+ },
+ 'skip': 'TODO: fix nested playlists processing in tests',
+ },
+ {
+ # Viqeo embeds
+ 'url': 'https://viqeo.tv/',
+ 'info_dict': {
+ 'id': 'viqeo',
+ 'title': 'All-new video platform',
+ },
+ 'playlist_count': 6,
+ },
+ {
+ # Squarespace video embed, 2019-08-28
+ 'url': 'http://ootboxford.com',
+ 'info_dict': {
+ 'id': 'Tc7b_JGdZfw',
+ 'title': 'Out of the Blue, at Childish Things 10',
+ 'ext': 'mp4',
+ 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
+ 'uploader_id': 'helendouglashouse',
+ 'uploader': 'Helen & Douglas House',
+ 'upload_date': '20140328',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # {
+ # # Zype embed
+ # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+ # 'info_dict': {
+ # 'id': '5b400b834b32992a310622b9',
+ # 'ext': 'mp4',
+ # 'title': 'Smoky Barbecue Favorites',
+ # 'thumbnail': r're:^https?://.*\.jpe?g',
+ # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+ # 'upload_date': '20170909',
+ # 'timestamp': 1504915200,
+ # },
+ # 'add_ie': [ZypeIE.ie_key()],
+ # 'params': {
+ # 'skip_download': True,
+ # },
+ # },
+ {
+ # videojs embed
+ 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
+ 'info_dict': {
+ 'id': 'shell',
+ 'ext': 'mp4',
+ 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано',
+ 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download MPD manifest'],
+ },
+ {
+ # DailyMotion embed with DM.player
+ 'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804',
+ 'info_dict': {
+ 'id': 'k6aKkGHd9FJs4mtJN39',
+ 'ext': 'mp4',
+ 'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final',
+ 'description': 'This video is private.',
+ 'uploader_id': 'x1jf30l',
+ 'uploader': 'beIN SPORTS USA',
+ 'upload_date': '20190528',
+ 'timestamp': 1559062971,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1677,7 +2176,170 @@ class GenericIE(InfoExtractor):
# 'params': {
# 'force_generic_extractor': True,
# },
- # }
+ # },
+ {
+ # VHX Embed
+ 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy',
+ 'info_dict': {
+ 'id': '858208',
+ 'ext': 'mp4',
+ 'title': 'Untitled',
+ 'uploader_id': 'user80538407',
+ 'uploader': 'OTT Videos',
+ },
+ },
+ {
+ # ArcPublishing PoWa video player
+ 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/',
+ 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3',
+ 'info_dict': {
+ 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
+ 'ext': 'mp4',
+ 'title': 'Senate candidates wave to voters on Anchorage streets',
+ 'description': 'md5:91f51a6511f090617353dc720318b20e',
+ 'timestamp': 1604378735,
+ 'upload_date': '20201103',
+ 'duration': 1581,
+ },
+ },
+ {
+ # MyChannels SDK embed
+ # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen
+ 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/',
+ 'md5': '90c0699c37006ef18e198c032d81739c',
+ 'info_dict': {
+ 'id': '194165',
+ 'ext': 'mp4',
+ 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe',
+ 'timestamp': 1611740340,
+ 'upload_date': '20210127',
+ 'duration': 159,
+ },
+ },
+ {
+ # Simplecast player embed
+ 'url': 'https://www.bio.org/podcast',
+ 'info_dict': {
+ 'id': 'podcast',
+ 'title': 'I AM BIO Podcast | BIO',
+ },
+ 'playlist_mincount': 52,
+ },
+ {
+ # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
+ 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
+ 'only_matching': True,
+ }, {
+ # KVS Player
+ 'url': 'https://www.kvs-demo.com/videos/105/kelis-4th-of-july/',
+ 'info_dict': {
+ 'id': '105',
+ 'display_id': 'kelis-4th-of-july',
+ 'ext': 'mp4',
+ 'title': 'Kelis - 4th Of July',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ },
+ }, {
+ # KVS Player
+ 'url': 'https://www.kvs-demo.com/embed/105/',
+ 'info_dict': {
+ 'id': '105',
+ 'display_id': 'kelis-4th-of-july',
+ 'ext': 'mp4',
+ 'title': 'Kelis - 4th Of July / Embed Player',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # KVS Player (tested also in thisvid.py)
+ 'url': 'https://youix.com/video/leningrad-zoj/',
+ 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
+ 'info_dict': {
+ 'id': '18485',
+ 'display_id': 'leningrad-zoj',
+ 'ext': 'mp4',
+ 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
+ }, {
+ # KVS Player
+ 'url': 'https://youix.com/embed/18485',
+ 'md5': '94f96ba95706dc3880812b27b7d8a2b8',
+ 'info_dict': {
+ 'id': '18485',
+ 'display_id': 'leningrad-zoj',
+ 'ext': 'mp4',
+ 'title': 'Ленинград - ЗОЖ',
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
+ }, {
+ # KVS Player
+ 'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
+ 'md5': '94166bdb26b4cb1fb9214319a629fc51',
+ 'info_dict': {
+ 'id': '21217',
+ 'display_id': '40-nochey-2016',
+ 'ext': 'mp4',
+ 'title': '40 ночей (2016) - BogMedia.org',
+ 'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
+ 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
+ },
+ }, {
+ # KVS Player (for sites that serve kt_player.js via non-https urls)
+ 'url': 'http://www.camhub.world/embed/389508',
+ 'md5': 'fbe89af4cfb59c8fd9f34a202bb03e32',
+ 'info_dict': {
+ 'id': '389508',
+ 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
+ 'ext': 'mp4',
+ 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+ 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+ },
+ }, {
+ 'url': 'https://mrdeepfakes.com/video/5/selena-gomez-pov-deep-fakes',
+ 'md5': 'fec4ad5ec150f655e0c74c696a4a2ff4',
+ 'info_dict': {
+ 'id': '5',
+ 'display_id': 'selena-gomez-pov-deep-fakes',
+ 'ext': 'mp4',
+ 'title': 'Selena Gomez POV (Deep Fakes) DeepFake Porn - MrDeepFakes',
+ 'description': 'md5:17d1f84b578c9c26875ac5ef9a932354',
+ 'height': 720,
+ 'age_limit': 18,
+ },
+ }, {
+ 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+ 'md5': 'e2f0a4c329f7986280b7328e24036d60',
+ 'info_dict': {
+ 'id': '284002',
+ 'display_id': 'just-out-of-the-shower-joi',
+ 'ext': 'mp4',
+ 'title': 'Just Out Of The Shower JOI - Shooshtime',
+ 'height': 720,
+ 'age_limit': 18,
+ },
+ }, {
+ # would like to use the yt-dl test video but searching for
+ # '"\'/\\ä↭𝕐' fails, so using an old vid from YouTube Korea
+ 'note': 'Test default search',
+ 'url': 'Shorts로 허락 필요없이 놀자! (BTS편)',
+ 'info_dict': {
+ 'id': 'usDGO4Zb-dc',
+ 'ext': 'mp4',
+ 'title': 'YouTube Shorts로 허락 필요없이 놀자! (BTS편)',
+ 'description': 'md5:96e31607eba81ab441567b5e289f4716',
+ 'upload_date': '20211107',
+ 'uploader': 'YouTube Korea',
+ 'location': '대한민국',
+ },
+ 'params': {
+ 'default_search': 'ytsearch',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['uploader id'],
+ },
]
def report_following_redirect(self, new_url):
@@ -1689,23 +2351,52 @@ class GenericIE(InfoExtractor):
playlist_desc_el = doc.find('./channel/description')
playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+ NS_MAP = {
+ 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+ }
+
entries = []
for it in doc.findall('./channel/item'):
- next_url = xpath_text(it, 'link', fatal=False)
+ next_url = None
+ enclosure_nodes = it.findall('./enclosure')
+ for e in enclosure_nodes:
+ next_url = e.attrib.get('url')
+ if next_url:
+ break
+
if not next_url:
- enclosure_nodes = it.findall('./enclosure')
- for e in enclosure_nodes:
- next_url = e.attrib.get('url')
- if next_url:
- break
+ next_url = xpath_text(it, 'link', fatal=False)
if not next_url:
continue
+ def itunes(key):
+ return xpath_text(
+ it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
+ default=None)
+
+ duration = itunes('duration')
+ explicit = (itunes('explicit') or '').lower()
+ if explicit in ('true', 'yes'):
+ age_limit = 18
+ elif explicit in ('false', 'no'):
+ age_limit = 0
+ else:
+ age_limit = None
+
entries.append({
'_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
+ 'description': xpath_text(it, 'description', default=None),
+ 'timestamp': unified_timestamp(
+ xpath_text(it, 'pubDate', default=None)),
+ 'duration': int_or_none(duration) or parse_duration(duration),
+ 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
+ 'episode': itunes('title'),
+ 'episode_number': int_or_none(itunes('episode')),
+ 'season_number': int_or_none(itunes('season')),
+ 'age_limit': age_limit,
})
return {
@@ -1753,12 +2444,91 @@ class GenericIE(InfoExtractor):
'title': title,
}
+ def _extract_kvs(self, url, webpage, video_id):
+
+ def getlicensetoken(license):
+ modlicense = license.replace('$', '').replace('0', '1')
+ center = int(len(modlicense) / 2)
+ fronthalf = int(modlicense[:center + 1])
+ backhalf = int(modlicense[center:])
+
+ modlicense = compat_str(4 * abs(fronthalf - backhalf))
+
+ def parts():
+ for o in range(0, center + 1):
+ for i in range(1, 5):
+ yield compat_str((int(license[o + i]) + int(modlicense[o])) % 10)
+
+ return ''.join(parts())
+
+ def getrealurl(video_url, license_code):
+ if not video_url.startswith('function/0/'):
+ return video_url # not obfuscated
+
+ url_path, _, url_query = video_url.partition('?')
+ urlparts = url_path.split('/')[2:]
+ license = getlicensetoken(license_code)
+ newmagic = urlparts[5][:32]
+
+ def spells(x, o):
+ l = (o + sum(int(n) for n in license[o:])) % 32
+ for i in range(0, len(x)):
+ yield {l: x[o], o: x[l]}.get(i, x[i])
+
+ for o in range(len(newmagic) - 1, -1, -1):
+ newmagic = ''.join(spells(newmagic, o))
+
+ urlparts[5] = newmagic + urlparts[5][32:]
+ return '/'.join(urlparts) + '?' + url_query
+
+ flashvars = self._search_regex(
+ r'(?s)<script\b[^>]*>.*?var\s+flashvars\s*=\s*(\{.+?\});.*?</script>',
+ webpage, 'flashvars')
+ flashvars = self._parse_json(flashvars, video_id, transform_source=js_to_json)
+
+ # extract the part after the last / as the display_id from the
+ # canonical URL.
+ display_id = self._search_regex(
+ r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+ r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+ webpage, 'display_id', fatal=False
+ )
+ title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+ thumbnail = flashvars['preview_url']
+ if thumbnail.startswith('//'):
+ protocol, _, _ = url.partition('/')
+ thumbnail = protocol + thumbnail
+
+ url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
+ formats = []
+ for key in url_keys:
+ if '/get_file/' not in flashvars[key]:
+ continue
+ format_id = flashvars.get(key + '_text', key)
+ formats.append(merge_dicts(
+ parse_resolution(format_id) or parse_resolution(flashvars[key]), {
+ 'url': urljoin(url, getrealurl(flashvars[key], flashvars['license_code'])),
+ 'format_id': format_id,
+ 'ext': 'mp4',
+ 'http_headers': {'Referer': url},
+ }))
+ if not formats[-1].get('height'):
+ formats[-1]['quality'] = 1
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': flashvars['video_id'],
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
+
def _real_extract(self, url):
if url.startswith('//'):
- return {
- '_type': 'url',
- 'url': self.http_scheme() + url,
- }
+ return self.url_result(self.http_scheme() + url)
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
@@ -1767,7 +2537,7 @@ class GenericIE(InfoExtractor):
default_search = 'fixup_error'
if default_search in ('auto', 'auto_warning', 'fixup_error'):
- if '/' in url:
+ if re.match(r'^[^\s/]+\.[^\s/]+/', url):
self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
elif default_search != 'fixup_error':
@@ -1828,21 +2598,21 @@ class GenericIE(InfoExtractor):
info_dict = {
'id': video_id,
'title': self._generic_title(url),
- 'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+ 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
}
# Check for direct link to a video
content_type = head_response.headers.get('Content-Type', '').lower()
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
- format_id = m.group('format_id')
+ format_id = compat_str(m.group('format_id'))
if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
elif format_id == 'f4m':
formats = self._extract_f4m_formats(url, video_id)
else:
formats = [{
- 'format_id': m.group('format_id'),
+ 'format_id': format_id,
'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
@@ -1891,6 +2661,9 @@ class GenericIE(InfoExtractor):
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
+ if '<title>DPG Media Privacy Gate</title>' in webpage:
+ webpage = self._download_webpage(url, video_id)
+
self.report_extraction(video_id)
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
@@ -1907,10 +2680,14 @@ class GenericIE(InfoExtractor):
self._sort_formats(smil['formats'])
return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
- return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
+ return self.playlist_result(
+ self._parse_xspf(
+ doc, video_id, xspf_url=url,
+ xspf_base_url=full_response.geturl()),
+ video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats(
- doc, video_id,
+ doc,
mpd_base_url=full_response.geturl().rpartition('/')[0],
mpd_url=url)
self._sort_formats(info_dict['formats'])
@@ -1928,9 +2705,17 @@ class GenericIE(InfoExtractor):
return camtasia_res
# Sometimes embedded video player is hidden behind percent encoding
- # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
+ # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
- webpage = compat_urllib_parse_unquote(webpage)
+ # FIXME: unescaping the whole page may break URLs, commenting out for now.
+ # There probably should be a second run of generic extractor on unescaped webpage.
+ # webpage = compat_urllib_parse_unquote(webpage)
+
+ # Unescape squarespace embeds to be detected by generic extractor,
+ # see https://github.com/ytdl-org/youtube-dl/issues/21294
+ webpage = re.sub(
+ r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
+ lambda x: unescapeHTML(x.group(0)), webpage)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
@@ -1948,10 +2733,17 @@ class GenericIE(InfoExtractor):
# And then there are the jokers who advertise that they use RTA,
# but actually don't.
AGE_LIMIT_MARKERS = [
- r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
+ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+ r'>[^<]*you acknowledge you are at least (\d+) years old',
+ r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
]
- if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
- age_limit = 18
+ for marker in AGE_LIMIT_MARKERS:
+ m = re.search(marker, webpage)
+ if not m:
+ continue
+ age_limit = max(
+ age_limit or 0,
+ int_or_none(m.groups() and m.group(1), default=18))
# video uploader is domain name
video_uploader = self._search_regex(
@@ -1960,6 +2752,13 @@ class GenericIE(InfoExtractor):
video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+ info_dict.update({
+ 'title': video_title,
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'age_limit': age_limit,
+ })
+
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
@@ -1979,21 +2778,38 @@ class GenericIE(InfoExtractor):
# Look for Brightcove New Studio embeds
bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
- return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
+ return self.playlist_from_matches(
+ bc_urls, video_id, video_title,
+ getter=lambda x: smuggle_url(x, {'referrer': url}),
+ ie='BrightcoveNew')
+
+ # Look for Nexx embeds
+ nexx_urls = NexxIE._extract_urls(webpage)
+ if nexx_urls:
+ return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+ # Look for Nexx iFrame embeds
+ nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+ if nexx_embed_urls:
+ return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
# Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls:
return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
- # Look for Vessel embeds
- vessel_urls = VesselIE._extract_urls(webpage)
- if vessel_urls:
- return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
+ arc_urls = ArcPublishingIE._extract_urls(webpage)
+ if arc_urls:
+ return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
+
+ mychannels_urls = MedialaanIE._extract_urls(webpage)
+ if mychannels_urls:
+ return self.playlist_from_matches(
+ mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key())
# Look for embedded rtl.nl player
matches = re.findall(
- r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
+ r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
if matches:
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
@@ -2002,42 +2818,21 @@ class GenericIE(InfoExtractor):
if vimeo_urls:
return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
+ vhx_url = VHXEmbedIE._extract_url(webpage)
+ if vhx_url:
+ return self.url_result(vhx_url, VHXEmbedIE.ie_key())
+
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage, 'vid.me embed', default=None)
if vid_me_embed_url is not None:
return self.url_result(vid_me_embed_url, 'Vidme')
- # Look for embedded YouTube player
- matches = re.findall(r'''(?x)
- (?:
- <iframe[^>]+?src=|
- data-video-url=|
- <embed[^>]+?src=|
- embedSWF\(?:\s*|
- <object[^>]+data=|
- new\s+SWFObject\(
- )
- (["\'])
- (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
- (?:embed|v|p)/.+?)
- \1''', webpage)
- if matches:
+ # Look for YouTube embeds
+ youtube_urls = YoutubeIE._extract_urls(webpage)
+ if youtube_urls:
return self.playlist_from_matches(
- matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
-
- # Look for lazyYT YouTube embed
- matches = re.findall(
- r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
- if matches:
- return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
-
- # Look for Wordpress "YouTube Video Importer" plugin
- matches = re.findall(r'''(?x)<div[^>]+
- class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
- data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
- if matches:
- return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
+ youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
matches = DailymotionIE._extract_urls(webpage)
if matches:
@@ -2053,58 +2848,33 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
- # Look for embedded Wistia player
- match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
- if match:
- embed_url = self._proto_relative_url(
- unescapeHTML(match.group('url')))
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
+ # Look for DailyMail embeds
+ dailymail_urls = DailyMailIE._extract_urls(webpage)
+ if dailymail_urls:
+ return self.playlist_from_matches(
+ dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
- return {
- '_type': 'url_transparent',
- 'url': 'wistia:%s' % match.group('id'),
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
+ # Look for Teachable embeds, must be before Wistia
+ teachable_url = TeachableIE._extract_url(webpage, url)
+ if teachable_url:
+ return self.url_result(teachable_url)
- match = re.search(
- r'''(?sx)
- <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
- ''', webpage)
- if match:
- return self.url_result(self._proto_relative_url(
- 'wistia:%s' % match.group('id')), 'Wistia')
+ # Look for embedded Wistia player
+ wistia_urls = WistiaIE._extract_urls(webpage)
+ if wistia_urls:
+ playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+ for entry in playlist['entries']:
+ entry.update({
+ '_type': 'url_transparent',
+ 'uploader': video_uploader,
+ })
+ return playlist
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
- # Look for embedded condenast player
- matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
- webpage)
- if matches:
- return {
- '_type': 'playlist',
- 'entries': [{
- '_type': 'url',
- 'ie_key': 'CondeNast',
- 'url': ma,
- } for ma in matches],
- 'title': video_title,
- 'id': video_id,
- }
-
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -2139,10 +2909,11 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
- mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
- re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
- re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
- re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+ mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
+ or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+ or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+ or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
+ or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
embed_token = self._search_regex(
r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
@@ -2172,23 +2943,10 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group(1), 'Mpora')
- # Look for embedded NovaMov-based player
- mobj = re.search(
- r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
- (?P<url>http://(?:(?:embed|www)\.)?
- (?:novamov\.com|
- nowvideo\.(?:ch|sx|eu|at|ag|co)|
- videoweed\.(?:es|com)|
- movshare\.(?:net|sx|ag)|
- divxstage\.(?:eu|net|ch|co|at|ag))
- /embed\.php.+?)\1''', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'))
-
# Look for embedded Facebook player
- facebook_url = FacebookIE._extract_url(webpage)
- if facebook_url is not None:
- return self.url_result(facebook_url, 'Facebook')
+ facebook_urls = FacebookIE._extract_urls(webpage)
+ if facebook_urls:
+ return self.playlist_from_matches(facebook_urls, video_id, video_title)
# Look for embedded VK player
mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -2196,9 +2954,14 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'VK')
# Look for embedded Odnoklassniki player
- mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'Odnoklassniki')
+ odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage)
+ if odnoklassniki_url:
+ return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+
+ # Look for sibnet embedded player
+ sibnet_urls = VKIE._extract_sibnet_urls(webpage)
+ if sibnet_urls:
+ return self.playlist_from_matches(sibnet_urls, video_id, video_title)
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
@@ -2225,6 +2988,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for Simplecast embeds
+ simplecast_urls = SimplecastIE._extract_urls(webpage)
+ if simplecast_urls:
+ return self.playlist_from_matches(
+ simplecast_urls, video_id, video_title)
+
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
@@ -2241,9 +3010,9 @@ class GenericIE(InfoExtractor):
return self.url_result(tvc_url, 'TVC')
# Look for embedded SportBox player
- sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+ sportbox_urls = SportBoxIE._extract_urls(webpage)
if sportbox_urls:
- return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
+ return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
@@ -2270,6 +3039,26 @@ class GenericIE(InfoExtractor):
if redtube_urls:
return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
+ # Look for embedded Tube8 player
+ tube8_urls = Tube8IE._extract_urls(webpage)
+ if tube8_urls:
+ return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key())
+
+ # Look for embedded Mofosex player
+ mofosex_urls = MofosexEmbedIE._extract_urls(webpage)
+ if mofosex_urls:
+ return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key())
+
+ # Look for embedded Spankwire player
+ spankwire_urls = SpankwireIE._extract_urls(webpage)
+ if spankwire_urls:
+ return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key())
+
+ # Look for embedded YouPorn player
+ youporn_urls = YouPornIE._extract_urls(webpage)
+ if youporn_urls:
+ return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key())
+
# Look for embedded Tvigle player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -2288,11 +3077,9 @@ class GenericIE(InfoExtractor):
return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player
- mobj = re.search(
- r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
- webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ arte_urls = ArteTVEmbedIE._extract_urls(webpage)
+ if arte_urls:
+ return self.playlist_from_matches(arte_urls, video_id, video_title)
# Look for embedded francetv player
mobj = re.search(
@@ -2301,20 +3088,15 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'))
- # Look for embedded smotri.com player
- smotri_url = SmotriIE._extract_url(webpage)
- if smotri_url:
- return self.url_result(smotri_url, 'Smotri')
-
# Look for embedded Myvi.ru player
myvi_url = MyviIE._extract_url(webpage)
if myvi_url:
return self.url_result(myvi_url)
# Look for embedded soundcloud player
- soundcloud_urls = SoundcloudIE._extract_urls(webpage)
+ soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
if soundcloud_urls:
- return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+ return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
# Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage)
@@ -2357,7 +3139,7 @@ class GenericIE(InfoExtractor):
webpage)
if not mobj:
mobj = re.search(
- r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+ r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
@@ -2381,16 +3163,19 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- kaltura_url = KalturaIE._extract_url(webpage)
- if kaltura_url:
- return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
+ kaltura_urls = KalturaIE._extract_urls(webpage)
+ if kaltura_urls:
+ return self.playlist_from_matches(
+ kaltura_urls, video_id, video_title,
+ getter=lambda x: smuggle_url(x, {'source_url': url}),
+ ie=KalturaIE.ie_key())
- # Look for Eagle.Platform embeds
+ # Look for EaglePlatform embeds
eagleplatform_url = EaglePlatformIE._extract_url(webpage)
if eagleplatform_url:
- return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+ return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
- # Look for ClipYou (uses Eagle.Platform) embeds
+ # Look for ClipYou (uses EaglePlatform) embeds
mobj = re.search(
r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
if mobj is not None:
@@ -2447,7 +3232,7 @@ class GenericIE(InfoExtractor):
# Look for UDN embeds
mobj = re.search(
- r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
+ r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
if mobj is not None:
return self.url_result(
compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
@@ -2457,10 +3242,11 @@ class GenericIE(InfoExtractor):
if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP')
- # Look for Dailymotion Cloud videos
- dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
- if dmcloud_url:
- return self.url_result(dmcloud_url, 'DailymotionCloud')
+ # Look for Kinja embeds
+ kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url)
+ if kinja_embed_urls:
+ return self.playlist_from_matches(
+ kinja_embed_urls, video_id, video_title)
# Look for OnionStudios embeds
onionstudios_url = OnionStudiosIE._extract_url(webpage)
@@ -2473,9 +3259,9 @@ class GenericIE(InfoExtractor):
return self.url_result(viewlift_url)
# Look for JWPlatform embeds
- jwplatform_url = JWPlatformIE._extract_url(webpage)
- if jwplatform_url:
- return self.url_result(jwplatform_url, 'JWPlatform')
+ jwplatform_urls = JWPlatformIE._extract_urls(webpage)
+ if jwplatform_urls:
+ return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key())
# Look for Digiteka embeds
digiteka_url = DigitekaIE._extract_url(webpage)
@@ -2498,28 +3284,11 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
limelight_urls, video_id, video_title, video_description)
- mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
- if mobj:
- lm = {
- 'Media': 'media',
- 'Channel': 'channel',
- 'ChannelList': 'channel_list',
- }
- return self.url_result(smuggle_url('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
- 'Limelight%s' % mobj.group(1), mobj.group(2))
-
- mobj = re.search(
- r'''(?sx)
- <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
- <param[^>]+
- name=(["\'])flashVars\2[^>]+
- value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
- ''', webpage)
- if mobj:
- return self.url_result(smuggle_url(
- 'limelight:media:%s' % mobj.group('id'),
- {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
# Look for AdobeTVVideo embeds
mobj = re.search(
@@ -2540,7 +3309,7 @@ class GenericIE(InfoExtractor):
# Look for VODPlatform embeds
mobj = re.search(
- r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1',
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1',
webpage)
if mobj is not None:
return self.url_result(
@@ -2548,10 +3317,14 @@ class GenericIE(InfoExtractor):
# Look for Mangomolo embeds
mobj = re.search(
- r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/
+ r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//
+ (?:
+ admin\.mangomolo\.com/analytics/index\.php/customers/embed|
+ player\.mangomolo\.com/v1
+ )/
(?:
video\?.*?\bid=(?P<video_id>\d+)|
- index\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
+ (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
).+?)\1''', webpage)
if mobj is not None:
info = {
@@ -2581,11 +3354,6 @@ class GenericIE(InfoExtractor):
return self.url_result(
self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
- # Look for LiveLeak embeds
- liveleak_url = LiveLeakIE._extract_url(webpage)
- if liveleak_url:
- return self.url_result(liveleak_url, 'LiveLeak')
-
# Look for 3Q SDN embeds
threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
if threeqsdn_url:
@@ -2620,12 +3388,6 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
- # Look for Openload embeds
- openload_urls = OpenloadIE._extract_urls(webpage)
- if openload_urls:
- return self.playlist_from_matches(
- openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
-
# Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls:
@@ -2636,40 +3398,217 @@ class GenericIE(InfoExtractor):
rutube_urls = RutubeIE._extract_urls(webpage)
if rutube_urls:
return self.playlist_from_matches(
- rutube_urls, ie=RutubeIE.ie_key())
+ rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
- # Looking for http://schema.org/VideoObject
- json_ld = self._search_json_ld(
- webpage, video_id, default={}, expected_type='VideoObject')
- if json_ld.get('url'):
- info_dict.update({
- 'title': video_title or info_dict['title'],
- 'description': video_description,
- 'thumbnail': video_thumbnail,
- 'age_limit': age_limit
- })
- info_dict.update(json_ld)
- return info_dict
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+ # Look for Mediaset embeds
+ mediaset_urls = MediasetIE._extract_urls(self, webpage)
+ if mediaset_urls:
+ return self.playlist_from_matches(
+ mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
+ # Look for JOJ.sk embeds
+ joj_urls = JojIE._extract_urls(webpage)
+ if joj_urls:
+ return self.playlist_from_matches(
+ joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+ # Look for megaphone.fm embeds
+ mpfn_urls = MegaphoneIE._extract_urls(webpage)
+ if mpfn_urls:
+ return self.playlist_from_matches(
+ mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+ # Look for vzaar embeds
+ vzaar_urls = VzaarIE._extract_urls(webpage)
+ if vzaar_urls:
+ return self.playlist_from_matches(
+ vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
+ channel9_urls = Channel9IE._extract_urls(webpage)
+ if channel9_urls:
+ return self.playlist_from_matches(
+ channel9_urls, video_id, video_title, ie=Channel9IE.ie_key())
+
+ vshare_urls = VShareIE._extract_urls(webpage)
+ if vshare_urls:
+ return self.playlist_from_matches(
+ vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
+
+ # Look for Mediasite embeds
+ mediasite_urls = MediasiteIE._extract_urls(webpage)
+ if mediasite_urls:
+ entries = [
+ self.url_result(smuggle_url(
+ compat_urlparse.urljoin(url, mediasite_url),
+ {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
+ for mediasite_url in mediasite_urls]
+ return self.playlist_result(entries, video_id, video_title)
+
+ springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage)
+ if springboardplatform_urls:
+ return self.playlist_from_matches(
+ springboardplatform_urls, video_id, video_title,
+ ie=SpringboardPlatformIE.ie_key())
+
+ yapfiles_urls = YapFilesIE._extract_urls(webpage)
+ if yapfiles_urls:
+ return self.playlist_from_matches(
+ yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
+
+ vice_urls = ViceIE._extract_urls(webpage)
+ if vice_urls:
+ return self.playlist_from_matches(
+ vice_urls, video_id, video_title, ie=ViceIE.ie_key())
+
+ xfileshare_urls = XFileShareIE._extract_urls(webpage)
+ if xfileshare_urls:
+ return self.playlist_from_matches(
+ xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key())
+
+ cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage)
+ if cloudflarestream_urls:
+ return self.playlist_from_matches(
+ cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
+
+ peertube_urls = PeerTubeIE._extract_urls(webpage, url)
+ if peertube_urls:
+ return self.playlist_from_matches(
+ peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
+
+ indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
+ if indavideo_urls:
+ return self.playlist_from_matches(
+ indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key())
+
+ apa_urls = APAIE._extract_urls(webpage)
+ if apa_urls:
+ return self.playlist_from_matches(
+ apa_urls, video_id, video_title, ie=APAIE.ie_key())
+
+ foxnews_urls = FoxNewsIE._extract_urls(webpage)
+ if foxnews_urls:
+ return self.playlist_from_matches(
+ foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
+
+ sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer(
+ r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
+ webpage)]
+ if sharevideos_urls:
+ return self.playlist_from_matches(
+ sharevideos_urls, video_id, video_title)
+
+ viqeo_urls = ViqeoIE._extract_urls(webpage)
+ if viqeo_urls:
+ return self.playlist_from_matches(
+ viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key())
+
+ expressen_urls = ExpressenIE._extract_urls(webpage)
+ if expressen_urls:
+ return self.playlist_from_matches(
+ expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key())
+
+ zype_urls = ZypeIE._extract_urls(webpage)
+ if zype_urls:
+ return self.playlist_from_matches(
+ zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
- for entry in entries:
- entry.update({
+ if len(entries) == 1:
+ entries[0].update({
'id': video_id,
'title': video_title,
})
+ else:
+ for num, entry in enumerate(entries, start=1):
+ entry.update({
+ 'id': '%s-%s' % (video_id, num),
+ 'title': '%s (%d)' % (video_title, num),
+ })
+ for entry in entries:
self._sort_formats(entry['formats'])
- return self.playlist_result(entries)
+ return self.playlist_result(entries, video_id, video_title)
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
- info = self._parse_jwplayer_data(
- jwplayer_data, video_id, require_title=False, base_url=url)
- if not info.get('title'):
- info['title'] = video_title
- return info
+ try:
+ info = self._parse_jwplayer_data(
+ jwplayer_data, video_id, require_title=False, base_url=url)
+ return merge_dicts(info, info_dict)
+ except ExtractorError:
+ # See https://github.com/ytdl-org/youtube-dl/pull/16735
+ pass
+
+ # Video.js embed
+ mobj = re.search(
+ r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+ webpage)
+ if mobj is not None:
+ sources = self._parse_json(
+ mobj.group(1), video_id, transform_source=js_to_json,
+ fatal=False) or []
+ if not isinstance(sources, list):
+ sources = [sources]
+ formats = []
+ for source in sources:
+ src = source.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ src = compat_urlparse.urljoin(url, src)
+ src_type = source.get('type')
+ if isinstance(src_type, compat_str):
+ src_type = src_type.lower()
+ ext = determine_ext(src).lower()
+ if src_type == 'video/youtube':
+ return self.url_result(src, YoutubeIE.ie_key())
+ if src_type == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': (mimetype2ext(src_type)
+ or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+ 'http_headers': {
+ 'Referer': full_response.geturl(),
+ },
+ })
+ if formats:
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
+ return info_dict
+
+ # Look for generic KVS player (before ld+json for tests)
+ found = self._search_regex(
+ (r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
+ # kt_player('kt_player', 'https://i.shoosh.co/player/kt_player.swf?v=5.5.1', ...
+ r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
+ ), webpage, 'KVS player', group='ver', default=False)
+ if found:
+ self.report_extraction('%s: KVS Player' % (video_id, ))
+ if found.split('.')[0] not in ('4', '5', '6'):
+ self.report_warning('Untested major version (%s) in player engine - download may fail.' % (found, ))
+ return merge_dicts(
+ self._extract_kvs(url, webpage, video_id),
+ info_dict)
+
+ # Looking for http://schema.org/VideoObject
+ json_ld = self._search_json_ld(
+ webpage, video_id, default={}, expected_type='VideoObject')
+ if json_ld.get('url'):
+ return merge_dicts(json_ld, info_dict)
def check_video(vurl):
if YoutubeIE.suitable(vurl):
@@ -2727,7 +3666,7 @@ class GenericIE(InfoExtractor):
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
- found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
+ found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
if not found:
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
@@ -2758,7 +3697,7 @@ class GenericIE(InfoExtractor):
# be supported by youtube-dl thus this is checked the very last (see
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
- if embed_url:
+ if embed_url and embed_url != url:
return self.url_result(embed_url)
if not found: