diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 382 |
1 files changed, 256 insertions, 126 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b06f43446..b83c18380 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_etree_fromstring, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_xml_parse_error, @@ -35,6 +36,10 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -56,6 +61,7 @@ from .dailymotion import ( DailymotionIE, DailymotionCloudIE, ) +from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .mtv import MTVServicesEmbeddedIE @@ -88,6 +94,11 @@ from .rutube import RutubeIE from .limelight import LimelightBaseIE from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE class GenericIE(InfoExtractor): @@ -565,6 +576,19 @@ class GenericIE(InfoExtractor): }, 'skip': 'movie expired', }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -756,6 +780,20 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Dailymotion'], }, + # DailyMail embed + { + 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', + 'info_dict': { + 'id': '1495629', + 'ext': 'mp4', + 'title': 'Care worker punches elderly dementia patient in head 11 times', + 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', + }, + 'add_ie': ['DailyMail'], + 'params': { + 'skip_download': True, + }, + }, # YouTube embed { 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', @@ -1182,7 +1220,7 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # Eagle.Platform embed (generic URL) + # EaglePlatform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1196,8 +1234,26 @@ class GenericIE(InfoExtractor): 'view_count': int, 'age_limit': 0, }, + 'params': { + 'skip_download': True, + }, + }, + # referrer protected EaglePlatform embed + { + 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', + 'info_dict': { + 'id': '582306', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3382, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, - # ClipYou (Eagle.Platform) embed (custom URL) + # ClipYou (EaglePlatform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1209,6 +1265,9 @@ class GenericIE(InfoExtractor): 'duration': 216, 'view_count': int, }, + 'params': { + 'skip_download': True, + }, }, # Pladform embed { @@ -1460,14 +1519,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -1509,6 +1581,22 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['BrightcoveLegacy'], }, + # Nexx embed + { + 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', + 'info_dict': { + 'id': '247746', + 'ext': 'mp4', + 'title': "Yesterday's Jam (OV)", + 'description': 'md5:09bc0984723fed34e2581624a84e05f0', + 'timestamp': 1492594816, + 'upload_date': '20170419', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, # Facebook <iframe> embed { 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', @@ -1519,6 +1607,21 @@ class GenericIE(InfoExtractor): 'title': 'Facebook video #599637780109885', }, }, + # Facebook <iframe> embed, plugin video + { + 'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/', + 'info_dict': { + 'id': '1754168231264132', + 'ext': 'mp4', + 'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...', + 'uploader': 'Tariq Ramadan (official)', + 'timestamp': 1496758379, + 'upload_date': '20170606', + }, + 'params': { + 'skip_download': True, + }, + }, # Facebook API embed { 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', @@ -1696,6 +1799,21 @@ class GenericIE(InfoExtractor): 'playlist_mincount': 5, }, { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, + { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', @@ -1718,6 +1836,49 @@ class GenericIE(InfoExtractor): }, 'add_ie': [WashingtonPostIE.ie_key()], }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, + { + # JOJ.sk embeds + 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'info_dict': { + 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'title': 'Slovenskom sa prehnala vlna silných búrok', + }, + 'playlist_mincount': 5, + 'add_ie': [JojIE.ie_key()], + }, + { + # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) + 'url': 'https://tvrain.ru/amp/418921/', + 'md5': 'cc00413936695987e8de148b67d14f1d', + 'info_dict': { + 'id': '418921', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + }, + }, + { + # vzaar embed + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', + 'info_dict': { + 'id': '8707641', + 'ext': 'mp4', + 'title': 'Building A Business Online: Principal Chairs Q & A', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1867,7 +2028,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -1892,14 +2053,14 @@ class GenericIE(InfoExtractor): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - format_id = m.group('format_id') + format_id = compat_str(m.group('format_id')) if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: formats = [{ - 'format_id': m.group('format_id'), + 'format_id': format_id, 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] @@ -1968,7 +2129,7 @@ class GenericIE(InfoExtractor): elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( doc, video_id, - mpd_base_url=full_response.geturl().rpartition('/')[0], + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2017,6 +2178,13 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) + info_dict.update({ + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit, + }) + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -2038,6 +2206,16 @@ class GenericIE(InfoExtractor): if bc_urls: return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + # Look for Nexx embeds + nexx_urls = NexxIE._extract_urls(webpage) + if nexx_urls: + return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) + + # Look for Nexx iFrame embeds + nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) + if nexx_embed_urls: + return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) + # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: @@ -2065,36 +2243,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: @@ -2110,58 +2263,27 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - # Look for embedded Wistia player - match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) - if match: - embed_url = self._proto_relative_url( - unescapeHTML(match.group('url'))) - return { - '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'Wistia', - 'uploader': video_uploader, - } + # Look for DailyMail embeds + dailymail_urls = DailyMailIE._extract_urls(webpage) + if dailymail_urls: + return self.playlist_from_matches( + dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: + # Look for embedded Wistia player + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: return { '_type': 'url_transparent', - 'url': 'wistia:%s' % match.group('id'), - 'ie_key': 'Wistia', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, } - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 - ''', webpage) - if match: - return self.url_result(self._proto_relative_url( - 'wistia:%s' % match.group('id')), 'Wistia') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player - matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', - webpage) - if matches: - return { - '_type': 'playlist', - 'entries': [{ - '_type': 'url', - 'ie_key': 'CondeNast', - 'url': ma, - } for ma in matches], - 'title': video_title, - 'id': video_id, - } - # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -2198,6 +2320,7 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or + re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: @@ -2243,9 +2366,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - facebook_url = FacebookIE._extract_url(webpage) - if facebook_url is not None: - return self.url_result(facebook_url, 'Facebook') + facebook_urls = FacebookIE._extract_urls(webpage) + if facebook_urls: + return self.playlist_from_matches(facebook_urls, video_id, video_title) # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -2442,12 +2565,12 @@ class GenericIE(InfoExtractor): if kaltura_url: return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) - # Look for Eagle.Platform embeds + # Look for EaglePlatform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) if eagleplatform_url: - return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) + return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - # Look for ClipYou (uses Eagle.Platform) embeds + # Look for ClipYou (uses EaglePlatform) embeds mobj = re.search( r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) if mobj is not None: @@ -2555,29 +2678,6 @@ class GenericIE(InfoExtractor): return self.playlist_result( limelight_urls, video_id, video_title, video_description) - mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) - if mobj: - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - return self.url_result(smuggle_url('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), - 'Limelight%s' % mobj.group(1), mobj.group(2)) - - mobj = re.search( - r'''(?sx) - <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) - ''', webpage) - if mobj: - return self.url_result(smuggle_url( - 'limelight:media:%s' % mobj.group('id'), - {'source_url': url}), 'LimelightMedia', mobj.group('id')) - # Look for Anvato embeds anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) if anvato_urls: @@ -2645,9 +2745,9 @@ class GenericIE(InfoExtractor): self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds - liveleak_url = LiveLeakIE._extract_url(webpage) - if liveleak_url: - return self.url_result(liveleak_url, 'LiveLeak') + liveleak_urls = LiveLeakIE._extract_urls(webpage) + if liveleak_urls: + return self.playlist_from_matches(liveleak_urls, video_id, video_title) # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) @@ -2699,7 +2799,7 @@ class GenericIE(InfoExtractor): rutube_urls = RutubeIE._extract_urls(webpage) if rutube_urls: return self.playlist_from_matches( - rutube_urls, ie=RutubeIE.ie_key()) + rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) # Look for WashingtonPost embeds wapo_urls = WashingtonPostIE._extract_urls(webpage) @@ -2707,18 +2807,44 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - info_dict.update({ - 'title': video_title or info_dict['title'], - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit - }) - info_dict.update(json_ld) - return info_dict + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + + # Look for JOJ.sk embeds + joj_urls = JojIE._extract_urls(webpage) + if joj_urls: + return self.playlist_from_matches( + joj_urls, video_id, video_title, ie=JojIE.ie_key()) + + # Look for megaphone.fm embeds + mpfn_urls = MegaphoneIE._extract_urls(webpage) + if mpfn_urls: + return self.playlist_from_matches( + mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + + def merge_dicts(dict1, dict2): + merged = {} + for k, v in dict1.items(): + if v is not None: + merged[k] = v + for k, v in dict2.items(): + if v is None: + continue + if (k not in merged or + (isinstance(v, compat_str) and v and + isinstance(merged[k], compat_str) and + not merged[k])): + merged[k] = v + return merged # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') @@ -2736,9 +2862,13 @@ class GenericIE(InfoExtractor): if jwplayer_data: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - if not info.get('title'): - info['title'] = video_title - return info + return merge_dicts(info, info_dict) + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) def check_video(vurl): if YoutubeIE.suitable(vurl): |