aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py155
1 files changed, 149 insertions, 6 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 27e2bc300..2ff002643 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -26,8 +26,10 @@ from ..utils import (
unsmuggle_url,
UnsupportedError,
url_basename,
+ xpath_text,
)
from .brightcove import BrightcoveIE
+from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
from .smotri import SmotriIE
@@ -526,6 +528,17 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Viddler'],
},
+ # Libsyn embed
+ {
+ 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
+ 'info_dict': {
+ 'id': '3377616',
+ 'ext': 'mp3',
+ 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
+ 'description': 'md5:601cb790edd05908957dae8aaa866465',
+ 'upload_date': '20150220',
+ },
+ },
# jwplayer YouTube
{
'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
@@ -569,6 +582,75 @@ class GenericIE(InfoExtractor):
'title': 'John Carlson Postgame 2/25/15',
},
},
+ # Eagle.Platform embed (generic URL)
+ {
+ 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+ 'info_dict': {
+ 'id': '227304',
+ 'ext': 'mp4',
+ 'title': 'Навальный вышел на свободу',
+ 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 87,
+ 'view_count': int,
+ 'age_limit': 0,
+ },
+ },
+ # ClipYou (Eagle.Platform) embed (custom URL)
+ {
+ 'url': 'http://muz-tv.ru/play/7129/',
+ 'info_dict': {
+ 'id': '12820',
+ 'ext': 'mp4',
+ 'title': "'O Sole Mio",
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 216,
+ 'view_count': int,
+ },
+ },
+ # Pladform embed
+ {
+ 'url': 'http://muz-tv.ru/kinozal/view/7400/',
+ 'info_dict': {
+ 'id': '100183293',
+ 'ext': 'mp4',
+ 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
+ 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 694,
+ 'age_limit': 0,
+ },
+ },
+ # 5min embed
+ {
+ 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
+ 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
+ 'info_dict': {
+ 'id': '518726732',
+ 'ext': 'mp4',
+ 'title': 'Facebook Creates "On This Day" | Crunch Report',
+ },
+ },
+ # RSS feed with enclosure
+ {
+ 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'info_dict': {
+ 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ 'ext': 'm4v',
+ 'upload_date': '20150228',
+ 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ }
+ },
+ # NBC Sports vplayer embed
+ {
+ 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
+ 'info_dict': {
+ 'id': 'ln7x1qSThw4k',
+ 'ext': 'flv',
+ 'title': "PFT Live: New leader in the 'new-look' defense",
+ 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+ },
+ }
]
def report_following_redirect(self, new_url):
@@ -580,11 +662,24 @@ class GenericIE(InfoExtractor):
playlist_desc_el = doc.find('./channel/description')
playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
- entries = [{
- '_type': 'url',
- 'url': e.find('link').text,
- 'title': e.find('title').text,
- } for e in doc.findall('./channel/item')]
+ entries = []
+ for it in doc.findall('./channel/item'):
+ next_url = xpath_text(it, 'link', fatal=False)
+ if not next_url:
+ enclosure_nodes = it.findall('./enclosure')
+ for e in enclosure_nodes:
+ next_url = e.attrib.get('url')
+ if next_url:
+ break
+
+ if not next_url:
+ continue
+
+ entries.append({
+ '_type': 'url',
+ 'url': next_url,
+ 'title': it.find('title').text,
+ })
return {
'_type': 'playlist',
@@ -943,6 +1038,19 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'))
+ # Look for NYTimes player
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
+ # Look for Libsyn player
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for Ooyala videos
mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
@@ -1131,6 +1239,35 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
+ # Look for Eagle.Platform embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'EaglePlatform')
+
+ # Look for ClipYou (uses Eagle.Platform) embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
+ if mobj is not None:
+ return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
+
+ # Look for Pladform embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Pladform')
+
+ # Look for 5min embeds
+ mobj = re.search(
+ r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
+ if mobj is not None:
+ return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
+
+ # Look for NBC Sports VPlayer embeds
+ nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+ if nbc_sports_url:
+ return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
@@ -1187,10 +1324,16 @@ class GenericIE(InfoExtractor):
# HTML5 video
found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
if not found:
+ REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
- r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
+ r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
webpage)
+ if not found:
+ # Look also in Refresh HTTP header
+ refresh_header = head_response.headers.get('Refresh')
+ if refresh_header:
+ found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = found.group(1)
self.report_following_redirect(new_url)