aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py112
1 files changed, 104 insertions, 8 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 5c03fddc6..6d2efb22e 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -8,7 +8,6 @@ import re
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
- compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_request,
compat_urlparse,
@@ -37,6 +36,7 @@ from .rutv import RUTVIE
from .tvc import TVCIE
from .sportbox import SportBoxEmbedIE
from .smotri import SmotriIE
+from .myvi import MyviIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
from .senateisvp import SenateISVPIE
@@ -46,6 +46,8 @@ from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
+from .onionstudios import OnionStudiosIE
+from .snagfilms import SnagFilmsEmbedIE
class GenericIE(InfoExtractor):
@@ -336,6 +338,17 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # Myvi.ru embed
+ {
+ 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+ 'info_dict': {
+ 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+ 'ext': 'mp4',
+ 'title': 'Ужастики, русский трейлер (2015)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 153,
+ }
+ },
# XHamster embed
{
'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
@@ -394,6 +407,26 @@ class GenericIE(InfoExtractor):
'skip_download': 'Requires rtmpdump'
}
},
+ # francetv embed
+ {
+ 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+ 'info_dict': {
+ 'id': 'EV_30231',
+ 'ext': 'mp4',
+ 'title': 'Alcaline, le concert avec Calogero',
+ 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+ 'upload_date': '20150226',
+ 'timestamp': 1424989860,
+ 'duration': 5400,
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ 'expected_warnings': [
+ 'Forbidden'
+ ]
+ },
# Condé Nast embed
{
'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -667,6 +700,18 @@ class GenericIE(InfoExtractor):
'title': 'John Carlson Postgame 2/25/15',
},
},
+ # Kaltura embed (different embed code)
+ {
+ 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+ 'info_dict': {
+ 'id': '1_a52wc67y',
+ 'ext': 'flv',
+ 'upload_date': '20150127',
+ 'uploader_id': 'PremierMedia',
+ 'timestamp': int,
+ 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+ },
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -836,6 +881,27 @@ class GenericIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpe?g$',
}
},
+ # OnionStudios embed
+ {
+ 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+ 'info_dict': {
+ 'id': '2855',
+ 'ext': 'mp4',
+ 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'uploader': 'ClickHole',
+ 'uploader_id': 'clickhole',
+ }
+ },
+ # SnagFilms embed
+ {
+ 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+ 'info_dict': {
+ 'id': '74849a00-85a9-11e1-9660-123139220831',
+ 'ext': 'mp4',
+ 'title': '#whilewewatch',
+ }
+ },
# AdobeTVVideo embed
{
'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
@@ -1014,7 +1080,9 @@ class GenericIE(InfoExtractor):
}
if not self._downloader.params.get('test', False) and not is_intentional:
- self._downloader.report_warning('Falling back on generic information extractor.')
+ force = self._downloader.params.get('force_generic_extractor', False)
+ self._downloader.report_warning(
+ '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
if not full_response:
request = compat_urllib_request.Request(url)
@@ -1066,7 +1134,7 @@ class GenericIE(InfoExtractor):
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/rg3/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
- webpage = compat_urllib_parse.unquote(webpage)
+ webpage = compat_urllib_parse_unquote(webpage)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
@@ -1128,6 +1196,12 @@ class GenericIE(InfoExtractor):
if vimeo_url is not None:
return self.url_result(vimeo_url)
+ vid_me_embed_url = self._search_regex(
+ r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+ webpage, 'vid.me embed', default=None)
+ if vid_me_embed_url is not None:
+ return self.url_result(vid_me_embed_url, 'Vidme')
+
# Look for embedded YouTube player
matches = re.findall(r'''(?x)
(?:
@@ -1320,7 +1394,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
- return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+ return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -1383,11 +1457,23 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ # Look for embedded francetv player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for embedded smotri.com player
smotri_url = SmotriIE._extract_url(webpage)
if smotri_url:
return self.url_result(smotri_url, 'Smotri')
+ # Look for embedded Myvi.ru player
+ myvi_url = MyviIE._extract_url(webpage)
+ if myvi_url:
+ return self.url_result(myvi_url)
+
# Look for embeded soundcloud player
mobj = re.search(
r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
@@ -1467,8 +1553,8 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = re.search(
- r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(["\'])(?:https?:)?//cdnapisec\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?\1.*?entry_id\s*:\s*(["\'])(?P<id>[^\2]+?)\2', webpage))
if mobj is not None:
return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
@@ -1530,6 +1616,16 @@ class GenericIE(InfoExtractor):
if dmcloud_url:
return self.url_result(dmcloud_url, 'DailymotionCloud')
+ # Look for OnionStudios embeds
+ onionstudios_url = OnionStudiosIE._extract_url(webpage)
+ if onionstudios_url:
+ return self.url_result(onionstudios_url)
+
+ # Look for SnagFilms embeds
+ snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
+ if snagfilms_url:
+ return self.url_result(snagfilms_url)
+
# Look for AdobeTVVideo embeds
mobj = re.search(
r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
@@ -1606,7 +1702,7 @@ class GenericIE(InfoExtractor):
if refresh_header:
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
- new_url = compat_urlparse.urljoin(url, found.group(1))
+ new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
self.report_following_redirect(new_url)
return {
'_type': 'url',
@@ -1618,7 +1714,7 @@ class GenericIE(InfoExtractor):
entries = []
for video_url in found:
video_url = compat_urlparse.urljoin(url, video_url)
- video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
# Sometimes, jwplayer extraction will result in a YouTube URL
if YoutubeIE.suitable(video_url):