aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/brightcove.py67
-rw-r--r--youtube_dl/extractor/extractors.py6
-rw-r--r--youtube_dl/extractor/generic.py58
-rw-r--r--youtube_dl/extractor/streamango.py64
-rw-r--r--youtube_dl/extractor/wsj.py52
5 files changed, 212 insertions, 35 deletions
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 46ef8e605..124497e95 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -17,6 +17,7 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ extract_attributes,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
@@ -109,6 +110,7 @@ class BrightcoveLegacyIE(InfoExtractor):
'upload_date': '20140827',
'uploader_id': '710858724001',
},
+ 'skip': 'Video gone',
},
{
# playlist with 'videoList'
@@ -487,12 +489,13 @@ class BrightcoveNewIE(InfoExtractor):
return urls[0] if urls else None
@staticmethod
- def _extract_urls(webpage):
+ def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
- # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
- # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
- # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+ # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+ # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+ # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+ # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
@@ -501,22 +504,48 @@ class BrightcoveNewIE(InfoExtractor):
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
- # Look for embed_in_page embeds [2]
- for video_id, account_id, player_id, embed in re.findall(
- # According to examples from [3] it's unclear whether video id
- # may be optional and what to do when it is
- # According to [4] data-video-id may be prefixed with ref:
- r'''(?sx)
- <video[^>]+
- data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
- </video>.*?
- <script[^>]+
- src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ # Look for <video> tags [2] and embed_in_page embeds [3]
+ # [2] looks like:
+ for video, script_tag, account_id, player_id, embed in re.findall(
+ r'''(?isx)
+ (<video\s+[^>]+>)
+ (?:.*?
+ (<script[^>]+
+ src=["\'](?:https?:)?//players\.brightcove\.net/
+ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ )
+ )?
''', webpage):
- entries.append(
- 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
- % (account_id, player_id, embed, video_id))
+ attrs = extract_attributes(video)
+
+ # According to examples from [4] it's unclear whether video id
+ # may be optional and what to do when it is
+ video_id = attrs.get('data-video-id')
+ if not video_id:
+ continue
+
+ account_id = account_id or attrs.get('data-account')
+ if not account_id:
+ continue
+
+ player_id = player_id or attrs.get('data-player') or 'default'
+ embed = embed or attrs.get('data-embed') or 'default'
+
+ bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+ account_id, player_id, embed, video_id)
+
+ # Some brightcove videos may be embedded with video tag only and
+ # without script tag or any mentioning of brightcove at all. Such
+ # embeds are considered ambiguous since they are matched based only
+ # on data-video-id and data-account attributes and in the wild may
+ # not be brightcove embeds at all. Let's check reconstructed
+ # brightcove URLs in case of such embeds and only process valid
+ # ones. By this we ensure there is indeed a brightcove embed.
+ if not script_tag and not ie._is_valid_url(
+ bc_url, video_id, 'possible brightcove video'):
+ continue
+
+ entries.append(bc_url)
return entries
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 1671090f4..a92cbefed 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -939,6 +939,7 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
+from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -1233,7 +1234,10 @@ from .wrzuta import (
WrzutaIE,
WrzutaPlaylistIE,
)
-from .wsj import WSJIE
+from .wsj import (
+ WSJIE,
+ WSJArticleIE,
+)
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 36d23d2f3..6a34c2491 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -465,6 +465,59 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
+ 'skip': 'video rotates...weekly?',
+ },
+ {
+ # Brightcove:new type [2].
+ 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+ 'md5': '2b35148fcf48da41c9fb4591650784f3',
+ 'info_dict': {
+ 'id': '5348741021001',
+ 'ext': 'mp4',
+ 'upload_date': '20170306',
+ 'uploader_id': '4191638492001',
+ 'timestamp': 1488769918,
+ 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
+
+ },
+ },
+ {
+ # Alternative brightcove <video> attributes
+ 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+ 'info_dict': {
+ 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+ },
+ 'playlist': [{
+ 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+ 'info_dict': {
+ 'id': '5311302538001',
+ 'ext': 'mp4',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+ 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+ 'timestamp': 1486321708,
+ 'upload_date': '20170205',
+ 'uploader_id': '800000640001',
+ },
+ 'only_matching': True,
+ }],
+ },
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
},
# ooyala video
{
@@ -1640,7 +1693,7 @@ class GenericIE(InfoExtractor):
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
@@ -1900,7 +1953,6 @@ class GenericIE(InfoExtractor):
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
@@ -1915,7 +1967,7 @@ class GenericIE(InfoExtractor):
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644
index 000000000..aa4fad162
--- /dev/null
+++ b/youtube_dl/extractor/streamango.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+ 'md5': 'e992787515a182f55e38fc97588d802a',
+ 'info_dict': {
+ 'id': 'clapasobsptpkdfe',
+ 'ext': 'mp4',
+ 'title': '20170315_150006.mp4',
+ }
+ }, {
+ 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+
+ formats = []
+ for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+ video = self._parse_json(
+ format_, video_id, transform_source=js_to_json, fatal=False)
+ if not video:
+ continue
+ src = video.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, default_ext=None)
+ if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': ext or 'mp4',
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'tbr': int_or_none(video.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483ae..45cfca7c5 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,12 +10,14 @@ from ..utils import (
class WSJIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:
- video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
- (?:www\.)?wsj\.com/video/[^/]+/
- )
- (?P<id>[a-zA-Z0-9-]+)'''
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+ https?://(?:www\.)?wsj\.com/video/[^/]+/|
+ wsj:
+ )
+ (?P<id>[a-fA-F0-9-]{36})
+ '''
IE_DESC = 'Wall Street Journal'
_TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = (
- 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
- 'thumbnailList,author,description,name,duration,videoURL,'
- 'titletag,formattedCreationDate,keywords,editor' % video_id)
- info = self._download_json(api_url, video_id)['items'][0]
+ info = self._download_json(
+ 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+ query={
+ 'type': 'guid',
+ 'count': 1,
+ 'query': video_id,
+ 'fields': ','.join((
+ 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+ 'description', 'name', 'duration', 'videoURL', 'titletag',
+ 'formattedCreationDate', 'keywords', 'editor')),
+ })['items'][0]
title = info.get('name', info.get('titletag'))
formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
'title': title,
'categories': info.get('keywords'),
}
+
+
+class WSJArticleIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+ 'info_dict': {
+ 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+ 'ext': 'mp4',
+ 'upload_date': '20170221',
+ 'uploader_id': 'ralcaraz',
+ 'title': 'Bao Bao the Panda Leaves for China',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ video_id = self._search_regex(
+ r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+ return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)