aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.github/ISSUE_TEMPLATE.md6
-rw-r--r--ChangeLog16
-rw-r--r--docs/supportedsites.md2
-rw-r--r--test/test_YoutubeDL.py4
-rwxr-xr-xyoutube_dl/YoutubeDL.py12
-rw-r--r--youtube_dl/extractor/brightcove.py67
-rw-r--r--youtube_dl/extractor/extractors.py6
-rw-r--r--youtube_dl/extractor/generic.py58
-rw-r--r--youtube_dl/extractor/streamango.py64
-rw-r--r--youtube_dl/extractor/wsj.py52
-rw-r--r--youtube_dl/version.py2
11 files changed, 246 insertions, 43 deletions
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 70f6b51ed..5d5adb199 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -6,8 +6,8 @@
---
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.15**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.16**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2017.04.15
+[debug] youtube-dl version 2017.04.16
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/ChangeLog b/ChangeLog
index cf5ee84a4..6be86a090 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+version 2017.04.16
+
+Core
+* [YoutubeDL] Apply expand_path after output template substitution
++ [YoutubeDL] Propagate overridden meta fields to extraction results of type
+ url (#11163)
+
+Extractors
++ [generic] Extract RSS entries as url_transparent (#11163)
++ [streamango] Add support for streamango.com (#12643)
++ [wsj:article] Add support for articles (#12558)
+* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds'
+ URLs (#9163, #12005, #12178, #12480)
++ [udemy] Add support for react rendition (#12744)
+
+
version 2017.04.15
Extractors
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index b29b50c8d..afae82214 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -745,6 +745,7 @@
- **Steam**
- **Stitcher**
- **Streamable**
+ - **Streamango**
- **streamcloud.eu**
- **StreamCZ**
- **StreetVoice**
@@ -966,6 +967,7 @@
- **wrzuta.pl**
- **wrzuta.pl:playlist**
- **WSJ**: Wall Street Journal
+ - **WSJArticle**
- **XBef**
- **XboxClips**
- **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 8491a88bd..75945e38f 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -755,6 +755,7 @@ class TestYoutubeDL(unittest.TestCase):
'_type': 'url_transparent',
'url': 'foo2:',
'ie_key': 'Foo2',
+ 'title': 'foo1 title'
}
class Foo2IE(InfoExtractor):
@@ -771,7 +772,7 @@ class TestYoutubeDL(unittest.TestCase):
_VALID_URL = r'foo3:'
def _real_extract(self, url):
- return _make_result([{'url': TEST_URL}])
+ return _make_result([{'url': TEST_URL}], title='foo3 title')
ydl.add_info_extractor(Foo1IE(ydl))
ydl.add_info_extractor(Foo2IE(ydl))
@@ -779,6 +780,7 @@ class TestYoutubeDL(unittest.TestCase):
ydl.extract_info('foo1:')
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['url'], TEST_URL)
+ self.assertEqual(downloaded['title'], 'foo1 title')
if __name__ == '__main__':
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 7953670a7..819b374ef 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -672,8 +672,7 @@ class YoutubeDL(object):
FORMAT_RE.format(numeric_field),
r'%({0})s'.format(numeric_field), outtmpl)
- tmpl = expand_path(outtmpl)
- filename = tmpl % template_dict
+ filename = expand_path(outtmpl % template_dict)
# Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows
@@ -851,7 +850,14 @@ class YoutubeDL(object):
new_result = info.copy()
new_result.update(force_properties)
- assert new_result.get('_type') != 'url_transparent'
+ # Extracted info may not be a video result (i.e.
+ # info.get('_type', 'video') != video) but rather an url or
+ # url_transparent. In such cases outer metadata (from ie_result)
+ # should be propagated to inner one (info). For this to happen
+ # _type of info should be overridden with url_transparent. This
+ # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+ if new_result.get('_type') == 'url':
+ new_result['_type'] = 'url_transparent'
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 46ef8e605..124497e95 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -17,6 +17,7 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ extract_attributes,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
@@ -109,6 +110,7 @@ class BrightcoveLegacyIE(InfoExtractor):
'upload_date': '20140827',
'uploader_id': '710858724001',
},
+ 'skip': 'Video gone',
},
{
# playlist with 'videoList'
@@ -487,12 +489,13 @@ class BrightcoveNewIE(InfoExtractor):
return urls[0] if urls else None
@staticmethod
- def _extract_urls(webpage):
+ def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
- # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
- # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
- # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+ # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+ # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+ # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+ # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
@@ -501,22 +504,48 @@ class BrightcoveNewIE(InfoExtractor):
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
- # Look for embed_in_page embeds [2]
- for video_id, account_id, player_id, embed in re.findall(
- # According to examples from [3] it's unclear whether video id
- # may be optional and what to do when it is
- # According to [4] data-video-id may be prefixed with ref:
- r'''(?sx)
- <video[^>]+
- data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
- </video>.*?
- <script[^>]+
- src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ # Look for <video> tags [2] and embed_in_page embeds [3]
+ # [2] looks like:
+ for video, script_tag, account_id, player_id, embed in re.findall(
+ r'''(?isx)
+ (<video\s+[^>]+>)
+ (?:.*?
+ (<script[^>]+
+ src=["\'](?:https?:)?//players\.brightcove\.net/
+ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ )
+ )?
''', webpage):
- entries.append(
- 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
- % (account_id, player_id, embed, video_id))
+ attrs = extract_attributes(video)
+
+ # According to examples from [4] it's unclear whether video id
+ # may be optional and what to do when it is
+ video_id = attrs.get('data-video-id')
+ if not video_id:
+ continue
+
+ account_id = account_id or attrs.get('data-account')
+ if not account_id:
+ continue
+
+ player_id = player_id or attrs.get('data-player') or 'default'
+ embed = embed or attrs.get('data-embed') or 'default'
+
+ bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+ account_id, player_id, embed, video_id)
+
+ # Some brightcove videos may be embedded with video tag only and
+ # without script tag or any mentioning of brightcove at all. Such
+ # embeds are considered ambiguous since they are matched based only
+ # on data-video-id and data-account attributes and in the wild may
+ # not be brightcove embeds at all. Let's check reconstructed
+ # brightcove URLs in case of such embeds and only process valid
+ # ones. By this we ensure there is indeed a brightcove embed.
+ if not script_tag and not ie._is_valid_url(
+ bc_url, video_id, 'possible brightcove video'):
+ continue
+
+ entries.append(bc_url)
return entries
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 1671090f4..a92cbefed 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -939,6 +939,7 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
+from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -1233,7 +1234,10 @@ from .wrzuta import (
WrzutaIE,
WrzutaPlaylistIE,
)
-from .wsj import WSJIE
+from .wsj import (
+ WSJIE,
+ WSJArticleIE,
+)
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 36d23d2f3..6a34c2491 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -465,6 +465,59 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
+ 'skip': 'video rotates...weekly?',
+ },
+ {
+ # Brightcove:new type [2].
+ 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+ 'md5': '2b35148fcf48da41c9fb4591650784f3',
+ 'info_dict': {
+ 'id': '5348741021001',
+ 'ext': 'mp4',
+ 'upload_date': '20170306',
+ 'uploader_id': '4191638492001',
+ 'timestamp': 1488769918,
+ 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
+
+ },
+ },
+ {
+ # Alternative brightcove <video> attributes
+ 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+ 'info_dict': {
+ 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+ },
+ 'playlist': [{
+ 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+ 'info_dict': {
+ 'id': '5311302538001',
+ 'ext': 'mp4',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+ 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+ 'timestamp': 1486321708,
+ 'upload_date': '20170205',
+ 'uploader_id': '800000640001',
+ },
+ 'only_matching': True,
+ }],
+ },
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
},
# ooyala video
{
@@ -1640,7 +1693,7 @@ class GenericIE(InfoExtractor):
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
@@ -1900,7 +1953,6 @@ class GenericIE(InfoExtractor):
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
@@ -1915,7 +1967,7 @@ class GenericIE(InfoExtractor):
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644
index 000000000..aa4fad162
--- /dev/null
+++ b/youtube_dl/extractor/streamango.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+ 'md5': 'e992787515a182f55e38fc97588d802a',
+ 'info_dict': {
+ 'id': 'clapasobsptpkdfe',
+ 'ext': 'mp4',
+ 'title': '20170315_150006.mp4',
+ }
+ }, {
+ 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+
+ formats = []
+ for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+ video = self._parse_json(
+ format_, video_id, transform_source=js_to_json, fatal=False)
+ if not video:
+ continue
+ src = video.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, default_ext=None)
+ if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': ext or 'mp4',
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'tbr': int_or_none(video.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483ae..45cfca7c5 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,12 +10,14 @@ from ..utils import (
class WSJIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:
- video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
- (?:www\.)?wsj\.com/video/[^/]+/
- )
- (?P<id>[a-zA-Z0-9-]+)'''
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+ https?://(?:www\.)?wsj\.com/video/[^/]+/|
+ wsj:
+ )
+ (?P<id>[a-fA-F0-9-]{36})
+ '''
IE_DESC = 'Wall Street Journal'
_TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = (
- 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
- 'thumbnailList,author,description,name,duration,videoURL,'
- 'titletag,formattedCreationDate,keywords,editor' % video_id)
- info = self._download_json(api_url, video_id)['items'][0]
+ info = self._download_json(
+ 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+ query={
+ 'type': 'guid',
+ 'count': 1,
+ 'query': video_id,
+ 'fields': ','.join((
+ 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+ 'description', 'name', 'duration', 'videoURL', 'titletag',
+ 'formattedCreationDate', 'keywords', 'editor')),
+ })['items'][0]
title = info.get('name', info.get('titletag'))
formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
'title': title,
'categories': info.get('keywords'),
}
+
+
+class WSJArticleIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+ 'info_dict': {
+ 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+ 'ext': 'mp4',
+ 'upload_date': '20170221',
+ 'uploader_id': 'ralcaraz',
+ 'title': 'Bao Bao the Panda Leaves for China',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ video_id = self._search_regex(
+ r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+ return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 612b50f7b..8b01fbc0a 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2017.04.15'
+__version__ = '2017.04.16'