diff options
-rw-r--r-- | .github/ISSUE_TEMPLATE.md | 6 | ||||
-rw-r--r-- | ChangeLog | 16 | ||||
-rw-r--r-- | docs/supportedsites.md | 2 | ||||
-rw-r--r-- | test/test_YoutubeDL.py | 4 | ||||
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 67 | ||||
-rw-r--r-- | youtube_dl/extractor/extractors.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 58 | ||||
-rw-r--r-- | youtube_dl/extractor/streamango.py | 64 | ||||
-rw-r--r-- | youtube_dl/extractor/wsj.py | 52 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
11 files changed, 246 insertions, 43 deletions
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 70f6b51ed..5d5adb199 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.15** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.16** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.04.15 +[debug] youtube-dl version 2017.04.16 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} @@ -1,3 +1,19 @@ +version 2017.04.16 + +Core +* [YoutubeDL] Apply expand_path after output template substitution ++ [YoutubeDL] Propagate overridden meta fields to extraction results of type + url (#11163) + +Extractors ++ [generic] Extract RSS entries as url_transparent (#11163) ++ [streamango] Add support for streamango.com (#12643) ++ [wsj:article] Add support for articles (#12558) +* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds' + URLs (#9163, #12005, #12178, #12480) ++ [udemy] Add support for react rendition (#12744) + + version 2017.04.15 Extractors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b29b50c8d..afae82214 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -745,6 +745,7 @@ - **Steam** - **Stitcher** - **Streamable** + - **Streamango** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -966,6 +967,7 @@ - **wrzuta.pl** - **wrzuta.pl:playlist** - **WSJ**: Wall Street Journal + - **WSJArticle** - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8491a88bd..75945e38f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -755,6 +755,7 @@ class TestYoutubeDL(unittest.TestCase): '_type': 'url_transparent', 'url': 'foo2:', 'ie_key': 'Foo2', + 'title': 'foo1 title' } class Foo2IE(InfoExtractor): @@ -771,7 +772,7 @@ class TestYoutubeDL(unittest.TestCase): _VALID_URL = r'foo3:' def _real_extract(self, url): - return _make_result([{'url': TEST_URL}]) + return _make_result([{'url': TEST_URL}], title='foo3 title') ydl.add_info_extractor(Foo1IE(ydl)) ydl.add_info_extractor(Foo2IE(ydl)) @@ -779,6 +780,7 @@ class TestYoutubeDL(unittest.TestCase): ydl.extract_info('foo1:') downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'foo1 title') if __name__ == '__main__': diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7953670a7..819b374ef 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -672,8 +672,7 @@ class YoutubeDL(object): FORMAT_RE.format(numeric_field), r'%({0})s'.format(numeric_field), outtmpl) - tmpl = expand_path(outtmpl) - filename = tmpl % template_dict + filename = expand_path(outtmpl % template_dict) # Temporary fix for #4787 # 'Treat' all problem characters by passing filename through preferredencoding # to workaround encoding issues with subprocess on python2 @ Windows @@ -851,7 +850,14 @@ class YoutubeDL(object): new_result = info.copy() new_result.update(force_properties) - assert new_result.get('_type') != 'url_transparent' + # Extracted info may not be a video result (i.e. + # info.get('_type', 'video') != video) but rather an url or + # url_transparent. In such cases outer metadata (from ie_result) + # should be propagated to inner one (info). For this to happen + # _type of info should be overridden with url_transparent. This + # fixes issue from https://github.com/rg3/youtube-dl/pull/11163. + if new_result.get('_type') == 'url': + new_result['_type'] = 'url_transparent' return self.process_ie_result( new_result, download=download, extra_info=extra_info) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 46ef8e605..124497e95 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -17,6 +17,7 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + extract_attributes, find_xpath_attr, fix_xml_ampersands, float_or_none, @@ -109,6 +110,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'upload_date': '20140827', 'uploader_id': '710858724001', }, + 'skip': 'Video gone', }, { # playlist with 'videoList' @@ -487,12 +489,13 @@ class BrightcoveNewIE(InfoExtractor): return urls[0] if urls else None @staticmethod - def _extract_urls(webpage): + def _extract_urls(ie, webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript - # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html - # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] @@ -501,22 +504,48 @@ class BrightcoveNewIE(InfoExtractor): r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): entries.append(url if url.startswith('http') else 'http:' + url) - # Look for embed_in_page embeds [2] - for video_id, account_id, player_id, embed in re.findall( - # According to examples from [3] it's unclear whether video id - # may be optional and what to do when it is - # According to [4] data-video-id may be prefixed with ref: - r'''(?sx) - <video[^>]+ - data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? - </video>.*? - <script[^>]+ - src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video\s+[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? ''', webpage): - entries.append( - 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' - % (account_id, player_id, embed, video_id)) + attrs = extract_attributes(video) + + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue + + entries.append(bc_url) return entries diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1671090f4..a92cbefed 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -939,6 +939,7 @@ from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE +from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -1233,7 +1234,10 @@ from .wrzuta import ( WrzutaIE, WrzutaPlaylistIE, ) -from .wsj import WSJIE +from .wsj import ( + WSJIE, + WSJArticleIE, +) from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 36d23d2f3..6a34c2491 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -465,6 +465,59 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 download }, + 'skip': 'video rotates...weekly?', + }, + { + # Brightcove:new type [2]. + 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', + 'md5': '2b35148fcf48da41c9fb4591650784f3', + 'info_dict': { + 'id': '5348741021001', + 'ext': 'mp4', + 'upload_date': '20170306', + 'uploader_id': '4191638492001', + 'timestamp': 1488769918, + 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', + + }, + }, + { + # Alternative brightcove <video> attributes + 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', + 'info_dict': { + 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", + }, + 'playlist': [{ + 'md5': '732d22ba3d33f2f3fc253c39f8f36523', + 'info_dict': { + 'id': '5311302538001', + 'ext': 'mp4', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", + 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", + 'timestamp': 1486321708, + 'upload_date': '20170205', + 'uploader_id': '800000640001', + }, + 'only_matching': True, + }], + }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }, # ooyala video { @@ -1640,7 +1693,7 @@ class GenericIE(InfoExtractor): continue entries.append({ - '_type': 'url', + '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, }) @@ -1900,7 +1953,6 @@ class GenericIE(InfoExtractor): # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: - self.to_screen('Brightcove video detected.') entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), @@ -1915,7 +1967,7 @@ class GenericIE(InfoExtractor): } # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(webpage) + bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py new file mode 100644 index 000000000..aa4fad162 --- /dev/null +++ b/youtube_dl/extractor/streamango.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) + + +class StreamangoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', + 'md5': 'e992787515a182f55e38fc97588d802a', + 'info_dict': { + 'id': 'clapasobsptpkdfe', + 'ext': 'mp4', + 'title': '20170315_150006.mp4', + } + }, { + 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + + formats = [] + for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + video = self._parse_json( + format_, video_id, transform_source=js_to_json, fatal=False) + if not video: + continue + src = video.get('src') + if not src: + continue + ext = determine_ext(src, default_ext=None) + if video.get('type') == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': ext or 'mp4', + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'url': url, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index deb7483ae..45cfca7c5 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -10,12 +10,14 @@ from ..utils import ( class WSJIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?: - video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| - (?:www\.)?wsj\.com/video/[^/]+/ - ) - (?P<id>[a-zA-Z0-9-]+)''' + _VALID_URL = r'''(?x) + (?: + https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| + https?://(?:www\.)?wsj\.com/video/[^/]+/| + wsj: + ) + (?P<id>[a-fA-F0-9-]{36}) + ''' IE_DESC = 'Wall Street Journal' _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', @@ -38,12 +40,17 @@ class WSJIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_url = ( - 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' - 'thumbnailList,author,description,name,duration,videoURL,' - 'titletag,formattedCreationDate,keywords,editor' % video_id) - info = self._download_json(api_url, video_id)['items'][0] + info = self._download_json( + 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, + query={ + 'type': 'guid', + 'count': 1, + 'query': video_id, + 'fields': ','.join(( + 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', + 'description', 'name', 'duration', 'videoURL', 'titletag', + 'formattedCreationDate', 'keywords', 'editor')), + })['items'][0] title = info.get('name', info.get('titletag')) formats = [] @@ -87,3 +94,24 @@ class WSJIE(InfoExtractor): 'title': title, 'categories': info.get('keywords'), } + + +class WSJArticleIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', + 'info_dict': { + 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', + 'ext': 'mp4', + 'upload_date': '20170221', + 'uploader_id': 'ralcaraz', + 'title': 'Bao Bao the Panda Leaves for China', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + video_id = self._search_regex( + r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') + return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 612b50f7b..8b01fbc0a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.04.15' +__version__ = '2017.04.16' |