diff options
33 files changed, 587 insertions, 452 deletions
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 3eed91279..a52d26cec 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -30,14 +30,14 @@ class AudiomackIE(InfoExtractor): # audiomack wrapper around soundcloud song { 'add_ie': ['Soundcloud'], - 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare', + 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', 'info_dict': { - 'id': '172419696', + 'id': '258901379', 'ext': 'mp3', - 'description': 'md5:1fc3272ed7a635cce5be1568c2822997', - 'title': 'Young Thug ft Lil Wayne - Take Kare', - 'uploader': 'Young Thug World', - 'upload_date': '20141016', + 'description': 'mamba day freestyle for the legend Kobe Bryant ', + 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + 'uploader': 'ILOVEMAKONNEN', + 'upload_date': '20160414', } }, ] diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 425f08f2b..74c4510f9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -671,6 +671,7 @@ class BBCIE(BBCCoUkIE): 'info_dict': { 'id': '34475836', 'title': 'Jurgen Klopp: Furious football from a witty and winning coach', + 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, }, { diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index c621a08d5..051d783a2 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -5,7 +5,6 @@ from ..utils import ( xpath_text, xpath_element, int_or_none, - ExtractorError, find_xpath_attr, ) @@ -64,7 +63,7 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): display_id = self._match_id(url) @@ -84,11 +83,11 @@ class CBSIE(CBSBaseIE): pid = xpath_text(item, 'pid') if not pid: continue - try: - tp_formats, tp_subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) - except ExtractorError: - continue + tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid + if '.m3u8' in xpath_text(item, 'contentUrl', default=''): + tp_release_url += '&manifest=m3u' + tp_formats, tp_subtitles = self._extract_theplatform_smil( + tp_release_url, content_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 7bbf617d4..fa3cb7023 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, @@ -55,8 +56,13 @@ class EaglePlatformIE(InfoExtractor): raise ExtractorError(' '.join(response['errors']), expected=True) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) - self._handle_error(response) + try: + response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError): + response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) + self._handle_error(response) + raise return response def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ae9bc9a8..06b3d5e24 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -412,7 +412,12 @@ from .minoto import MinotoIE from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE -from .mixcloud import MixcloudIE +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, + MixcloudStreamIE, +) from .mlb import MLBIE from .mnet import MnetIE from .mpora import MporaIE @@ -420,7 +425,6 @@ from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE from .moniker import MonikerIE -from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE from .motorsport import MotorsportIE @@ -465,7 +469,6 @@ from .ndr import ( from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE -from .nerdist import NerdistIE from .neteasemusic import ( NetEaseMusicIE, NetEaseMusicAlbumIE, @@ -730,6 +733,7 @@ from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE from .tapely import TapelyIE from .tass import TassIE +from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, @@ -832,7 +836,6 @@ from .twitter import ( TwitterIE, TwitterAmplifyIE, ) -from .ubu import UbuIE from .udemy import ( UdemyIE, UdemyCourseIE @@ -917,7 +920,6 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE -from .wayofthemaster import WayOfTheMasterIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py index ea32b621c..ba1c15414 100644 --- a/youtube_dl/extractor/gazeta.py +++ b/youtube_dl/extractor/gazeta.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class GazetaIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)' _TESTS = [{ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml', 'md5': 'd49c9bdc6e5a7888f27475dc215ee789', @@ -18,9 +18,22 @@ class GazetaIE(InfoExtractor): 'description': 'md5:38617526050bd17b234728e7f9620a71', 'thumbnail': 're:^https?://.*\.jpg', }, + 'skip': 'video not found', }, { 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml', 'only_matching': True, + }, { + 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml', + 'info_dict': { + 'id': '252048', + 'ext': 'mp4', + 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['EaglePlatform'], }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2aadd6a12..95d233259 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .instagram import InstagramIE +from .liveleak import LiveLeakIE class GenericIE(InfoExtractor): @@ -104,7 +105,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, # infinite live stream }, 'expected_warnings': [ - r'501.*Not Implemented' + r'501.*Not Implemented', + r'400.*Bad Request', ], }, # Direct link with incorrect MIME type @@ -1140,6 +1142,18 @@ class GenericIE(InfoExtractor): 'upload_date': '20160409', }, }, + # LiveLeak embed + { + 'url': 'http://www.wykop.pl/link/3088787/', + 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'info_dict': { + 'id': '874_1459135191', + 'ext': 'mp4', + 'title': 'Man shows poor quality of new apartment building', + 'description': 'The wall is like a sand pile.', + 'uploader': 'Lake8737', + } + }, ] def report_following_redirect(self, new_url): @@ -1942,7 +1956,13 @@ class GenericIE(InfoExtractor): # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: - return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + return self.url_result( + self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) + + # Look for LiveLeak embeds + liveleak_url = LiveLeakIE._extract_url(webpage) + if liveleak_url: + return self.url_result(liveleak_url, 'LiveLeak') def check_video(vurl): if YoutubeIE.suitable(vurl): diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index a38eae421..059073749 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, parse_duration, unified_strdate, ) @@ -29,7 +30,12 @@ class HuffPostIE(InfoExtractor): 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, 'upload_date': '20140124', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404: Not Found'], } def _real_extract(self, url): @@ -45,7 +51,7 @@ class HuffPostIE(InfoExtractor): description = data.get('description') thumbnails = [] - for url in data['images'].values(): + for url in filter(None, data['images'].values()): m = re.match('.*-([0-9]+x[0-9]+)\.', url) if not m: continue @@ -54,13 +60,25 @@ class HuffPostIE(InfoExtractor): 'resolution': m.group(1), }) - formats = [{ - 'format': key, - 'format_id': key.replace('/', '.'), - 'ext': 'mp4', - 'url': url, - 'vcodec': 'none' if key.startswith('audio/') else None, - } for key, url in data.get('sources', {}).get('live', {}).items()] + formats = [] + sources = data.get('sources', {}) + live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items()) + for key, url in live_sources: + ext = determine_ext(url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'format': key, + 'format_id': key.replace('/', '.'), + 'ext': 'mp4', + 'url': url, + 'vcodec': 'none' if key.startswith('audio/') else None, + }) if not formats and data.get('fivemin_id'): return self.url_result('5min:%s' % data['fivemin_id']) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 11bb58d8a..3cbe77ad8 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -12,7 +12,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -38,10 +38,19 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, + }, { + 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', + 'only_matching': True, }] @staticmethod def _extract_embed_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', + webpage) + if mobj: + return mobj.group('url') + blockquote_el = get_element_by_attribute( 'class', 'instagram-media', webpage) if blockquote_el is None: @@ -53,7 +62,9 @@ class InstagramIE(InfoExtractor): return mobj.group('link') def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + url = mobj.group('url') webpage = self._download_webpage(url, video_id) uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 88570f261..ea8fbb329 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html' _NETRC_MACHINE = 'iqiyi' @@ -273,6 +273,9 @@ class IqiyiIE(InfoExtractor): 'title': '灌篮高手 国语版', }, 'playlist_count': 101, + }, { + 'url': 'http://www.pps.tv/w_19rrbav0ph.html', + 'only_matching': True, }] _FORMATS_MAP = [ diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index b4c30b7f3..a6050c4de 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -2,39 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - js_to_json, -) class KaraoketvIE(InfoExtractor): - _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' + _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P<id>\d+)' _TEST = { - 'url': 'http://karaoketv.co.il/?container=songs&id=171568', + 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { - 'id': '171568', - 'ext': 'mp4', - 'title': 'אל העולם שלך - רותם כהן - שרים קריוקי', + 'id': '58356', + 'ext': 'flv', + 'title': 'קריוקי של איזון', + }, + 'params': { + # rtmp download + 'skip_download': True, } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + api_page_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.karaoke\.co\.il/api_play\.php\?.+?)\1', + webpage, 'API play URL', group='url') + + api_page = self._download_webpage(api_page_url, video_id) + video_cdn_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.video-cdn\.com/embed/iframe/.+?)\1', + api_page, 'video cdn URL', group='url') + + video_cdn = self._download_webpage(video_cdn_url, video_id) + play_path = self._parse_json( + self._search_regex( + r'var\s+options\s*=\s*({.+?});', video_cdn, 'options'), + video_id)['clip']['url'] - page_video_url = self._og_search_video_url(webpage, video_id) - config_json = compat_urllib_parse_unquote_plus(self._search_regex( - r'config=(.*)', page_video_url, 'configuration')) + settings = self._parse_json( + self._search_regex( + r'var\s+settings\s*=\s*({.+?});', video_cdn, 'servers', default='{}'), + video_id, fatal=False) or {} - urls_info_json = self._download_json( - config_json, video_id, 'Downloading configuration', - transform_source=js_to_json) + servers = settings.get('servers') + if not servers or not isinstance(servers, list): + servers = ('wowzail.video-cdn.com:80/vodcdn', ) - url = urls_info_json['playlist'][0]['url'] + formats = [{ + 'url': 'rtmp://%s' % server if not server.startswith('rtmp') else server, + 'play_path': play_path, + 'app': 'vodcdn', + 'page_url': video_cdn_url, + 'player_url': 'http://www.video-cdn.com/assets/flowplayer/flowplayer.commercial-3.2.18.swf', + 'rtmp_real_time': True, + 'ext': 'flv', + } for server in servers] return { 'id': video_id, 'title': self._og_search_title(webpage), - 'url': url, + 'formats': formats, } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 86c17c931..c0ece5113 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -268,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor): 'title': '八十年代精选', 'description': '这些都是属于八十年代的回忆!', }, - 'playlist_count': 30, + 'playlist_count': 24, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 4684994e1..29fba5f30 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -53,6 +53,14 @@ class LiveLeakIE(InfoExtractor): } }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)', + webpage) + if mobj: + return 'http://www.liveleak.com/view?i=%s' % mobj.group('id') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 2338e7f96..2100583df 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -49,8 +49,8 @@ class MDRIE(InfoExtractor): 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1419047100, - 'upload_date': '20141220', + 'timestamp': 1450950000, + 'upload_date': '20151224', 'duration': 4628, 'uploader': 'KIKA', }, @@ -71,8 +71,8 @@ class MDRIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_url = self._search_regex( - r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>\\?/.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', - webpage, 'data url', default=None, group='url').replace('\/', '/') + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url').replace('\/', '/') doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py index 949ad11db..e48eba3fa 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/youtube_dl/extractor/ministrygrid.py @@ -1,8 +1,5 @@ from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -20,21 +17,28 @@ class MinistryGridIE(InfoExtractor): 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20140410', 'description': 'Coming soon from T4G 2014!', - 'uploader': 'LifeWay Christian Resources (MG)', + 'uploader_id': '2034960640001', + 'timestamp': 1397145591, + }, + 'params': { + # m3u8 download + 'skip_download': True, }, + 'add_ie': ['TDSLifeway'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - portlets_json = self._search_regex( - r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list') - portlets = json.loads(portlets_json) + portlets = self._parse_json(self._search_regex( + r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'), + video_id) pl_id = self._search_regex( - r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id') + r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id') for i, portlet in enumerate(portlets): portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) @@ -46,12 +50,8 @@ class MinistryGridIE(InfoExtractor): r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe', default=None) if video_iframe_url: - surl = smuggle_url( - video_iframe_url, {'force_videoid': video_id}) - return { - '_type': 'url', - 'id': video_id, - 'url': surl, - } + return self.url_result( + smuggle_url(video_iframe_url, {'force_videoid': video_id}), + video_id=video_id) raise ExtractorError('Could not find video iframe in any portlets') diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 9638cc9e6..483f6925f 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,27 +1,35 @@ from __future__ import unicode_literals +import base64 +import functools +import itertools import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import ( + clean_html, ExtractorError, - HEADRequest, - NO_DEFAULT, + OnDemandPagedList, parse_count, str_to_int, ) class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { 'id': 'dholbach-cryptkeeper', - 'ext': 'mp3', + 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', @@ -39,22 +47,22 @@ class MixcloudIE(InfoExtractor): 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*/images/', + 'thumbnail': 're:https?://.*', 'view_count': int, 'like_count': int, }, }] - def _check_url(self, url, track_id, ext): - try: - # We only want to know if the request succeed - # don't download the whole file - self._request_webpage( - HEADRequest(url), track_id, - 'Trying %s URL' % ext) - return True - except ExtractorError: - return False + # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js + @staticmethod + def _decrypt_play_info(play_info): + KEY = 'pleasedontdownloadourmusictheartistswontgetpaid' + + play_info = base64.b64decode(play_info.encode('ascii')) + + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)])) + for idx, ch in enumerate(play_info)]) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -68,19 +76,15 @@ class MixcloudIE(InfoExtractor): r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - preview_url = self._search_regex( - r'\s(?:data-preview-url|m-preview)="([^"]+)"', - webpage, 'preview url', default=None if message else NO_DEFAULT) + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info') + play_info = self._parse_json( + self._decrypt_play_info(encrypted_play_info), track_id) - if message: + if message and 'stream_url' not in play_info: raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - song_url = re.sub(r'audiocdn(\d+)', r'stream\1', preview_url) - song_url = song_url.replace('/previews/', '/c/originals/') - if not self._check_url(song_url, track_id, 'mp3'): - song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - if not self._check_url(song_url, track_id, 'm4a'): - raise ExtractorError('Unable to extract track url') + song_url = play_info['stream_url'] PREFIX = ( r'm-play-on-spacebar[^>]+' @@ -115,3 +119,201 @@ class MixcloudIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, } + + +class MixcloudPlaylistBaseIE(InfoExtractor): + _PAGE_SIZE = 24 + + def _find_urls_in_page(self, page): + for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page): + yield self.url_result( + compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), + MixcloudIE.ie_key()) + + def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None): + real_page_number = real_page_number or current_page + 1 + return self._download_webpage( + 'https://www.mixcloud.com/%s/' % path, video_id, + note='Download %s (page %d)' % (page_name, current_page + 1), + errnote='Unable to download %s' % page_name, + query={'page': real_page_number, 'list': 'main', '_ajax': '1'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + + def _tracks_page_func(self, page, video_id, page_name, current_page): + resp = self._fetch_tracks_page(page, video_id, page_name, current_page) + + for item in self._find_urls_in_page(resp): + yield item + + def _get_user_description(self, page_content): + return self._html_search_regex( + r'<div[^>]+class="description-text"[^>]*>(.+?)</div>', + page_content, 'user description', fatal=False) + + +class MixcloudUserIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' + IE_NAME = 'mixcloud:user' + + _TESTS = [{ + 'url': 'http://www.mixcloud.com/dholbach/', + 'info_dict': { + 'id': 'dholbach_uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.mixcloud.com/dholbach/uploads/', + 'info_dict': { + 'id': 'dholbach_uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 11, + }, { + 'url': 'http://www.mixcloud.com/dholbach/favorites/', + 'info_dict': { + 'id': 'dholbach_favorites', + 'title': 'Daniel Holbach (favorites)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'params': { + 'playlist_items': '1-100', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.mixcloud.com/dholbach/listens/', + 'info_dict': { + 'id': 'dholbach_listens', + 'title': 'Daniel Holbach (listens)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'params': { + 'playlist_items': '1-100', + }, + 'playlist_mincount': 100, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user') + list_type = mobj.group('type') + + # if only a profile URL was supplied, default to download all uploads + if list_type is None: + list_type = 'uploads' + + video_id = '%s_%s' % (user_id, list_type) + + profile = self._download_webpage( + 'https://www.mixcloud.com/%s/' % user_id, video_id, + note='Downloading user profile', + errnote='Unable to download user profile') + + username = self._og_search_title(profile) + description = self._get_user_description(profile) + + entries = OnDemandPagedList( + functools.partial( + self._tracks_page_func, + '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), + self._PAGE_SIZE, use_cache=True) + + return self.playlist_result( + entries, video_id, '%s (%s)' % (username, list_type), description) + + +class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' + IE_NAME = 'mixcloud:playlist' + + _TESTS = [{ + 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', + 'info_dict': { + 'id': 'RedBullThre3style_tokyo-finalists-2015', + 'title': 'National Champions 2015', + 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', + }, + 'playlist_mincount': 16, + }, { + 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', + 'info_dict': { + 'id': 'maxvibes_jazzcat-on-ness-radio', + 'title': 'Jazzcat on Ness Radio', + 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263', + }, + 'playlist_mincount': 23 + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user') + playlist_id = mobj.group('playlist') + video_id = '%s_%s' % (user_id, playlist_id) + + profile = self._download_webpage( + url, user_id, + note='Downloading playlist page', + errnote='Unable to download playlist page') + + description = self._get_user_description(profile) + playlist_title = self._html_search_regex( + r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>', + profile, 'playlist title') + + entries = OnDemandPagedList( + functools.partial( + self._tracks_page_func, + '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), + self._PAGE_SIZE) + + return self.playlist_result(entries, video_id, playlist_title, description) + + +class MixcloudStreamIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' + IE_NAME = 'mixcloud:stream' + + _TEST = { + 'url': 'https://www.mixcloud.com/FirstEar/stream/', + 'info_dict': { + 'id': 'FirstEar', + 'title': 'First Ear', + 'description': 'Curators of good music\nfirstearmusic.com', + }, + 'playlist_mincount': 192, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + entries = [] + prev_page_url = None + + def _handle_page(page): + entries.extend(self._find_urls_in_page(page)) + return self._search_regex( + r'm-next-page-url="([^"]+)"', page, + 'next page URL', default=None) + + next_page_url = _handle_page(webpage) + + for idx in itertools.count(0): + if not next_page_url or prev_page_url == next_page_url: + break + + prev_page_url = next_page_url + current_page = int(self._search_regex( + r'\?page=(\d+)', next_page_url, 'next page number')) + + next_page_url = _handle_page(self._fetch_tracks_page( + '%s/stream' % user_id, user_id, 'stream', idx, + real_page_number=current_page)) + + username = self._og_search_title(webpage) + description = self._get_user_description(webpage) + + return self.playlist_result(entries, user_id, username, description) diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py deleted file mode 100644 index a85109a89..000000000 --- a/youtube_dl/extractor/mooshare.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class MooshareIE(InfoExtractor): - IE_NAME = 'mooshare' - IE_DESC = 'Mooshare.biz' - _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' - - _TESTS = [ - { - 'url': 'http://mooshare.biz/8dqtk4bjbp8g', - 'md5': '4e14f9562928aecd2e42c6f341c8feba', - 'info_dict': { - 'id': '8dqtk4bjbp8g', - 'ext': 'mp4', - 'title': 'Comedy Football 2011 - (part 1-2)', - 'duration': 893, - }, - }, - { - 'url': 'http://mooshare.biz/aipjtoc4g95j', - 'info_dict': { - 'id': 'aipjtoc4g95j', - 'ext': 'mp4', - 'title': 'Orange Caramel Dashing Through the Snow', - 'duration': 212, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading page') - - if re.search(r'>Video Not Found or Deleted<', page) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - hash_key = self._html_search_regex(r'<input type="hidden" name="hash" value="([^"]+)">', page, 'hash') - title = self._html_search_regex(r'(?m)<div class="blockTitle">\s*<h2>Watch ([^<]+)</h2>', page, 'title') - - download_form = { - 'op': 'download1', - 'id': video_id, - 'hash': hash_key, - } - - request = sanitized_Request( - 'http://mooshare.biz/%s' % video_id, urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - self._sleep(5, video_id) - - video_page = self._download_webpage(request, video_id, 'Downloading video page') - - thumbnail = self._html_search_regex(r'image:\s*"([^"]+)",', video_page, 'thumbnail', fatal=False) - duration_str = self._html_search_regex(r'duration:\s*"(\d+)",', video_page, 'duration', fatal=False) - duration = int(duration_str) if duration_str is not None else None - - formats = [] - - # SD video - mobj = re.search(r'(?m)file:\s*"(?P<url>[^"]+)",\s*provider:', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('url'), - 'format_id': 'sd', - 'format': 'SD', - }) - - # HD video - mobj = re.search(r'\'hd-2\': { file: \'(?P<url>[^\']+)\' },', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('url'), - 'format_id': 'hd', - 'format': 'HD', - }) - - # rtmp video - mobj = re.search(r'(?m)file: "(?P<playpath>[^"]+)",\s*streamer: "(?P<rtmpurl>rtmp://[^"]+)",', video_page) - if mobj is not None: - formats.append({ - 'url': mobj.group('rtmpurl'), - 'play_path': mobj.group('playpath'), - 'rtmp_live': False, - 'ext': 'mp4', - 'format_id': 'rtmp', - 'format': 'HD', - }) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nerdist.py b/youtube_dl/extractor/nerdist.py deleted file mode 100644 index c6dc34be4..000000000 --- a/youtube_dl/extractor/nerdist.py +++ /dev/null @@ -1,80 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - determine_ext, - parse_iso8601, - xpath_text, -) - - -class NerdistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nerdist\.com/vepisode/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.nerdist.com/vepisode/exclusive-which-dc-characters-w', - 'md5': '3698ed582931b90d9e81e02e26e89f23', - 'info_dict': { - 'display_id': 'exclusive-which-dc-characters-w', - 'id': 'RPHpvJyr', - 'ext': 'mp4', - 'title': 'Your TEEN TITANS Revealed! Who\'s on the show?', - 'thumbnail': 're:^https?://.*/thumbs/.*\.jpg$', - 'description': 'Exclusive: Find out which DC Comics superheroes will star in TEEN TITANS Live-Action TV Show on Nerdist News with Jessica Chobot!', - 'uploader': 'Eric Diaz', - 'upload_date': '20150202', - 'timestamp': 1422892808, - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'''(?x)<script\s+(?:type="text/javascript"\s+)? - src="https?://content\.nerdist\.com/players/([a-zA-Z0-9_]+)-''', - webpage, 'video ID') - timestamp = parse_iso8601(self._html_search_meta( - 'shareaholic:article_published_time', webpage, 'upload date')) - uploader = self._html_search_meta( - 'shareaholic:article_author_name', webpage, 'article author') - - doc = self._download_xml( - 'http://content.nerdist.com/jw6/%s.xml' % video_id, video_id) - video_info = doc.find('.//item') - title = xpath_text(video_info, './title', fatal=True) - description = xpath_text(video_info, './description') - thumbnail = xpath_text( - video_info, './{http://rss.jwpcdn.com/}image', 'thumbnail') - - formats = [] - for source in video_info.findall('./{http://rss.jwpcdn.com/}source'): - vurl = source.attrib['file'] - ext = determine_ext(vurl) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - vurl, video_id, entry_protocol='m3u8_native', ext='mp4', - preference=0)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - vurl, video_id, fatal=False - )) - else: - formats.append({ - 'format_id': ext, - 'url': vurl, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'formats': formats, - 'uploader': uploader, - } diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index cd117b04e..705940323 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -7,8 +7,8 @@ from .common import InfoExtractor class NewgroundsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/audio/listen/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.newgrounds.com/audio/listen/549479', 'md5': 'fe6033d297591288fa1c1f780386f07a', 'info_dict': { @@ -17,7 +17,16 @@ class NewgroundsIE(InfoExtractor): 'title': 'B7 - BusMode', 'uploader': 'Burn7', } - } + }, { + 'url': 'http://www.newgrounds.com/portal/view/673111', + 'md5': '3394735822aab2478c31b1004fe5e5bc', + 'info_dict': { + 'id': '673111', + 'ext': 'mp4', + 'title': 'Dancin', + 'uploader': 'Squirrelman82', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -25,9 +34,11 @@ class NewgroundsIE(InfoExtractor): webpage = self._download_webpage(url, music_id) title = self._html_search_regex( - r',"name":"([^"]+)",', webpage, 'music title') + r'<title>([^>]+)</title>', webpage, 'title') + uploader = self._html_search_regex( - r',"artist":"([^"]+)",', webpage, 'music uploader') + [r',"artist":"([^"]+)",', r'[\'"]owner[\'"]\s*:\s*[\'"]([^\'"]+)[\'"],'], + webpage, 'uploader') music_url_json_string = self._html_search_regex( r'({"url":"[^"]+"),', webpage, 'music url') + '}' diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 0f1f448fe..6e843c327 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + determine_ext, + int_or_none, +) class OnionStudiosIE(InfoExtractor): @@ -17,7 +20,7 @@ class OnionStudiosIE(InfoExtractor): 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:545299bda6abf87e5ec666548c6a9448', + 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', 'uploader_id': 'TheAVClub', @@ -42,9 +45,19 @@ class OnionStudiosIE(InfoExtractor): formats = [] for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): - if determine_ext(src) != 'm3u8': # m3u8 always results in 403 + ext = determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + height = int_or_none(self._search_regex( + r'/(\d+)\.%s' % ext, src, 'height', default=None)) formats.append({ + 'format_id': ext + ('-%sp' % height if height else ''), 'url': src, + 'height': height, + 'ext': ext, + 'preference': 1, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index cce84b9e4..fca30e1aa 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -40,7 +40,7 @@ class Puls4IE(InfoExtractor): webpage = self._download_webpage(url, video_id) error_message = self._html_search_regex( - r'<div class="message-error">(.+?)</div>', + r'<div[^>]+class="message-error"[^>]*>(.+?)</div>', webpage, 'error message', default=None) if error_message: raise ExtractorError( diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 4f0c66213..e5c28ae89 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + js_to_json, unified_strdate, ) @@ -94,19 +95,32 @@ class SportBoxEmbedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - hls = self._search_regex( - r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", - webpage, 'hls file') + formats = [] - formats = self._extract_m3u8_formats(hls, video_id, 'mp4') - self._sort_formats(formats) + def cleanup_js(code): + # desktop_advert_config contains complex Javascripts and we don't need it + return js_to_json(re.sub(r'desktop_advert_config.*', '', code)) + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id, + transform_source=cleanup_js) + + hls_url = jwplayer_data.get('hls_url') + if hls_url: + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', m3u8_id='hls')) - title = self._search_regex( - r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') + rtsp_url = jwplayer_data.get('rtsp_url') + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) - thumbnail = self._search_regex( - r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', - webpage, 'thumbnail', default=None) + title = jwplayer_data['node_title'] + thumbnail = jwplayer_data.get('image_url') return { 'id': video_id, diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py new file mode 100644 index 000000000..4d1f5c801 --- /dev/null +++ b/youtube_dl/extractor/tdslifeway.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TDSLifewayIE(InfoExtractor): + _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P<id>\d+)/index\.html' + + _TEST = { + # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers + 'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb®istration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F', + 'info_dict': { + 'id': '3453494717001', + 'ext': 'mp4', + 'title': 'The Gospel by Numbers', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20140410', + 'description': 'Coming soon from T4G 2014!', + 'uploader_id': '2034960640001', + 'timestamp': 1397145591, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + } + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 6da701a39..7a5a533b7 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -50,8 +50,6 @@ class ThePlatformBaseIE(OnceIE): else: formats.append(_format) - self._sort_formats(formats) - subtitles = self._parse_smil_subtitles(meta, default_ns) return formats, subtitles @@ -241,6 +239,7 @@ class ThePlatformIE(ThePlatformBaseIE): smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) + self._sort_formats(formats) ret = self.get_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 63b5d5924..bb8b8e234 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -65,6 +65,9 @@ class TudouIE(InfoExtractor): if quality: info_url += '&hd' + quality xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page') + error = xml_data.attrib.get('error') + if error is not None: + raise ExtractorError('Tudou said: %s' % error, expected=True) final_url = xml_data.text return final_url diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1f32ea2eb..ea673054f 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -260,6 +260,17 @@ class TwitterIE(InfoExtractor): 'upload_date': '20140615', }, 'add_ie': ['Vine'], + }, { + 'url': 'https://twitter.com/captainamerica/status/719944021058060289', + # md5 constantly changes + 'info_dict': { + 'id': '719944021058060289', + 'ext': 'mp4', + 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', + 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', + 'uploader_id': 'captainamerica', + 'uploader': 'Captain America', + }, }] def _real_extract(self, url): @@ -284,17 +295,6 @@ class TwitterIE(InfoExtractor): 'title': username + ' - ' + title, } - card_id = self._search_regex( - r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) - if card_id: - card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id - info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': card_url, - }) - return info - mobj = re.search(r'''(?x) <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s* <source[^>]+video-src="(?P<url>[^"]+)" diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py deleted file mode 100644 index 1d52cbc98..000000000 --- a/youtube_dl/extractor/ubu.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, -) - - -class UbuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' - _TEST = { - 'url': 'http://ubu.com/film/her_noise.html', - 'md5': '138d5652618bf0f03878978db9bef1ee', - 'info_dict': { - 'id': 'her_noise', - 'ext': 'm4v', - 'title': 'Her Noise - The Making Of (2007)', - 'duration': 3600, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'<title>.+?Film & Video: ([^<]+)</title>', webpage, 'title') - - duration = int_or_none(self._html_search_regex( - r'Duration: (\d+) minutes', webpage, 'duration', fatal=False), - invscale=60) - - formats = [] - FORMAT_REGEXES = [ - ('sq', r"'flashvars'\s*,\s*'file=([^']+)'"), - ('hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'), - ] - preference = qualities([fid for fid, _ in FORMAT_REGEXES]) - for format_id, format_regex in FORMAT_REGEXES: - m = re.search(format_regex, webpage) - if m: - formats.append({ - 'url': m.group(1), - 'format_id': format_id, - 'preference': preference(format_id), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'formats': formats, - } diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index b5fe753d7..54605d863 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -41,6 +41,12 @@ class UstreamIE(InfoExtractor): 'uploader': 'sportscanadatv', }, 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.', + }, { + 'url': 'http://www.ustream.tv/embed/10299409', + 'info_dict': { + 'id': '10299409', + }, + 'playlist_count': 3, }] def _real_extract(self, url): @@ -55,10 +61,12 @@ class UstreamIE(InfoExtractor): if m.group('type') == 'embed': video_id = m.group('id') webpage = self._download_webpage(url, video_id) - desktop_video_id = self._html_search_regex( - r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id') - desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id - return self.url_result(desktop_url, 'Ustream') + content_video_ids = self._parse_json(self._search_regex( + r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage, + 'content video IDs'), video_id) + return self.playlist_result( + map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids), + video_id) params = self._download_json( 'https://api.ustream.tv/videos/%s.json' % video_id, video_id) diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py index 9369abaf8..84698371a 100644 --- a/youtube_dl/extractor/varzesh3.py +++ b/youtube_dl/extractor/varzesh3.py @@ -2,11 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + clean_html, + remove_start, +) class Varzesh3IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' - _TEST = { + _TESTS = [{ 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', 'md5': '2a933874cb7dce4366075281eb49e855', 'info_dict': { @@ -15,8 +23,19 @@ class Varzesh3IE(InfoExtractor): 'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', 'description': 'فصل ۲۰۱۵-۲۰۱۴', 'thumbnail': 're:^https?://.*\.jpg$', - } - } + }, + 'skip': 'HTTP 404 Error', + }, { + 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', + 'md5': '841b7cd3afbc76e61708d94e53a4a4e7', + 'info_dict': { + 'id': '112785', + 'ext': 'mp4', + 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', + 'description': 'فوتبال 120', + }, + 'expected_warnings': ['description'], + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -26,15 +45,30 @@ class Varzesh3IE(InfoExtractor): video_url = self._search_regex( r'<source[^>]+src="([^"]+)"', webpage, 'video url') - title = self._og_search_title(webpage) + title = remove_start(self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') + description = self._html_search_regex( r'(?s)<div class="matn">(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) + webpage, 'description', default=None) + if description is None: + description = clean_html(self._html_search_meta('description', webpage)) + + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + fb_sharer_url = self._search_regex( + r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', + webpage, 'facebook sharer URL', fatal=False) + sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query) + thumbnail = sharer_params.get('p[images][0]', [None])[0] video_id = self._search_regex( r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", - webpage, display_id, default=display_id) + webpage, display_id, default=None) + if video_id is None: + video_id = self._search_regex( + 'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', + default=display_id) return { 'url': video_url, diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 46c785ae1..ec1245b0e 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -14,13 +14,21 @@ class ViceIE(InfoExtractor): 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', 'info_dict': { 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', 'duration': 725.983, }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, + }, { + 'url': 'http://www.vice.com/video/how-to-hack-a-car', + 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', }, }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', @@ -39,11 +47,15 @@ class ViceIE(InfoExtractor): try: embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, - 'ooyala embed code') - ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + 'ooyala embed code', default=None) + if embed_code: + ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + return self.url_result('ooyala:%s' % embed_code, 'Ooyala') + youtube_id = self._search_regex( + r'data-youtube-id="([^"]+)"', webpage, 'youtube id') + return self.url_result(youtube_id, 'Youtube') except ExtractorError: raise ExtractorError('The page doesn\'t contain a video', expected=True) - return self.url_result(ooyala_url, ie='Ooyala') class ViceShowIE(InfoExtractor): diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py deleted file mode 100644 index af7bb8b49..000000000 --- a/youtube_dl/extractor/wayofthemaster.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class WayOfTheMasterIE(InfoExtractor): - _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])' - - _TEST = { - 'url': 'http://www.wayofthemaster.com/hbks.shtml', - 'md5': '5316b57487ada8480606a93cb3d18d24', - 'info_dict': { - 'id': 'hbks', - 'ext': 'mp4', - 'title': 'Intelligent Design vs. Evolution', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._search_regex( - r'<img src="images/title_[^"]+".*?alt="([^"]+)"', - webpage, 'title', default=None) - if title is None: - title = self._html_search_regex( - r'<title>(.*?)</title>', webpage, 'page title') - - url_base = self._search_regex( - r'<param\s+name="?movie"?\s+value=".*?/wotm_videoplayer_highlow[0-9]*\.swf\?vid=([^"]+)"', - webpage, 'URL base') - formats = [{ - 'format_id': 'low', - 'quality': 1, - 'url': url_base + '_low.mp4', - }, { - 'format_id': 'high', - 'quality': 2, - 'url': url_base + '_high.mp4', - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index 236ff403b..b113ab1c4 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -12,7 +12,7 @@ from ..utils import ( class XboxClipsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' _TEST = { - 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', + 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5a102de51..44c1191bd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1987,8 +1987,8 @@ class YoutubeUserIE(YoutubeChannelIE): def suitable(cls, url): # Don't return True if the url can be extracted with other youtube # extractor, the regex would is too permissive and it would match. - other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) - if any(ie.suitable(url) for ie in other_ies): + other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_yt_ies): return False else: return super(YoutubeUserIE, cls).suitable(url) |