diff options
Diffstat (limited to 'youtube_dl/extractor')
24 files changed, 474 insertions, 109 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5e0d7d3dc..e61a88de7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -142,6 +142,8 @@ from .dailymotion import ( from .daum import ( DaumIE, DaumClipIE, + DaumPlaylistIE, + DaumUserIE, ) from .dbtv import DBTVIE from .dcn import ( @@ -372,6 +374,7 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makertv import MakerTVIE from .malemotion import MalemotionIE +from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE @@ -482,6 +485,7 @@ from .npo import ( NPOLiveIE, NPORadioIE, NPORadioFragmentIE, + SchoolTVIE, VPROIE, WNLIE ) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index be7913bc7..92eee8119 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,11 +8,7 @@ from ..compat import compat_str from ..utils import int_or_none -class ACastBaseIE(InfoExtractor): - _API_BASE_URL = 'https://www.acast.com/api/' - - -class ACastIE(ACastBaseIE): +class ACastIE(InfoExtractor): IE_NAME = 'acast' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<channel>[^/]+)/(?P<id>[^/#?]+)' _TEST = { @@ -23,14 +19,19 @@ class ACastIE(ACastBaseIE): 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', 'timestamp': 1196172000000, - 'description': 'md5:0c5d8201dfea2b93218ea986c91eee6e', + 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, } } def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - cast_data = self._download_json(self._API_BASE_URL + 'channels/%s/acasts/%s/playback' % (channel, display_id), display_id) + + embed_page = self._download_webpage( + re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id) + cast_data = self._parse_json(self._search_regex( + r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'), + display_id)['GetAcast/%s/%s' % (channel, display_id)] return { 'id': compat_str(cast_data['id']), @@ -44,7 +45,7 @@ class ACastIE(ACastBaseIE): } -class ACastChannelIE(ACastBaseIE): +class ACastChannelIE(InfoExtractor): IE_NAME = 'acast:channel' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<id>[^/#?]+)' _TEST = { @@ -56,6 +57,7 @@ class ACastChannelIE(ACastBaseIE): }, 'playlist_mincount': 20, } + _API_BASE_URL = 'https://www.acast.com/api/' @classmethod def suitable(cls, url): diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 7d65b8193..190bc2cc8 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -8,6 +8,8 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( qualities, + unescapeHTML, + xpath_element, ) @@ -31,7 +33,7 @@ class AllocineIE(InfoExtractor): 'id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', - 'description': 'md5:eeaffe7c2d634525e21159b93acf3b1e', + 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -41,7 +43,7 @@ class AllocineIE(InfoExtractor): 'id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', - 'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac', + 'description': 'md5:601d15393ac40f249648ef000720e7e3', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -59,14 +61,18 @@ class AllocineIE(InfoExtractor): if typ == 'film': video_id = self._search_regex(r'href="/video/player_gen_cmedia=([0-9]+).+"', webpage, 'video id') else: - player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player') - - player_data = json.loads(player) - video_id = compat_str(player_data['refMedia']) + player = self._search_regex(r'data-player=\'([^\']+)\'>', webpage, 'data player', default=None) + if player: + player_data = json.loads(player) + video_id = compat_str(player_data['refMedia']) + else: + model = self._search_regex(r'data-model="([^"]+)">', webpage, 'data model') + model_data = self._parse_json(unescapeHTML(model), display_id) + video_id = compat_str(model_data['id']) xml = self._download_xml('http://www.allocine.fr/ws/AcVisiondataV4.ashx?media=%s' % video_id, display_id) - video = xml.find('.//AcVisionVideo').attrib + video = xpath_element(xml, './/AcVisionVideo').attrib quality = qualities(['ld', 'md', 'hd']) formats = [] diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 510813f76..c28e72927 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -1,7 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + js_to_json, + determine_ext, +) class BpbIE(InfoExtractor): @@ -10,7 +16,8 @@ class BpbIE(InfoExtractor): _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - 'md5': '0792086e8e2bfbac9cdf27835d5f2093', + # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 + 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', @@ -25,13 +32,26 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') - video_url = self._html_search_regex( - r'(http://film\.bpb\.de/player/dokument_[0-9]+\.mp4)', - webpage, 'video URL') + video_info_dicts = re.findall( + r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + + formats = [] + for video_info in video_info_dicts: + video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) + quality = video_info['quality'] + video_url = video_info['src'] + formats.append({ + 'url': video_url, + 'preference': 10 if quality == 'high' else 0, + 'format_note': quality, + 'format_id': '%s-%s' % (quality, determine_ext(video_url)), + }) + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': self._og_search_description(webpage), } diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 9bc345f60..c84c51058 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -3,16 +3,20 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_parse, compat_urllib_parse_unquote, + compat_urlparse, ) from ..utils import ( int_or_none, str_to_int, xpath_text, + unescapeHTML, ) @@ -38,7 +42,7 @@ class DaumIE(InfoExtractor): 'info_dict': { 'id': '65139429', 'ext': 'mp4', - 'title': 'md5:a100d65d09cec246d8aa9bde7de45aed', + 'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118', 'description': 'md5:79794514261164ff27e36a21ad229fc5', 'upload_date': '20150604', 'thumbnail': 're:^https?://.*\.(?:jpg|png)', @@ -115,6 +119,7 @@ class DaumIE(InfoExtractor): class DaumClipIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)' IE_NAME = 'daum.net:clip' + _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s' _TESTS = [{ 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690', @@ -133,6 +138,10 @@ class DaumClipIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url) + def _real_extract(self, url): video_id = self._match_id(url) clip_info = self._download_json( @@ -143,7 +152,7 @@ class DaumClipIE(InfoExtractor): '_type': 'url_transparent', 'id': video_id, 'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'], - 'title': clip_info['title'], + 'title': unescapeHTML(clip_info['title']), 'thumbnail': clip_info.get('thumb_url'), 'description': clip_info.get('contents'), 'duration': int_or_none(clip_info.get('duration')), @@ -151,3 +160,139 @@ class DaumClipIE(InfoExtractor): 'view_count': int_or_none(clip_info.get('play_count')), 'ie_key': 'Daum', } + + +class DaumListIE(InfoExtractor): + def _get_entries(self, list_id, list_id_type): + name = None + entries = [] + for pagenum in itertools.count(1): + list_info = self._download_json( + 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % ( + pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum) + + entries.extend([ + self.url_result( + 'http://tvpot.daum.net/v/%s' % clip['vid']) + for clip in list_info['clip_list'] + ]) + + if not name: + name = list_info.get('playlist_bean', {}).get('name') or \ + list_info.get('potInfo', {}).get('name') + + if not list_info.get('has_more'): + break + + return name, entries + + def _check_clip(self, url, list_id): + query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) + if 'clipid' in query_dict: + clip_id = query_dict['clipid'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) + return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) + + +class DaumPlaylistIE(DaumListIE): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View\.do|Top\.tv)\?.*?playlistid=(?P<id>[0-9]+)' + IE_NAME = 'daum.net:playlist' + _URL_TEMPLATE = 'http://tvpot.daum.net/mypot/View.do?playlistid=%s' + + _TESTS = [{ + 'note': 'Playlist url with clipid', + 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', + 'info_dict': { + 'id': '6213966', + 'title': 'Woorissica Official', + }, + 'playlist_mincount': 181 + }, { + 'note': 'Playlist url with clipid - noplaylist', + 'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844', + 'info_dict': { + 'id': '73806844', + 'ext': 'mp4', + 'title': '151017 Airport', + 'upload_date': '20160117', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + } + }] + + @classmethod + def suitable(cls, url): + return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url) + + def _real_extract(self, url): + list_id = self._match_id(url) + + clip_result = self._check_clip(url, list_id) + if clip_result: + return clip_result + + name, entries = self._get_entries(list_id, 'playlistid') + + return self.playlist_result(entries, list_id, name) + + +class DaumUserIE(DaumListIE): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View|Top)\.(?:do|tv)\?.*?ownerid=(?P<id>[0-9a-zA-Z]+)' + IE_NAME = 'daum.net:user' + + _TESTS = [{ + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0', + 'info_dict': { + 'id': 'o2scDLIVbHc0', + 'title': '마이 리틀 텔레비전', + }, + 'playlist_mincount': 213 + }, { + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156', + 'info_dict': { + 'id': '73801156', + 'ext': 'mp4', + 'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116', + 'upload_date': '20160117', + 'description': 'md5:5e91d2d6747f53575badd24bd62b9f36' + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + } + }, { + 'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence', + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631', + 'info_dict': { + 'id': '6196631', + 'title': '마이 리틀 텔레비전 - 20160109', + }, + 'playlist_count': 11 + }, { + 'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0', + 'only_matching': True, + }, { + 'url': 'http://m.tvpot.daum.net/mypot/Top.tv?ownerid=45x1okb1If50&playlistid=3569733', + 'only_matching': True, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + clip_result = self._check_clip(url, list_id) + if clip_result: + return clip_result + + query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query) + if 'playlistid' in query_dict: + playlist_id = query_dict['playlistid'][0] + return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist') + + name, entries = self._get_entries(list_id, 'ownerid') + + return self.playlist_result(entries, list_id, name) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index 027f55eb2..f6b9046f9 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -6,24 +6,29 @@ from ..utils import ( xpath_text, xpath_with_ns, ) +from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.tv/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ - 'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/', - # MD5 is flaky, seems to change regularly - # 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3', + # YouTube embed video + 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', + 'md5': '5208d3a17adeaef829a7861887cb9029', 'info_dict': { - 'id': 'phoenix-wright-ace-attorney-dual-destinies-review', + 'id': 'HkSQKetlGOU', 'ext': 'mp4', - 'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review', - 'description': 'md5:36fd701e57e8c15ac8682a2374c99731', + 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', + 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', + 'uploader': 'Gamekings Vault', + 'upload_date': '20151123', }, + 'add_ie': ['Youtube'], }, { # vimeo video - 'url': 'http://www.gamekings.tv/videos/the-legend-of-zelda-majoras-mask/', + 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', 'md5': '12bf04dfd238e70058046937657ea68d', 'info_dict': { 'id': 'the-legend-of-zelda-majoras-mask', @@ -33,7 +38,7 @@ class GamekingsIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, { - 'url': 'http://www.gamekings.tv/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', + 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', 'only_matching': True, }] @@ -43,7 +48,11 @@ class GamekingsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) playlist_id = self._search_regex( - r'gogoVideo\(\s*\d+\s*,\s*"([^"]+)', webpage, 'playlist id') + r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') + + # Check if a YouTube embed is used + if YoutubeIE.suitable(playlist_id): + return self.url_result(playlist_id, ie='Youtube') playlist = self._download_xml( 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26d3698c8..b18e734c4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1819,6 +1819,17 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Limelight embeds + mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) + if mobj: + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + return self.url_result('limelight:%s:%s' % ( + lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 1d391e69f..9f1ade2e4 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -2,12 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import smuggle_url class KickStarterIE(InfoExtractor): _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*' _TESTS = [{ - 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location', + 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant/description', 'md5': 'c81addca81327ffa66c642b5d8b08cab', 'info_dict': { 'id': '1404461844', @@ -27,7 +28,8 @@ class KickStarterIE(InfoExtractor): 'uploader_id': 'pebble', 'uploader': 'Pebble Technology', 'title': 'Pebble iOS Notifications', - } + }, + 'add_ie': ['Vimeo'], }, { 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html', 'info_dict': { @@ -43,7 +45,7 @@ class KickStarterIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'<title>\s*(.*?)(?:\s*— Kickstarter)?\s*</title>', + r'<title>\s*(.*?)(?:\s*—\s*Kickstarter)?\s*</title>', webpage, 'title') video_url = self._search_regex( r'data-video-url="(.*?)"', @@ -52,7 +54,7 @@ class KickStarterIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': 'Generic', - 'url': url, + 'url': smuggle_url(url, {'to_generic': True}), 'title': title, } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 08bdae8a2..9665ece89 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -5,11 +5,13 @@ import datetime import re import time import base64 +import hashlib from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_ord, + compat_str, ) from ..utils import ( determine_ext, @@ -258,6 +260,7 @@ class LetvCloudIE(InfoExtractor): }, }, { 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360', + 'md5': 'e03d9cc8d9c13191e1caf277e42dbd31', 'info_dict': { 'id': 'p7jnfw5hw9_ec93197892', 'ext': 'mp4', @@ -265,6 +268,7 @@ class LetvCloudIE(InfoExtractor): }, }, { 'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd', + 'md5': 'cb988699a776b22d4a41b9d43acfb3ac', 'info_dict': { 'id': 'p7jnfw5hw9_187060b6fd', 'ext': 'mp4', @@ -272,21 +276,37 @@ class LetvCloudIE(InfoExtractor): }, }] - def _real_extract(self, url): - uu_mobj = re.search('uu=([\w]+)', url) - vu_mobj = re.search('vu=([\w]+)', url) - - if not uu_mobj or not vu_mobj: - raise ExtractorError('Invalid URL: %s' % url, expected=True) - - uu = uu_mobj.group(1) - vu = vu_mobj.group(1) - media_id = uu + '_' + vu - - play_json_req = sanitized_Request( - 'http://api.letvcloud.com/gpc.php?cf=html5&sign=signxxxxx&ver=2.2&format=json&' + - 'uu=' + uu + '&vu=' + vu) - play_json = self._download_json(play_json_req, media_id, 'Downloading playJson data') + @staticmethod + def sign_data(obj): + if obj['cf'] == 'flash': + salt = '2f9d6924b33a165a6d8b5d3d42f4f987' + items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu'] + elif obj['cf'] == 'html5': + salt = 'fbeh5player12c43eccf2bec3300344' + items = ['cf', 'ran', 'uu', 'bver', 'vu'] + input_data = ''.join([item + obj[item] for item in items]) + salt + obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest() + + def _get_formats(self, cf, uu, vu, media_id): + def get_play_json(cf, timestamp): + data = { + 'cf': cf, + 'ver': '2.2', + 'bver': 'firefox44.0', + 'format': 'json', + 'uu': uu, + 'vu': vu, + 'ran': compat_str(timestamp), + } + self.sign_data(data) + return self._download_json( + 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data), + media_id, 'Downloading playJson data for type %s' % cf) + + play_json = get_play_json(cf, time.time()) + # The server time may be different from local time + if play_json.get('code') == 10071: + play_json = get_play_json(cf, play_json['timestamp']) if not play_json.get('data'): if play_json.get('message'): @@ -312,6 +332,21 @@ class LetvCloudIE(InfoExtractor): 'width': int_or_none(play_url.get('vwidth')), 'height': int_or_none(play_url.get('vheight')), }) + + return formats + + def _real_extract(self, url): + uu_mobj = re.search('uu=([\w]+)', url) + vu_mobj = re.search('vu=([\w]+)', url) + + if not uu_mobj or not vu_mobj: + raise ExtractorError('Invalid URL: %s' % url, expected=True) + + uu = uu_mobj.group(1) + vu = vu_mobj.group(1) + media_id = uu + '_' + vu + + formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index fb03dd527..1a0625ac3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -40,7 +40,8 @@ class LimelightBaseIE(InfoExtractor): if not stream_url: continue if '.f4m' in stream_url: - formats.extend(self._extract_f4m_formats(stream_url, video_id)) + formats.extend(self._extract_f4m_formats( + stream_url, video_id, fatal=False)) else: fmt = { 'url': stream_url, @@ -72,8 +73,8 @@ class LimelightBaseIE(InfoExtractor): format_id = mobile_url.get('targetMediaPlatform') if determine_ext(media_url) == 'm3u8': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=-1, m3u8_id=format_id)) + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) else: formats.append({ 'url': media_url, diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py new file mode 100644 index 000000000..28e0dfe63 --- /dev/null +++ b/youtube_dl/extractor/matchtv.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ( + sanitized_Request, + xpath_text, +) + + +class MatchTVIE(InfoExtractor): + _VALID_URL = r'https?://matchtv\.ru/?#live-player' + _TEST = { + 'url': 'http://matchtv.ru/#live-player', + 'info_dict': { + 'id': 'matchtv-live', + 'ext': 'flv', + 'title': 're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = 'matchtv-live' + request = sanitized_Request( + 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({ + 'ts': '', + 'quality': 'SD', + 'contentId': '561d2c0df7159b37178b4567', + 'sign': '', + 'includeHighlights': '0', + 'userId': '', + 'sessionId': random.randint(1, 1000000000), + 'contentType': 'channel', + 'timeShift': '0', + 'platform': 'portal', + }), + headers={ + 'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf', + }) + video_url = self._download_json(request, video_id)['data']['videoUrl'] + f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') + formats = self._extract_f4m_formats(f4m_url, video_id) + return { + 'id': video_id, + 'title': self._live_title('Матч ТВ - Прямой эфир'), + 'is_live': True, + 'formats': formats, + } diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 9d26030d3..a071378b6 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -18,13 +18,17 @@ class NBAIE(InfoExtractor): 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { 'id': '0021200253-okc-bkn-recap', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, 'timestamp': 1354638466, 'upload_date': '20121204', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -68,7 +72,7 @@ class NBAIE(InfoExtractor): if video_url.startswith('/'): continue if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif video_url.endswith('.f4m'): formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) else: diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 1dd54c2f1..18d01f423 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -19,32 +19,39 @@ class NBCIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.nbc.com/the-tonight-show/segments/112966', - # md5 checksum is not stable 'info_dict': { - 'id': 'c9xnCo0YPOPH', - 'ext': 'flv', + 'id': '112966', + 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', 'info_dict': { - 'id': 'XwU9KZkp98TH', + 'id': '176', 'ext': 'flv', 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', }, - 'skip': 'Only works from US', + 'skip': '404 Not Found', }, { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { - 'id': '8iUuyzWDdYUZ', - 'ext': 'flv', + 'id': '2832821', + 'ext': 'mp4', 'title': 'Star Wars Teaser', 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'skip': 'Only works from US', }, { @@ -66,7 +73,11 @@ class NBCIE(InfoExtractor): webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url - return self.url_result(smuggle_url(theplatform_url, {'source_url': url})) + return { + '_type': 'url_transparent', + 'url': smuggle_url(theplatform_url, {'source_url': url}), + 'id': video_id, + } class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index eb12fb810..87f5675c7 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -189,7 +189,7 @@ class NPOIE(NPOBaseIE): if not video_url: continue if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) else: formats.append({ 'url': video_url, @@ -406,6 +406,38 @@ class NPORadioFragmentIE(InfoExtractor): } +class SchoolTVIE(InfoExtractor): + IE_NAME = 'schooltv' + _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)' + + _TEST = { + 'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/', + 'info_dict': { + 'id': 'WO_NTR_429477', + 'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam', + 'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?', + 'ext': 'mp4', + 'description': 'md5:abfa0ff690adb73fd0297fd033aaa631' + }, + 'params': { + # Skip because of m3u8 download + 'skip_download': True + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-mid=(["\'])(?P<id>.+?)\1', webpage, 'video_id', group='id') + return { + '_type': 'url_transparent', + 'ie_key': 'NPO', + 'url': 'npo:%s' % video_id, + 'display_id': display_id + } + + class VPROIE(NPOIE): IE_NAME = 'vpro' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ff13050d..a126f5054 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -133,26 +133,32 @@ class NRKTVIE(InfoExtractor): _TESTS = [ { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': 'adf2c5454fa2bf032f47a9f8fb351342', 'info_dict': { 'id': 'MUHH48000314', - 'ext': 'flv', + 'ext': 'mp4', 'title': '20 spørsmål', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'md5': '383650ece2b25ecec996ad7b5bb2a384', 'info_dict': { 'id': 'mdfp15000514', - 'ext': 'flv', - 'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', - 'duration': 4605.0, + 'duration': 4605.08, + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, { diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 05f93904c..e5d62a139 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -71,7 +71,7 @@ class ScreenwaveMediaIE(InfoExtractor): formats = [] for source in sources: if source['type'] == 'hls': - formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4')) else: file_ = source.get('file') if not file_: @@ -107,7 +107,11 @@ class TeamFourIE(InfoExtractor): 'upload_date': '20130401', 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar', 'title': 'A Moment With TFS Episode 4', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 474ebb49b..990ea0fa8 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -53,17 +53,25 @@ class SenateISVPIE(InfoExtractor): 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { 'id': 'judiciary031715', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Integrated Senate Video Player', 'thumbnail': 're:^https?://.*\.(?:jpg|png)$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', 'info_dict': { 'id': 'commerce011514', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Integrated Senate Video Player' - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', # checksum differs each time diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index fa338b936..1457e524e 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -17,18 +17,21 @@ class TV2IE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', - 'md5': '9cb9e3410b18b515d71892f27856e9b1', 'info_dict': { 'id': '916509', - 'ext': 'flv', - 'title': 'Se Gryttens hyllest av Steven Gerrard', + 'ext': 'mp4', + 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', 'timestamp': 1431715610, 'upload_date': '20150515', 'duration': 156.967, 'view_count': int, 'categories': list, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 86ba70ed9..14e945d49 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -86,10 +86,9 @@ class VGTVIE(XstreamIE): { # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', - 'md5': '458f4841239dab414343b50e5af8869c', 'info_dict': { 'id': '113063', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', @@ -98,6 +97,10 @@ class VGTVIE(XstreamIE): 'upload_date': '20150530', 'view_count': int, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 2ba9f31df..7c6e98026 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import smuggle_url class VidziIE(InfoExtractor): @@ -13,6 +14,11 @@ class VidziIE(InfoExtractor): 'id': 'cghql9yq6emu', 'ext': 'mp4', 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', + 'uploader': 'vidzi.tv', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @@ -20,19 +26,14 @@ class VidziIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_host = self._html_search_regex( - r'id=\'vplayer\'><img src="http://(.*?)/i', webpage, - 'video host') - video_hash = self._html_search_regex( - r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash') - ext = self._html_search_regex( - r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext') - video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') + # Vidzi now uses jwplayer, which can be handled by GenericIE return { + '_type': 'url_transparent', 'id': video_id, 'title': title, - 'url': video_url, + 'url': smuggle_url(url, {'to_generic': True}), + 'ie_key': 'Generic', } diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 525e303d4..315984bf9 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -45,6 +45,10 @@ class ViideaIE(InfoExtractor): 'upload_date': '20130627', 'duration': 565, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 8bbac54e2..2466410fa 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -34,19 +34,20 @@ class XuiteIE(InfoExtractor): }, }, { # Video with only one format - 'url': 'http://vlog.xuite.net/play/TkRZNjhULTM0NDE2MjkuZmx2', - 'md5': 'c45737fc8ac5dc8ac2f92ecbcecf505e', + 'url': 'http://vlog.xuite.net/play/WUxxR2xCLTI1OTI1MDk5LmZsdg==', + 'md5': '21f7b39c009b5a4615b4463df6eb7a46', 'info_dict': { - 'id': '3441629', + 'id': '25925099', 'ext': 'mp4', - 'title': '孫燕姿 - 眼淚成詩', + 'title': 'BigBuckBunny_320x180', 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 217.399, - 'timestamp': 1299383640, - 'upload_date': '20110306', - 'uploader': 'Valen', - 'uploader_id': '10400126', - 'categories': ['影視娛樂'], + 'duration': 596.458, + 'timestamp': 1454242500, + 'upload_date': '20160131', + 'uploader': 'yan12125', + 'uploader_id': '12158353', + 'categories': ['個人短片'], + 'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4', }, }, { # Video with two formats diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index dd724085a..b29baafc4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -114,15 +114,13 @@ class YouPornIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = self._html_search_regex( - r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>', - webpage, 'description', default=None) + description = self._og_search_description(webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') uploader = self._html_search_regex( - r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', + r'(?s)<div[^>]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d6fef39e9..a7f8c968e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -181,7 +181,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeEntryListBaseInfoExtractor(InfoExtractor): +class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page @@ -233,7 +233,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content): + for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -1543,7 +1543,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): +class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? |