diff options
Diffstat (limited to 'youtube_dl/extractor')
29 files changed, 897 insertions, 162 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index df4a7419a..d7e8138be 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -106,6 +106,7 @@ from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE +from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE from .dreisat import DreiSatIE @@ -114,6 +115,7 @@ from .drtuber import DrTuberIE from .drtv import DRTVIE from .dvtv import DVTVIE from .dump import DumpIE +from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .divxstage import DivxStageIE @@ -274,6 +276,7 @@ from .metacritic import MetacriticIE from .mgoon import MgoonIE from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE +from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE @@ -309,6 +312,8 @@ from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, ) from .ndr import NDRIE from .ndtv import NDTVIE @@ -421,6 +426,10 @@ from .rutube import ( ) from .rutv import RUTVIE from .sandia import SandiaIE +from .safari import ( + SafariIE, + SafariCourseIE, +) from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE @@ -553,6 +562,7 @@ from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE +from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE from .vgtv import VGTVIE diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 4a88ccd13..0dca29b71 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -6,32 +6,39 @@ from .common import InfoExtractor class BloombergIE(InfoExtractor): - _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<id>.+?)\.html' + _VALID_URL = r'https?://www\.bloomberg\.com/news/videos/[^/]+/(?P<id>[^/?#]+)' _TEST = { - 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', # The md5 checksum changes 'info_dict': { 'id': 'qurhIVlJSB6hzkVi229d8g', 'ext': 'flv', 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', - 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', + 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, } def _real_extract(self, url): name = self._match_id(url) webpage = self._download_webpage(url, name) - - f4m_url = self._search_regex( - r'<source src="(https?://[^"]+\.f4m.*?)"', webpage, - 'f4m url') + video_id = self._search_regex(r'"bmmrId":"(.+?)"', webpage, 'id') title = re.sub(': Video$', '', self._og_search_title(webpage)) + embed_info = self._download_json( + 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + formats = [] + for stream in embed_info['streams']: + if stream["muxing_format"] == "TS": + formats.extend(self._extract_m3u8_formats(stream['url'], video_id)) + else: + formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + self._sort_formats(formats) + return { - 'id': name.split('-')[-1], + 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, name), + 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 90ea07438..0a77e951c 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln|ktvk)(?:-ap)?|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -45,6 +45,9 @@ class CNNIE(InfoExtractor): 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } + }, { + 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4f67c3aac..47d58330b 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): def _build_request(url): """Build a request with the family filter disabled""" request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - request.add_header('Cookie', 'ff=off') + request.add_header('Cookie', 'family_filter=off; ff=off') return request @@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') + embed_request = self._build_request(embed_url) + embed_page = self._download_webpage( + embed_request, video_id, 'Downloading embed page') info = self._search_regex(r'var info = ({.*?}),$', embed_page, 'video info', flags=re.MULTILINE) info = json.loads(info) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py new file mode 100644 index 000000000..3ed1f1663 --- /dev/null +++ b/youtube_dl/extractor/dhm.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + parse_duration, +) + + +class DHMIE(InfoExtractor): + IE_DESC = 'Filmarchiv - Deutsches Historisches Museum' + _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/', + 'md5': '11c475f670209bf6acca0b2b7ef51827', + 'info_dict': { + 'id': 'the-marshallplan-at-work-in-west-germany', + 'ext': 'flv', + 'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE', + 'description': 'md5:1fabd480c153f97b07add61c44407c82', + 'duration': 660, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/', + 'md5': '09890226332476a3e3f6f2cb74734aa5', + 'info_dict': { + 'id': 'rolle-1', + 'ext': 'flv', + 'title': 'ROLLE 1', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + playlist_url = self._search_regex( + r"file\s*:\s*'([^']+)'", webpage, 'playlist url') + + playlist = self._download_xml(playlist_url, video_id) + + track = playlist.find( + './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track') + + video_url = xpath_text( + track, './{http://xspf.org/ns/0/}location', + 'video url', fatal=True) + thumbnail = xpath_text( + track, './{http://xspf.org/ns/0/}image', + 'thumbnail') + + title = self._search_regex( + [r'dc:title="([^"]+)"', r'<title> »([^<]+)</title>'], + webpage, 'title').strip() + description = self._html_search_regex( + r'<p><strong>Description:</strong>(.+?)</p>', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)', + webpage, 'duration', default=None)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index d7956e6e4..479430c51 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -1,19 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals +import hashlib +import time from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import (ExtractorError, unescapeHTML) +from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { - 'id': 'iseven', + 'id': '17732', + 'display_id': 'iseven', 'ext': 'flv', 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:9e525642c25a0a24302869937cf69d17', + 'description': 'md5:c93d6692dde6fe33809a46edcbecca44', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', 'uploader_id': '431925', @@ -22,22 +26,52 @@ class DouyuTVIE(InfoExtractor): 'params': { 'skip_download': True, } - } + }, { + 'url': 'http://www.douyutv.com/85982', + 'info_dict': { + 'id': '85982', + 'display_id': '85982', + 'ext': 'flv', + 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'douyu小漠', + 'uploader_id': '3769985', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + }] def _real_extract(self, url): video_id = self._match_id(url) + if video_id.isdigit(): + room_id = video_id + else: + page = self._download_webpage(url, video_id) + room_id = self._html_search_regex( + r'"room_id"\s*:\s*(\d+),', page, 'room id') + + prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( + room_id, int(time.time())) + + auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() config = self._download_json( - 'http://www.douyutv.com/api/client/room/%s' % video_id, video_id) + 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), + video_id) data = config['data'] error_code = config.get('error', 0) - show_status = data.get('show_status') if error_code is not 0: - raise ExtractorError( - 'Server reported error %i' % error_code, expected=True) + error_desc = 'Server reported error %i' % error_code + if isinstance(data, (compat_str, compat_basestring)): + error_desc += ': ' + data + raise ExtractorError(error_desc, expected=True) + show_status = data.get('show_status') # 1 = live, 2 = offline if show_status == '2': raise ExtractorError( @@ -46,7 +80,7 @@ class DouyuTVIE(InfoExtractor): base_url = data['rtmp_url'] live_path = data['rtmp_live'] - title = self._live_title(data['room_name']) + title = self._live_title(unescapeHTML(data['room_name'])) description = data.get('show_details') thumbnail = data.get('room_src') @@ -66,7 +100,8 @@ class DouyuTVIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, + 'id': room_id, + 'display_id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py new file mode 100644 index 000000000..e43bc81b2 --- /dev/null +++ b/youtube_dl/extractor/dumpert.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 + +from .common import InfoExtractor +from ..utils import qualities + + +class DumpertIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' + _TEST = { + 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/', + 'md5': '1b9318d7d5054e7dcb9dc7654f21d643', + 'info_dict': { + 'id': '6646981/951bc60f', + 'ext': 'mp4', + 'title': 'Ik heb nieuws voor je', + 'description': 'Niet schrikken hoor', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + files_base64 = self._search_regex( + r'data-files="([^"]+)"', webpage, 'data files') + + files = self._parse_json( + base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'), + video_id) + + quality = qualities(['flv', 'mobile', 'tablet', '720p']) + + formats = [{ + 'url': video_url, + 'format_id': format_id, + 'quality': quality(format_id), + } for format_id, video_url in files.items() if format_id != 'still'] + self._sort_formats(formats) + + title = self._html_search_meta( + 'title', webpage) or self._og_search_title(webpage) + description = self._html_search_meta( + 'description', webpage) or self._og_search_description(webpage) + thumbnail = files.get('still') or self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats + } diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 79e2fbd39..0cbca90b0 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -1,11 +1,17 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ExtractorError class EroProfileIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)' - _TEST = { + _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?' + _NETRC_MACHINE = 'eroprofile' + _TESTS = [{ 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore', 'md5': 'c26f351332edf23e1ea28ce9ec9de32f', 'info_dict': { @@ -16,13 +22,55 @@ class EroProfileIE(InfoExtractor): 'thumbnail': 're:https?://.*\.jpg', 'age_limit': 18, } - } + }, { + 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', + 'md5': '1baa9602ede46ce904c431f5418d8916', + 'info_dict': { + 'id': '1133519', + 'ext': 'm4v', + 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file', + 'thumbnail': 're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'skip': 'Requires login', + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + query = compat_urllib_parse.urlencode({ + 'username': username, + 'password': password, + 'url': 'http://www.eroprofile.com/', + }) + login_url = self._LOGIN_URL + query + login_page = self._download_webpage(login_url, None, False) + + m = re.search(r'Your username or password was incorrect\.', login_page) + if m: + raise ExtractorError( + 'Wrong username and/or password.', expected=True) + + self.report_login() + redirect_url = self._search_regex( + r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') + self._download_webpage(redirect_url, None, False) + + def _real_initialize(self): + self._login() def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + m = re.search(r'You must be logged in to view this video\.', webpage) + if m: + raise ExtractorError( + 'This video requires login. Please specify a username and password and try again.', expected=True) + video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8a49b0b54..2ff002643 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,6 +29,7 @@ from ..utils import ( xpath_text, ) from .brightcove import BrightcoveIE +from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE @@ -620,6 +621,16 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, + # 5min embed + { + 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', + 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', + 'info_dict': { + 'id': '518726732', + 'ext': 'mp4', + 'title': 'Facebook Creates "On This Day" | Crunch Report', + }, + }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', @@ -629,6 +640,16 @@ class GenericIE(InfoExtractor): 'upload_date': '20150228', 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } + }, + # NBC Sports vplayer embed + { + 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a', + 'info_dict': { + 'id': 'ln7x1qSThw4k', + 'ext': 'flv', + 'title': "PFT Live: New leader in the 'new-look' defense", + 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + }, } ] @@ -1236,6 +1257,17 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Pladform') + # Look for 5min embeds + mobj = re.search( + r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) + if mobj is not None: + return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') + + # Look for NBC Sports VPlayer embeds + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py new file mode 100644 index 000000000..cc3f27194 --- /dev/null +++ b/youtube_dl/extractor/miomio.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + int_or_none, +) + + +class MioMioIE(InfoExtractor): + IE_NAME = 'miomio.tv' + _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.miomio.tv/watch/cc179734/', + 'md5': '48de02137d0739c15b440a224ad364b9', + 'info_dict': { + 'id': '179734', + 'ext': 'flv', + 'title': '手绘动漫鬼泣但丁全程画法', + 'duration': 354, + }, + }, { + 'url': 'http://www.miomio.tv/watch/cc184024/', + 'info_dict': { + 'id': '43729', + 'title': '《动漫同人插画绘制》', + }, + 'playlist_mincount': 86, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + mioplayer_path = self._search_regex( + r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') + + xml_config = self._search_regex( + r'flashvars="type=sina&(.+?)&', + webpage, 'xml config') + + # skipping the following page causes lags and eventually connection drop-outs + self._request_webpage( + 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), + video_id) + + # the following xml contains the actual configuration information on the video file(s) + vid_config = self._download_xml( + 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), + video_id) + + http_headers = { + 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, + } + + entries = [] + for f in vid_config.findall('./durl'): + segment_url = xpath_text(f, 'url', 'video url') + if not segment_url: + continue + order = xpath_text(f, 'order', 'order') + segment_id = video_id + segment_title = title + if order: + segment_id += '-%s' % order + segment_title += ' part %s' % order + entries.append({ + 'id': segment_id, + 'url': segment_url, + 'title': segment_title, + 'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000), + 'http_headers': http_headers, + }) + + if len(entries) == 1: + segment = entries[0] + segment['id'] = video_id + segment['title'] = title + return segment + + return { + '_type': 'multi_video', + 'id': video_id, + 'entries': entries, + 'title': title, + 'http_headers': http_headers, + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 21aea0c55..84f291558 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -97,7 +97,7 @@ class MixcloudIE(InfoExtractor): r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) like_count = str_to_int(self._search_regex( - r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', + r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 3645d3033..ecd0ac8b1 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -14,7 +14,7 @@ from ..utils import ( class NBCIE(InfoExtractor): - _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' _TESTS = [ { @@ -50,6 +50,57 @@ class NBCIE(InfoExtractor): return self.url_result(theplatform_url) +class NBCSportsVPlayerIE(InfoExtractor): + _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + + _TESTS = [{ + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } + }, { + 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + iframe_m = re.search( + r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + if iframe_m: + return iframe_m.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + theplatform_url = self._og_search_video_url(webpage) + return self.url_result(theplatform_url, 'ThePlatform') + + +class NBCSportsIE(InfoExtractor): + # Does not include https becuase its certificate is invalid + _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + + _TEST = { + 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'info_dict': { + 'id': 'PHJSaFWbrTY9', + 'ext': 'flv', + 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', + 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + return self.url_result( + NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index a20672c0c..46cebc0d7 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url class PhoenixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', - 'info_dict': { - 'id': '884301', - 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', - } - } + _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ + (?: + phoenix/die_sendungen/(?:[^/]+/)? + )? + (?P<id>[0-9]+)''' + _TESTS = [ + { + 'url': 'http://www.phoenix.de/content/884301', + 'md5': 'ed249f045256150c92e72dbb70eadec6', + 'info_dict': { + 'id': '884301', + 'ext': 'mp4', + 'title': 'Michael Krons mit Hans-Werner Sinn', + 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', + 'upload_date': '20141025', + 'uploader': 'Im Dialog', + } + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', + 'only_matching': True, + }, + { + 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', + 'only_matching': True, + }, + ] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 9576aed0e..e766ccca3 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -4,85 +4,72 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - float_or_none, int_or_none, - str_to_int, + parse_iso8601, ) class PlayFMIE(InfoExtractor): IE_NAME = 'play.fm' - _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])' _TEST = { - 'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220', + 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12', 'md5': 'c505f8307825a245d0c7ad1850001f22', 'info_dict': { - 'id': '137220', + 'id': '71276', 'ext': 'mp3', - 'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', - 'uploader': 'Sven Tasnadi', - 'uploader_id': 'sventasnadi', - 'duration': 5627.428, - 'upload_date': '20140712', + 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12', + 'description': '', + 'duration': 5627, + 'timestamp': 1406033781, + 'upload_date': '20140722', + 'uploader': 'Dan Drastic', + 'uploader_id': '71170', 'view_count': int, 'comment_count': int, - 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - upload_date = mobj.group('upload_date') - - rec_data = compat_urllib_parse.urlencode({'rec_id': video_id}) - req = compat_urllib_request.Request( - 'http://www.play.fm/flexRead/recording', data=rec_data) - req.add_header('Content-Type', 'application/x-www-form-urlencoded') - rec_doc = self._download_xml(req, video_id) + slug = mobj.group('slug') - error_node = rec_doc.find('./error') - if error_node is not None: - raise ExtractorError('An error occured: %s (code %s)' % ( - error_node.text, rec_doc.find('./status').text)) + recordings = self._download_json( + 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id) - recording = rec_doc.find('./recording') - title = recording.find('./title').text - view_count = str_to_int(recording.find('./stats/playcount').text) - comment_count = str_to_int(recording.find('./stats/comments').text) - duration = float_or_none(recording.find('./duration').text, scale=1000) - thumbnail = recording.find('./image').text + error = recordings.get('error') + if isinstance(error, dict): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error.get('message')), + expected=True) - artist = recording.find('./artists/artist') - uploader = artist.find('./name').text - uploader_id = artist.find('./slug').text - - video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % ( - 'http:', recording.find('./url').text, - recording.find('./_class').text, recording.find('./file_id').text, - rec_doc.find('./uuid').text, video_id, - rec_doc.find('./jingle/file_id').text, - 'http%3A%2F%2Fwww.play.fm%2Fplayer', - ) + audio_url = recordings['audio'] + video_id = compat_str(recordings.get('id') or video_id) + title = recordings['title'] + description = recordings.get('description') + duration = int_or_none(recordings.get('recordingDuration')) + timestamp = parse_iso8601(recordings.get('created_at')) + uploader = recordings.get('page', {}).get('title') + uploader_id = compat_str(recordings.get('page', {}).get('id')) + view_count = int_or_none(recordings.get('playCount')) + comment_count = int_or_none(recordings.get('commentCount')) + categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', - 'filesize': int_or_none(recording.find('./size').text), + 'url': audio_url, 'title': title, - 'upload_date': upload_date, - 'view_count': view_count, - 'comment_count': comment_count, + 'description': description, 'duration': duration, - 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3a27e3789..0c8b731cf 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor): } def _extract_count(self, pattern, webpage, name): - count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) - if count: - count = str_to_int(count) - return count + return str_to_int(self._search_regex( + pattern, webpage, '%s count' % name, fatal=False)) def _real_extract(self, url): video_id = self._match_id(url) @@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor): if thumbnail: thumbnail = compat_urllib_parse.unquote(thumbnail) - view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') - like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') - dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + view_count = self._extract_count( + r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count( + r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count( + r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') comment_count = self._extract_count( - r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') + r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 385681d06..7cc799664 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( unified_strdate, + int_or_none, ) @@ -24,7 +25,7 @@ class ProSiebenSat1IE(InfoExtractor): 'info_dict': { 'id': '2104602', 'ext': 'mp4', - 'title': 'Staffel 2, Episode 18 - Jahresrückblick', + 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, @@ -266,6 +267,9 @@ class ProSiebenSat1IE(InfoExtractor): urls_sources = urls_sources.values() def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate for source in urls_sources: diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py new file mode 100644 index 000000000..10251f29e --- /dev/null +++ b/youtube_dl/extractor/safari.py @@ -0,0 +1,157 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE + +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + smuggle_url, + std_headers, +) + + +class SafariBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' + _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' + _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' + _NETRC_MACHINE = 'safari' + + _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' + _API_FORMAT = 'json' + + LOGGED_IN = False + + def _real_initialize(self): + # We only need to log in once for courses or individual videos + if not self.LOGGED_IN: + self._login() + SafariBaseIE.LOGGED_IN = True + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + raise ExtractorError( + self._ACCOUNT_CREDENTIALS_HINT, + expected=True) + + headers = std_headers + if 'Referer' not in headers: + headers['Referer'] = self._LOGIN_URL + + login_page = self._download_webpage( + self._LOGIN_URL, None, + 'Downloading login form') + + csrf = self._html_search_regex( + r"name='csrfmiddlewaretoken'\s+value='([^']+)'", + login_page, 'csrf token') + + login_form = { + 'csrfmiddlewaretoken': csrf, + 'email': username, + 'password1': password, + 'login': 'Sign In', + 'next': '', + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers) + login_page = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + raise ExtractorError( + 'Login failed; make sure your credentials are correct and try again.', + expected=True) + + self.to_screen('Login successful') + + +class SafariIE(SafariBaseIE): + IE_NAME = 'safari' + IE_DESC = 'safaribooksonline.com online video' + _VALID_URL = r'''(?x)https?:// + (?:www\.)?safaribooksonline\.com/ + (?: + library/view/[^/]+| + api/v1/book + )/ + (?P<course_id>\d+)/ + (?:chapter(?:-content)?/)? + (?P<part>part\d+)\.html + ''' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', + 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', + 'info_dict': { + 'id': '2842601850001', + 'ext': 'mp4', + 'title': 'Introduction', + }, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + part = mobj.group('part') + + webpage = self._download_webpage( + '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), + part) + + bc_url = BrightcoveIE._extract_brightcove_url(webpage) + if not bc_url: + raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) + + return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove') + + +class SafariCourseIE(SafariBaseIE): + IE_NAME = 'safari:course' + IE_DESC = 'safaribooksonline.com online courses' + + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)' + + _TESTS = [{ + 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', + 'info_dict': { + 'id': '9780133392838', + 'title': 'Hadoop Fundamentals LiveLessons', + }, + 'playlist_count': 22, + 'skip': 'Requires safaribooksonline account credentials', + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json', + 'only_matching': True, + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + + course_json = self._download_json( + '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), + course_id, 'Downloading course JSON') + + if 'chapters' not in course_json: + raise ExtractorError( + 'No chapters found for course %s' % course_id, expected=True) + + entries = [ + self.url_result(chapter, 'Safari') + for chapter in course_json['chapters']] + + course_title = course_json['title'] + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 9f79ff5c1..0b717a1e4 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=', + r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);', webpage, 'slideshare object') info = json.loads(slideshare_obj) if info['slideshow']['type'] != 'video': diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 9d4505972..316b2c90f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 7cb06f351..a46a7ecba 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -54,7 +54,7 @@ class TeamcocoIE(InfoExtractor): embed_url, video_id, 'Downloading embed page') player_data = self._parse_json(self._search_regex( - r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id) + r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id) data = self._parse_json( base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index feac666f7..0e3e627f4 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor): error_msg = next( n.attrib['abstract'] for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction') + if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') except StopIteration: pass else: diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index 06554a1be..96c809eaf 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -42,7 +42,6 @@ class UltimediaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) deliver_url = self._search_regex( @@ -81,8 +80,8 @@ class UltimediaIE(InfoExtractor): title = clean_html(( self._html_search_regex( r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', - webpage, 'title', default=None) - or self._search_regex( + webpage, 'title', default=None) or + self._search_regex( r"var\s+nameVideo\s*=\s*'([^']+)'", deliver_page, 'title'))) diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py index eb49586cc..9369abaf8 100644 --- a/youtube_dl/extractor/varzesh3.py +++ b/youtube_dl/extractor/varzesh3.py @@ -1,48 +1,45 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) -import re class Varzesh3IE(InfoExtractor): - _VALID_URL = r'(?P<url>(https?://(?:www\.)?video\.varzesh3\.com)/(?P<id>.+))' - _TEST ={ + _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' + _TEST = { 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/', 'md5': '2a933874cb7dce4366075281eb49e855', 'info_dict': { - 'url': 'http://dl1.video.varzesh3.com/video/clip94/1/video/namayeshi/saves_week26.mp4', 'id': '76337', 'ext': 'mp4', 'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا', - 'thumbnail': 'http://video.varzesh3.com/wp-content/uploads/230315_saves_week26.jpg', 'description': 'فصل ۲۰۱۵-۲۰۱۴', + 'thumbnail': 're:^https?://.*\.jpg$', } } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r'<source[^>]+src="([^"]+)"', webpage, 'video url') - if not 'shortlink' in webpage: - raise ExtractorError('URL has no videos or there is a problem.') + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'(?s)<div class="matn">(.+?)</div>', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) - title = self._html_search_regex(r'meta[^>]+property="og:title"[^>]+content="([^"]+)"', webpage, 'title') - video_link = self._html_search_regex(r'source[^>]+src="([^"]+)"', webpage, 'video_link') - vid_id = self._html_search_regex(r"link[^>]+rel='canonical'[^>]+href='\/\?p=([^']+)'\/>", webpage, 'vid_id') - try: - description = self._html_search_regex(r'<div class="matn">(.*?)</div>', webpage, 'description', flags=re.DOTALL) - except: - description = title - thumbnail = self._html_search_regex(r'link[^>]+rel="image_src"[^>]+href="([^"]+)"', webpage, 'thumbnail') + video_id = self._search_regex( + r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", + webpage, display_id, default=display_id) return { - 'url': video_link, - 'id': vid_id, + 'url': video_url, + 'id': video_id, 'title': title, - 'ext': video_link.split(".")[-1], 'description': description, 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py new file mode 100644 index 000000000..6215f0642 --- /dev/null +++ b/youtube_dl/extractor/vessel.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( + ExtractorError, + parse_iso8601, +) + + +class VesselIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)' + _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s' + _LOGIN_URL = 'https://www.vessel.com/api/account/login' + _NETRC_MACHINE = 'vessel' + _TEST = { + 'url': 'https://www.vessel.com/videos/HDN7G5UMs', + 'md5': '455cdf8beb71c6dd797fd2f3818d05c4', + 'info_dict': { + 'id': 'HDN7G5UMs', + 'ext': 'mp4', + 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150317', + 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?', + 'timestamp': int, + }, + } + + @staticmethod + def make_json_request(url, data): + payload = json.dumps(data).encode('utf-8') + req = compat_urllib_request.Request(url, payload) + req.add_header('Content-Type', 'application/json; charset=utf-8') + return req + + @staticmethod + def find_assets(data, asset_type): + for asset in data.get('assets', []): + if asset.get('type') == asset_type: + yield asset + + def _check_access_rights(self, data): + access_info = data.get('__view', {}) + if not access_info.get('allow_access', True): + err_code = access_info.get('error_code') or '' + if err_code == 'ITEM_PAID_ONLY': + raise ExtractorError( + 'This video requires subscription.', expected=True) + else: + raise ExtractorError( + 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + data = { + 'client_id': 'web', + 'type': 'password', + 'user_key': username, + 'password': password, + } + login_request = VesselIE.make_json_request(self._LOGIN_URL, data) + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + data = self._parse_json(self._search_regex( + r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id) + asset_id = data['model']['data']['id'] + + req = VesselIE.make_json_request( + self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) + data = self._download_json(req, video_id) + + self._check_access_rights(data) + + try: + video_asset = next(VesselIE.find_assets(data, 'video')) + except StopIteration: + raise ExtractorError('No video assets found') + + formats = [] + for f in video_asset.get('sources', []): + if f['name'] == 'hls-index': + formats.extend(self._extract_m3u8_formats( + f['location'], video_id, ext='mp4', m3u8_id='m3u8')) + else: + formats.append({ + 'format_id': f['name'], + 'tbr': f.get('bitrate'), + 'height': f.get('height'), + 'width': f.get('width'), + 'url': f['location'], + }) + self._sort_formats(formats) + + thumbnails = [] + for im_asset in VesselIE.find_assets(data, 'image'): + thumbnails.append({ + 'url': im_asset['location'], + 'width': im_asset.get('width', 0), + 'height': im_asset.get('height', 0), + }) + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'thumbnails': thumbnails, + 'description': data.get('short_description'), + 'duration': data.get('duration'), + 'comment_count': data.get('comment_count'), + 'like_count': data.get('like_count'), + 'view_count': data.get('view_count'), + 'timestamp': parse_iso8601(data.get('released_at')), + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bd09652cd..28bcc89cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -244,6 +244,16 @@ class VimeoIE(VimeoBaseInfoExtractor): # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) + vimeo_config = self._search_regex( + r'vimeo\.config\s*=\s*({.+?});', webpage, + 'vimeo config', default=None) + if vimeo_config: + seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) + if seed_status.get('state') == 'failed': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, seed_status['title']), + expected=True) + # Extract the config JSON try: try: diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 4971965f9..81d885fdc 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def base64_decode_utf8(data): + return base64.b64decode(data.encode('utf-8')).decode('utf-8') + + @staticmethod + def base64_encode_utf8(data): + return base64.b64encode(data.encode('utf-8')).decode('utf-8') + def _extract_flv_config(self, media_id): - base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8') + base64_media_id = self.base64_encode_utf8(media_id) flv_config = self._download_xml( 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): - prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8') + prop_id = self.base64_decode_utf8(prop.attrib['id']) # CDATA may be empty in flv config if not prop.text: continue - encoded_content = base64.b64decode(prop.text).decode('utf-8') + encoded_content = self.base64_decode_utf8(prop.text) prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content) return prop_dict diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 97dbac4cc..b777159c5 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, ) +from .nbc import NBCSportsVPlayerIE + class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' @@ -129,6 +131,15 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', 'only_matching': True, + }, { + 'note': 'NBC Sports embeds', + 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', + 'info_dict': { + 'id': '9CsDKds0kvHI', + 'ext': 'flv', + 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', + 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + } } ] @@ -151,6 +162,10 @@ class YahooIE(InfoExtractor): items = json.loads(items_json) video_id = items[0]['id'] return self._get_info(video_id, display_id, webpage) + # Look for NBCSports iframes + nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) + if nbc_sports_url: + return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e4c855ee0..6abe72f73 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -52,7 +52,7 @@ class YouPornIE(InfoExtractor): webpage, 'JSON parameters') try: params = json.loads(json_params) - except: + except ValueError: raise ExtractorError('Invalid JSON') self.report_extraction(video_id) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27c8c4453..5488101e1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - playlist_id = mobj.group(1) or mobj.group(2) - - # Check if it's a video-specific URL - query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - if 'v' in query_dict: - video_id = query_dict['v'][0] - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - - if playlist_id.startswith('RD') or playlist_id.startswith('UL'): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) - + def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page @@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + playlist_id = mobj.group(1) or mobj.group(2) + + # Check if it's a video-specific URL + query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + if 'v' in query_dict: + video_id = query_dict['v'][0] + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + return self.url_result(video_id, 'Youtube', video_id=video_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + if playlist_id.startswith('RD') or playlist_id.startswith('UL'): + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + + return self._extract_playlist(playlist_id) + class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' @@ -1643,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:recommended' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' -class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' - _FEED_NAME = 'watch_later' - _PLAYLIST_TITLE = 'Youtube Watch Later' - _PERSONAL_FEED = True + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' |