diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/krasview.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 69 | ||||
-rw-r--r-- | youtube_dl/extractor/nytimes.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/ultimedia.py | 104 | ||||
-rw-r--r-- | youtube_dl/extractor/videomega.py | 45 | ||||
-rw-r--r-- | youtube_dl/extractor/vine.py | 15 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
9 files changed, 208 insertions, 81 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 73c17aa84..7eb9b4fbb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -538,6 +538,7 @@ from .udemy import ( UdemyIE, UdemyCourseIE ) +from .ultimedia import UltimediaIE from .unistra import UnistraIE from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dc5755d12..8716e4503 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1006,6 +1006,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for NYTimes player + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index e46954b47..96f95979a 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor): description = self._og_search_description(webpage, default=None) thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage) duration = int_or_none(flashvars.get('duration')) - width = int_or_none(self._og_search_property('video:width', webpage, 'video width')) - height = int_or_none(self._og_search_property('video:height', webpage, 'video height')) + width = int_or_none(self._og_search_property( + 'video:width', webpage, 'video width', default=None)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, 'video height', default=None)) return { 'id': video_id, diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 1831c6749..21aea0c55 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..compat import ( @@ -10,7 +11,6 @@ from ..utils import ( ExtractorError, HEADRequest, str_to_int, - parse_iso8601, ) @@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor): 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', - 'upload_date': '20111115', - 'timestamp': 1321359578, 'thumbnail': 're:https?://.*\.jpg', 'view_count': int, 'like_count': int, @@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor): 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', - 'ext': 'm4a', - 'title': 'Electric Relaxation vol. 3', + 'ext': 'mp3', + 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', - 'uploader': 'Daniel Drumz', + 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*\.jpg', + 'thumbnail': 're:https?://.*/images/', 'view_count': int, 'like_count': int, }, }] - def _get_url(self, track_id, template_url): - server_count = 30 - for i in range(server_count): - url = template_url % i + def _get_url(self, track_id, template_url, server_number): + boundaries = (1, 30) + for nr in server_numbers(server_number, boundaries): + url = template_url % nr try: # We only want to know if the request succeed # don't download the whole file self._request_webpage( HEADRequest(url), track_id, - 'Checking URL %d/%d ...' % (i + 1, server_count + 1)) + 'Checking URL %d/%d ...' % (nr, boundaries[-1])) return url except ExtractorError: pass - return None def _real_extract(self, url): @@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') + server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: self.to_screen('Trying with m4a extension') template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(track_id, template_url) + final_song_url = self._get_url(track_id, template_url, server_number) if final_song_url is None: raise ExtractorError('Unable to extract track url') PREFIX = ( - r'<span class="play-button[^"]*?"' + r'm-play-on-spacebar[^>]+' r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') title = self._html_search_regex( PREFIX + r'm-title="([^"]+)"', webpage, 'title') @@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor): r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) like_count = str_to_int(self._search_regex( - [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"', - r'/favorites/?">([0-9]+)<'], + r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"', webpage, 'like count', fatal=False)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>'], webpage, 'play count', fatal=False)) - timestamp = parse_iso8601(self._search_regex( - r'<time itemprop="dateCreated" datetime="([^"]+)">', - webpage, 'upload date', default=None)) return { 'id': track_id, @@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, - 'timestamp': timestamp, 'view_count': view_count, 'like_count': like_count, } + + +def server_numbers(first, boundaries): + """ Server numbers to try in descending order of probable availability. + Starting from first (i.e. the number of the server hosting the preview file) + and going further and further up to the higher boundary and down to the + lower one in an alternating fashion. Namely: + + server_numbers(2, (1, 5)) + + # Where the preview server is 2, min number is 1 and max is 5. + # Yields: 2, 3, 1, 4, 5 + + Why not random numbers or increasing sequences? Since from what I've seen, + full length files seem to be hosted on servers whose number is closer to + that of the preview; to be confirmed. + """ + zip_longest = getattr(itertools, 'zip_longest', None) + if zip_longest is None: + # python 2.x + zip_longest = itertools.izip_longest + + if len(boundaries) != 2: + raise ValueError("boundaries should be a two-element tuple") + min, max = boundaries + highs = range(first + 1, max + 1) + lows = range(first - 1, min - 1, -1) + rest = filter( + None, itertools.chain.from_iterable(zip_longest(highs, lows))) + yield first + for n in rest: + yield n diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 56e1cad3b..03f0a4de6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,15 +1,17 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, +) class NYTimesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', 'md5': '18a525a510f942ada2720db5f31644c0', 'info_dict': { @@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor): 'uploader': 'Brett Weiner', 'duration': 419, } - } + }, { + 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') + 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, + video_id, 'Downloading video JSON') title = video_data['headline'] - description = video_data['summary'] - duration = video_data['duration'] / 1000.0 + description = video_data.get('summary') + duration = float_or_none(video_data.get('duration'), 1000) uploader = video_data['byline'] timestamp = parse_iso8601(video_data['publication_date'][:-8]) @@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor): formats = [ { 'url': video['url'], - 'format_id': video['type'], - 'vcodec': video['video_codec'], - 'width': video['width'], - 'height': video['height'], - 'filesize': get_file_size(video['fileSize']), + 'format_id': video.get('type'), + 'vcodec': video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('fileSize')), } for video in video_data['renditions'] ] self._sort_formats(formats) @@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor): thumbnails = [ { 'url': 'http://www.nytimes.com/%s' % image['url'], - 'resolution': '%dx%d' % (image['width'], image['height']), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), } for image in video_data['images'] ] diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py new file mode 100644 index 000000000..06554a1be --- /dev/null +++ b/youtube_dl/extractor/ultimedia.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, + clean_html, +) + + +class UltimediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' + _TESTS = [{ + # news + 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'md5': '276a0e49de58c7e85d32b057837952a2', + 'info_dict': { + 'id': 's8uk0r', + 'ext': 'mp4', + 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', + 'description': 'md5:3e5c8fd65791487333dda5db8aed32af', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150317', + }, + }, { + # music + 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'md5': '2ea3513813cf230605c7e2ffe7eca61c', + 'info_dict': { + 'id': 'xvpfp8', + 'ext': 'mp4', + 'title': "Two - C'est la vie (Clip)", + 'description': 'Two', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20150224', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + deliver_url = self._search_regex( + r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', + webpage, 'deliver URL') + + deliver_page = self._download_webpage( + deliver_url, video_id, 'Downloading iframe page') + + if '>This video is currently not available' in deliver_page: + raise ExtractorError( + 'Video %s is currently not available' % video_id, expected=True) + + player = self._parse_json( + self._search_regex( + r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), + video_id) + + quality = qualities(['flash', 'html5']) + formats = [] + for mode in player['modes']: + video_url = mode.get('config', {}).get('file') + if not video_url: + continue + if re.match(r'https?://www\.youtube\.com/.+?', video_url): + return self.url_result(video_url, 'Youtube') + formats.append({ + 'url': video_url, + 'format_id': mode.get('type'), + 'quality': quality(mode.get('type')), + }) + self._sort_formats(formats) + + thumbnail = player.get('image') + + title = clean_html(( + self._html_search_regex( + r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', + webpage, 'title', default=None) + or self._search_regex( + r"var\s+nameVideo\s*=\s*'([^']+)'", + deliver_page, 'title'))) + + description = clean_html(self._html_search_regex( + r'(?s)<span>Description</span>(.+?)</p>', webpage, + 'description', fatal=False)) + + upload_date = unified_strdate(self._search_regex( + r'Ajouté le\s*<span>([^<]+)', webpage, + 'upload date', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 273030316..eb309a7cd 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -4,28 +4,21 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, - remove_start, -) +from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?:www\.)?videomega\.tv/ - (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+) + (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+) ''' _TEST = { - 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ', + 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4', 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', 'info_dict': { - 'id': 'QR0HCUHI1661IHUCH0RQ', + 'id': '4GNA688SU99US886ANG4', 'ext': 'mp4', - 'title': 'Big Buck Bunny', + 'title': 'BigBuckBunny_320x180', 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) + iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id req = compat_urllib_request.Request(iframe_url) req.add_header('Referer', url) webpage = self._download_webpage(req, video_id) - try: - escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1] - except IndexError: - raise ExtractorError('Unable to extract escaped data') - - playlist = compat_urllib_parse.unquote(escaped_data) - + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + title = re.sub( + r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) - video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') - title = remove_start(self._html_search_regex( - r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - }] - self._sort_formats(formats) + r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) + video_url = self._search_regex( + r'<source[^>]+?src="([^"]+)"', webpage, 'video URL') return { 'id': video_id, 'title': title, - 'formats': formats, + 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { 'Referer': iframe_url, diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0b58fe0fe..c3187cfeb 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -33,14 +33,13 @@ class VineIE(InfoExtractor): r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) formats = [{ - 'url': data['videoLowURL'], - 'ext': 'mp4', - 'format_id': 'low', - }, { - 'url': data['videoUrl'], - 'ext': 'mp4', - 'format_id': 'standard', - }] + 'format_id': '%(format)s-%(rate)s' % f, + 'vcodec': f['format'], + 'quality': f['rate'], + 'url': f['videoUrl'], + } for f in data['videoUrls'] if f.get('rate')] + + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7ed07c375..51b4260aa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.03.15' +__version__ = '2015.03.18' |