aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/generic.py7
-rw-r--r--youtube_dl/extractor/krasview.py6
-rw-r--r--youtube_dl/extractor/mixcloud.py69
-rw-r--r--youtube_dl/extractor/nytimes.py40
-rw-r--r--youtube_dl/extractor/ultimedia.py104
-rw-r--r--youtube_dl/extractor/videomega.py45
-rw-r--r--youtube_dl/extractor/vine.py15
-rw-r--r--youtube_dl/version.py2
9 files changed, 208 insertions, 81 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 73c17aa84..7eb9b4fbb 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -538,6 +538,7 @@ from .udemy import (
UdemyIE,
UdemyCourseIE
)
+from .ultimedia import UltimediaIE
from .unistra import UnistraIE
from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index dc5755d12..8716e4503 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1006,6 +1006,13 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'))
+ # Look for NYTimes player
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for Ooyala videos
mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py
index e46954b47..96f95979a 100644
--- a/youtube_dl/extractor/krasview.py
+++ b/youtube_dl/extractor/krasview.py
@@ -40,8 +40,10 @@ class KrasViewIE(InfoExtractor):
description = self._og_search_description(webpage, default=None)
thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)
duration = int_or_none(flashvars.get('duration'))
- width = int_or_none(self._og_search_property('video:width', webpage, 'video width'))
- height = int_or_none(self._og_search_property('video:height', webpage, 'video height'))
+ width = int_or_none(self._og_search_property(
+ 'video:width', webpage, 'video width', default=None))
+ height = int_or_none(self._og_search_property(
+ 'video:height', webpage, 'video height', default=None))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 1831c6749..21aea0c55 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import re
+import itertools
from .common import InfoExtractor
from ..compat import (
@@ -10,7 +11,6 @@ from ..utils import (
ExtractorError,
HEADRequest,
str_to_int,
- parse_iso8601,
)
@@ -27,8 +27,6 @@ class MixcloudIE(InfoExtractor):
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader_id': 'dholbach',
- 'upload_date': '20111115',
- 'timestamp': 1321359578,
'thumbnail': 're:https?://.*\.jpg',
'view_count': int,
'like_count': int,
@@ -37,31 +35,30 @@ class MixcloudIE(InfoExtractor):
'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
'info_dict': {
'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
- 'ext': 'm4a',
- 'title': 'Electric Relaxation vol. 3',
+ 'ext': 'mp3',
+ 'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
- 'uploader': 'Daniel Drumz',
+ 'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
- 'thumbnail': 're:https?://.*\.jpg',
+ 'thumbnail': 're:https?://.*/images/',
'view_count': int,
'like_count': int,
},
}]
- def _get_url(self, track_id, template_url):
- server_count = 30
- for i in range(server_count):
- url = template_url % i
+ def _get_url(self, track_id, template_url, server_number):
+ boundaries = (1, 30)
+ for nr in server_numbers(server_number, boundaries):
+ url = template_url % nr
try:
# We only want to know if the request succeed
# don't download the whole file
self._request_webpage(
HEADRequest(url), track_id,
- 'Checking URL %d/%d ...' % (i + 1, server_count + 1))
+ 'Checking URL %d/%d ...' % (nr, boundaries[-1]))
return url
except ExtractorError:
pass
-
return None
def _real_extract(self, url):
@@ -75,17 +72,18 @@ class MixcloudIE(InfoExtractor):
preview_url = self._search_regex(
r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
song_url = preview_url.replace('/previews/', '/c/originals/')
+ server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number'))
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
- final_song_url = self._get_url(track_id, template_url)
+ final_song_url = self._get_url(track_id, template_url, server_number)
if final_song_url is None:
self.to_screen('Trying with m4a extension')
template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
- final_song_url = self._get_url(track_id, template_url)
+ final_song_url = self._get_url(track_id, template_url, server_number)
if final_song_url is None:
raise ExtractorError('Unable to extract track url')
PREFIX = (
- r'<span class="play-button[^"]*?"'
+ r'm-play-on-spacebar[^>]+'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')
@@ -99,16 +97,12 @@ class MixcloudIE(InfoExtractor):
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
like_count = str_to_int(self._search_regex(
- [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
- r'/favorites/?">([0-9]+)<'],
+ r'\bbutton-favorite\b.+m-ajax-toggle-count="([^"]+)"',
webpage, 'like count', fatal=False))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>'],
webpage, 'play count', fatal=False))
- timestamp = parse_iso8601(self._search_regex(
- r'<time itemprop="dateCreated" datetime="([^"]+)">',
- webpage, 'upload date', default=None))
return {
'id': track_id,
@@ -118,7 +112,38 @@ class MixcloudIE(InfoExtractor):
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
- 'timestamp': timestamp,
'view_count': view_count,
'like_count': like_count,
}
+
+
+def server_numbers(first, boundaries):
+ """ Server numbers to try in descending order of probable availability.
+ Starting from first (i.e. the number of the server hosting the preview file)
+ and going further and further up to the higher boundary and down to the
+ lower one in an alternating fashion. Namely:
+
+ server_numbers(2, (1, 5))
+
+ # Where the preview server is 2, min number is 1 and max is 5.
+ # Yields: 2, 3, 1, 4, 5
+
+ Why not random numbers or increasing sequences? Since from what I've seen,
+ full length files seem to be hosted on servers whose number is closer to
+ that of the preview; to be confirmed.
+ """
+ zip_longest = getattr(itertools, 'zip_longest', None)
+ if zip_longest is None:
+ # python 2.x
+ zip_longest = itertools.izip_longest
+
+ if len(boundaries) != 2:
+ raise ValueError("boundaries should be a two-element tuple")
+ min, max = boundaries
+ highs = range(first + 1, max + 1)
+ lows = range(first - 1, min - 1, -1)
+ rest = filter(
+ None, itertools.chain.from_iterable(zip_longest(highs, lows)))
+ yield first
+ for n in rest:
+ yield n
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 56e1cad3b..03f0a4de6 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -1,15 +1,17 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+)
class NYTimesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
'md5': '18a525a510f942ada2720db5f31644c0',
'info_dict': {
@@ -22,18 +24,21 @@ class NYTimesIE(InfoExtractor):
'uploader': 'Brett Weiner',
'duration': 419,
}
- }
+ }, {
+ 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
video_data = self._download_json(
- 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
+ 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id,
+ video_id, 'Downloading video JSON')
title = video_data['headline']
- description = video_data['summary']
- duration = video_data['duration'] / 1000.0
+ description = video_data.get('summary')
+ duration = float_or_none(video_data.get('duration'), 1000)
uploader = video_data['byline']
timestamp = parse_iso8601(video_data['publication_date'][:-8])
@@ -49,11 +54,11 @@ class NYTimesIE(InfoExtractor):
formats = [
{
'url': video['url'],
- 'format_id': video['type'],
- 'vcodec': video['video_codec'],
- 'width': video['width'],
- 'height': video['height'],
- 'filesize': get_file_size(video['fileSize']),
+ 'format_id': video.get('type'),
+ 'vcodec': video.get('video_codec'),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'filesize': get_file_size(video.get('fileSize')),
} for video in video_data['renditions']
]
self._sort_formats(formats)
@@ -61,7 +66,8 @@ class NYTimesIE(InfoExtractor):
thumbnails = [
{
'url': 'http://www.nytimes.com/%s' % image['url'],
- 'resolution': '%dx%d' % (image['width'], image['height']),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
} for image in video_data['images']
]
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
new file mode 100644
index 000000000..06554a1be
--- /dev/null
+++ b/youtube_dl/extractor/ultimedia.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ qualities,
+ unified_strdate,
+ clean_html,
+)
+
+
+class UltimediaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)'
+ _TESTS = [{
+ # news
+ 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
+ 'md5': '276a0e49de58c7e85d32b057837952a2',
+ 'info_dict': {
+ 'id': 's8uk0r',
+ 'ext': 'mp4',
+ 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
+ 'description': 'md5:3e5c8fd65791487333dda5db8aed32af',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150317',
+ },
+ }, {
+ # music
+ 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8',
+ 'md5': '2ea3513813cf230605c7e2ffe7eca61c',
+ 'info_dict': {
+ 'id': 'xvpfp8',
+ 'ext': 'mp4',
+ 'title': "Two - C'est la vie (Clip)",
+ 'description': 'Two',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150224',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ deliver_url = self._search_regex(
+ r'<iframe[^>]+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+ webpage, 'deliver URL')
+
+ deliver_page = self._download_webpage(
+ deliver_url, video_id, 'Downloading iframe page')
+
+ if '>This video is currently not available' in deliver_page:
+ raise ExtractorError(
+ 'Video %s is currently not available' % video_id, expected=True)
+
+ player = self._parse_json(
+ self._search_regex(
+ r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'),
+ video_id)
+
+ quality = qualities(['flash', 'html5'])
+ formats = []
+ for mode in player['modes']:
+ video_url = mode.get('config', {}).get('file')
+ if not video_url:
+ continue
+ if re.match(r'https?://www\.youtube\.com/.+?', video_url):
+ return self.url_result(video_url, 'Youtube')
+ formats.append({
+ 'url': video_url,
+ 'format_id': mode.get('type'),
+ 'quality': quality(mode.get('type')),
+ })
+ self._sort_formats(formats)
+
+ thumbnail = player.get('image')
+
+ title = clean_html((
+ self._html_search_regex(
+ r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>',
+ webpage, 'title', default=None)
+ or self._search_regex(
+ r"var\s+nameVideo\s*=\s*'([^']+)'",
+ deliver_page, 'title')))
+
+ description = clean_html(self._html_search_regex(
+ r'(?s)<span>Description</span>(.+?)</p>', webpage,
+ 'description', fatal=False))
+
+ upload_date = unified_strdate(self._search_regex(
+ r'Ajouté le\s*<span>([^<]+)', webpage,
+ 'upload date', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 273030316..eb309a7cd 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,28 +4,21 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
- remove_start,
-)
+from ..compat import compat_urllib_request
class VideoMegaIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?:www\.)?videomega\.tv/
- (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
+ (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
'''
_TEST = {
- 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
+ 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4',
'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
'info_dict': {
- 'id': 'QR0HCUHI1661IHUCH0RQ',
+ 'id': '4GNA688SU99US886ANG4',
'ext': 'mp4',
- 'title': 'Big Buck Bunny',
+ 'title': 'BigBuckBunny_320x180',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
@@ -33,34 +26,24 @@ class VideoMegaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
+ iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
req = compat_urllib_request.Request(iframe_url)
req.add_header('Referer', url)
webpage = self._download_webpage(req, video_id)
- try:
- escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1]
- except IndexError:
- raise ExtractorError('Unable to extract escaped data')
-
- playlist = compat_urllib_parse.unquote(escaped_data)
-
+ title = self._html_search_regex(
+ r'<title>(.*?)</title>', webpage, 'title')
+ title = re.sub(
+ r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
- r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
- video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
- title = remove_start(self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- }]
- self._sort_formats(formats)
+ r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+ video_url = self._search_regex(
+ r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
return {
'id': video_id,
'title': title,
- 'formats': formats,
+ 'url': video_url,
'thumbnail': thumbnail,
'http_headers': {
'Referer': iframe_url,
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 0b58fe0fe..c3187cfeb 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -33,14 +33,13 @@ class VineIE(InfoExtractor):
r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
formats = [{
- 'url': data['videoLowURL'],
- 'ext': 'mp4',
- 'format_id': 'low',
- }, {
- 'url': data['videoUrl'],
- 'ext': 'mp4',
- 'format_id': 'standard',
- }]
+ 'format_id': '%(format)s-%(rate)s' % f,
+ 'vcodec': f['format'],
+ 'quality': f['rate'],
+ 'url': f['videoUrl'],
+ } for f in data['videoUrls'] if f.get('rate')]
+
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 7ed07c375..51b4260aa 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2015.03.15'
+__version__ = '2015.03.18'