aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py8
-rw-r--r--youtube_dl/extractor/arte.py38
-rw-r--r--youtube_dl/extractor/audiomack.py69
-rw-r--r--youtube_dl/extractor/bild.py39
-rw-r--r--youtube_dl/extractor/cinemassacre.py17
-rw-r--r--youtube_dl/extractor/common.py14
-rw-r--r--youtube_dl/extractor/crunchyroll.py11
-rw-r--r--youtube_dl/extractor/francetv.py4
-rw-r--r--youtube_dl/extractor/funnyordie.py2
-rw-r--r--youtube_dl/extractor/generic.py73
-rw-r--r--youtube_dl/extractor/glide.py40
-rw-r--r--youtube_dl/extractor/hark.py48
-rw-r--r--youtube_dl/extractor/lrt.py2
-rw-r--r--youtube_dl/extractor/mitele.py13
-rw-r--r--youtube_dl/extractor/motherless.py56
-rw-r--r--youtube_dl/extractor/nhl.py34
-rw-r--r--youtube_dl/extractor/pbs.py20
-rw-r--r--youtube_dl/extractor/soundcloud.py15
-rw-r--r--youtube_dl/extractor/sportbox.py4
-rw-r--r--youtube_dl/extractor/telecinco.py19
-rw-r--r--youtube_dl/extractor/tumblr.py35
-rw-r--r--youtube_dl/extractor/viddler.py108
-rw-r--r--youtube_dl/extractor/vidzi.py33
-rw-r--r--youtube_dl/extractor/vrt.py95
-rw-r--r--youtube_dl/extractor/youtube.py5
25 files changed, 601 insertions, 201 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 0dd763006..8e31de93d 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -20,12 +20,14 @@ from .arte import (
ArteTVDDCIE,
ArteTVEmbedIE,
)
+from .audiomack import AudiomackIE
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .beeg import BeegIE
from .behindkink import BehindKinkIE
+from .bild import BildIE
from .bilibili import BiliBiliIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
@@ -137,6 +139,7 @@ from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
+from .glide import GlideIE
from .globo import GloboIE
from .godtube import GodTubeIE
from .golem import GolemIE
@@ -370,6 +373,7 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
+from .telecinco import TelecincoIE
from .telemb import TeleMBIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
@@ -424,6 +428,7 @@ from .videopremium import VideoPremiumIE
from .videott import VideoTtIE
from .videoweed import VideoWeedIE
from .vidme import VidmeIE
+from .vidzi import VidziIE
from .vimeo import (
VimeoIE,
VimeoAlbumIE,
@@ -443,6 +448,7 @@ from .viki import VikiIE
from .vk import VKIE
from .vodlocker import VodlockerIE
from .vporn import VpornIE
+from .vrt import VRTIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vulture import VultureIE
@@ -492,10 +498,8 @@ from .youtube import (
YoutubeUserIE,
YoutubeWatchLaterIE,
)
-
from .zdf import ZDFIE
-
_ALL_CLASSES = [
klass
for name, klass in globals().items()
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 3a34d1ecc..b9a9440c0 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -10,8 +10,8 @@ from ..utils import (
unified_strdate,
determine_ext,
get_element_by_id,
- compat_str,
get_element_by_attribute,
+ int_or_none,
)
# There are different sources of video in arte.tv, the extraction process
@@ -90,15 +90,24 @@ class ArteTVPlus7IE(InfoExtractor):
if not upload_date_str:
upload_date_str = player_info.get('VDA', '').split(' ')[0]
+ title = player_info['VTI'].strip()
+ subtitle = player_info.get('VSU', '').strip()
+ if subtitle:
+ title += ' - %s' % subtitle
+
info_dict = {
'id': player_info['VID'],
- 'title': player_info['VTI'],
+ 'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
- all_formats = player_info['VSR'].values()
+ all_formats = []
+ for format_id, format_dict in player_info['VSR'].items():
+ fmt = dict(format_dict)
+ fmt['format_id'] = format_id
+ all_formats.append(fmt)
# Some formats use the m3u8 protocol
all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))
def _match_lang(f):
@@ -149,25 +158,12 @@ class ArteTVPlus7IE(InfoExtractor):
)
formats = sorted(formats, key=sort_key)
def _format(format_info):
- quality = ''
- height = format_info.get('height')
- if height is not None:
- quality = compat_str(height)
- bitrate = format_info.get('bitrate')
- if bitrate is not None:
- quality += '-%d' % bitrate
- if format_info.get('versionCode') is not None:
- format_id = '%s-%s' % (quality, format_info['versionCode'])
- else:
- format_id = quality
- media_type = format_info.get('mediaType')
- if media_type is not None:
- format_id += '-%s' % media_type
info = {
- 'format_id': format_id,
- 'format_note': format_info.get('versionLibelle'),
- 'width': format_info.get('width'),
- 'height': height,
+ 'format_id': format_info['format_id'],
+ 'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')),
+ 'width': int_or_none(format_info.get('width')),
+ 'height': int_or_none(format_info.get('height')),
+ 'tbr': int_or_none(format_info.get('bitrate')),
}
if format_info['mediaType'] == 'rtmp':
info['url'] = format_info['streamer']
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py
new file mode 100644
index 000000000..57446fddd
--- /dev/null
+++ b/youtube_dl/extractor/audiomack.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .soundcloud import SoundcloudIE
+from ..utils import ExtractorError
+import datetime
+import time
+
+
+class AudiomackIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
+ IE_NAME = 'audiomack'
+ _TESTS = [
+ #hosted on audiomack
+ {
+ 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
+ 'info_dict':
+ {
+ 'id' : 'roosh-williams/extraordinary',
+ 'ext': 'mp3',
+ 'title': 'Roosh Williams - Extraordinary'
+ }
+ },
+ #hosted on soundcloud via audiomack
+ {
+ 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare',
+ 'file': '172419696.mp3',
+ 'info_dict':
+ {
+ 'ext': 'mp3',
+ 'title': 'Young Thug ft Lil Wayne - Take Kare',
+ "upload_date": "20141016",
+ "description": "New track produced by London On Da Track called “Take Kare\"\n\nhttp://instagram.com/theyoungthugworld\nhttps://www.facebook.com/ThuggerThuggerCashMoney\n",
+ "uploader": "Young Thug World"
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ api_response = self._download_json(
+ "http://www.audiomack.com/api/music/url/song/%s?_=%d" % (
+ video_id, time.time()),
+ video_id)
+
+ if "url" not in api_response:
+ raise ExtractorError("Unable to deduce api url of song")
+ realurl = api_response["url"]
+
+ #Audiomack wraps a lot of soundcloud tracks in their branded wrapper
+ # - if so, pass the work off to the soundcloud extractor
+ if SoundcloudIE.suitable(realurl):
+ return {'_type': 'url', 'url': realurl, 'ie_key': 'Soundcloud'}
+
+ webpage = self._download_webpage(url, video_id)
+ artist = self._html_search_regex(
+ r'<span class="artist">(.*?)</span>', webpage, "artist")
+ songtitle = self._html_search_regex(
+ r'<h1 class="profile-title song-title"><span class="artist">.*?</span>(.*?)</h1>',
+ webpage, "title")
+ title = artist + " - " + songtitle
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': realurl,
+ }
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py
new file mode 100644
index 000000000..0269d1174
--- /dev/null
+++ b/youtube_dl/extractor/bild.py
@@ -0,0 +1,39 @@
+#coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class BildIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
+ IE_DESC = 'Bild.de'
+ _TEST = {
+ 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
+ 'md5': 'dd495cbd99f2413502a1713a1156ac8a',
+ 'info_dict': {
+ 'id': '38184146',
+ 'ext': 'mp4',
+ 'title': 'BILD hat sie getestet',
+ 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg',
+ 'duration': 196,
+ 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
+ doc = self._download_xml(xml_url, video_id)
+
+ duration = int_or_none(doc.attrib.get('duration'), scale=1000)
+
+ return {
+ 'id': video_id,
+ 'title': doc.attrib['ueberschrift'],
+ 'description': doc.attrib.get('text'),
+ 'url': doc.attrib['src'],
+ 'thumbnail': doc.attrib.get('img'),
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index 496271be4..d064a28f9 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
+ mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
raise ExtractorError('Can\'t extract embed url and video id')
playerdata_url = mobj.group('embed_url')
@@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor):
video_description = self._html_search_regex(
r'<div class="entry-content">(?P<description>.+?)</div>',
webpage, 'description', flags=re.DOTALL, fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
- video_thumbnail = self._search_regex(
- r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
- sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
- videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url')
+ vidurl = self._search_regex(
+ r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
+ vidid = self._search_regex(
+ r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid')
+ videoserver = self._html_search_regex(
+ r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver')
+
+ videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
formats = []
- baseurl = sd_url[:sd_url.rfind('/')+1]
+ baseurl = vidurl[:vidurl.rfind('/')+1]
for video in videolist.findall('.//video'):
src = video.get('src')
if not src:
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index e8366f7f9..e1bd6bb49 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -89,6 +89,10 @@ class InfoExtractor(object):
format, irrespective of the file format.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ * source_preference Order number for this video source
+ (quality takes higher priority)
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
* http_referer HTTP Referer header value to set.
* http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
@@ -238,7 +242,6 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """
-
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
@@ -247,6 +250,10 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ return (content, urlh)
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -305,7 +312,7 @@ class InfoExtractor(object):
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
- return (content, urlh)
+ return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """
@@ -613,12 +620,13 @@ class InfoExtractor(object):
audio_ext_preference,
f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+ f.get('source_preference') if f.get('source_preference') is not None else -1,
f.get('format_id'),
)
formats.sort(key=_formats_key)
def http_scheme(self):
- """ Either "https:" or "https:", depending on the user's preferences """
+ """ Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
if self._downloader.params.get('prefer_insecure', False)
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 9ac86c2be..2dca52660 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -40,6 +40,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013',
+ 'url': 're:(?!.*&amp)',
},
'params': {
# rtmp
@@ -238,12 +239,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
- streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format)
- video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url')
- video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path')
+ streamdata = self._download_xml(
+ streamdata_req, video_id,
+ note='Downloading media info for %s' % video_format)
+ video_url = streamdata.find('.//host').text
+ video_play_path = streamdata.find('.//file').text
formats.append({
'url': video_url,
- 'play_path': video_play_path,
+ 'play_path': video_play_path,
'ext': 'flv',
'format': video_format,
'format_id': video_format,
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 0b3374d97..566e20d76 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -46,7 +46,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
f4m_format['preference'] = 1
formats.extend(f4m_formats)
elif video_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(video_url, video_id))
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
formats.append({
'url': video_url,
'format_id': format_id,
- 'preference': 2,
+ 'preference': -1,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index d966e8403..ec6d96ada 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage)
+ links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)
if not links:
raise ExtractorError('No media links available for %s' % video_id)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 9057a6beb..51dbbc8db 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -380,6 +380,17 @@ class GenericIE(InfoExtractor):
'uploader': 'education-portal.com',
},
},
+ {
+ 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
+ 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
+ 'info_dict': {
+ 'id': 'uxjb0lwrcz',
+ 'ext': 'mp4',
+ 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+ 'duration': 1715.0,
+ 'uploader': 'thoughtworks.wistia.com',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -476,7 +487,8 @@ class GenericIE(InfoExtractor):
'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
) % (url, url), expected=True)
else:
- assert ':' in default_search
+ if ':' not in default_search:
+ default_search += ':'
return self.url_result(default_search + url)
url, smuggled_data = unsmuggle_url(url)
@@ -491,14 +503,14 @@ class GenericIE(InfoExtractor):
self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url)
- response = self._request_webpage(
+ head_response = self._request_webpage(
head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False)
- if response is not False:
+ if head_response is not False:
# Check for redirect
- new_url = response.geturl()
+ new_url = head_response.geturl()
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -506,34 +518,35 @@ class GenericIE(InfoExtractor):
new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
- # Check for direct link to a video
- content_type = response.headers.get('Content-Type', '')
- m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
- if m:
- upload_date = response.headers.get('Last-Modified')
- if upload_date:
- upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
- 'formats': [{
- 'format_id': m.group('format_id'),
- 'url': url,
- 'vcodec': 'none' if m.group('type') == 'audio' else None
- }],
- 'upload_date': upload_date,
- }
+ full_response = None
+ if head_response is False:
+ full_response = self._request_webpage(url, video_id)
+ head_response = full_response
+
+ # Check for direct link to a video
+ content_type = head_response.headers.get('Content-Type', '')
+ m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+ if m:
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'formats': [{
+ 'format_id': m.group('format_id'),
+ 'url': url,
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
+ }],
+ 'upload_date': upload_date,
+ }
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
- try:
+ if full_response:
+ webpage = _webpage_read_content(url, video_id)
+ else:
webpage = self._download_webpage(url, video_id)
- except ValueError:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError('Failed to download URL: %s' % url)
-
self.report_extraction(video_id)
# Is it an RSS feed?
@@ -623,7 +636,8 @@ class GenericIE(InfoExtractor):
<iframe[^>]+?src=|
data-video-url=|
<embed[^>]+?src=|
- embedSWF\(?:\s*
+ embedSWF\(?:\s*|
+ new\s+SWFObject\(
)
(["\'])
(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
@@ -652,7 +666,7 @@ class GenericIE(InfoExtractor):
# Look for embedded Wistia player
match = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
embed_url = self._proto_relative_url(
unescapeHTML(match.group('url')))
@@ -664,6 +678,7 @@ class GenericIE(InfoExtractor):
'title': video_title,
'id': video_id,
}
+
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py
new file mode 100644
index 000000000..9561ed5fb
--- /dev/null
+++ b/youtube_dl/extractor/glide.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GlideIE(InfoExtractor):
+ IE_DESC = 'Glide mobile video messages (glide.me)'
+ _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)'
+ _TEST = {
+ 'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==',
+ 'md5': '4466372687352851af2d131cfaa8a4c7',
+ 'info_dict': {
+ 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==',
+ 'ext': 'mp4',
+ 'title': 'Damon Timm\'s Glide message',
+ 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<title>(.*?)</title>', webpage, 'title')
+ video_url = self.http_scheme() + self._search_regex(
+ r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL')
+ thumbnail_url = self._search_regex(
+ r'<img id="video-thumbnail" src="(.*?)"',
+ webpage, 'thumbnail url', fatal=False)
+ thumbnail = (
+ thumbnail_url if thumbnail_url is None
+ else self.http_scheme() + thumbnail_url)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py
index 5bdd08afa..b6cc15b6f 100644
--- a/youtube_dl/extractor/hark.py
+++ b/youtube_dl/extractor/hark.py
@@ -1,37 +1,33 @@
# -*- coding: utf-8 -*-
-
-import re
-import json
+from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import determine_ext
+
class HarkIE(InfoExtractor):
- _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+'
+ _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+'
_TEST = {
- u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
- u'file': u'mmbzyhkgny.mp3',
- u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
- u'info_dict': {
- u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
- u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
- u'duration': 11,
+ 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
+ 'md5': '6783a58491b47b92c7c1af5a77d4cbee',
+ 'info_dict': {
+ 'id': 'mmbzyhkgny',
+ 'ext': 'mp3',
+ 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013',
+ 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
+ 'duration': 11,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
- json_url = "http://www.hark.com/clips/%s.json" %(video_id)
- info_json = self._download_webpage(json_url, video_id)
- info = json.loads(info_json)
- final_url = info['url']
+ video_id = self._match_id(url)
+ data = self._download_json(
+ 'http://www.hark.com/clips/%s.json' % video_id, video_id)
- return {'id': video_id,
- 'url' : final_url,
- 'title': info['name'],
- 'ext': determine_ext(final_url),
- 'description': info['description'],
- 'thumbnail': info['image_original'],
- 'duration': info['duration'],
- }
+ return {
+ 'id': video_id,
+ 'url': data['url'],
+ 'title': data['name'],
+ 'description': data.get('description'),
+ 'thumbnail': data.get('image_original'),
+ 'duration': data.get('duration'),
+ }
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index fca0bfef0..db5df4078 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -22,7 +22,7 @@ class LRTIE(InfoExtractor):
'id': '54391',
'ext': 'mp4',
'title': 'Septynios Kauno dienos',
- 'description': 'Kauno miesto ir apskrities naujienos',
+ 'description': 'md5:24d84534c7dc76581e59f5689462411a',
'duration': 1783,
},
'params': {
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 979f3d692..6691521e5 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -6,6 +6,7 @@ import json
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
+ compat_urlparse,
get_element_by_attribute,
parse_duration,
strip_jsonp,
@@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor):
).replace('\'', '"')
embed_data = json.loads(embed_data_json)
- info_url = embed_data['flashvars']['host']
+ domain = embed_data['mediaUrl']
+ if not domain.startswith('http'):
+ # only happens in telecinco.es videos
+ domain = 'http://' + domain
+ info_url = compat_urlparse.urljoin(
+ domain,
+ compat_urllib_parse.unquote(embed_data['flashvars']['host'])
+ )
info_el = self._download_xml(info_url, episode).find('./video/info')
video_link = info_el.find('videoUrl/link').text
token_query = compat_urllib_parse.urlencode({'id': video_link})
token_info = self._download_json(
- 'http://token.mitele.es/?' + token_query, episode,
+ embed_data['flashvars']['ov_tk'] + '?' + token_query,
+ episode,
transform_source=strip_jsonp
)
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 6229b2173..3621ff99e 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -5,20 +5,20 @@ import re
from .common import InfoExtractor
from ..utils import (
- int_or_none,
+ str_to_int,
unified_strdate,
)
class MotherlessIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)'
+ _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
_TESTS = [
{
'url': 'http://motherless.com/AC3FFE1',
- 'md5': '5527fef81d2e529215dad3c2d744a7d9',
+ 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
'info_dict': {
'id': 'AC3FFE1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Fucked in the ass while playing PS3',
'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
'upload_date': '20100913',
@@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
'age_limit': 18,
}
+ },
+ {
+ 'url': 'http://motherless.com/g/cosplay/633979F',
+ 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+ 'info_dict': {
+ 'id': '633979F',
+ 'ext': 'mp4',
+ 'title': 'Turtlette',
+ 'categories': ['superheroine heroine superher'],
+ 'upload_date': '20140827',
+ 'uploader_id': 'shade0230',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
}
]
- def _real_extract(self,url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
-
- video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url')
+ title = self._html_search_regex(
+ r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')
age_limit = self._rta_search(webpage)
-
- view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count')
+ view_count = str_to_int(self._html_search_regex(
+ r'<strong>Views</strong>\s+([^<]+)<',
+ webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._html_search_regex(
+ r'<strong>Favorited</strong>\s+([^<]+)<',
+ webpage, 'like count', fatal=False))
- upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date')
+ upload_date = self._html_search_regex(
+ r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date')
if 'Ago' in upload_date:
days = int(re.search(r'([0-9]+)', upload_date).group(1))
upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
else:
upload_date = unified_strdate(upload_date)
- like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count')
-
comment_count = webpage.count('class="media-comment-contents"')
- uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id')
+ uploader_id = self._html_search_regex(
+ r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
+ webpage, 'uploader_id')
categories = self._html_search_meta('keywords', webpage)
if categories:
@@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor):
'uploader_id': uploader_id,
'thumbnail': self._og_search_thumbnail(webpage),
'categories': categories,
- 'view_count': int_or_none(view_count.replace(',', '')),
- 'like_count': int_or_none(like_count.replace(',', '')),
+ 'view_count': view_count,
+ 'like_count': like_count,
'comment_count': comment_count,
'age_limit': age_limit,
'url': video_url,
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 072d9cf8e..d66c2c6f8 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -22,21 +22,23 @@ class NHLBaseInfoExtractor(InfoExtractor):
self.report_extraction(video_id)
initial_video_url = info['publishPoint']
- data = compat_urllib_parse.urlencode({
- 'type': 'fvod',
- 'path': initial_video_url.replace('.mp4', '_sd.mp4'),
- })
- path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
- path_doc = self._download_xml(
- path_url, video_id, 'Downloading final video url')
- video_url = path_doc.find('path').text
+ if info['formats'] == '1':
+ data = compat_urllib_parse.urlencode({
+ 'type': 'fvod',
+ 'path': initial_video_url.replace('.mp4', '_sd.mp4'),
+ })
+ path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
+ path_doc = self._download_xml(
+ path_url, video_id, 'Downloading final video url')
+ video_url = path_doc.find('path').text
+ else:
+ video_url = initial_video_url
join = compat_urlparse.urljoin
return {
'id': video_id,
'title': info['name'],
'url': video_url,
- 'ext': determine_ext(video_url),
'description': info['description'],
'duration': int(info['duration']),
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
@@ -46,10 +48,11 @@ class NHLBaseInfoExtractor(InfoExtractor):
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9a-z-]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
+ 'md5': 'db704a4ea09e8d3988c85e36cc892d09',
'info_dict': {
'id': '453614',
'ext': 'mp4',
@@ -59,6 +62,17 @@ class NHLIE(NHLBaseInfoExtractor):
'upload_date': '20131006',
},
}, {
+ 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h',
+ 'md5': 'd22e82bc592f52d37d24b03531ee9696',
+ 'info_dict': {
+ 'id': '2014020024-628-h',
+ 'ext': 'mp4',
+ 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)',
+ 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014',
+ 'duration': 0,
+ 'upload_date': '20141011',
+ },
+ }, {
'url': 'http://video.flames.nhl.com/videocenter/console?id=630616',
'only_matching': True,
}]
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 8f140d626..6118ed5c2 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -80,8 +80,14 @@ class PBSIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
}
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
+ 'info_dict': {
+ 'id': 'united-states-of-secrets',
+ },
+ 'playlist_count': 2,
}
-
]
def _extract_webpage(self, url):
@@ -96,6 +102,12 @@ class PBSIE(InfoExtractor):
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
+ # tabbed frontline videos
+ tabbed_videos = re.findall(
+ r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage)
+ if tabbed_videos:
+ return tabbed_videos, presumptive_id, upload_date
+
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
@@ -130,6 +142,12 @@ class PBSIE(InfoExtractor):
def _real_extract(self, url):
video_id, display_id, upload_date = self._extract_webpage(url)
+ if isinstance(video_id, list):
+ entries = [self.url_result(
+ 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
+ for vid_id in video_id]
+ return self.playlist_result(entries, display_id)
+
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 4719ba45c..c77671fd3 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -40,14 +40,15 @@ class SoundcloudIE(InfoExtractor):
_TESTS = [
{
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
- 'file': '62986583.mp3',
'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
'info_dict': {
- "upload_date": "20121011",
- "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
- "uploader": "E.T. ExTerrestrial Music",
- "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1",
- "duration": 143,
+ 'id': '62986583',
+ 'ext': 'mp3',
+ 'upload_date': '20121011',
+ 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
+ 'uploader': 'E.T. ExTerrestrial Music',
+ 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
+ 'duration': 143,
}
},
# not streamable song
@@ -103,7 +104,7 @@ class SoundcloudIE(InfoExtractor):
'id': '128590877',
'ext': 'mp3',
'title': 'Bus Brakes',
- 'description': 'md5:0170be75dd395c96025d210d261c784e',
+ 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples',
'upload_date': '20140109',
'duration': 17,
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index 19cc976e3..b9cd35109 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -26,7 +26,6 @@ class SportBoxIE(InfoExtractor):
'timestamp': 1411896237,
'upload_date': '20140928',
'duration': 4846,
- 'view_count': int,
},
'params': {
# m3u8 download
@@ -65,8 +64,6 @@ class SportBoxIE(InfoExtractor):
r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
duration = parse_duration(self._html_search_regex(
r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
- view_count = int_or_none(self._html_search_regex(
- r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False))
return {
'id': video_id,
@@ -76,6 +73,5 @@ class SportBoxIE(InfoExtractor):
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
- 'view_count': view_count,
'formats': formats,
}
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
new file mode 100644
index 000000000..db9788c18
--- /dev/null
+++ b/youtube_dl/extractor/telecinco.py
@@ -0,0 +1,19 @@
+#coding: utf-8
+from __future__ import unicode_literals
+
+from .mitele import MiTeleIE
+
+
+class TelecincoIE(MiTeleIE):
+ IE_NAME = 'telecinco.es'
+ _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html'
+
+ _TEST = {
+ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
+ 'info_dict': {
+ 'id': 'MDSVID20141015_0058',
+ 'ext': 'mp4',
+ 'title': 'Con Martín Berasategui, hacer un bacalao al ...',
+ 'duration': 662,
+ },
+ }
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 306fe8974..40c53ff17 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -4,9 +4,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
class TumblrIE(InfoExtractor):
@@ -18,7 +15,7 @@ class TumblrIE(InfoExtractor):
'id': '54196191430',
'ext': 'mp4',
'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
- 'description': 'md5:dfac39636969fe6bf1caa2d50405f069',
+ 'description': 'md5:37db8211e40b50c7c44e95da14f630b7',
'thumbnail': 're:http://.*\.jpg',
}
}, {
@@ -27,7 +24,7 @@ class TumblrIE(InfoExtractor):
'info_dict': {
'id': '90208453769',
'ext': 'mp4',
- 'title': '5SOS STRUM ;)',
+ 'title': '5SOS STRUM ;]',
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
@@ -41,18 +38,12 @@ class TumblrIE(InfoExtractor):
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
- re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
- video = re.search(re_video, webpage)
- if video is None:
- raise ExtractorError('Unable to extract video')
- video_url = video.group('video_url')
- ext = video.group('ext')
-
- video_thumbnail = self._search_regex(
- r'posters.*?\[\\x22(.*?)\\x22',
- webpage, 'thumbnail', fatal=False) # We pick the first poster
- if video_thumbnail:
- video_thumbnail = video_thumbnail.replace('\\\\/', '/')
+ iframe_url = self._search_regex(
+ r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
+ webpage, 'iframe url')
+ iframe = self._download_webpage(iframe_url, video_id)
+ video_url = self._search_regex(r'<source src="([^"]+)"',
+ iframe, 'video url')
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
@@ -62,9 +53,9 @@ class TumblrIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'description': self._html_search_meta('description', webpage),
- 'thumbnail': video_thumbnail,
- 'ext': ext,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': video_title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
index 9328ef4a2..0faa729c6 100644
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -1,55 +1,85 @@
-import json
-import re
+from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+)
class ViddlerIE(InfoExtractor):
- _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
_TEST = {
- u"url": u"http://www.viddler.com/v/43903784",
- u'file': u'43903784.mp4',
- u'md5': u'fbbaedf7813e514eb7ca30410f439ac9',
- u'info_dict': {
- u"title": u"Video Made Easy",
- u"uploader": u"viddler",
- u"duration": 100.89,
+ "url": "http://www.viddler.com/v/43903784",
+ 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4',
+ 'info_dict': {
+ 'id': '43903784',
+ 'ext': 'mp4',
+ "title": "Video Made Easy",
+ 'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ',
+ "uploader": "viddler",
+ 'timestamp': 1335371429,
+ 'upload_date': '20120425',
+ "duration": 100.89,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'view_count': int,
+ 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'],
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- embed_url = mobj.group('domain') + u'/embed/' + video_id
- webpage = self._download_webpage(embed_url, video_id)
-
- video_sources_code = self._search_regex(
- r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs')
- video_sources = json.loads(video_sources_code.replace("'", '"'))
-
- formats = [{
- 'url': video_url,
- 'format': format_id,
- } for video_url, format_id in video_sources.items()]
-
- title = self._html_search_regex(
- r"title\s*:\s*'([^']*)'", webpage, u'title')
- uploader = self._html_search_regex(
- r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False)
- duration_s = self._html_search_regex(
- r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False)
- duration = float(duration_s) if duration_s else None
- thumbnail = self._html_search_regex(
- r"thumbnail\s*:\s*'([^']*)'",
- webpage, u'thumbnail', fatal=False)
+ video_id = self._match_id(url)
+
+ json_url = (
+ 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' %
+ video_id)
+ data = self._download_json(json_url, video_id)['video']
+
+ formats = []
+ for filed in data['files']:
+ if filed.get('status', 'ready') != 'ready':
+ continue
+ f = {
+ 'format_id': filed['profile_id'],
+ 'format_note': filed['profile_name'],
+ 'url': self._proto_relative_url(filed['url']),
+ 'width': int_or_none(filed.get('width')),
+ 'height': int_or_none(filed.get('height')),
+ 'filesize': int_or_none(filed.get('size')),
+ 'ext': filed.get('ext'),
+ 'source_preference': -1,
+ }
+ formats.append(f)
+
+ if filed.get('cdn_url'):
+ f = f.copy()
+ f['url'] = self._proto_relative_url(filed['cdn_url'])
+ f['format_id'] = filed['profile_id'] + '-cdn'
+ f['source_preference'] = 1
+ formats.append(f)
+
+ if filed.get('html5_video_source'):
+ f = f.copy()
+ f['url'] = self._proto_relative_url(
+ filed['html5_video_source'])
+ f['format_id'] = filed['profile_id'] + '-html5'
+ f['source_preference'] = 0
+ formats.append(f)
+ self._sort_formats(formats)
+
+ categories = [
+ t.get('text') for t in data.get('tags', []) if 'text' in t]
return {
'_type': 'video',
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'duration': duration,
+ 'title': data['title'],
'formats': formats,
+ 'description': data.get('description'),
+ 'timestamp': int_or_none(data.get('upload_time')),
+ 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')),
+ 'uploader': data.get('author'),
+ 'duration': float_or_none(data.get('length')),
+ 'view_count': int_or_none(data.get('view_count')),
+ 'categories': categories,
}
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
new file mode 100644
index 000000000..669979e13
--- /dev/null
+++ b/youtube_dl/extractor/vidzi.py
@@ -0,0 +1,33 @@
+#coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class VidziIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
+ _TEST = {
+ 'url': 'http://vidzi.tv/cghql9yq6emu.html',
+ 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
+ 'info_dict': {
+ 'id': 'cghql9yq6emu',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_regex(
+ r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url')
+ title = self._html_search_regex(
+ r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ }
+ \ No newline at end of file
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
new file mode 100644
index 000000000..57ef8dc30
--- /dev/null
+++ b/youtube_dl/extractor/vrt.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class VRTIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
+ _TESTS = [
+ # deredactie.be
+ {
+ 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL',
+ 'md5': '4cebde1eb60a53782d4f3992cbd46ec8',
+ 'info_dict': {
+ 'id': '2129880',
+ 'ext': 'flv',
+ 'title': 'Het journaal L - 25/10/14',
+ 'description': None,
+ 'timestamp': 1414271750.949,
+ 'upload_date': '20141025',
+ 'duration': 929,
+ }
+ },
+ # sporza.be
+ {
+ 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time',
+ 'md5': '11f53088da9bf8e7cfc42456697953ff',
+ 'info_dict': {
+ 'id': '2124639',
+ 'ext': 'flv',
+ 'title': 'Bekijk Extra Time van 20 oktober',
+ 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426',
+ 'timestamp': 1413835980.560,
+ 'upload_date': '20141020',
+ 'duration': 3238,
+ }
+ },
+ # cobra.be
+ {
+ 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari',
+ 'md5': '78a2b060a5083c4f055449a72477409d',
+ 'info_dict': {
+ 'id': '2126050',
+ 'ext': 'flv',
+ 'title': 'Bret Easton Ellis in Café Corsari',
+ 'description': 'md5:f699986e823f32fd6036c1855a724ee9',
+ 'timestamp': 1413967500.494,
+ 'upload_date': '20141022',
+ 'duration': 661,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False)
+
+ formats = []
+ mobj = re.search(
+ r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"',
+ webpage)
+ if mobj:
+ formats.extend(self._extract_m3u8_formats(
+ '%s/%s' % (mobj.group('server'), mobj.group('path')),
+ video_id, 'mp4'))
+ mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage)
+ if mobj:
+ formats.extend(self._extract_f4m_formats(
+ '%s/manifest.f4m' % mobj.group('src'), video_id))
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = self._og_search_thumbnail(webpage)
+ timestamp = float_or_none(self._search_regex(
+ r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000)
+ duration = float_or_none(self._search_regex(
+ r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index cfae2de89..4ab56e0ac 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -191,8 +191,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _real_initialize(self):
if self._downloader is None:
return
- if not self._set_language():
- return
+ if self._get_login_info()[0] is not None:
+ if not self._set_language():
+ return
if not self._login():
return
self._confirm_age()