aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py4
-rw-r--r--youtube_dl/extractor/arte.py17
-rw-r--r--youtube_dl/extractor/bambuser.py80
-rw-r--r--youtube_dl/extractor/brightcove.py4
-rw-r--r--youtube_dl/extractor/common.py5
-rw-r--r--youtube_dl/extractor/dailymotion.py23
-rw-r--r--youtube_dl/extractor/exfm.py2
-rw-r--r--youtube_dl/extractor/extremetube.py50
-rw-r--r--youtube_dl/extractor/keezmovies.py2
-rw-r--r--youtube_dl/extractor/livestream.py10
-rw-r--r--youtube_dl/extractor/metacafe.py51
-rw-r--r--youtube_dl/extractor/mtv.py2
-rw-r--r--youtube_dl/extractor/myspace.py48
-rw-r--r--youtube_dl/extractor/vevo.py94
-rw-r--r--youtube_dl/extractor/vimeo.py15
-rw-r--r--youtube_dl/extractor/vk.py45
-rw-r--r--youtube_dl/extractor/youtube.py28
17 files changed, 401 insertions, 79 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7efd097e4..8dad38a00 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -9,6 +9,7 @@ from .arte import (
ArteTVFutureIE,
)
from .auengine import AUEngineIE
+from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
@@ -39,6 +40,7 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .exfm import ExfmIE
+from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fktv import (
@@ -83,6 +85,7 @@ from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
from .muzu import MuzuTVIE
+from .myspace import MySpaceIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .naver import NaverIE
@@ -141,6 +144,7 @@ from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
+from .vk import VKIE
from .wat import WatIE
from .websurg import WeBSurgIE
from .weibo import WeiboIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index d39b48951..e10c74c11 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -158,7 +158,9 @@ class ArteTVPlus7IE(InfoExtractor):
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
- formats = player_info['VSR'].values()
+ all_formats = player_info['VSR'].values()
+ # Some formats use the m3u8 protocol
+ all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))
def _match_lang(f):
if f.get('versionCode') is None:
return True
@@ -170,11 +172,16 @@ class ArteTVPlus7IE(InfoExtractor):
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
- formats = filter(_match_lang, formats)
- # Some formats use the m3u8 protocol
- formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
- # We order the formats by quality
+ formats = filter(_match_lang, all_formats)
formats = list(formats) # in python3 filter returns an iterator
+ if not formats:
+ # Some videos are only available in the 'Originalversion'
+ # they aren't tagged as being in French or German
+ if all(f['versionCode'] == 'VO' for f in all_formats):
+ formats = all_formats
+ else:
+ raise ExtractorError(u'The formats list is empty')
+ # We order the formats by quality
if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:
sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
else:
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
new file mode 100644
index 000000000..f3b36f473
--- /dev/null
+++ b/youtube_dl/extractor/bambuser.py
@@ -0,0 +1,80 @@
+import re
+import json
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+)
+
+
+class BambuserIE(InfoExtractor):
+ IE_NAME = u'bambuser'
+ _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
+ _API_KEY = '005f64509e19a868399060af746a00aa'
+
+ _TEST = {
+ u'url': u'http://bambuser.com/v/4050584',
+ u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+ u'info_dict': {
+ u'id': u'4050584',
+ u'ext': u'flv',
+ u'title': u'Education engineering days - lightning talks',
+ u'duration': 3741,
+ u'uploader': u'pixelversity',
+ u'uploader_id': u'344706',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
+ '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
+ info_json = self._download_webpage(info_url, video_id)
+ info = json.loads(info_json)['result']
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'url': info['url'],
+ 'thumbnail': info.get('preview'),
+ 'duration': int(info['length']),
+ 'view_count': int(info['views_total']),
+ 'uploader': info['username'],
+ 'uploader_id': info['uid'],
+ }
+
+
+class BambuserChannelIE(InfoExtractor):
+ IE_NAME = u'bambuser:channel'
+ _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
+ # The maximum number we can get with each request
+ _STEP = 50
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user = mobj.group('user')
+ urls = []
+ last_id = ''
+ for i in itertools.count(1):
+ req_url = ('http://bambuser.com/xhr-api/index.php?username={user}'
+ '&sort=created&access_mode=0%2C1%2C2&limit={count}'
+ '&method=broadcast&format=json&vid_older_than={last}'
+ ).format(user=user, count=self._STEP, last=last_id)
+ req = compat_urllib_request.Request(req_url)
+ # Without setting this header, we wouldn't get any result
+ req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
+ info_json = self._download_webpage(req, user,
+ u'Downloading page %d' % i)
+ results = json.loads(info_json)['result']
+ if len(results) == 0:
+ break
+ last_id = results[-1]['vid']
+ urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
+
+ return {
+ '_type': 'playlist',
+ 'title': user,
+ 'entries': urls,
+ }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 1392f382a..0d9b87a34 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -23,7 +23,7 @@ class BrightcoveIE(InfoExtractor):
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
u'file': u'2371591881001.mp4',
- u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+ u'md5': u'8eccab865181d29ec2958f32a6a754f5',
u'note': u'Test Brightcove downloads and detection in GenericIE',
u'info_dict': {
u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
@@ -122,12 +122,10 @@ class BrightcoveIE(InfoExtractor):
best_format = renditions[-1]
info.update({
'url': best_format['defaultURL'],
- 'ext': 'mp4',
})
elif video_info.get('FLVFullLengthURL') is not None:
info.update({
'url': video_info['FLVFullLengthURL'],
- 'ext': 'flv',
})
else:
raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index ce349fe20..e0ccba533 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -63,7 +63,7 @@ class InfoExtractor(object):
* ext Will be calculated from url if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
- Calculated from the format_id, width, height
+ Calculated from the format_id, width, height.
and format_note fields if missing.
* format_id A short description of the format
("mp4_h264_opus" or "19")
@@ -71,6 +71,9 @@ class InfoExtractor(object):
("3D" or "DASH video")
* width Width of the video, if known
* height Height of the video, if known
+ webpage_url: The url to the video webpage, if given to youtube-dl it
+ should allow to get the same result again. (It will be set
+ by YoutubeDL if it's missing)
Unless mentioned otherwise, the fields should be Unicode strings.
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 4c0488245..355b4ed0a 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
"""Build a request with the family filter disabled"""
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'family_filter=off')
+ request.add_header('Cookie', 'ff=off')
return request
class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
@@ -61,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
},
u'skip': u'VEVO is only available in some countries',
},
+ # age-restricted video
+ {
+ u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+ u'file': u'xyh2zz.mp4',
+ u'md5': u'0d667a7b9cebecc3c89ee93099c4159d',
+ u'info_dict': {
+ u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+ u'uploader': 'HotWaves1012',
+ u'age_limit': 18,
+ }
+
+ }
]
def _real_extract(self, url):
@@ -90,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
- webpage, 'video uploader')
+ webpage, 'video uploader', fatal=False)
+ age_limit = self._rta_search(webpage)
video_upload_date = None
mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
@@ -132,15 +146,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self._list_available_subtitles(video_id)
return
- return [{
+ return {
'id': video_id,
'formats': formats,
'uploader': video_uploader,
'upload_date': video_upload_date,
'title': self._og_search_title(webpage),
'subtitles': video_subtitles,
- 'thumbnail': info['thumbnail_url']
- }]
+ 'thumbnail': info['thumbnail_url'],
+ 'age_limit': age_limit,
+ }
def _get_available_subtitles(self, video_id):
try:
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py
index c74556579..a51d79b08 100644
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor):
u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
},
u'note': u'Soundcloud song',
+ u'skip': u'The site is down too often',
},
{
u'url': u'http://ex.fm/song/wddt8',
@@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):
u'title': u'Safe and Sound',
u'uploader': u'Capital Cities',
},
+ u'skip': u'The site is down too often',
},
]
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
new file mode 100644
index 000000000..0f1eec40f
--- /dev/null
+++ b/youtube_dl/extractor/extremetube.py
@@ -0,0 +1,50 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urllib_parse,
+)
+
+class ExtremeTubeIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _TEST = {
+ u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+ u'file': u'652431.mp4',
+ u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
+ u'info_dict': {
+ u"title": u"Music Video 14 british euro brit european cumshots swallow",
+ u"uploader": u"unknown",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ url = 'http://www.' + mobj.group('url')
+
+ req = compat_urllib_request.Request(url)
+ req.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(req, video_id)
+
+ video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
+ uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+ path = compat_urllib_parse_urlparse( video_url ).path
+ extension = os.path.splitext( path )[1][1:]
+ format = path.split('/')[5].split('_')[:2]
+ format = "-".join( format )
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'uploader': uploader,
+ 'url': video_url,
+ 'ext': extension,
+ 'format': format,
+ 'format_id': format,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 5e05900da..786924445 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -12,7 +12,7 @@ from ..aes import (
)
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
_TEST = {
u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
u'file': u'1214711.mp4',
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index d04da98c8..4531fd6ab 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -40,13 +40,9 @@ class LivestreamIE(InfoExtractor):
if video_id is None:
# This is an event page:
- player = get_meta_content('twitter:player', webpage)
- if player is None:
- raise ExtractorError('Couldn\'t extract event api url')
- api_url = player.replace('/player', '')
- api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url)
- info = json.loads(self._download_webpage(api_url, event_name,
- u'Downloading event info'))
+ config_json = self._search_regex(r'window.config = ({.*?});',
+ webpage, u'window config')
+ info = json.loads(config_json)['event']
videos = [self._extract_video_info(video_data['data'])
for video_data in info['feed']['data'] if video_data['type'] == u'video']
return self.playlist_result(videos, info['id'], info['full_name'])
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 234b9e80f..91480ba87 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor):
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = u'metacafe'
- _TESTS = [{
+ _TESTS = [
+ # Youtube video
+ {
u"add_ie": ["Youtube"],
u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
u"file": u"_aUehQsCQtM.mp4",
@@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor):
u"uploader_id": u"PBS"
}
},
+ # Normal metacafe video
+ {
+ u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+ u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
+ u'info_dict': {
+ u'id': u'11121940',
+ u'ext': u'mp4',
+ u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
+ u'uploader': u'ign',
+ u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+ },
+ },
+ # AnyClip video
{
u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
u"file": u"an-dVVXnuY7Jh77J.mp4",
u"info_dict": {
u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
u"uploader": u"anyclip",
- u"description": u"md5:38c711dd98f5bb87acf973d573442e67"
- }
- }]
+ u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
+ },
+ },
+ # age-restricted video
+ {
+ u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+ u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
+ u'info_dict': {
+ u'id': u'5186653',
+ u'ext': u'mp4',
+ u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+ u'uploader': u'Dwayne Pipe',
+ u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
+ u'age_limit': 18,
+ },
+ },
+ ]
def report_disclaimer(self):
@@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor):
'submit': "Continue - I'm over 18",
}
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
self.report_age_confirmation()
compat_urllib_request.urlopen(request).read()
@@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor):
# Retrieve video webpage to extract further information
req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
- req.headers['Cookie'] = 'flashVersion=0;'
+
+ # AnyClip videos require the flashversion cookie so that we get the link
+ # to the mp4 file
+ mobj_an = re.match(r'^an-(.*?)$', video_id)
+ if mobj_an:
+ req.headers['Cookie'] = 'flashVersion=0;'
webpage = self._download_webpage(req, video_id)
# Extract URL, uploader and title from webpage
@@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor):
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
webpage, u'uploader nickname', fatal=False)
+ if re.search(r'"contentRating":"restricted"', webpage) is not None:
+ age_limit = 18
+ else:
+ age_limit = 0
+
return {
'_type': 'video',
'id': video_id,
@@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor):
'upload_date': None,
'title': video_title,
'ext': video_ext,
+ 'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index e520e2bb4..e96d3952c 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -80,6 +80,8 @@ class MTVIE(InfoExtractor):
video_id = self._id_from_uri(uri)
self.report_extraction(video_id)
mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+ # Remove the templates, like &device={device}
+ mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)
if 'acceptMethods' not in mediagen_url:
mediagen_url += '&acceptMethods=fms'
mediagen_page = self._download_webpage(mediagen_url, video_id,
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
new file mode 100644
index 000000000..050f54a5a
--- /dev/null
+++ b/youtube_dl/extractor/myspace.py
@@ -0,0 +1,48 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+)
+
+
+class MySpaceIE(InfoExtractor):
+ _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689',
+ u'info_dict': {
+ u'id': u'100008689',
+ u'ext': u'flv',
+ u'title': u'Viva La Vida',
+ u'description': u'The official Viva La Vida video, directed by Hype Williams',
+ u'uploader': u'Coldplay',
+ u'uploader_id': u'coldplay',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ context = json.loads(self._search_regex(r'context = ({.*?});', webpage,
+ u'context'))
+ video = context['video']
+ rtmp_url, play_path = video['streamUrl'].split(';', 1)
+
+ return {
+ 'id': compat_str(video['mediaId']),
+ 'title': video['title'],
+ 'url': rtmp_url,
+ 'play_path': play_path,
+ 'ext': 'flv',
+ 'description': video['description'],
+ 'thumbnail': video['imageUrl'],
+ 'uploader': video['artistName'],
+ 'uploader_id': video['artistUsername'],
+ }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 1c1cc418d..3f6020f74 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -5,7 +5,7 @@ import datetime
from .common import InfoExtractor
from ..utils import (
- determine_ext,
+ compat_HTTPError,
ExtractorError,
)
@@ -16,26 +16,22 @@ class VevoIE(InfoExtractor):
(currently used by MTVIE)
"""
_VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
- _TEST = {
+ _TESTS = [{
u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
u'file': u'GB1101300280.mp4',
+ u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",
u'info_dict': {
u"upload_date": u"20130624",
u"uploader": u"Hurts",
u"title": u"Somebody to Die For",
- u'duration': 230,
+ u"duration": 230,
+ u"width": 1920,
+ u"height": 1080,
}
- }
+ }]
+ _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
- info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
-
- self.report_extraction(video_id)
- video_info = json.loads(info_json)['video']
+ def _formats_from_json(self, video_info):
last_version = {'version': -1}
for version in video_info['videoVersions']:
# These are the HTTP downloads, other types are for different manifests
@@ -50,17 +46,74 @@ class VevoIE(InfoExtractor):
# Already sorted from worst to best quality
for rend in renditions.findall('rendition'):
attr = rend.attrib
- f_url = attr['url']
+ format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
formats.append({
- 'url': f_url,
- 'ext': determine_ext(f_url),
+ 'url': attr['url'],
+ 'format_id': attr['name'],
+ 'format_note': format_note,
'height': int(attr['frameheight']),
'width': int(attr['frameWidth']),
})
+ return formats
+
+ def _formats_from_smil(self, smil_xml):
+ formats = []
+ smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+ els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
+ for el in els:
+ src = el.attrib['src']
+ m = re.match(r'''(?xi)
+ (?P<ext>[a-z0-9]+):
+ (?P<path>
+ [/a-z0-9]+ # The directory and main part of the URL
+ _(?P<cbr>[0-9]+)k
+ _(?P<width>[0-9]+)x(?P<height>[0-9]+)
+ _(?P<vcodec>[a-z0-9]+)
+ _(?P<vbr>[0-9]+)
+ _(?P<acodec>[a-z0-9]+)
+ _(?P<abr>[0-9]+)
+ \.[a-z0-9]+ # File extension
+ )''', src)
+ if not m:
+ continue
- date_epoch = int(self._search_regex(
- r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000
- upload_date = datetime.datetime.fromtimestamp(date_epoch)
+ format_url = self._SMIL_BASE_URL + m.group('path')
+ format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
+ m.groupdict())
+ formats.append({
+ 'url': format_url,
+ 'format_id': u'SMIL_' + m.group('cbr'),
+ 'format_note': format_note,
+ 'ext': m.group('ext'),
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ return formats
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
+ info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
+ video_info = json.loads(info_json)['video']
+
+ formats = self._formats_from_json(video_info)
+ try:
+ smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+ self._SMIL_BASE_URL, video_id, video_id.lower())
+ smil_xml = self._download_webpage(smil_url, video_id,
+ u'Downloading SMIL info')
+ formats.extend(self._formats_from_smil(smil_xml))
+ except ExtractorError as ee:
+ if not isinstance(ee.cause, compat_HTTPError):
+ raise
+ self._downloader.report_warning(
+ u'Cannot download SMIL information, falling back to JSON ..')
+
+ timestamp_ms = int(self._search_regex(
+ r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))
+ upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
info = {
'id': video_id,
'title': video_info['title'],
@@ -71,7 +124,4 @@ class VevoIE(InfoExtractor):
'duration': video_info['duration'],
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
return info
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index b4dbcd2ee..62273fd33 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -20,14 +20,14 @@ class VimeoIE(InfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
- _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
+ _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
_NETRC_MACHINE = 'vimeo'
IE_NAME = u'vimeo'
_TESTS = [
{
u'url': u'http://vimeo.com/56015672#at=0',
u'file': u'56015672.mp4',
- u'md5': u'ae7a1d8b183758a0506b0622f37dfa14',
+ u'md5': u'8879b6cc097e987f02484baf890129e5',
u'info_dict': {
u"upload_date": u"20121220",
u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
@@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
- if not mobj.group('proto'):
- url = 'https://' + url
- elif mobj.group('pro'):
+ if mobj.group('pro') or mobj.group('player'):
url = 'http://player.vimeo.com/video/' + video_id
- elif mobj.group('direct_link'):
+ else:
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
@@ -234,7 +232,7 @@ class VimeoIE(InfoExtractor):
if len(formats) == 0:
raise ExtractorError(u'No known codec found')
- return [{
+ return {
'id': video_id,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
@@ -243,7 +241,8 @@ class VimeoIE(InfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'formats': formats,
- }]
+ 'webpage_url': url,
+ }
class VimeoChannelIE(InfoExtractor):
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
new file mode 100644
index 000000000..90d8a6d07
--- /dev/null
+++ b/youtube_dl/extractor/vk.py
@@ -0,0 +1,45 @@
+# encoding: utf-8
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+ unescapeHTML,
+)
+
+
+class VKIE(InfoExtractor):
+ IE_NAME = u'vk.com'
+ _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+
+ _TEST = {
+ u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
+ u'md5': u'0deae91935c54e00003c2a00646315f0',
+ u'info_dict': {
+ u'id': u'162222515',
+ u'ext': u'flv',
+ u'title': u'ProtivoGunz - Хуёвая песня',
+ u'uploader': u'Noize MC',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
+ info_page = self._download_webpage(info_url, video_id)
+ m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
+ if m_yt is not None:
+ self.to_screen(u'Youtube video detected')
+ return self.url_result(m_yt.group(1), 'Youtube')
+ vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars')
+ vars = json.loads(vars_json)
+
+ return {
+ 'id': compat_str(vars['vid']),
+ 'url': vars['url240'],
+ 'title': unescapeHTML(vars['md_title']),
+ 'thumbnail': vars['jpg'],
+ 'uploader': vars['md_author'],
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index d05d0a8c1..6ddd6ef06 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
return False
- galx = None
- dsh = None
- match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- galx = match.group(1)
- match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- dsh = match.group(1)
+ galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
+ login_page, u'Login GALX parameter')
# Log in
login_form_strs = {
@@ -95,7 +89,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
u'checkConnection': u'',
u'checkedDomains': u'youtube',
u'dnConn': u'',
- u'dsh': dsh,
u'pstMsg': u'0',
u'rmShown': u'1',
u'secTok': u'',
@@ -347,18 +340,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
}
},
{
- u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
- u"file": u"1ltcDfZMA3U.mp4",
- u"note": u"Test VEVO video (#897)",
- u"info_dict": {
- u"upload_date": u"20070518",
- u"title": u"Maps - It Will Find You",
- u"description": u"Music video by Maps performing It Will Find You.",
- u"uploader": u"MuteUSA",
- u"uploader_id": u"MuteUSA"
- }
- },
- {
u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
u"file": u"UxxajLWwzqY.mp4",
u"note": u"Test generic use_cipher_signature video (#897)",
@@ -1118,7 +1099,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'lang': lang,
'v': video_id,
'fmt': self._downloader.params.get('subtitlesformat'),
- 'name': l[0],
+ 'name': l[0].encode('utf-8'),
})
url = u'http://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
@@ -1504,7 +1485,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'subtitles': video_subtitles,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations
+ 'annotations': video_annotations,
+ 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
})
return results