aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py1
-rw-r--r--youtube_dl/extractor/brightcove.py62
-rw-r--r--youtube_dl/extractor/dotsub.py29
-rw-r--r--youtube_dl/extractor/gamespot.py20
-rw-r--r--youtube_dl/extractor/generic.py41
-rw-r--r--youtube_dl/extractor/steam.py6
-rw-r--r--youtube_dl/extractor/veoh.py47
-rw-r--r--youtube_dl/extractor/youtube.py60
8 files changed, 220 insertions, 46 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index d2a71a6f1..4b67f333b 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -59,6 +59,7 @@ from .tumblr import TumblrIE
from .tutv import TutvIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
+from .veoh import VeohIE
from .vevo import VevoIE
from .vimeo import VimeoIE
from .vine import VineIE
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index f85acbb5d..68ee5292b 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -1,28 +1,80 @@
import re
import json
+import xml.etree.ElementTree
from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+)
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+ _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+ _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+ _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
+
+ # There is a test for Brigtcove in GenericIE, that way we test both the download
+ # and the detection of videos, and we don't have to find an URL that is always valid
+
+ @classmethod
+ def _build_brighcove_url(cls, object_str):
+ """
+ Build a Brightcove url from a xml string containing
+ <object class="BrightcoveExperience">{params}</object>
+ """
+ object_doc = xml.etree.ElementTree.fromstring(object_str)
+ assert u'BrightcoveExperience' in object_doc.attrib['class']
+ params = {'flashID': object_doc.attrib['id'],
+ 'playerID': object_doc.find('./param[@name="playerID"]').attrib['value'],
+ }
+ playerKey = object_doc.find('./param[@name="playerKey"]')
+ # Not all pages define this value
+ if playerKey is not None:
+ params['playerKey'] = playerKey.attrib['value']
+ videoPlayer = object_doc.find('./param[@name="@videoPlayer"]')
+ if videoPlayer is not None:
+ params['@videoPlayer'] = videoPlayer.attrib['value']
+ data = compat_urllib_parse.urlencode(params)
+ return cls._FEDERATED_URL_TEMPLATE % data
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = mobj.group('query')
- video_id = mobj.group('id')
- request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+ m_video_id = re.search(r'videoPlayer=(\d+)', query)
+ if m_video_id is not None:
+ video_id = m_video_id.group(1)
+ return self._get_video_info(video_id, query)
+ else:
+ player_key = self._search_regex(r'playerKey=(.+?)(&|$)', query, 'playlist_id')
+ return self._get_playlist_info(player_key)
+
+ def _get_video_info(self, video_id, query):
+ request_url = self._FEDERATED_URL_TEMPLATE % query
webpage = self._download_webpage(request_url, video_id)
self.report_extraction(video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+
+ return self._extract_video_info(video_info)
+
+ def _get_playlist_info(self, player_key):
+ playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
+ player_key, u'Downloading playlist information')
+
+ playlist_info = json.loads(playlist_info)['videoList']
+ videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+ return self.playlist_result(videos, playlist_id=playlist_info['id'],
+ playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+
+ def _extract_video_info(self, video_info):
renditions = video_info['renditions']
renditions = sorted(renditions, key=lambda r: r['size'])
best_format = renditions[-1]
-
- return {'id': video_id,
+
+ return {'id': video_info['id'],
'title': video_info['displayName'],
'url': best_format['defaultURL'],
'ext': 'mp4',
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 2afeaba07..0ee9a684e 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -1,5 +1,7 @@
import re
import json
+import time
+
from .common import InfoExtractor
@@ -13,7 +15,8 @@ class DotsubIE(InfoExtractor):
u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
u"uploader": u"4v4l0n42",
u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
- u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p'
+ u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
+ u'upload_date': u'20101213',
}
}
@@ -23,20 +26,16 @@ class DotsubIE(InfoExtractor):
info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
webpage = self._download_webpage(info_url, video_id)
info = json.loads(webpage)
- video_url = info['mediaURI']
- uploader = info['user']
- description = info['description']
- view_count = info['numberOfViews']
- title = info['title']
- thumbnail_url = info['screenshotURI']
- ext = 'flv'
+ date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
+
return [{
'id': video_id,
- 'url': video_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- 'description': description,
- 'uploader': uploader,
- 'view_count': view_count,
+ 'url': info['mediaURI'],
+ 'ext': 'flv',
+ 'title': info['title'],
+ 'thumbnail': info['screenshotURI'],
+ 'description': info['description'],
+ 'uploader': info['user'],
+ 'view_count': info['numberOfViews'],
+ 'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
}]
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index cec3b7ac8..7585b7061 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -4,14 +4,15 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
unified_strdate,
+ compat_urllib_parse,
)
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/'
+ _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
u"file": u"6410818.mp4",
- u"md5": u"5569d64ca98db01f0177c934fe8c1e9b",
+ u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": {
u"title": u"Arma III - Community Guide: SITREP I",
u"upload_date": u"20130627",
@@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(3).split("-")[-1]
- info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id)
+ page_id = mobj.group('page_id')
+ webpage = self._download_webpage(url, page_id)
+ video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
+ r'http://www\.gamespot\.com/videoembed/(\d+)'],
+ webpage, 'video id')
+ data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
+ info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
info_xml = self._download_webpage(info_url, video_id)
doc = xml.etree.ElementTree.fromstring(info_xml)
clip_el = doc.find('./playList/clip')
- video_url = clip_el.find('./URI').text
+ http_urls = [{'url': node.find('filePath').text,
+ 'rate': int(node.find('rate').text)}
+ for node in clip_el.find('./httpURI')]
+ best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
+ video_url = best_quality['url']
title = clip_el.find('./title').text
ext = video_url.rpartition('.')[2]
thumbnail_url = clip_el.find('./screenGrabURI').text
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 20bc53330..33790741f 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1,3 +1,5 @@
+# encoding: utf-8
+
import os
import re
@@ -9,20 +11,34 @@ from ..utils import (
ExtractorError,
)
+from .brightcove import BrightcoveIE
class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = u'generic'
- _TEST = {
- u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
- u'file': u'13601338388002.mp4',
- u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
- u'info_dict': {
- u"uploader": u"www.hodiho.fr",
- u"title": u"R\u00e9gis plante sa Jeep"
- }
- }
+ _TESTS = [
+ {
+ u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+ u'file': u'13601338388002.mp4',
+ u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+ u'info_dict': {
+ u"uploader": u"www.hodiho.fr",
+ u"title": u"R\u00e9gis plante sa Jeep"
+ }
+ },
+ {
+ u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
+ u'file': u'2371591881001.mp4',
+ u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+ u'note': u'Test Brightcove downloads and detection in GenericIE',
+ u'info_dict': {
+ u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+ u'uploader': u'8TV',
+ u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+ }
+ },
+ ]
def report_download_webpage(self, video_id):
"""Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
self.report_extraction(video_id)
+ # Look for BrigthCove:
+ m_brightcove = re.search(r'<object.+?class=".*?BrightcoveExperience.*?".+?</object>', webpage, re.DOTALL)
+ if m_brightcove is not None:
+ self.to_screen(u'Brightcove video detected.')
+ bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
+ return self.url_result(bc_url, 'Brightcove')
+
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py
index ecac4ec40..91658f892 100644
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
u"file": u"81300.flv",
u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
u"info_dict": {
- u"title": u"Terraria 1.1 Trailer"
+ u"title": u"Terraria 1.1 Trailer",
+ u'playlist_index': 1,
}
},
{
u"file": u"80859.flv",
u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
u"info_dict": {
- u"title": u"Terraria Trailer"
+ u"title": u"Terraria Trailer",
+ u'playlist_index': 2,
}
}
]
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
new file mode 100644
index 000000000..00672c9e5
--- /dev/null
+++ b/youtube_dl/extractor/veoh.py
@@ -0,0 +1,47 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+)
+
+class VeohIE(InfoExtractor):
+ _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+
+ _TEST = {
+ u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ u'file': u'56314296.mp4',
+ u'md5': u'620e68e6a3cff80086df3348426c9ca3',
+ u'info_dict': {
+ u'title': u'Straight Backs Are Stronger',
+ u'uploader': u'LUMOback',
+ u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+ if m_youtube is not None:
+ youtube_id = m_youtube.group(1)
+ self.to_screen(u'%s: detected Youtube video.' % video_id)
+ return self.url_result(youtube_id, 'Youtube')
+
+ self.report_extraction(video_id)
+ info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
+ info = json.loads(info)
+ video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+
+ return {'id': info['videoId'],
+ 'title': info['title'],
+ 'ext': determine_ext(video_url),
+ 'url': video_url,
+ 'uploader': info['username'],
+ 'thumbnail': info.get('highResImage') or info.get('medResImage'),
+ 'description': info['description'],
+ 'view_count': info['views'],
+ }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 61b7b561f..87f9994ba 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -117,7 +117,19 @@ class YoutubeIE(InfoExtractor):
u"uploader": u"IconaPop",
u"uploader_id": u"IconaPop"
}
- }
+ },
+ {
+ u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
+ u"file": u"07FYdnEawAQ.mp4",
+ u"note": u"Test VEVO video with age protection (#956)",
+ u"info_dict": {
+ u"upload_date": u"20130703",
+ u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
+ u"description": u"md5:64249768eec3bc4276236606ea996373",
+ u"uploader": u"justintimberlakeVEVO",
+ u"uploader_id": u"justintimberlakeVEVO"
+ }
+ },
]
@@ -178,7 +190,7 @@ class YoutubeIE(InfoExtractor):
elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83:
- return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36]
+ return s[:81]
elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
@@ -410,15 +422,35 @@ class YoutubeIE(InfoExtractor):
# Get video info
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (video_id, el_type))
+ if re.search(r'player-age-gate-content">', video_webpage) is not None:
+ self.report_age_confirmation()
+ age_gate = True
+ # We simulate the access to the video from www.youtube.com/v/{video_id}
+ # this can be viewed without login into Youtube
+ data = compat_urllib_parse.urlencode({'video_id': video_id,
+ 'el': 'embedded',
+ 'gl': 'US',
+ 'hl': 'en',
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+ 'asv': 3,
+ 'sts':'1588',
+ })
+ video_info_url = 'https://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
+ else:
+ age_gate = False
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ % (video_id, el_type))
+ video_info_webpage = self._download_webpage(video_info_url, video_id,
+ note=False,
+ errnote='unable to download video info webpage')
+ video_info = compat_parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
@@ -535,6 +567,8 @@ class YoutubeIE(InfoExtractor):
self.report_rtmp_download()
video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
+ if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
+ raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
@@ -545,9 +579,15 @@ class YoutubeIE(InfoExtractor):
elif 's' in url_data:
if self._downloader.params.get('verbose'):
s = url_data['s'][0]
- player = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
- 'html5 player', fatal=False)
- self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' %
+ if age_gate:
+ player_version = self._search_regex(r'ad3-(.+?)\.swf',
+ video_info['ad3_module'][0], 'flash player',
+ fatal=False)
+ player = 'flash player %s' % player_version
+ else:
+ player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
+ 'html5 player', fatal=False)
+ self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
(len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
signature = self._decrypt_signature(url_data['s'][0])
url += '&signature=' + signature