aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/InfoExtractors.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/InfoExtractors.py')
-rwxr-xr-xyoutube_dl/InfoExtractors.py96
1 files changed, 75 insertions, 21 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index a7fdf1607..ae36558d7 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -115,7 +115,8 @@ class InfoExtractor(object):
""" Returns the response handle """
if note is None:
note = u'Downloading video webpage'
- self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+ if note is not False:
+ self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
try:
return compat_urllib_request.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -133,6 +134,14 @@ class InfoExtractor(object):
else:
encoding = 'utf-8'
webpage_bytes = urlh.read()
+ if self._downloader.params.get('dump_intermediate_pages', False):
+ try:
+ url = url_or_request.get_full_url()
+ except AttributeError:
+ url = url_or_request
+ self._downloader.to_screen(u'Dumping request to ' + url)
+ dump = base64.b64encode(webpage_bytes).decode('ascii')
+ self._downloader.to_screen(dump)
return webpage_bytes.decode(encoding, 'replace')
#Methods for following #608
@@ -485,18 +494,14 @@ class YoutubeIE(InfoExtractor):
# Get video info
self.report_video_info_webpage_download(video_id)
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
- request = compat_urllib_request.Request(video_info_url)
- try:
- video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
- video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
- return
+ video_info_webpage = self._download_webpage(video_info_url, video_id,
+ note=False,
+ errnote='unable to download video info webpage')
+ video_info = compat_parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
if 'token' not in video_info:
if 'reason' in video_info:
self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
@@ -1151,7 +1156,7 @@ class VimeoIE(InfoExtractor):
# Extract video description
video_description = get_element_by_attribute("itemprop", "description", webpage)
if video_description: video_description = clean_html(video_description)
- else: video_description = ''
+ else: video_description = u''
# Extract upload date
video_upload_date = None
@@ -1794,9 +1799,13 @@ class YoutubePlaylistIE(InfoExtractor):
self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
return
- if not 'feed' in response or not 'entry' in response['feed']:
+ if 'feed' not in response:
self._downloader.report_error(u'Got a malformed response from YouTube API')
return
+ if 'entry' not in response['feed']:
+ # Number of videos is a multiple of self._MAX_RESULTS
+ break
+
videos += [ (entry['yt$position']['$t'], entry['content']['src'])
for entry in response['feed']['entry']
if 'content' in entry ]
@@ -2144,7 +2153,7 @@ class FacebookIE(InfoExtractor):
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
webpage = self._download_webpage(url, video_id)
- BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
+ BEFORE = '{swf.addParam(param[0], param[1]);});\n'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
if not m:
@@ -2152,12 +2161,14 @@ class FacebookIE(InfoExtractor):
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
- video_url = params['hd_src']
+ video_data = params['video_data'][0]
+ video_url = video_data.get('hd_src')
if not video_url:
- video_url = params['sd_src']
+ video_url = video_data['sd_src']
if not video_url:
raise ExtractorError(u'Cannot find video URL')
- video_duration = int(params['video_duration'])
+ video_duration = int(video_data['video_duration'])
+ thumbnail = video_data['thumbnail_src']
m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
if not m:
@@ -2170,7 +2181,7 @@ class FacebookIE(InfoExtractor):
'url': video_url,
'ext': 'mp4',
'duration': video_duration,
- 'thumbnail': params['thumbnail_src'],
+ 'thumbnail': thumbnail,
}
return [info]
@@ -3685,7 +3696,9 @@ class FunnyOrDieIE(InfoExtractor):
m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
if not m:
- self._downloader.trouble(u'Cannot find video title')
+ m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
+ if not m:
+ self._downloader.trouble(u'Cannot find video title')
title = clean_html(m.group('title'))
m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
@@ -4119,7 +4132,7 @@ class KeekIE(InfoExtractor):
video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
+ m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
title = unescapeHTML(m.group('title'))
m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
uploader = clean_html(m.group('uploader'))
@@ -4344,6 +4357,46 @@ class LiveLeakIE(InfoExtractor):
return [info]
+class ARDIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+ _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
+ _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
+
+ def _real_extract(self, url):
+ # determine video id from url
+ m = re.match(self._VALID_URL, url)
+
+ numid = re.search(r'documentId=([0-9]+)', url)
+ if numid:
+ video_id = numid.group(1)
+ else:
+ video_id = m.group('video_id')
+
+ # determine title and media streams from webpage
+ html = self._download_webpage(url, video_id)
+ title = re.search(self._TITLE, html).group('title')
+ streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+ if not streams:
+ assert '"fsk"' in html
+ self._downloader.report_error(u'this video is only available after 8:00 pm')
+ return
+
+ # choose default media type and highest quality for now
+ stream = max([s for s in streams if int(s["media_type"]) == 0],
+ key=lambda s: int(s["quality"]))
+
+ # there's two possibilities: RTMP stream or HTTP download
+ info = {'id': video_id, 'title': title, 'ext': 'mp4'}
+ if stream['rtmp_url']:
+ self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
+ assert stream['video_url'].startswith('mp4:')
+ info["url"] = stream["rtmp_url"]
+ info["play_path"] = stream['video_url']
+ else:
+ assert stream["video_url"].endswith('.mp4')
+ info["url"] = stream["video_url"]
+ return [info]
+
def gen_extractors():
""" Return a list of an instance of every supported extractor.
@@ -4397,5 +4450,6 @@ def gen_extractors():
MySpassIE(),
SpiegelIE(),
LiveLeakIE(),
+ ARDIE(),
GenericIE()
]