aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/FileDownloader.py89
-rw-r--r--youtube_dl/YoutubeDL.py243
-rw-r--r--youtube_dl/__init__.py62
-rw-r--r--youtube_dl/extractor/__init__.py7
-rw-r--r--youtube_dl/extractor/appletrailers.py23
-rw-r--r--youtube_dl/extractor/arte.py35
-rw-r--r--youtube_dl/extractor/bliptv.py39
-rw-r--r--youtube_dl/extractor/brightcove.py22
-rw-r--r--youtube_dl/extractor/cbs.py30
-rw-r--r--youtube_dl/extractor/channel9.py267
-rw-r--r--youtube_dl/extractor/clipsyndicate.py10
-rw-r--r--youtube_dl/extractor/common.py38
-rw-r--r--youtube_dl/extractor/dailymotion.py16
-rw-r--r--youtube_dl/extractor/daum.py2
-rw-r--r--youtube_dl/extractor/ign.py2
-rw-r--r--youtube_dl/extractor/metacafe.py19
-rw-r--r--youtube_dl/extractor/metacritic.py9
-rw-r--r--youtube_dl/extractor/mixcloud.py23
-rw-r--r--youtube_dl/extractor/mtv.py9
-rw-r--r--youtube_dl/extractor/naver.py2
-rw-r--r--youtube_dl/extractor/ndtv.py66
-rw-r--r--youtube_dl/extractor/pornhd.py38
-rw-r--r--youtube_dl/extractor/pornhub.py2
-rw-r--r--youtube_dl/extractor/rtlnow.py5
-rw-r--r--youtube_dl/extractor/smotri.py107
-rw-r--r--youtube_dl/extractor/soundcloud.py56
-rw-r--r--youtube_dl/extractor/stanfordoc.py20
-rw-r--r--youtube_dl/extractor/theplatform.py12
-rw-r--r--youtube_dl/extractor/vimeo.py12
-rw-r--r--youtube_dl/extractor/wimp.py23
-rw-r--r--youtube_dl/extractor/xhamster.py4
-rw-r--r--youtube_dl/extractor/youtube.py135
-rw-r--r--youtube_dl/extractor/zdf.py4
-rw-r--r--youtube_dl/utils.py59
-rw-r--r--youtube_dl/version.py2
35 files changed, 1158 insertions, 334 deletions
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py
index 3ff9716b3..47124932f 100644
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -204,11 +204,27 @@ class FileDownloader(object):
"""Report destination filename."""
self.to_screen(u'[download] Destination: ' + filename)
+ def _report_progress_status(self, msg, is_last_line=False):
+ fullmsg = u'[download] ' + msg
+ if self.params.get('progress_with_newline', False):
+ self.to_screen(fullmsg)
+ else:
+ if os.name == 'nt':
+ prev_len = getattr(self, '_report_progress_prev_line_length',
+ 0)
+ if prev_len > len(fullmsg):
+ fullmsg += u' ' * (prev_len - len(fullmsg))
+ self._report_progress_prev_line_length = len(fullmsg)
+ clear_line = u'\r'
+ else:
+ clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r')
+ self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
+ self.to_console_title(u'youtube-dl ' + msg)
+
def report_progress(self, percent, data_len_str, speed, eta):
"""Report download progress."""
if self.params.get('noprogress', False):
return
- clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
if eta is not None:
eta_str = self.format_eta(eta)
else:
@@ -218,14 +234,29 @@ class FileDownloader(object):
else:
percent_str = 'Unknown %'
speed_str = self.format_speed(speed)
- if self.params.get('progress_with_newline', False):
- self.to_screen(u'[download] %s of %s at %s ETA %s' %
- (percent_str, data_len_str, speed_str, eta_str))
+
+ msg = (u'%s of %s at %s ETA %s' %
+ (percent_str, data_len_str, speed_str, eta_str))
+ self._report_progress_status(msg)
+
+ def report_progress_live_stream(self, downloaded_data_len, speed, elapsed):
+ if self.params.get('noprogress', False):
+ return
+ downloaded_str = format_bytes(downloaded_data_len)
+ speed_str = self.format_speed(speed)
+ elapsed_str = FileDownloader.format_seconds(elapsed)
+ msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
+ self._report_progress_status(msg)
+
+ def report_finish(self, data_len_str, tot_time):
+ """Report download finished."""
+ if self.params.get('noprogress', False):
+ self.to_screen(u'[download] Download completed')
else:
- self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
- (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
- self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %
- (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
+ self._report_progress_status(
+ (u'100%% of %s in %s' %
+ (data_len_str, self.format_seconds(tot_time))),
+ is_last_line=True)
def report_resuming_byte(self, resume_len):
"""Report attempt to resume at given byte."""
@@ -246,16 +277,7 @@ class FileDownloader(object):
"""Report it was impossible to resume download."""
self.to_screen(u'[download] Unable to resume')
- def report_finish(self, data_len_str, tot_time):
- """Report download finished."""
- if self.params.get('noprogress', False):
- self.to_screen(u'[download] Download completed')
- else:
- clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'')
- self.to_screen(u'\r%s[download] 100%% of %s in %s' %
- (clear_line, data_len_str, self.format_seconds(tot_time)))
-
- def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
+ def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn):
def run_rtmpdump(args):
start = time.time()
resume_percent = None
@@ -301,11 +323,27 @@ class FileDownloader(object):
'eta': eta,
'speed': speed,
})
- elif self.params.get('verbose', False):
- if not cursor_in_new_line:
- self.to_screen(u'')
- cursor_in_new_line = True
- self.to_screen(u'[rtmpdump] '+line)
+ else:
+ # no percent for live streams
+ mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
+ if mobj:
+ downloaded_data_len = int(float(mobj.group(1))*1024)
+ time_now = time.time()
+ speed = self.calc_speed(start, time_now, downloaded_data_len)
+ self.report_progress_live_stream(downloaded_data_len, speed, time_now - start)
+ cursor_in_new_line = False
+ self._hook_progress({
+ 'downloaded_bytes': downloaded_data_len,
+ 'tmpfilename': tmpfilename,
+ 'filename': filename,
+ 'status': 'downloading',
+ 'speed': speed,
+ })
+ elif self.params.get('verbose', False):
+ if not cursor_in_new_line:
+ self.to_screen(u'')
+ cursor_in_new_line = True
+ self.to_screen(u'[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
self.to_screen(u'')
@@ -338,6 +376,8 @@ class FileDownloader(object):
basic_args += ['--stop', '1']
if live:
basic_args += ['--live']
+ if conn:
+ basic_args += ['--conn', conn]
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
@@ -479,7 +519,8 @@ class FileDownloader(object):
info_dict.get('page_url', None),
info_dict.get('play_path', None),
info_dict.get('tc_url', None),
- info_dict.get('rtmp_live', False))
+ info_dict.get('rtmp_live', False),
+ info_dict.get('rtmp_conn', None))
# Attempt to download using mplayer
if url.startswith('mms') or url.startswith('rtsp'):
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 07b36a98e..2a4ab674d 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -3,6 +3,7 @@
from __future__ import absolute_import
+import collections
import errno
import io
import json
@@ -22,7 +23,6 @@ if os.name == 'nt':
from .utils import (
compat_cookiejar,
compat_http_client,
- compat_print,
compat_str,
compat_urllib_error,
compat_urllib_request,
@@ -34,6 +34,8 @@ from .utils import (
encodeFilename,
ExtractorError,
format_bytes,
+ formatSeconds,
+ get_term_width,
locked_file,
make_HTTPS_handler,
MaxDownloadsReached,
@@ -93,6 +95,7 @@ class YoutubeDL(object):
forcethumbnail: Force printing thumbnail URL.
forcedescription: Force printing description.
forcefilename: Force printing final filename.
+ forceduration: Force printing duration.
forcejson: Force printing info_dict as JSON.
simulate: Do not download the video files.
format: Video format code.
@@ -126,13 +129,24 @@ class YoutubeDL(object):
noplaylist: Download single video instead of a playlist if in doubt.
age_limit: An integer representing the user's age in years.
Unsuitable videos for the given age are skipped.
- download_archive: File name of a file where all downloads are recorded.
+ min_views: An integer representing the minimum view count the video
+ must have in order to not be skipped.
+ Videos without view count information are always
+ downloaded. None for no limit.
+ max_views: An integer representing the maximum view count.
+ Videos that are more popular than that are not
+ downloaded.
+ Videos without view count information are always
+ downloaded. None for no limit.
+ download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
proxy: URL of the proxy server to use
socket_timeout: Time to wait for unresponsive hosts, in seconds
+ bidi_workaround: Work around buggy terminals without bidirectional text
+ support, using fridibi
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -156,8 +170,30 @@ class YoutubeDL(object):
self._download_retcode = 0
self._num_downloads = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+ self._err_file = sys.stderr
self.params = {} if params is None else params
+ if params.get('bidi_workaround', False):
+ try:
+ import pty
+ master, slave = pty.openpty()
+ width = get_term_width()
+ if width is None:
+ width_args = []
+ else:
+ width_args = ['-w', str(width)]
+ self._fribidi = subprocess.Popen(
+ ['fribidi', '-c', 'UTF-8'] + width_args,
+ stdin=subprocess.PIPE,
+ stdout=slave,
+ stderr=self._err_file)
+ self._fribidi_channel = os.fdopen(master, 'rb')
+ except OSError as ose:
+ if ose.errno == 2:
+ self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
+ else:
+ raise
+
if (sys.version_info >= (3,) and sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
and not params['restrictfilenames']):
@@ -205,13 +241,31 @@ class YoutubeDL(object):
self._pps.append(pp)
pp.set_downloader(self)
+ def _bidi_workaround(self, message):
+ if not hasattr(self, '_fribidi_channel'):
+ return message
+
+ assert type(message) == type(u'')
+ line_count = message.count(u'\n') + 1
+ self._fribidi.stdin.write((message + u'\n').encode('utf-8'))
+ self._fribidi.stdin.flush()
+ res = u''.join(self._fribidi_channel.readline().decode('utf-8')
+ for _ in range(line_count))
+ return res[:-len(u'\n')]
+
def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
+ return self.to_stdout(message, skip_eol, check_quiet=True)
+
+ def to_stdout(self, message, skip_eol=False, check_quiet=False):
+ """Print message to stdout if not in quiet mode."""
if self.params.get('logger'):
self.params['logger'].debug(message)
- elif not self.params.get('quiet', False):
+ elif not check_quiet or not self.params.get('quiet', False):
+ message = self._bidi_workaround(message)
terminator = [u'\n', u''][skip_eol]
output = message + terminator
+
write_string(output, self._screen_file)
def to_stderr(self, message):
@@ -220,10 +274,9 @@ class YoutubeDL(object):
if self.params.get('logger'):
self.params['logger'].error(message)
else:
+ message = self._bidi_workaround(message)
output = message + u'\n'
- if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
- output = output.encode(preferredencoding())
- sys.stderr.write(output)
+ write_string(output, self._err_file)
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
@@ -294,7 +347,7 @@ class YoutubeDL(object):
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
'''
- if sys.stderr.isatty() and os.name != 'nt':
+ if self._err_file.isatty() and os.name != 'nt':
_msg_header = u'\033[0;33mWARNING:\033[0m'
else:
_msg_header = u'WARNING:'
@@ -306,29 +359,13 @@ class YoutubeDL(object):
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
'''
- if sys.stderr.isatty() and os.name != 'nt':
+ if self._err_file.isatty() and os.name != 'nt':
_msg_header = u'\033[0;31mERROR:\033[0m'
else:
_msg_header = u'ERROR:'
error_message = u'%s %s' % (_msg_header, message)
self.trouble(error_message, tb)
- def report_writedescription(self, descfn):
- """ Report that the description file is being written """
- self.to_screen(u'[info] Writing video description to: ' + descfn)
-
- def report_writesubtitles(self, sub_filename):
- """ Report that the subtitles file is being written """
- self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
-
- def report_writeinfojson(self, infofn):
- """ Report that the metadata file has been written """
- self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
-
- def report_writeannotations(self, annofn):
- """ Report that the annotations file has been written. """
- self.to_screen(u'[info] Writing video annotations to: ' + annofn)
-
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
try:
@@ -355,18 +392,17 @@ class YoutubeDL(object):
template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
sanitize = lambda k, v: sanitize_filename(
- u'NA' if v is None else compat_str(v),
+ compat_str(v),
restricted=self.params.get('restrictfilenames'),
is_id=(k == u'id'))
template_dict = dict((k, sanitize(k, v))
- for k, v in template_dict.items())
+ for k, v in template_dict.items()
+ if v is not None)
+ template_dict = collections.defaultdict(lambda: u'NA', template_dict)
tmpl = os.path.expanduser(self.params['outtmpl'])
filename = tmpl % template_dict
return filename
- except KeyError as err:
- self.report_error(u'Erroneous output template')
- return None
except ValueError as err:
self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
return None
@@ -374,13 +410,14 @@ class YoutubeDL(object):
def _match_entry(self, info_dict):
""" Returns None iff the file should be downloaded """
+ video_title = info_dict.get('title', info_dict.get('id', u'video'))
if 'title' in info_dict:
# This can happen when we're just evaluating the playlist
title = info_dict['title']
matchtitle = self.params.get('matchtitle', False)
if matchtitle:
if not re.search(matchtitle, title, re.IGNORECASE):
- return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+ return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
rejecttitle = self.params.get('rejecttitle', False)
if rejecttitle:
if re.search(rejecttitle, title, re.IGNORECASE):
@@ -389,14 +426,21 @@ class YoutubeDL(object):
if date is not None:
dateRange = self.params.get('daterange', DateRange())
if date not in dateRange:
- return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+ return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+ view_count = info_dict.get('view_count', None)
+ if view_count is not None:
+ min_views = self.params.get('min_views')
+ if min_views is not None and view_count < min_views:
+ return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
+ max_views = self.params.get('max_views')
+ if max_views is not None and view_count > max_views:
+ return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
age_limit = self.params.get('age_limit')
if age_limit is not None:
if age_limit < info_dict.get('age_limit', 0):
return u'Skipping "' + title + '" because it is age restricted'
if self.in_download_archive(info_dict):
- return (u'%s has already been recorded in archive'
- % info_dict.get('title', info_dict.get('id', u'video')))
+ return u'%s has already been recorded in archive' % video_title
return None
@staticmethod
@@ -695,22 +739,25 @@ class YoutubeDL(object):
# Forced printings
if self.params.get('forcetitle', False):
- compat_print(info_dict['fulltitle'])
+ self.to_stdout(info_dict['fulltitle'])
if self.params.get('forceid', False):
- compat_print(info_dict['id'])
+ self.to_stdout(info_dict['id'])
if self.params.get('forceurl', False):
# For RTMP URLs, also include the playpath
- compat_print(info_dict['url'] + info_dict.get('play_path', u''))
+ self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
- compat_print(info_dict['thumbnail'])
+ self.to_stdout(info_dict['thumbnail'])
if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
- compat_print(info_dict['description'])
+ self.to_stdout(info_dict['description'])
if self.params.get('forcefilename', False) and filename is not None:
- compat_print(filename)
+ self.to_stdout(filename)
+ if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
+ self.to_stdout(formatSeconds(info_dict['duration']))
if self.params.get('forceformat', False):
- compat_print(info_dict['format'])
+ self.to_stdout(info_dict['format'])
if self.params.get('forcejson', False):
- compat_print(json.dumps(info_dict))
+ info_dict['_filename'] = filename
+ self.to_stdout(json.dumps(info_dict))
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
@@ -728,28 +775,34 @@ class YoutubeDL(object):
return
if self.params.get('writedescription', False):
- try:
- descfn = filename + u'.description'
- self.report_writedescription(descfn)
- with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
- descfile.write(info_dict['description'])
- except (KeyError, TypeError):
- self.report_warning(u'There\'s no description to write.')
- except (OSError, IOError):
- self.report_error(u'Cannot write description file ' + descfn)
- return
+ descfn = filename + u'.description'
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
+ self.to_screen(u'[info] Video description is already present')
+ else:
+ try:
+ self.to_screen(u'[info] Writing video description to: ' + descfn)
+ with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
+ descfile.write(info_dict['description'])
+ except (KeyError, TypeError):
+ self.report_warning(u'There\'s no description to write.')
+ except (OSError, IOError):
+ self.report_error(u'Cannot write description file ' + descfn)
+ return
if self.params.get('writeannotations', False):
- try:
- annofn = filename + u'.annotations.xml'
- self.report_writeannotations(annofn)
- with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
- annofile.write(info_dict['annotations'])
- except (KeyError, TypeError):
- self.report_warning(u'There are no annotations to write.')
- except (OSError, IOError):
- self.report_error(u'Cannot write annotations file: ' + annofn)
- return
+ annofn = filename + u'.annotations.xml'
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
+ self.to_screen(u'[info] Video annotations are already present')
+ else:
+ try:
+ self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+ with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+ annofile.write(info_dict['annotations'])
+ except (KeyError, TypeError):
+ self.report_warning(u'There are no annotations to write.')
+ except (OSError, IOError):
+ self.report_error(u'Cannot write annotations file: ' + annofn)
+ return
subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')])
@@ -765,38 +818,48 @@ class YoutubeDL(object):
continue
try:
sub_filename = subtitles_filename(filename, sub_lang, sub_format)
- self.report_writesubtitles(sub_filename)
- with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
- subfile.write(sub)
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+ self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+ else:
+ self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
+ with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
+ subfile.write(sub)
except (OSError, IOError):
self.report_error(u'Cannot write subtitles file ' + descfn)
return
if self.params.get('writeinfojson', False):
infofn = os.path.splitext(filename)[0] + u'.info.json'
- self.report_writeinfojson(infofn)
- try:
- json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
- write_json_file(json_info_dict, encodeFilename(infofn))
- except (OSError, IOError):
- self.report_error(u'Cannot write metadata to JSON file ' + infofn)
- return
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
+ self.to_screen(u'[info] Video description metadata is already present')
+ else:
+ self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
+ try:
+ json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
+ write_json_file(json_info_dict, encodeFilename(infofn))
+ except (OSError, IOError):
+ self.report_error(u'Cannot write metadata to JSON file ' + infofn)
+ return
if self.params.get('writethumbnail', False):
if info_dict.get('thumbnail') is not None:
thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
- thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
- self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
- (info_dict['extractor'], info_dict['id']))
- try:
- uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
- with open(thumb_filename, 'wb') as thumbf:
- shutil.copyfileobj(uf, thumbf)
- self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
- (info_dict['extractor'], info_dict['id'], thumb_filename))
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_warning(u'Unable to download thumbnail "%s": %s' %
- (info_dict['thumbnail'], compat_str(err)))
+ thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
+ self.to_screen(u'[%s] %s: Thumbnail is already present' %
+ (info_dict['extractor'], info_dict['id']))
+ else:
+ self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
+ (info_dict['extractor'], info_dict['id']))
+ try:
+ uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+ with open(thumb_filename, 'wb') as thumbf:
+ shutil.copyfileobj(uf, thumbf)
+ self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
+ (info_dict['extractor'], info_dict['id'], thumb_filename))
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self.report_warning(u'Unable to download thumbnail "%s": %s' %
+ (info_dict['thumbnail'], compat_str(err)))
if not self.params.get('skip_download', False):
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
@@ -841,6 +904,20 @@ class YoutubeDL(object):
return self._download_retcode
+ def download_with_info_file(self, info_filename):
+ with io.open(info_filename, 'r', encoding='utf-8') as f:
+ info = json.load(f)
+ try:
+ self.process_ie_result(info, download=True)
+ except DownloadError:
+ webpage_url = info.get('webpage_url')
+ if webpage_url is not None:
+ self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url)
+ return self.download([webpage_url])
+ else:
+ raise
+ return self._download_retcode
+
def post_process(self, filename, ie_info):
"""Run all the postprocessors on the given file."""
info = dict(ie_info)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index d2446b670..0775b72fd 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -37,6 +37,7 @@ __authors__ = (
'Anton Larionov',
'Takuya Tsuchida',
'Sergey M.',
+ 'Michael Orlitzky',
)
__license__ = 'Public Domain'
@@ -48,7 +49,6 @@ import os
import random
import re
import shlex
-import subprocess
import sys
@@ -57,11 +57,13 @@ from .utils import (
DateRange,
decodeOption,
determine_ext,
+ get_term_width,
DownloadError,
get_cachedir,
MaxDownloadsReached,
preferredencoding,
SameFileError,
+ setproctitle,
std_headers,
write_string,
)
@@ -113,19 +115,6 @@ def parseOpts(overrideArguments=None):
def _comma_separated_values_options_callback(option, opt_str, value, parser):
setattr(parser.values, option.dest, value.split(','))
- def _find_term_columns():
- columns = os.environ.get('COLUMNS', None)
- if columns:
- return int(columns)
-
- try:
- sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- out,err = sp.communicate()
- return int(out.split()[1])
- except:
- pass
- return None
-
def _hide_login_info(opts):
opts = list(opts)
for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
@@ -140,7 +129,7 @@ def parseOpts(overrideArguments=None):
max_help_position = 80
# No need to wrap help messages if we're on a wide console
- columns = _find_term_columns()
+ columns = get_term_width()
if columns: max_width = columns
fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
@@ -204,6 +193,9 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--socket-timeout', dest='socket_timeout',
type=float, default=None, help=optparse.SUPPRESS_HELP)
+ general.add_option(
+ '--bidi-workaround', dest='bidi_workaround', action='store_true',
+ help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH')
selection.add_option('--playlist-start',
@@ -220,6 +212,14 @@ def parseOpts(overrideArguments=None):
selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None)
selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None)
+ selection.add_option(
+ '--min-views', metavar='COUNT', dest='min_views',
+ default=None, type=int,
+ help="Do not download any videos with less than COUNT views",)
+ selection.add_option(
+ '--max-views', metavar='COUNT', dest='max_views',
+ default=None, type=int,
+ help="Do not download any videos with more than COUNT views",)
selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False)
selection.add_option('--age-limit', metavar='YEARS', dest='age_limit',
help='download only videos suitable for the given age',
@@ -300,6 +300,9 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('--get-description',
action='store_true', dest='getdescription',
help='simulate, quiet but print video description', default=False)
+ verbosity.add_option('--get-duration',
+ action='store_true', dest='getduration',
+ help='simulate, quiet but print video length', default=False)
verbosity.add_option('--get-filename',
action='store_true', dest='getfilename',
help='simulate, quiet but print output filename', default=False)
@@ -360,6 +363,9 @@ def parseOpts(overrideArguments=None):
help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False)
filesystem.add_option('-a', '--batch-file',
dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
+ filesystem.add_option('--load-info',
+ dest='load_info_filename', metavar='FILE',
+ help='json file containing the video information (created with the "--write-json" option')
filesystem.add_option('-w', '--no-overwrites',
action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
filesystem.add_option('-c', '--continue',
@@ -467,12 +473,15 @@ def parseOpts(overrideArguments=None):
return parser, opts, args
+
def _real_main(argv=None):
# Compatibility fixes for Windows
if sys.platform == 'win32':
# https://github.com/rg3/youtube-dl/issues/820
codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
+ setproctitle(u'youtube-dl')
+
parser, opts, args = parseOpts(argv)
# Set user agent
@@ -611,27 +620,30 @@ def _real_main(argv=None):
or (opts.useid and u'%(id)s.%(ext)s')
or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
or u'%(title)s-%(id)s.%(ext)s')
- if '%(ext)s' not in outtmpl and opts.extractaudio:
+ if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
parser.error(u'Cannot download a video and extract audio into the same'
- u' file! Use "%%(ext)s" instead of %r' %
- determine_ext(outtmpl, u''))
+ u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
+ u' template'.format(outtmpl))
+
+ any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson
ydl_opts = {
'usenetrc': opts.usenetrc,
'username': opts.username,
'password': opts.password,
'videopassword': opts.videopassword,
- 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
+ 'quiet': (opts.quiet or any_printing),
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
'forcethumbnail': opts.getthumbnail,
'forcedescription': opts.getdescription,
+ 'forceduration': opts.getduration,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
'forcejson': opts.dumpjson,
'simulate': opts.simulate,
- 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
+ 'skip_download': (opts.skip_download or opts.simulate or any_printing),
'format': opts.format,
'format_limit': opts.format_limit,
'listformats': opts.listformats,
@@ -675,6 +687,8 @@ def _real_main(argv=None):
'keepvideo': opts.keepvideo,
'min_filesize': opts.min_filesize,
'max_filesize': opts.max_filesize,
+ 'min_views': opts.min_views,
+ 'max_views': opts.max_views,
'daterange': date,
'cachedir': opts.cachedir,
'youtube_print_sig_code': opts.youtube_print_sig_code,
@@ -684,6 +698,7 @@ def _real_main(argv=None):
'nocheckcertificate': opts.no_check_certificate,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
+ 'bidi_workaround': opts.bidi_workaround,
}
with YoutubeDL(ydl_opts) as ydl:
@@ -706,14 +721,17 @@ def _real_main(argv=None):
update_self(ydl.to_screen, opts.verbose)
# Maybe do nothing
- if len(all_urls) < 1:
+ if (len(all_urls) < 1) and (opts.load_info_filename is None):
if not opts.update_self:
parser.error(u'you must provide at least one URL')
else:
sys.exit()
try:
- retcode = ydl.download(all_urls)
+ if opts.load_info_filename is not None:
+ retcode = ydl.download_with_info_file(opts.load_info_filename)
+ else:
+ retcode = ydl.download(all_urls)
except MaxDownloadsReached:
ydl.to_screen(u'--max-download limit reached, aborting.')
retcode = 101
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 8a063648c..cebb8717f 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -8,6 +8,7 @@ from .arte import (
ArteTVPlus7IE,
ArteTVCreativeIE,
ArteTVFutureIE,
+ ArteTVDDCIE,
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
@@ -19,6 +20,8 @@ from .brightcove import BrightcoveIE
from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
+from .cbs import CBSIE
+from .channel9 import Channel9IE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .clipsyndicate import ClipsyndicateIE
@@ -100,6 +103,7 @@ from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import NBCNewsIE
+from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
@@ -110,6 +114,7 @@ from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
from .podomatic import PodomaticIE
+from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
from .pyvideo import PyvideoIE
@@ -128,6 +133,7 @@ from .smotri import (
SmotriIE,
SmotriCommunityIE,
SmotriUserIE,
+ SmotriBroadcastIE,
)
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
@@ -210,6 +216,7 @@ from .youtube import (
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
+ YoutubeTopListIE,
)
from .zdf import ZDFIE
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a527f10de..ef5644aa5 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
import json
from .common import InfoExtractor
@@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
- playlist_snippet = self._download_webpage(playlist_url, movie)
- playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
- playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
- # The ' in the onClick attributes are not escaped, it couldn't be parsed
- # with xml.etree.ElementTree.fromstring
- # like: http://trailers.apple.com/trailers/wb/gravity/
- def _clean_json(m):
- return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
- playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
- playlist_html = u'<html>' + playlist_cleaned + u'</html>'
+ def fix_html(s):
+ s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+ s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
+ # The ' in the onClick attributes are not escaped, it couldn't be parsed
+ # like: http://trailers.apple.com/trailers/wb/gravity/
+ def _clean_json(m):
+ return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+ s = re.sub(self._JSON_RE, _clean_json, s)
+ s = u'<html>' + s + u'</html>'
+ return s
+ doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
- doc = xml.etree.ElementTree.fromstring(playlist_html)
playlist = []
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 56a5d009f..4b7bef775 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -10,6 +10,7 @@ from ..utils import (
determine_ext,
get_element_by_id,
compat_str,
+ get_element_by_attribute,
)
# There are different sources of video in arte.tv, the extraction process
@@ -142,7 +143,9 @@ class ArteTVPlus7IE(InfoExtractor):
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+ return self._extract_from_json_url(json_url, video_id, lang)
+ def _extract_from_json_url(self, json_url, video_id, lang):
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
self.report_extraction(video_id)
info = json.loads(json_info)
@@ -257,3 +260,35 @@ class ArteTVFutureIE(ArteTVPlus7IE):
webpage = self._download_webpage(url, anchor_id)
row = get_element_by_id(anchor_id, webpage)
return self._extract_from_webpage(row, anchor_id, lang)
+
+
+class ArteTVDDCIE(ArteTVPlus7IE):
+ IE_NAME = u'arte.tv:ddc'
+ _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+
+ _TEST = {
+ u'url': u'http://ddc.arte.tv/folge/neues-aus-mauretanien',
+ u'file': u'049881-009_PLUS7-D.flv',
+ u'info_dict': {
+ u'title': u'Mit offenen Karten',
+ u'description': u'md5:57929b0eaeddeb8a0c983f58e9ebd3b6',
+ u'upload_date': u'20131207',
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id, lang = self._extract_url_info(url)
+ if lang == 'folge':
+ lang = 'de'
+ elif lang == 'emission':
+ lang = 'fr'
+ webpage = self._download_webpage(url, video_id)
+ scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage)
+ script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url')
+ javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
+ json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
+ return self._extract_from_json_url(json_url, video_id, lang)
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 493504f75..5e33a69df 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -51,8 +51,7 @@ class BlipTVIE(InfoExtractor):
url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
urlp = compat_urllib_parse_urlparse(url)
if urlp.path.startswith('/play/'):
- request = compat_urllib_request.Request(url)
- response = compat_urllib_request.urlopen(request)
+ response = self._request_webpage(url, None, False)
redirecturl = response.geturl()
rurlp = compat_urllib_parse_urlparse(redirecturl)
file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
@@ -69,25 +68,23 @@ class BlipTVIE(InfoExtractor):
request.add_header('User-Agent', 'iTunes/10.6.1')
self.report_extraction(mobj.group(1))
info = None
- try:
- urlh = compat_urllib_request.urlopen(request)
- if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
- basename = url.split('/')[-1]
- title,ext = os.path.splitext(basename)
- title = title.decode('UTF-8')
- ext = ext.replace('.', '')
- self.report_direct_download(title)
- info = {
- 'id': title,
- 'url': url,
- 'uploader': None,
- 'upload_date': None,
- 'title': title,
- 'ext': ext,
- 'urlhandle': urlh
- }
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
+ urlh = self._request_webpage(request, None, False,
+ u'unable to download video info webpage')
+ if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
+ basename = url.split('/')[-1]
+ title,ext = os.path.splitext(basename)
+ title = title.decode('UTF-8')
+ ext = ext.replace('.', '')
+ self.report_direct_download(title)
+ info = {
+ 'id': title,
+ 'url': url,
+ 'uploader': None,
+ 'upload_date': None,
+ 'title': title,
+ 'ext': ext,
+ 'urlhandle': urlh
+ }
if info is None: # Regular URL
try:
json_code_bytes = urlh.read()
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 66fe0ac9a..b1b7526ca 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -55,6 +55,18 @@ class BrightcoveIE(InfoExtractor):
u'uploader': u'Mashable',
},
},
+ {
+ # test that the default referer works
+ # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
+ u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
+ u'info_dict': {
+ u'id': u'2878862109001',
+ u'ext': u'mp4',
+ u'title': u'Lost in Motion II',
+ u'description': u'md5:363109c02998fee92ec02211bd8000df',
+ u'uploader': u'National Ballet of Canada',
+ },
+ },
]
@classmethod
@@ -118,17 +130,21 @@ class BrightcoveIE(InfoExtractor):
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
- return self._get_video_info(videoPlayer[0], query_str, query)
+ return self._get_video_info(videoPlayer[0], query_str, query,
+ # We set the original url as the default 'Referer' header
+ referer=url)
else:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
- def _get_video_info(self, video_id, query_str, query):
+ def _get_video_info(self, video_id, query_str, query, referer=None):
request_url = self._FEDERATED_URL_TEMPLATE % query_str
req = compat_urllib_request.Request(request_url)
linkBase = query.get('linkBaseURL')
if linkBase is not None:
- req.add_header('Referer', linkBase[0])
+ referer = linkBase[0]
+ if referer is not None:
+ req.add_header('Referer', referer)
webpage = self._download_webpage(req, video_id)
self.report_extraction(video_id)
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
new file mode 100644
index 000000000..ac0315853
--- /dev/null
+++ b/youtube_dl/extractor/cbs.py
@@ -0,0 +1,30 @@
+import re
+
+from .common import InfoExtractor
+
+
+class CBSIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*'
+
+ _TEST = {
+ u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+ u'file': u'4JUVEwq3wUT7.flv',
+ u'info_dict': {
+ u'title': u'Connect Chat feat. Garth Brooks',
+ u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
+ u'duration': 1495,
+ },
+ u'params': {
+ # rtmp download
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ real_id = self._search_regex(
+ r"video\.settings\.pid\s*=\s*'([^']+)';",
+ webpage, u'real video ID')
+ return self.url_result(u'theplatform:%s' % real_id)
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
new file mode 100644
index 000000000..ae70ea229
--- /dev/null
+++ b/youtube_dl/extractor/channel9.py
@@ -0,0 +1,267 @@
+# encoding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+class Channel9IE(InfoExtractor):
+ '''
+ Common extractor for channel9.msdn.com.
+
+ The type of provided URL (video or playlist) is determined according to
+ meta Search.PageType from web page HTML rather than URL itself, as it is
+ not always possible to do.
+ '''
+ IE_DESC = u'Channel 9'
+ IE_NAME = u'channel9'
+ _VALID_URL = r'^https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
+
+ _TESTS = [
+ {
+ u'url': u'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+ u'file': u'Events_TechEd_Australia_2013_KOS002.mp4',
+ u'md5': u'bbd75296ba47916b754e73c3a4bbdf10',
+ u'info_dict': {
+ u'title': u'Developer Kick-Off Session: Stuff We Love',
+ u'description': u'md5:c08d72240b7c87fcecafe2692f80e35f',
+ u'duration': 4576,
+ u'thumbnail': u'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ u'session_code': u'KOS002',
+ u'session_day': u'Day 1',
+ u'session_room': u'Arena 1A',
+ u'session_speakers': [ u'Ed Blankenship', u'Andrew Coates', u'Brady Gaster', u'Patrick Klug', u'Mads Kristensen' ],
+ },
+ },
+ {
+ u'url': u'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+ u'file': u'posts_Self-service-BI-with-Power-BI-nuclear-testing.mp4',
+ u'md5': u'b43ee4529d111bc37ba7ee4f34813e68',
+ u'info_dict': {
+ u'title': u'Self-service BI with Power BI - nuclear testing',
+ u'description': u'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+ u'duration': 1540,
+ u'thumbnail': u'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ u'authors': [ u'Mike Wilmot' ],
+ },
+ }
+ ]
+
+ _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
+
+ # Sorted by quality
+ _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
+
+ def _restore_bytes(self, formatted_size):
+ if not formatted_size:
+ return 0
+ m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
+ if not m:
+ return 0
+ units = m.group('units')
+ try:
+ exponent = [u'B', u'KB', u'MB', u'GB', u'TB', u'PB', u'EB', u'ZB', u'YB'].index(units.upper())
+ except ValueError:
+ return 0
+ size = float(m.group('size'))
+ return int(size * (1024 ** exponent))
+
+ def _formats_from_html(self, html):
+ FORMAT_REGEX = r'''
+ (?x)
+ <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
+ <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
+ (?:<div\s+class="popup\s+rounded">\s*
+ <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
+ </div>)? # File size part may be missing
+ '''
+ # Extract known formats
+ formats = [{'url': x.group('url'),
+ 'format_id': x.group('quality'),
+ 'format_note': x.group('note'),
+ 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
+ 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+ } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+ # Sort according to known formats list
+ formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+ return formats
+
+ def _extract_title(self, html):
+ title = self._html_search_meta(u'title', html, u'title')
+ if title is None:
+ title = self._og_search_title(html)
+ TITLE_SUFFIX = u' (Channel 9)'
+ if title is not None and title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+ return title
+
+ def _extract_description(self, html):
+ DESCRIPTION_REGEX = r'''(?sx)
+ <div\s+class="entry-content">\s*
+ <div\s+id="entry-body">\s*
+ (?P<description>.+?)\s*
+ </div>\s*
+ </div>
+ '''
+ m = re.search(DESCRIPTION_REGEX, html)
+ if m is not None:
+ return m.group('description')
+ return self._html_search_meta(u'description', html, u'description')
+
+ def _extract_duration(self, html):
+ m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+ return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
+
+ def _extract_slides(self, html):
+ m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
+ return m.group('slidesurl') if m is not None else None
+
+ def _extract_zip(self, html):
+ m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
+ return m.group('zipurl') if m is not None else None
+
+ def _extract_avg_rating(self, html):
+ m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
+ return float(m.group('avgrating')) if m is not None else 0
+
+ def _extract_rating_count(self, html):
+ m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
+ return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
+
+ def _extract_view_count(self, html):
+ m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
+ return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
+
+ def _extract_comment_count(self, html):
+ m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
+ return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
+
+ def _fix_count(self, count):
+ return int(str(count).replace(',', '')) if count is not None else None
+
+ def _extract_authors(self, html):
+ m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
+ if m is None:
+ return None
+ return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
+
+ def _extract_session_code(self, html):
+ m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
+ return m.group('code') if m is not None else None
+
+ def _extract_session_day(self, html):
+ m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
+ return m.group('day') if m is not None else None
+
+ def _extract_session_room(self, html):
+ m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
+ return m.group('room') if m is not None else None
+
+ def _extract_session_speakers(self, html):
+ return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
+
+ def _extract_content(self, html, content_path):
+ # Look for downloadable content
+ formats = self._formats_from_html(html)
+ slides = self._extract_slides(html)
+ zip_ = self._extract_zip(html)
+
+ # Nothing to download
+ if len(formats) == 0 and slides is None and zip_ is None:
+ self._downloader.report_warning(u'None of recording, slides or zip are available for %s' % content_path)
+ return
+
+ # Extract meta
+ title = self._extract_title(html)
+ description = self._extract_description(html)
+ thumbnail = self._og_search_thumbnail(html)
+ duration = self._extract_duration(html)
+ avg_rating = self._extract_avg_rating(html)
+ rating_count = self._extract_rating_count(html)
+ view_count = self._extract_view_count(html)
+ comment_count = self._extract_comment_count(html)
+
+ common = {'_type': 'video',
+ 'id': content_path,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'avg_rating': avg_rating,
+ 'rating_count': rating_count,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
+
+ result = []
+
+ if slides is not None:
+ d = common.copy()
+ d.update({ 'title': title + '-Slides', 'url': slides })
+ result.append(d)
+
+ if zip_ is not None:
+ d = common.copy()
+ d.update({ 'title': title + '-Zip', 'url': zip_ })
+ result.append(d)
+
+ if len(formats) > 0:
+ d = common.copy()
+ d.update({ 'title': title, 'formats': formats })
+ result.append(d)
+
+ return result
+
+ def _extract_entry_item(self, html, content_path):
+ contents = self._extract_content(html, content_path)
+ if contents is None:
+ return contents
+
+ authors = self._extract_authors(html)
+
+ for content in contents:
+ content['authors'] = authors
+
+ return contents
+
+ def _extract_session(self, html, content_path):
+ contents = self._extract_content(html, content_path)
+ if contents is None:
+ return contents
+
+ session_meta = {'session_code': self._extract_session_code(html),
+ 'session_day': self._extract_session_day(html),
+ 'session_room': self._extract_session_room(html),
+ 'session_speakers': self._extract_session_speakers(html),
+ }
+
+ for content in contents:
+ content.update(session_meta)
+
+ return contents
+
+ def _extract_list(self, content_path):
+ rss = self._download_xml(self._RSS_URL % content_path, content_path, u'Downloading RSS')
+ entries = [self.url_result(session_url.text, 'Channel9')
+ for session_url in rss.findall('./channel/item/link')]
+ title_text = rss.find('./channel/title').text
+ return self.playlist_result(entries, content_path, title_text)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ content_path = mobj.group('contentpath')
+
+ webpage = self._download_webpage(url, content_path, u'Downloading web page')
+
+ page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
+ if page_type_m is None:
+ raise ExtractorError(u'Search.PageType not found, don\'t know how to process this page', expected=True)
+
+ page_type = page_type_m.group('pagetype')
+ if page_type == 'List': # List page, may contain list of 'item'-like objects
+ return self._extract_list(content_path)
+ elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
+ return self._extract_entry_item(webpage, content_path)
+ elif page_type == 'Session': # Event session page, may contain downloadable content
+ return self._extract_session(webpage, content_path)
+ else:
+ raise ExtractorError(u'Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index d4fc86973..c60089ad3 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,9 +1,9 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
+ fix_xml_all_ampersand,
)
@@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):
# it includes a required token
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
- playlist_page = self._download_webpage(
+ pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
- video_id, u'Downloading video info')
- # Fix broken xml
- playlist_page = re.sub('&', '&amp;', playlist_page)
- pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+ video_id, u'Downloading video info',
+ transform_source=fix_xml_all_ampersand)
track_doc = pdoc.find('trackList/track')
def find_param(name):
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 92a0c5050..fe8ce9e6c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -34,8 +34,8 @@ class InfoExtractor(object):
The dictionaries must include the following fields:
id: Video identifier.
- url: Final video URL.
title: Video title, unescaped.
+ url: Final video URL.
ext: Video filename extension.
Instead of url and ext, formats can also specified.
@@ -54,6 +54,7 @@ class InfoExtractor(object):
player_url: SWF Player URL (used for rtmpdump).
subtitles: The subtitle file contents as a dictionary in the format
{language: subtitles}.
+ duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
@@ -154,27 +155,38 @@ class InfoExtractor(object):
def IE_NAME(self):
return type(self).__name__[:-2]
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
- self.to_screen(u'%s: %s' % (video_id, note))
+ if video_id is None:
+ self.to_screen(u'%s' % (note,))
+ else:
+ self.to_screen(u'%s: %s' % (video_id, note))
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
- raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
+ errmsg = u'%s: %s' % (errnote, compat_str(err))
+ if fatal:
+ raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+ else:
+ self._downloader.report_warning(errmsg)
+ return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+ if urlh is False:
+ assert not fatal
+ return False
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -209,14 +221,22 @@ class InfoExtractor(object):
content = webpage_bytes.decode(encoding, 'replace')
return (content, urlh)
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """
- return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ if res is False:
+ return res
+ else:
+ content, _ = res
+ return content
def _download_xml(self, url_or_request, video_id,
- note=u'Downloading XML', errnote=u'Unable to download XML'):
+ note=u'Downloading XML', errnote=u'Unable to download XML',
+ transform_source=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ if transform_source:
+ xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
def to_screen(self, msg):
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 3bd0b862c..6685c94a3 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
"""Information Extractor for Dailymotion"""
- _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
+ _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = u'dailymotion'
_FORMATS = [
@@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1).split('_')[0].split('?')[0]
+ video_id = mobj.group('id')
url = 'http://www.dailymotion.com/video/%s' % video_id
@@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self.to_screen(u'Vevo video detected: %s' % vevo_id)
return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo')
- video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
- # Looking for official user
- r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
- webpage, 'video uploader', fatal=False)
age_limit = self._rta_search(webpage)
video_upload_date = None
@@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
self._list_available_subtitles(video_id, webpage)
return
- view_count = str_to_int(self._search_regex(
- r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count'))
+ view_count = self._search_regex(
+ r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False)
+ if view_count is not None:
+ view_count = str_to_int(view_count)
return {
'id': video_id,
'formats': formats,
- 'uploader': video_uploader,
+ 'uploader': info['owner_screenname'],
'upload_date': video_upload_date,
'title': self._og_search_title(webpage),
'subtitles': video_subtitles,
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index d418ce4a8..4876ecb48 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -9,7 +9,7 @@ from ..utils import (
class DaumIE(InfoExtractor):
- _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
IE_NAME = u'daum.net'
_TEST = {
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 57b79a336..381af91e4 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -44,7 +44,7 @@ class IGNIE(InfoExtractor):
{
u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
u'info_dict': {
- u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion',
+ u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
},
},
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index e59bdd604..99d3c83a5 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -1,14 +1,10 @@
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
compat_parse_qs,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
- compat_str,
determine_ext,
ExtractorError,
)
@@ -93,12 +89,8 @@ class MetacafeIE(InfoExtractor):
def _real_initialize(self):
# Retrieve disclaimer
- request = compat_urllib_request.Request(self._DISCLAIMER)
- try:
- self.report_disclaimer()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
+ self.report_disclaimer()
+ self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
# Confirm age
disclaimer_form = {
@@ -107,11 +99,8 @@ class MetacafeIE(InfoExtractor):
}
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ self.report_age_confirmation()
+ self._download_webpage(request, None, False, u'Unable to confirm age')
def _real_extract(self, url):
# Extract id and simplified title from URL
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 6b95b4998..e560c1d35 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -1,8 +1,10 @@
import re
-import xml.etree.ElementTree
import operator
from .common import InfoExtractor
+from ..utils import (
+ fix_xml_all_ampersand,
+)
class MetacriticIE(InfoExtractor):
@@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
- info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
- video_id, u'Downloading info xml').replace('&', '&amp;')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+ video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index e2baf44d7..125d81551 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,13 +1,10 @@
import json
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_urllib_error,
- compat_urllib_request,
unified_strdate,
+ ExtractorError,
)
@@ -31,13 +28,18 @@ class MixcloudIE(InfoExtractor):
"""Returns 1st active url from list"""
for url in url_list:
try:
- compat_urllib_request.urlopen(url)
+ # We only want to know if the request succeed
+ # don't download the whole file
+ self._request_webpage(url, None, False)
return url
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error):
+ except ExtractorError:
url = None
return None
+ def _get_url(self, template_url):
+ return self.check_urls(template_url % i for i in range(30))
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -53,13 +55,18 @@ class MixcloudIE(InfoExtractor):
preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
- final_song_url = self.check_urls(template_url % i for i in range(30))
+ final_song_url = self._get_url(template_url)
+ if final_song_url is None:
+ self.to_screen('Trying with m4a extension')
+ template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
+ final_song_url = self._get_url(template_url)
+ if final_song_url is None:
+ raise ExtractorError(u'Unable to extract track url')
return {
'id': track_id,
'title': info['name'],
'url': final_song_url,
- 'ext': 'mp3',
'description': info.get('description'),
'thumbnail': info['pictures'].get('extra_large'),
'uploader': info['user']['name'],
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 6b3feb560..5b2bd9633 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
- idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
- u'Downloading info')
+
+ def fix_ampersand(s):
+ """ Fix unencoded ampersand in XML """
+ return s.replace(u'& ', '&amp; ')
+ idoc = self._download_xml(
+ self._FEED_URL + '?' + data, video_id,
+ u'Downloading info', transform_source=fix_ampersand)
return [self._get_video_info(item) for item in idoc.findall('.//item')]
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index c012ec0cf..4cab30631 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -9,7 +9,7 @@ from ..utils import (
class NaverIE(InfoExtractor):
- _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
_TEST = {
u'url': u'http://tvcast.naver.com/v/81652',
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py
new file mode 100644
index 000000000..2e8501f99
--- /dev/null
+++ b/youtube_dl/extractor/ndtv.py
@@ -0,0 +1,66 @@
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import month_by_name
+
+
+class NDTVIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+
+ _TEST = {
+ u"url": u"http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710",
+ u"file": u"300710.mp4",
+ u"md5": u"39f992dbe5fb531c395d8bbedb1e5e88",
+ u"info_dict": {
+ u"title": u"NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal",
+ u"description": u"In an exclusive interview to NDTV, Aam Aadmi Party's Arvind Kejriwal says it makes no difference to him that Rahul Gandhi said the Congress needs to learn from his party.",
+ u"upload_date": u"20131208",
+ u"duration": 1327,
+ u"thumbnail": u"http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg",
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ filename = self._search_regex(
+ r"__filename='([^']+)'", webpage, u'video filename')
+ video_url = (u'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
+ filename)
+
+ duration_str = filename = self._search_regex(
+ r"__duration='([^']+)'", webpage, u'duration', fatal=False)
+ duration = None if duration_str is None else int(duration_str)
+
+ date_m = re.search(r'''(?x)
+ <p\s+class="vod_dateline">\s*
+ Published\s+On:\s*
+ (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
+ ''', webpage)
+ upload_date = None
+ assert date_m
+ if date_m is not None:
+ month = month_by_name(date_m.group('monthname'))
+ if month is not None:
+ upload_date = '%s%02d%02d' % (
+ date_m.group('year'), month, int(date_m.group('day')))
+
+ description = self._og_search_description(webpage)
+ READ_MORE = u' (Read more)'
+ if description.endswith(READ_MORE):
+ description = description[:-len(READ_MORE)]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': duration,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
new file mode 100644
index 000000000..71abd5013
--- /dev/null
+++ b/youtube_dl/extractor/pornhd.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse
+
+
+class PornHdIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _TEST = {
+ u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
+ u'file': u'1962.flv',
+ u'md5': u'35272469887dca97abd30abecc6cdf75',
+ u'info_dict': {
+ u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
+ u"age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('video_id')
+ video_title = mobj.group('video_title')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(
+ r'&hd=(http.+?)&', webpage, u'video URL')
+ video_url = compat_urllib_parse.unquote(video_url)
+ age_limit = 18
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': video_title,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 8b3471919..d9135c6b9 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -12,7 +12,7 @@ from ..aes import (
)
class PornHubIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))'
_TEST = {
u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015',
u'file': u'648719015.mp4',
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
index 2f238de35..511674d8d 100644
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -7,14 +7,15 @@ from ..utils import (
ExtractorError,
)
+
class RTLnowIE(InfoExtractor):
"""Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de/|rtl2now\.rtl2\.de/|(?:www\.)?voxnow\.de/|(?:www\.)?rtlnitronow\.de/|(?:www\.)?superrtlnow\.de/|(?:www\.)?n-tvnow\.de/)[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
+ _VALID_URL = r'(?:http://)?(?P<url>(?P<base_url>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
_TESTS = [{
u'url': u'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
u'file': u'90419.flv',
u'info_dict': {
- u'upload_date': u'20070416',
+ u'upload_date': u'20070416',
u'title': u'Ahornallee - Folge 1 - Der Einzug',
u'description': u'Folge 1 - Der Einzug',
},
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 5a28bc820..4ea89bf85 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -3,10 +3,13 @@
import re
import json
import hashlib
+import uuid
from .common import InfoExtractor
from ..utils import (
- ExtractorError
+ compat_urllib_parse,
+ compat_urllib_request,
+ ExtractorError,
)
@@ -249,3 +252,105 @@ class SmotriUserIE(InfoExtractor):
u'user nickname')
return self.playlist_result(entries, user_id, user_nickname)
+
+
+class SmotriBroadcastIE(InfoExtractor):
+ IE_DESC = u'Smotri.com broadcasts'
+ IE_NAME = u'smotri:broadcast'
+ _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ broadcast_id = mobj.group('broadcastid')
+
+ broadcast_url = 'http://' + mobj.group('url')
+ broadcast_page = self._download_webpage(broadcast_url, broadcast_id, u'Downloading broadcast page')
+
+ if re.search(u'>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
+ raise ExtractorError(u'Broadcast %s does not exist' % broadcast_id, expected=True)
+
+ # Adult content
+ if re.search(u'EroConfirmText">', broadcast_page) is not None:
+
+ (username, password) = self._get_login_info()
+ if username is None:
+ raise ExtractorError(u'Erotic broadcasts allowed only for registered users, '
+ u'use --username and --password options to provide account credentials.', expected=True)
+
+ # Log in
+ login_form_strs = {
+ u'login-hint53': '1',
+ u'confirm_erotic': '1',
+ u'login': username,
+ u'password': password,
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ login_url = broadcast_url + '/?no_redirect=1'
+ request = compat_urllib_request.Request(login_url, login_data)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ broadcast_page = self._download_webpage(
+ request, broadcast_id, note=u'Logging in and confirming age')
+
+ if re.search(u'>Неверный логин или пароль<', broadcast_page) is not None:
+ raise ExtractorError(u'Unable to log in: bad username or password', expected=True)
+
+ adult_content = True
+ else:
+ adult_content = False
+
+ ticket = self._html_search_regex(
+ u'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);',
+ broadcast_page, u'broadcast ticket')
+
+ url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
+
+ broadcast_password = self._downloader.params.get('videopassword', None)
+ if broadcast_password:
+ url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
+
+ broadcast_json_page = self._download_webpage(url, broadcast_id, u'Downloading broadcast JSON')
+
+ try:
+ broadcast_json = json.loads(broadcast_json_page)
+
+ protected_broadcast = broadcast_json['_pass_protected'] == 1
+ if protected_broadcast and not broadcast_password:
+ raise ExtractorError(u'This broadcast is protected by a password, use the --video-password option', expected=True)
+
+ broadcast_offline = broadcast_json['is_play'] == 0
+ if broadcast_offline:
+ raise ExtractorError(u'Broadcast %s is offline' % broadcast_id, expected=True)
+
+ rtmp_url = broadcast_json['_server']
+ if not rtmp_url.startswith('rtmp://'):
+ raise ExtractorError(u'Unexpected broadcast rtmp URL')
+
+ broadcast_playpath = broadcast_json['_streamName']
+ broadcast_thumbnail = broadcast_json['_imgURL']
+ broadcast_title = broadcast_json['title']
+ broadcast_description = broadcast_json['description']
+ broadcaster_nick = broadcast_json['nick']
+ broadcaster_login = broadcast_json['login']
+ rtmp_conn = 'S:%s' % uuid.uuid4().hex
+ except KeyError:
+ if protected_broadcast:
+ raise ExtractorError(u'Bad broadcast password', expected=True)
+ raise ExtractorError(u'Unexpected broadcast JSON')
+
+ return {
+ 'id': broadcast_id,
+ 'url': rtmp_url,
+ 'title': broadcast_title,
+ 'thumbnail': broadcast_thumbnail,
+ 'description': broadcast_description,
+ 'uploader': broadcaster_nick,
+ 'uploader_id': broadcaster_login,
+ 'age_limit': 18 if adult_content else 0,
+ 'ext': 'flv',
+ 'play_path': broadcast_playpath,
+ 'rtmp_live': True,
+ 'rtmp_conn': rtmp_conn
+ }
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index cb6dedab7..cbba4094b 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
import json
import re
import itertools
@@ -23,7 +24,10 @@ class SoundcloudIE(InfoExtractor):
"""
_VALID_URL = r'''^(?:https?://)?
- (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
+ (?:(?:(?:www\.)?soundcloud\.com/
+ (?P<uploader>[\w\d-]+)/
+ (?!sets/)(?P<title>[\w\d-]+)/?
+ (?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
|(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
)
@@ -56,6 +60,32 @@ class SoundcloudIE(InfoExtractor):
u'skip_download': True,
},
},
+ # private link
+ {
+ u'url': u'https://soundcloud.com/jaimemf/youtube-dl-test-video-a-y-baw/s-8Pjrp',
+ u'md5': u'aa0dd32bfea9b0c5ef4f02aacd080604',
+ u'info_dict': {
+ u'id': u'123998367',
+ u'ext': u'mp3',
+ u'title': u'Youtube - Dl Test Video \'\' Ä↭',
+ u'uploader': u'jaimeMF',
+ u'description': u'test chars: \"\'/\\ä↭',
+ u'upload_date': u'20131209',
+ },
+ },
+ # downloadable song
+ {
+ u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1',
+ u'md5': u'56a8b69568acaa967b4c49f9d1d52d19',
+ u'info_dict': {
+ u'id': u'105614606',
+ u'ext': u'wav',
+ u'title': u'Just Your Problem Baby (Acapella)',
+ u'description': u'Vocals',
+ u'uploader': u'Sim Gretina',
+ u'upload_date': u'20130815',
+ },
+ },
]
_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
@@ -73,7 +103,7 @@ class SoundcloudIE(InfoExtractor):
def _resolv_url(cls, url):
return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
- def _extract_info_dict(self, info, full_title=None, quiet=False):
+ def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
track_id = compat_str(info['id'])
name = full_title or track_id
if quiet:
@@ -82,7 +112,7 @@ class SoundcloudIE(InfoExtractor):
thumbnail = info['artwork_url']
if thumbnail is not None:
thumbnail = thumbnail.replace('-large', '-t500x500')
- ext = info.get('original_format', u'mp3')
+ ext = u'mp3'
result = {
'id': track_id,
'uploader': info['user']['username'],
@@ -98,14 +128,16 @@ class SoundcloudIE(InfoExtractor):
track_id, self._CLIENT_ID))
result['formats'] = [{
'format_id': 'download',
- 'ext': ext,
+ 'ext': info.get('original_format', u'mp3'),
'url': format_url,
'vcodec': 'none',
}]
else:
# We have to retrieve the url
+ streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
+ 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
stream_json = self._download_webpage(
- 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
+ streams_url,
track_id, u'Downloading track url')
formats = []
@@ -157,6 +189,7 @@ class SoundcloudIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
track_id = mobj.group('track_id')
+ token = None
if track_id is not None:
info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
full_title = track_id
@@ -165,19 +198,22 @@ class SoundcloudIE(InfoExtractor):
return self.url_result(query['url'][0], ie='Soundcloud')
else:
# extract uploader (which is in the url)
- uploader = mobj.group(1)
+ uploader = mobj.group('uploader')
# extract simple title (uploader + slug of song title)
- slug_title = mobj.group(2)
- full_title = '%s/%s' % (uploader, slug_title)
+ slug_title = mobj.group('title')
+ token = mobj.group('token')
+ full_title = resolve_title = '%s/%s' % (uploader, slug_title)
+ if token:
+ resolve_title += '/%s' % token
self.report_resolve(full_title)
- url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
+ url = 'http://soundcloud.com/%s' % resolve_title
info_json_url = self._resolv_url(url)
info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON')
info = json.loads(info_json)
- return self._extract_info_dict(info, full_title)
+ return self._extract_info_dict(info, full_title, secret_token=token)
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py
index d54e01a12..44c52c718 100644
--- a/youtube_dl/extractor/stanfordoc.py
+++ b/youtube_dl/extractor/stanfordoc.py
@@ -1,14 +1,7 @@
import re
-import socket
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_request,
-
ExtractorError,
orderedSet,
unescapeHTML,
@@ -45,11 +38,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
self.report_extraction(info['id'])
baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
xmlUrl = baseUrl + video + '.xml'
- try:
- metaXml = compat_urllib_request.urlopen(xmlUrl).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
+ mdoc = self._download_xml(xmlUrl, info['id'])
try:
info['title'] = mdoc.findall('./title')[0].text
info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
@@ -95,12 +84,9 @@ class StanfordOpenClassroomIE(InfoExtractor):
'upload_date': None,
}
- self.report_download_webpage(info['id'])
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
- try:
- rootpage = compat_urllib_request.urlopen(rootURL).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
+ rootpage = self._download_webpage(rootURL, info['id'],
+ errnote=u'Unable to download course info page')
info['title'] = info['id']
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 61452e47d..cec65261b 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -3,6 +3,7 @@ import json
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
xpath_with_ns,
)
@@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor):
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
meta = self._download_xml(smil_url, video_id)
+
+ try:
+ error_msg = next(
+ n.attrib['abstract']
+ for n in meta.findall(_x('.//smil:ref'))
+ if n.attrib.get('title') == u'Geographic Restriction')
+ except StopIteration:
+ pass
+ else:
+ raise ExtractorError(error_msg, expected=True)
+
info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index fb2bd225a..ea4409528 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url, new_video=True):
+ def _real_extract(self, url):
url, data = unsmuggle_url(url)
headers = std_headers
if data is not None:
@@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor):
config = json.loads(config_json)
except RegexNotFoundError:
# For pro videos or player.vimeo.com urls
- config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
- webpage, u'info section', flags=re.DOTALL)
+ # We try to find out to which variable is assigned the config dic
+ m_variable_name = re.search('(\w)\.video\.id', webpage)
+ if m_variable_name is not None:
+ config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))
+ else:
+ config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+ config = self._search_regex(config_re, webpage, u'info section',
+ flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index b9c3b13f9..82a626e0e 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -11,7 +11,8 @@ class WimpIE(InfoExtractor):
u'file': u'deerfence.flv',
u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5',
u'info_dict': {
- u"title": u"Watch Till End: Herd of deer jump over a fence."
+ u"title": u"Watch Till End: Herd of deer jump over a fence.",
+ u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
}
}
@@ -19,18 +20,14 @@ class WimpIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'<meta name="description" content="(.+?)" />',webpage, 'video title')
- thumbnail_url = self._search_regex(r'<meta property="og\:image" content="(.+?)" />', webpage,'video thumbnail')
googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url')
googleString = base64.b64decode(googleString).decode('ascii')
- final_url = self._search_regex('","(.*?)"', googleString,'final video url')
- ext = final_url.rpartition(u'.')[2]
-
- return [{
- 'id': video_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- }]
+ final_url = self._search_regex('","(.*?)"', googleString, u'final video url')
+ return {
+ 'id': video_id,
+ 'url': final_url,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 279f75e7a..ef9997ee4 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor):
{
u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
u'file': u'2221348.flv',
- u'md5': u'970a94178ca4118c5aa3aaea21211b81',
+ u'md5': u'e767b9475de189320f691f49c679c4c7',
u'info_dict': {
u"upload_date": u"20130914",
u"uploader_id": u"jojo747400",
@@ -46,7 +46,7 @@ class XHamsterIE(InfoExtractor):
return mobj.group('server')+'/key='+mobj.group('file')
def is_hd(webpage):
- return webpage.find('<div class=\'icon iconHD\'>') != -1
+ return webpage.find('<div class=\'icon iconHD\'') != -1
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 01715024c..a68a214ca 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,7 +7,6 @@ import itertools
import json
import os.path
import re
-import socket
import string
import struct
import traceback
@@ -17,9 +16,7 @@ from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_chr,
- compat_http_client,
compat_parse_qs,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
@@ -45,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- def report_lang(self):
- """Report attempt to set language."""
- self.to_screen(u'Setting language')
-
def _set_language(self):
- request = compat_urllib_request.Request(self._LANG_URL)
- try:
- self.report_lang()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- return False
- return True
+ return bool(self._download_webpage(
+ self._LANG_URL, None,
+ note=u'Setting language', errnote='unable to set language',
+ fatal=False))
def _login(self):
(username, password) = self._get_login_info()
@@ -67,12 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return False
- request = compat_urllib_request.Request(self._LOGIN_URL)
- try:
- login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
- return False
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None,
+ note=u'Downloading login page',
+ errnote=u'unable to fetch login page', fatal=False)
+ if login_page is False:
+ return
galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
login_page, u'Login GALX parameter')
@@ -102,29 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
- try:
- self.report_login()
- login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
- return False
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+
+ req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ login_results = self._download_webpage(
+ req, None,
+ note=u'Logging in', errnote=u'unable to log in', fatal=False)
+ if login_results is False:
+ return False
+ if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
return False
return True
def _confirm_age(self):
age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ 'next_url': '/',
+ 'action_confirm': 'Confirm',
+ }
+ req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+
+ self._download_webpage(
+ req, None,
+ note=u'Confirming age', errnote=u'Unable to confirm age')
return True
def _real_initialize(self):
@@ -1389,9 +1377,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if 'length_seconds' not in video_info:
self._downloader.report_warning(u'unable to extract video duration')
- video_duration = ''
+ video_duration = None
else:
- video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+ video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
# annotations
video_annotations = None
@@ -1584,6 +1572,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
if playlist_id.startswith('RD'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
+ if playlist_id.startswith('TL'):
+ raise ExtractorError(u'For downloading YouTube.com top lists, use '
+ u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
# Extract the video ids from the playlist pages
ids = []
@@ -1606,6 +1597,38 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self.playlist_result(url_results, playlist_id, playlist_title)
+class YoutubeTopListIE(YoutubePlaylistIE):
+ IE_NAME = u'youtube:toplist'
+ IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
+ u' (Example: "yttoplist:music:Top Tracks")')
+ _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel = mobj.group('chann')
+ title = mobj.group('title')
+ query = compat_urllib_parse.urlencode({'title': title})
+ playlist_re = 'href="([^"]+?%s[^"]+?)"' % re.escape(query)
+ channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
+ link = self._html_search_regex(playlist_re, channel_page, u'list')
+ url = compat_urlparse.urljoin('https://www.youtube.com/', link)
+
+ video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
+ ids = []
+ # sometimes the webpage doesn't contain the videos
+ # retry until we get them
+ for i in itertools.count(0):
+ msg = u'Downloading Youtube mix'
+ if i > 0:
+ msg += ', retry #%d' % i
+ webpage = self._download_webpage(url, title, msg)
+ ids = orderedSet(re.findall(video_re, webpage))
+ if ids:
+ break
+ url_results = self._ids_to_results(ids)
+ return self.playlist_result(url_results, playlist_title=title)
+
+
class YoutubeChannelIE(InfoExtractor):
IE_DESC = u'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
@@ -1631,10 +1654,11 @@ class YoutubeChannelIE(InfoExtractor):
video_ids = []
url = 'https://www.youtube.com/channel/%s/videos' % channel_id
channel_page = self._download_webpage(url, channel_id)
- if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
- autogenerated = True
- else:
- autogenerated = False
+ autogenerated = re.search(r'''(?x)
+ class="[^"]*?(?:
+ channel-header-autogenerated-label|
+ yt-channel-title-autogenerated
+ )[^"]*"''', channel_page) is not None
if autogenerated:
# The videos are contained in a single page
@@ -1736,10 +1760,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch'
- def report_download_page(self, query, pagenum):
- """Report attempt to download search page with given number."""
- self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
-
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
@@ -1748,16 +1768,15 @@ class YoutubeSearchIE(SearchInfoExtractor):
limit = n
while (50 * pagenum) < limit:
- self.report_download_page(query, pagenum+1)
result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
- request = compat_urllib_request.Request(result_url)
- try:
- data = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
- api_response = json.loads(data)['data']
-
- if not 'items' in api_response:
+ data_json = self._download_webpage(
+ result_url, video_id=u'query "%s"' % query,
+ note=u'Downloading page %s' % (pagenum + 1),
+ errnote=u'Unable to download API page')
+ data = json.loads(data_json)
+ api_response = data['data']
+
+ if 'items' not in api_response:
raise ExtractorError(u'[youtube] No video results')
new_ids = list(video['id'] for video in api_response['items'])
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 689f19735..35ece354a 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor):
try:
proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
except ValueError:
- proto_pref = 999
+ proto_pref = -999
quality = fnode.find('./quality').text
QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
try:
quality_pref = -QUALITY_ORDER.index(quality)
except ValueError:
- quality_pref = 999
+ quality_pref = -999
abr = int(fnode.find('./audioBitrate').text) // 1000
vbr = int(fnode.find('./videoBitrate').text) // 1000
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 7b5878830..bd46a2da2 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import ctypes
import datetime
import email.utils
import errno
@@ -15,6 +16,7 @@ import platform
import re
import ssl
import socket
+import subprocess
import sys
import traceback
import zlib
@@ -547,7 +549,7 @@ def make_HTTPS_handler(opts_no_check_certificate):
def connect(self):
sock = socket.create_connection((self.host, self.port), self.timeout)
- if self._tunnel_host:
+ if getattr(self, '_tunnel_host', False):
self.sock = sock
self._tunnel()
try:
@@ -561,11 +563,14 @@ def make_HTTPS_handler(opts_no_check_certificate):
return HTTPSHandlerV3()
else:
context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
- context.set_default_verify_paths()
-
context.verify_mode = (ssl.CERT_NONE
if opts_no_check_certificate
else ssl.CERT_REQUIRED)
+ context.set_default_verify_paths()
+ try:
+ context.load_default_certs()
+ except AttributeError:
+ pass # Python < 3.4
return compat_urllib_request.HTTPSHandler(context=context)
class ExtractorError(Exception):
@@ -1021,6 +1026,54 @@ def format_bytes(bytes):
converted = float(bytes) / float(1024 ** exponent)
return u'%.2f%s' % (converted, suffix)
+
def str_to_int(int_str):
int_str = re.sub(r'[,\.]', u'', int_str)
return int(int_str)
+
+
+def get_term_width():
+ columns = os.environ.get('COLUMNS', None)
+ if columns:
+ return int(columns)
+
+ try:
+ sp = subprocess.Popen(
+ ['stty', 'size'],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = sp.communicate()
+ return int(out.split()[1])
+ except:
+ pass
+ return None
+
+
+def month_by_name(name):
+ """ Return the number of a month by (locale-independently) English name """
+
+ ENGLISH_NAMES = [
+ u'January', u'February', u'March', u'April', u'May', u'June',
+ u'July', u'August', u'September', u'October', u'November', u'December']
+ try:
+ return ENGLISH_NAMES.index(name) + 1
+ except ValueError:
+ return None
+
+
+def fix_xml_all_ampersand(xml_str):
+ """Replace all the '&' by '&amp;' in XML"""
+ return xml_str.replace(u'&', u'&amp;')
+
+
+def setproctitle(title):
+ try:
+ libc = ctypes.cdll.LoadLibrary("libc.so.6")
+ except OSError:
+ return
+ title = title
+ buf = ctypes.create_string_buffer(len(title) + 1)
+ buf.value = title
+ try:
+ libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
+ except AttributeError:
+ return # Strange libc, just skip this
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 68b30bfd4..5bc7fd774 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.12.04'
+__version__ = '2013.12.16'