path: root/youtube_dl
diff options
Diffstat (limited to 'youtube_dl')
52 files changed, 2307 insertions, 1730 deletions
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py
index 47124932f..5c8e676a2 100644
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -1,724 +1,12 @@
-import os
-import re
-import subprocess
-import sys
-import time
-from .utils import (
- compat_urllib_error,
- compat_urllib_request,
- ContentTooShortError,
- determine_ext,
- encodeFilename,
- format_bytes,
- sanitize_open,
- timeconvert,
-class FileDownloader(object):
- """File Downloader class.
- File downloader objects are the ones responsible of downloading the
- actual video file and writing it to disk.
- File downloaders accept a lot of parameters. In order not to saturate
- the object constructor with arguments, it receives a dictionary of
- options instead.
- Available options:
- verbose: Print additional info to stdout.
- quiet: Do not print messages to stdout.
- ratelimit: Download speed limit, in bytes/sec.
- retries: Number of times to retry for HTTP error 5xx
- buffersize: Size of download buffer in bytes.
- noresizebuffer: Do not automatically resize the download buffer.
- continuedl: Try to continue downloads if possible.
- noprogress: Do not print the progress bar.
- logtostderr: Log messages to stderr instead of stdout.
- consoletitle: Display progress in console window's titlebar.
- nopart: Do not use temporary .part files.
- updatetime: Use the Last-modified header to set output file timestamps.
- test: Download only first bytes to test the downloader.
- min_filesize: Skip files smaller than this size
- max_filesize: Skip files larger than this size
- """
- params = None
- def __init__(self, ydl, params):
- """Create a FileDownloader object with the given options."""
- self.ydl = ydl
- self._progress_hooks = []
- self.params = params
- @staticmethod
- def format_seconds(seconds):
- (mins, secs) = divmod(seconds, 60)
- (hours, mins) = divmod(mins, 60)
- if hours > 99:
- return '--:--:--'
- if hours == 0:
- return '%02d:%02d' % (mins, secs)
- else:
- return '%02d:%02d:%02d' % (hours, mins, secs)
- @staticmethod
- def calc_percent(byte_counter, data_len):
- if data_len is None:
- return None
- return float(byte_counter) / float(data_len) * 100.0
- @staticmethod
- def format_percent(percent):
- if percent is None:
- return '---.-%'
- return '%6s' % ('%3.1f%%' % percent)
- @staticmethod
- def calc_eta(start, now, total, current):
- if total is None:
- return None
- dif = now - start
- if current == 0 or dif < 0.001: # One millisecond
- return None
- rate = float(current) / dif
- return int((float(total) - float(current)) / rate)
- @staticmethod
- def format_eta(eta):
- if eta is None:
- return '--:--'
- return FileDownloader.format_seconds(eta)
- @staticmethod
- def calc_speed(start, now, bytes):
- dif = now - start
- if bytes == 0 or dif < 0.001: # One millisecond
- return None
- return float(bytes) / dif
- @staticmethod
- def format_speed(speed):
- if speed is None:
- return '%10s' % '---b/s'
- return '%10s' % ('%s/s' % format_bytes(speed))
- @staticmethod
- def best_block_size(elapsed_time, bytes):
- new_min = max(bytes / 2.0, 1.0)
- new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
- if elapsed_time < 0.001:
- return int(new_max)
- rate = bytes / elapsed_time
- if rate > new_max:
- return int(new_max)
- if rate < new_min:
- return int(new_min)
- return int(rate)
- @staticmethod
- def parse_bytes(bytestr):
- """Parse a string indicating a byte quantity into an integer."""
- matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
- if matchobj is None:
- return None
- number = float(matchobj.group(1))
- multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
- return int(round(number * multiplier))
- def to_screen(self, *args, **kargs):
- self.ydl.to_screen(*args, **kargs)
- def to_stderr(self, message):
- self.ydl.to_screen(message)
- def to_console_title(self, message):
- self.ydl.to_console_title(message)
- def trouble(self, *args, **kargs):
- self.ydl.trouble(*args, **kargs)
- def report_warning(self, *args, **kargs):
- self.ydl.report_warning(*args, **kargs)
- def report_error(self, *args, **kargs):
- self.ydl.report_error(*args, **kargs)
- def slow_down(self, start_time, byte_counter):
- """Sleep if the download speed is over the rate limit."""
- rate_limit = self.params.get('ratelimit', None)
- if rate_limit is None or byte_counter == 0:
- return
- now = time.time()
- elapsed = now - start_time
- if elapsed <= 0.0:
- return
- speed = float(byte_counter) / elapsed
- if speed > rate_limit:
- time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
- def temp_name(self, filename):
- """Returns a temporary filename for the given filename."""
- if self.params.get('nopart', False) or filename == u'-' or \
- (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))):
- return filename
- return filename + u'.part'
- def undo_temp_name(self, filename):
- if filename.endswith(u'.part'):
- return filename[:-len(u'.part')]
- return filename
- def try_rename(self, old_filename, new_filename):
- try:
- if old_filename == new_filename:
- return
- os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
- except (IOError, OSError):
- self.report_error(u'unable to rename file')
- def try_utime(self, filename, last_modified_hdr):
- """Try to set the last-modified time of the given file."""
- if last_modified_hdr is None:
- return
- if not os.path.isfile(encodeFilename(filename)):
- return
- timestr = last_modified_hdr
- if timestr is None:
- return
- filetime = timeconvert(timestr)
- if filetime is None:
- return filetime
- # Ignore obviously invalid dates
- if filetime == 0:
- return
- try:
- os.utime(filename, (time.time(), filetime))
- except:
- pass
- return filetime
- def report_destination(self, filename):
- """Report destination filename."""
- self.to_screen(u'[download] Destination: ' + filename)
- def _report_progress_status(self, msg, is_last_line=False):
- fullmsg = u'[download] ' + msg
- if self.params.get('progress_with_newline', False):
- self.to_screen(fullmsg)
- else:
- if os.name == 'nt':
- prev_len = getattr(self, '_report_progress_prev_line_length',
- 0)
- if prev_len > len(fullmsg):
- fullmsg += u' ' * (prev_len - len(fullmsg))
- self._report_progress_prev_line_length = len(fullmsg)
- clear_line = u'\r'
- else:
- clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r')
- self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
- self.to_console_title(u'youtube-dl ' + msg)
- def report_progress(self, percent, data_len_str, speed, eta):
- """Report download progress."""
- if self.params.get('noprogress', False):
- return
- if eta is not None:
- eta_str = self.format_eta(eta)
- else:
- eta_str = 'Unknown ETA'
- if percent is not None:
- percent_str = self.format_percent(percent)
- else:
- percent_str = 'Unknown %'
- speed_str = self.format_speed(speed)
- msg = (u'%s of %s at %s ETA %s' %
- (percent_str, data_len_str, speed_str, eta_str))
- self._report_progress_status(msg)
- def report_progress_live_stream(self, downloaded_data_len, speed, elapsed):
- if self.params.get('noprogress', False):
- return
- downloaded_str = format_bytes(downloaded_data_len)
- speed_str = self.format_speed(speed)
- elapsed_str = FileDownloader.format_seconds(elapsed)
- msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
- self._report_progress_status(msg)
- def report_finish(self, data_len_str, tot_time):
- """Report download finished."""
- if self.params.get('noprogress', False):
- self.to_screen(u'[download] Download completed')
- else:
- self._report_progress_status(
- (u'100%% of %s in %s' %
- (data_len_str, self.format_seconds(tot_time))),
- is_last_line=True)
- def report_resuming_byte(self, resume_len):
- """Report attempt to resume at given byte."""
- self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
- def report_retry(self, count, retries):
- """Report retry in case of HTTP error 5xx"""
- self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
- def report_file_already_downloaded(self, file_name):
- """Report file has already been fully downloaded."""
- try:
- self.to_screen(u'[download] %s has already been downloaded' % file_name)
- except UnicodeEncodeError:
- self.to_screen(u'[download] The file has already been downloaded')
- def report_unable_to_resume(self):
- """Report it was impossible to resume download."""
- self.to_screen(u'[download] Unable to resume')
- def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn):
- def run_rtmpdump(args):
- start = time.time()
- resume_percent = None
- resume_downloaded_data_len = None
- proc = subprocess.Popen(args, stderr=subprocess.PIPE)
- cursor_in_new_line = True
- proc_stderr_closed = False
- while not proc_stderr_closed:
- # read line from stderr
- line = u''
- while True:
- char = proc.stderr.read(1)
- if not char:
- proc_stderr_closed = True
- break
- if char in [b'\r', b'\n']:
- break
- line += char.decode('ascii', 'replace')
- if not line:
- # proc_stderr_closed is True
- continue
- mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
- if mobj:
- downloaded_data_len = int(float(mobj.group(1))*1024)
- percent = float(mobj.group(2))
- if not resume_percent:
- resume_percent = percent
- resume_downloaded_data_len = downloaded_data_len
- eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
- speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
- data_len = None
- if percent > 0:
- data_len = int(downloaded_data_len * 100 / percent)
- data_len_str = u'~' + format_bytes(data_len)
- self.report_progress(percent, data_len_str, speed, eta)
- cursor_in_new_line = False
- self._hook_progress({
- 'downloaded_bytes': downloaded_data_len,
- 'total_bytes': data_len,
- 'tmpfilename': tmpfilename,
- 'filename': filename,
- 'status': 'downloading',
- 'eta': eta,
- 'speed': speed,
- })
- else:
- # no percent for live streams
- mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
- if mobj:
- downloaded_data_len = int(float(mobj.group(1))*1024)
- time_now = time.time()
- speed = self.calc_speed(start, time_now, downloaded_data_len)
- self.report_progress_live_stream(downloaded_data_len, speed, time_now - start)
- cursor_in_new_line = False
- self._hook_progress({
- 'downloaded_bytes': downloaded_data_len,
- 'tmpfilename': tmpfilename,
- 'filename': filename,
- 'status': 'downloading',
- 'speed': speed,
- })
- elif self.params.get('verbose', False):
- if not cursor_in_new_line:
- self.to_screen(u'')
- cursor_in_new_line = True
- self.to_screen(u'[rtmpdump] '+line)
- proc.wait()
- if not cursor_in_new_line:
- self.to_screen(u'')
- return proc.returncode
- self.report_destination(filename)
- tmpfilename = self.temp_name(filename)
- test = self.params.get('test', False)
- # Check for rtmpdump first
- try:
- subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
- except (OSError, IOError):
- self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
- return False
- # Download using rtmpdump. rtmpdump returns exit code 2 when
- # the connection was interrumpted and resuming appears to be
- # possible. This is part of rtmpdump's normal usage, AFAIK.
- basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
- if player_url is not None:
- basic_args += ['--swfVfy', player_url]
- if page_url is not None:
- basic_args += ['--pageUrl', page_url]
- if play_path is not None:
- basic_args += ['--playpath', play_path]
- if tc_url is not None:
- basic_args += ['--tcUrl', url]
- if test:
- basic_args += ['--stop', '1']
- if live:
- basic_args += ['--live']
- if conn:
- basic_args += ['--conn', conn]
- args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
- if sys.platform == 'win32' and sys.version_info < (3, 0):
- # Windows subprocess module does not actually support Unicode
- # on Python 2.x
- # See http://stackoverflow.com/a/9951851/35070
- subprocess_encoding = sys.getfilesystemencoding()
- args = [a.encode(subprocess_encoding, 'ignore') for a in args]
- else:
- subprocess_encoding = None
- if self.params.get('verbose', False):
- if subprocess_encoding:
- str_args = [
- a.decode(subprocess_encoding) if isinstance(a, bytes) else a
- for a in args]
- else:
- str_args = args
- try:
- import pipes
- shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
- except ImportError:
- shell_quote = repr
- self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
- retval = run_rtmpdump(args)
- while (retval == 2 or retval == 1) and not test:
- prevsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
- time.sleep(5.0) # This seems to be needed
- retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
- cursize = os.path.getsize(encodeFilename(tmpfilename))
- if prevsize == cursize and retval == 1:
- break
- # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
- if prevsize == cursize and retval == 2 and cursize > 1024:
- self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
- retval = 0
- break
- if retval == 0 or (test and retval == 2):
- fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'[rtmpdump] %s bytes' % fsize)
- self.try_rename(tmpfilename, filename)
- self._hook_progress({
- 'downloaded_bytes': fsize,
- 'total_bytes': fsize,
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- else:
- self.to_stderr(u"\n")
- self.report_error(u'rtmpdump exited with code %d' % retval)
- return False
- def _download_with_mplayer(self, filename, url):
- self.report_destination(filename)
- tmpfilename = self.temp_name(filename)
- args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]
- # Check for mplayer first
- try:
- subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
- except (OSError, IOError):
- self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] )
- return False
- # Download using mplayer.
- retval = subprocess.call(args)
- if retval == 0:
- fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
- self.try_rename(tmpfilename, filename)
- self._hook_progress({
- 'downloaded_bytes': fsize,
- 'total_bytes': fsize,
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- else:
- self.to_stderr(u"\n")
- self.report_error(u'mplayer exited with code %d' % retval)
- return False
- def _download_m3u8_with_ffmpeg(self, filename, url):
- self.report_destination(filename)
- tmpfilename = self.temp_name(filename)
- args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
- '-bsf:a', 'aac_adtstoasc', tmpfilename]
- for program in ['avconv', 'ffmpeg']:
- try:
- subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
- break
- except (OSError, IOError):
- pass
- else:
- self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found')
- cmd = [program] + args
- retval = subprocess.call(cmd)
- if retval == 0:
- fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
- self.try_rename(tmpfilename, filename)
- self._hook_progress({
- 'downloaded_bytes': fsize,
- 'total_bytes': fsize,
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- else:
- self.to_stderr(u"\n")
- self.report_error(u'ffmpeg exited with code %d' % retval)
- return False
+# Legacy file for backwards compatibility, use youtube_dl.downloader instead!
+from .downloader import FileDownloader as RealFileDownloader
+from .downloader import get_suitable_downloader
+# This class reproduces the old behaviour of FileDownloader
+class FileDownloader(RealFileDownloader):
def _do_download(self, filename, info_dict):
- url = info_dict['url']
- # Check file already present
- if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
- self.report_file_already_downloaded(filename)
- self._hook_progress({
- 'filename': filename,
- 'status': 'finished',
- 'total_bytes': os.path.getsize(encodeFilename(filename)),
- })
- return True
- # Attempt to download using rtmpdump
- if url.startswith('rtmp'):
- return self._download_with_rtmpdump(filename, url,
- info_dict.get('player_url', None),
- info_dict.get('page_url', None),
- info_dict.get('play_path', None),
- info_dict.get('tc_url', None),
- info_dict.get('rtmp_live', False),
- info_dict.get('rtmp_conn', None))
- # Attempt to download using mplayer
- if url.startswith('mms') or url.startswith('rtsp'):
- return self._download_with_mplayer(filename, url)
- # m3u8 manifest are downloaded with ffmpeg
- if determine_ext(url) == u'm3u8':
- return self._download_m3u8_with_ffmpeg(filename, url)
- tmpfilename = self.temp_name(filename)
- stream = None
- # Do not include the Accept-Encoding header
- headers = {'Youtubedl-no-compression': 'True'}
- if 'user_agent' in info_dict:
- headers['Youtubedl-user-agent'] = info_dict['user_agent']
- basic_request = compat_urllib_request.Request(url, None, headers)
- request = compat_urllib_request.Request(url, None, headers)
- if self.params.get('test', False):
- request.add_header('Range','bytes=0-10240')
- # Establish possible resume length
- if os.path.isfile(encodeFilename(tmpfilename)):
- resume_len = os.path.getsize(encodeFilename(tmpfilename))
- else:
- resume_len = 0
- open_mode = 'wb'
- if resume_len != 0:
- if self.params.get('continuedl', False):
- self.report_resuming_byte(resume_len)
- request.add_header('Range','bytes=%d-' % resume_len)
- open_mode = 'ab'
- else:
- resume_len = 0
- count = 0
- retries = self.params.get('retries', 0)
- while count <= retries:
- # Establish connection
- try:
- if count == 0 and 'urlhandle' in info_dict:
- data = info_dict['urlhandle']
- data = compat_urllib_request.urlopen(request)
- break
- except (compat_urllib_error.HTTPError, ) as err:
- if (err.code < 500 or err.code >= 600) and err.code != 416:
- # Unexpected HTTP error
- raise
- elif err.code == 416:
- # Unable to resume (requested range not satisfiable)
- try:
- # Open the connection again without the range header
- data = compat_urllib_request.urlopen(basic_request)
- content_length = data.info()['Content-Length']
- except (compat_urllib_error.HTTPError, ) as err:
- if err.code < 500 or err.code >= 600:
- raise
- else:
- # Examine the reported length
- if (content_length is not None and
- (resume_len - 100 < int(content_length) < resume_len + 100)):
- # The file had already been fully downloaded.
- # Explanation to the above condition: in issue #175 it was revealed that
- # YouTube sometimes adds or removes a few bytes from the end of the file,
- # changing the file size slightly and causing problems for some users. So
- # I decided to implement a suggested change and consider the file
- # completely downloaded if the file size differs less than 100 bytes from
- # the one in the hard drive.
- self.report_file_already_downloaded(filename)
- self.try_rename(tmpfilename, filename)
- self._hook_progress({
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- else:
- # The length does not match, we start the download over
- self.report_unable_to_resume()
- open_mode = 'wb'
- break
- # Retry
- count += 1
- if count <= retries:
- self.report_retry(count, retries)
- if count > retries:
- self.report_error(u'giving up after %s retries' % retries)
- return False
- data_len = data.info().get('Content-length', None)
- if data_len is not None:
- data_len = int(data_len) + resume_len
- min_data_len = self.params.get("min_filesize", None)
- max_data_len = self.params.get("max_filesize", None)
- if min_data_len is not None and data_len < min_data_len:
- self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
- return False
- if max_data_len is not None and data_len > max_data_len:
- self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
- return False
- data_len_str = format_bytes(data_len)
- byte_counter = 0 + resume_len
- block_size = self.params.get('buffersize', 1024)
- start = time.time()
- while True:
- # Download and write
- before = time.time()
- data_block = data.read(block_size)
- after = time.time()
- if len(data_block) == 0:
- break
- byte_counter += len(data_block)
- # Open file just in time
- if stream is None:
- try:
- (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
- assert stream is not None
- filename = self.undo_temp_name(tmpfilename)
- self.report_destination(filename)
- except (OSError, IOError) as err:
- self.report_error(u'unable to open for writing: %s' % str(err))
- return False
- try:
- stream.write(data_block)
- except (IOError, OSError) as err:
- self.to_stderr(u"\n")
- self.report_error(u'unable to write data: %s' % str(err))
- return False
- if not self.params.get('noresizebuffer', False):
- block_size = self.best_block_size(after - before, len(data_block))
- # Progress message
- speed = self.calc_speed(start, time.time(), byte_counter - resume_len)
- if data_len is None:
- eta = percent = None
- else:
- percent = self.calc_percent(byte_counter, data_len)
- eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
- self.report_progress(percent, data_len_str, speed, eta)
- self._hook_progress({
- 'downloaded_bytes': byte_counter,
- 'total_bytes': data_len,
- 'tmpfilename': tmpfilename,
- 'filename': filename,
- 'status': 'downloading',
- 'eta': eta,
- 'speed': speed,
- })
- # Apply rate limit
- self.slow_down(start, byte_counter - resume_len)
- if stream is None:
- self.to_stderr(u"\n")
- self.report_error(u'Did not get any data blocks')
- return False
- stream.close()
- self.report_finish(data_len_str, (time.time() - start))
- if data_len is not None and byte_counter != data_len:
- raise ContentTooShortError(byte_counter, int(data_len))
- self.try_rename(tmpfilename, filename)
- # Update file modification time
- if self.params.get('updatetime', True):
- info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
- self._hook_progress({
- 'downloaded_bytes': byte_counter,
- 'total_bytes': byte_counter,
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- def _hook_progress(self, status):
+ real_fd = get_suitable_downloader(info_dict)(self.ydl, self.params)
for ph in self._progress_hooks:
- ph(status)
- def add_progress_hook(self, ph):
- """ ph gets called on download progress, with a dictionary with the entries
- * filename: The final filename
- * status: One of "downloading" and "finished"
- It can also have some of the following entries:
- * downloaded_bytes: Bytes on disks
- * total_bytes: Total bytes, None if unknown
- * tmpfilename: The filename we're currently writing to
- * eta: The estimated time in seconds, None if unknown
- * speed: The download speed in bytes/second, None if unknown
- Hooks are guaranteed to be called at least once (with status "finished")
- if the download is successful.
- """
- self._progress_hooks.append(ph)
+ real_fd.add_progress_hook(ph)
+ return real_fd.download(filename, info_dict)
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py
index da95f1a87..481c07a94 100644
--- a/youtube_dl/PostProcessor.py
+++ b/youtube_dl/PostProcessor.py
@@ -10,6 +10,7 @@ from .utils import (
+ prepend_extension,
@@ -85,10 +86,10 @@ class FFmpegPostProcessor(PostProcessor):
files_cmd = []
for path in input_paths:
- files_cmd.extend(['-i', encodeFilename(path)])
+ files_cmd.extend(['-i', encodeFilename(path, True)])
cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y'] + files_cmd
+ opts +
- [encodeFilename(self._ffmpeg_filename_argument(out_path))])
+ [encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
if self._downloader.params.get('verbose', False):
self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
@@ -122,7 +123,10 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
if not self._exes['ffprobe'] and not self._exes['avprobe']:
raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')
- cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))]
+ cmd = [
+ self._exes['avprobe'] or self._exes['ffprobe'],
+ '-show_streams',
+ encodeFilename(self._ffmpeg_filename_argument(path), True)]
handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)
output = handle.communicate()[0]
if handle.wait() != 0:
@@ -499,13 +503,11 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
return True, info
filename = info['filepath']
- ext = os.path.splitext(filename)[1][1:]
- temp_filename = filename + u'.temp'
+ temp_filename = prepend_extension(filename, 'temp')
options = ['-c', 'copy']
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
- options.extend(['-f', ext])
self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
self.run_ffmpeg(filename, temp_filename, options)
@@ -514,6 +516,13 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
return True, info
+class FFmpegMergerPP(FFmpegPostProcessor):
+ def run(self, info):
+ filename = info['filepath']
+ args = ['-c', 'copy']
+ self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
+ return True, info
class XAttrMetadataPP(PostProcessor):
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 2a078adfb..5748ceaf3 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
import collections
import errno
@@ -51,9 +51,11 @@ from .utils import (
+ prepend_extension,
from .extractor import get_info_extractor, gen_extractors
-from .FileDownloader import FileDownloader
+from .downloader import get_suitable_downloader
+from .PostProcessor import FFmpegMergerPP
from .version import __version__
@@ -148,6 +150,7 @@ class YoutubeDL(object):
socket_timeout: Time to wait for unresponsive hosts, in seconds
bidi_workaround: Work around buggy terminals without bidirectional text
support, using fridibi
+ debug_printtraffic:Print out sent and received HTTP traffic
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -164,6 +167,8 @@ class YoutubeDL(object):
def __init__(self, params=None):
"""Create a FileDownloader object with the given options."""
+ if params is None:
+ params = {}
self._ies = []
self._ies_instances = {}
self._pps = []
@@ -172,7 +177,7 @@ class YoutubeDL(object):
self._num_downloads = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
self._err_file = sys.stderr
- self.params = {} if params is None else params
+ self.params = params
if params.get('bidi_workaround', False):
@@ -183,15 +188,21 @@ class YoutubeDL(object):
width_args = []
width_args = ['-w', str(width)]
- self._fribidi = subprocess.Popen(
- ['fribidi', '-c', 'UTF-8'] + width_args,
+ sp_kwargs = dict(
- self._fribidi_channel = os.fdopen(master, 'rb')
+ try:
+ self._output_process = subprocess.Popen(
+ ['bidiv'] + width_args, **sp_kwargs
+ )
+ except OSError:
+ self._output_process = subprocess.Popen(
+ ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
+ self._output_channel = os.fdopen(master, 'rb')
except OSError as ose:
if ose.errno == 2:
- self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
+ self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
@@ -200,15 +211,13 @@ class YoutubeDL(object):
and not params['restrictfilenames']):
# On Python 3, the Unicode filesystem API will throw errors (#1474)
- u'Assuming --restrict-filenames since file system encoding '
- u'cannot encode all charactes. '
- u'Set the LC_ALL environment variable to fix this.')
+ 'Assuming --restrict-filenames since file system encoding '
+ 'cannot encode all charactes. '
+ 'Set the LC_ALL environment variable to fix this.')
self.params['restrictfilenames'] = True
- self.fd = FileDownloader(self, self.params)
if '%(stitle)s' in self.params.get('outtmpl', ''):
- self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
+ self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
@@ -242,17 +251,22 @@ class YoutubeDL(object):
+ def add_progress_hook(self, ph):
+ """Add the progress hook (currently only for the file downloader)"""
+ self._progress_hooks.append(ph)
def _bidi_workaround(self, message):
- if not hasattr(self, '_fribidi_channel'):
+ if not hasattr(self, '_output_channel'):
return message
- assert type(message) == type(u'')
- line_count = message.count(u'\n') + 1
- self._fribidi.stdin.write((message + u'\n').encode('utf-8'))
- self._fribidi.stdin.flush()
- res = u''.join(self._fribidi_channel.readline().decode('utf-8')
+ assert hasattr(self, '_output_process')
+ assert type(message) == type('')
+ line_count = message.count('\n') + 1
+ self._output_process.stdin.write((message + '\n').encode('utf-8'))
+ self._output_process.stdin.flush()
+ res = ''.join(self._output_channel.readline().decode('utf-8')
for _ in range(line_count))
- return res[:-len(u'\n')]
+ return res[:-len('\n')]
def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
@@ -264,19 +278,19 @@ class YoutubeDL(object):
elif not check_quiet or not self.params.get('quiet', False):
message = self._bidi_workaround(message)
- terminator = [u'\n', u''][skip_eol]
+ terminator = ['\n', ''][skip_eol]
output = message + terminator
write_string(output, self._screen_file)
def to_stderr(self, message):
"""Print message to stderr."""
- assert type(message) == type(u'')
+ assert type(message) == type('')
if self.params.get('logger'):
message = self._bidi_workaround(message)
- output = message + u'\n'
+ output = message + '\n'
write_string(output, self._err_file)
def to_console_title(self, message):
@@ -287,21 +301,21 @@ class YoutubeDL(object):
# already of type unicode()
elif 'TERM' in os.environ:
- write_string(u'\033]0;%s\007' % message, self._screen_file)
+ write_string('\033]0;%s\007' % message, self._screen_file)
def save_console_title(self):
if not self.params.get('consoletitle', False):
if 'TERM' in os.environ:
# Save the title on stack
- write_string(u'\033[22;0t', self._screen_file)
+ write_string('\033[22;0t', self._screen_file)
def restore_console_title(self):
if not self.params.get('consoletitle', False):
if 'TERM' in os.environ:
# Restore the title from stack
- write_string(u'\033[23;0t', self._screen_file)
+ write_string('\033[23;0t', self._screen_file)
def __enter__(self):
@@ -327,13 +341,13 @@ class YoutubeDL(object):
if self.params.get('verbose'):
if tb is None:
if sys.exc_info()[0]: # if .trouble has been called from an except block
- tb = u''
+ tb = ''
if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
- tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
+ tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
tb += compat_str(traceback.format_exc())
tb_data = traceback.format_list(traceback.extract_stack())
- tb = u''.join(tb_data)
+ tb = ''.join(tb_data)
if not self.params.get('ignoreerrors', False):
if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
@@ -349,10 +363,10 @@ class YoutubeDL(object):
If stderr is a tty file the 'WARNING:' will be colored
if self._err_file.isatty() and os.name != 'nt':
- _msg_header = u'\033[0;33mWARNING:\033[0m'
+ _msg_header = '\033[0;33mWARNING:\033[0m'
- _msg_header = u'WARNING:'
- warning_message = u'%s %s' % (_msg_header, message)
+ _msg_header = 'WARNING:'
+ warning_message = '%s %s' % (_msg_header, message)
def report_error(self, message, tb=None):
@@ -361,18 +375,18 @@ class YoutubeDL(object):
in red if stderr is a tty file.
if self._err_file.isatty() and os.name != 'nt':
- _msg_header = u'\033[0;31mERROR:\033[0m'
+ _msg_header = '\033[0;31mERROR:\033[0m'
- _msg_header = u'ERROR:'
- error_message = u'%s %s' % (_msg_header, message)
+ _msg_header = 'ERROR:'
+ error_message = '%s %s' % (_msg_header, message)
self.trouble(error_message, tb)
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
- self.to_screen(u'[download] %s has already been downloaded' % file_name)
+ self.to_screen('[download] %s has already been downloaded' % file_name)
except UnicodeEncodeError:
- self.to_screen(u'[download] The file has already been downloaded')
+ self.to_screen('[download] The file has already been downloaded')
def increment_downloads(self):
"""Increment the ordinal that assigns a number to each file."""
@@ -387,61 +401,61 @@ class YoutubeDL(object):
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
- autonumber_templ = u'%0' + str(autonumber_size) + u'd'
+ autonumber_templ = '%0' + str(autonumber_size) + 'd'
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
- template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
+ template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
sanitize = lambda k, v: sanitize_filename(
- is_id=(k == u'id'))
+ is_id=(k == 'id'))
template_dict = dict((k, sanitize(k, v))
for k, v in template_dict.items()
if v is not None)
- template_dict = collections.defaultdict(lambda: u'NA', template_dict)
+ template_dict = collections.defaultdict(lambda: 'NA', template_dict)
tmpl = os.path.expanduser(self.params['outtmpl'])
filename = tmpl % template_dict
return filename
except ValueError as err:
- self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
+ self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
return None
def _match_entry(self, info_dict):
""" Returns None iff the file should be downloaded """
- video_title = info_dict.get('title', info_dict.get('id', u'video'))
+ video_title = info_dict.get('title', info_dict.get('id', 'video'))
if 'title' in info_dict:
# This can happen when we're just evaluating the playlist
title = info_dict['title']
matchtitle = self.params.get('matchtitle', False)
if matchtitle:
if not re.search(matchtitle, title, re.IGNORECASE):
- return u'"' + title + '" title did not match pattern "' + matchtitle + '"'
+ return '"' + title + '" title did not match pattern "' + matchtitle + '"'
rejecttitle = self.params.get('rejecttitle', False)
if rejecttitle:
if re.search(rejecttitle, title, re.IGNORECASE):
- return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+ return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
date = info_dict.get('upload_date', None)
if date is not None:
dateRange = self.params.get('daterange', DateRange())
if date not in dateRange:
- return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+ return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
view_count = info_dict.get('view_count', None)
if view_count is not None:
min_views = self.params.get('min_views')
if min_views is not None and view_count < min_views:
- return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
+ return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
max_views = self.params.get('max_views')
if max_views is not None and view_count > max_views:
- return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
+ return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
age_limit = self.params.get('age_limit')
if age_limit is not None:
if age_limit < info_dict.get('age_limit', 0):
- return u'Skipping "' + title + '" because it is age restricted'
+ return 'Skipping "' + title + '" because it is age restricted'
if self.in_download_archive(info_dict):
- return u'%s has already been recorded in archive' % video_title
+ return '%s has already been recorded in archive' % video_title
return None
@@ -468,8 +482,8 @@ class YoutubeDL(object):
if not ie.working():
- self.report_warning(u'The program functionality for this site has been marked as broken, '
- u'and will probably not work.')
+ self.report_warning('The program functionality for this site has been marked as broken, '
+ 'and will probably not work.')
ie_result = ie.extract(url)
@@ -502,7 +516,7 @@ class YoutubeDL(object):
- self.report_error(u'no suitable InfoExtractor: %s' % url)
+ self.report_error('no suitable InfoExtractor: %s' % url)
def process_ie_result(self, ie_result, download=True, extra_info={}):
@@ -533,7 +547,7 @@ class YoutubeDL(object):
def make_result(embedded_info):
new_result = ie_result.copy()
for f in ('_type', 'url', 'ext', 'player_url', 'formats',
- 'entries', 'urlhandle', 'ie_key', 'duration',
+ 'entries', 'ie_key', 'duration',
'subtitles', 'annotations', 'format',
'thumbnail', 'thumbnails'):
if f in new_result:
@@ -553,7 +567,7 @@ class YoutubeDL(object):
elif result_type == 'playlist':
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
- self.to_screen(u'[download] Downloading playlist: %s' % playlist)
+ self.to_screen('[download] Downloading playlist: %s' % playlist)
playlist_results = []
@@ -568,11 +582,11 @@ class YoutubeDL(object):
n_entries = len(entries)
- u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
+ "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
(ie_result['extractor'], playlist, n_all_entries, n_entries))
for i, entry in enumerate(entries, 1):
- self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries))
+ self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
extra = {
'playlist': playlist,
'playlist_index': i + playliststart,
@@ -584,7 +598,7 @@ class YoutubeDL(object):
reason = self._match_entry(entry)
if reason is not None:
- self.to_screen(u'[download] ' + reason)
+ self.to_screen('[download] ' + reason)
entry_result = self.process_ie_result(entry,
@@ -617,7 +631,7 @@ class YoutubeDL(object):
elif format_spec == 'worst':
return available_formats[0]
- extensions = [u'mp4', u'flv', u'webm', u'3gp']
+ extensions = ['mp4', 'flv', 'webm', '3gp']
if format_spec in extensions:
filter_f = lambda f: f['ext'] == format_spec
@@ -636,7 +650,7 @@ class YoutubeDL(object):
info_dict['playlist_index'] = None
# This extractors handle format selection themselves
- if info_dict['extractor'] in [u'youtube', u'Youku']:
+ if info_dict['extractor'] in ['Youku']:
if download:
return info_dict
@@ -653,33 +667,32 @@ class YoutubeDL(object):
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
if format.get('format') is None:
- format['format'] = u'{id} - {res}{note}'.format(
+ format['format'] = '{id} - {res}{note}'.format(
- note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
+ note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
# Automatically determine file extension if missing
if 'ext' not in format:
format['ext'] = determine_ext(format['url'])
- if self.params.get('listformats', None):
- self.list_formats(info_dict)
- return
format_limit = self.params.get('format_limit', None)
if format_limit:
formats = list(takewhile_inclusive(
lambda f: f['format_id'] != format_limit, formats
- if self.params.get('prefer_free_formats'):
- def _free_formats_key(f):
- try:
- ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext'])
- except ValueError:
- ext_ord = -1
- # We only compare the extension if they have the same height and width
- return (f.get('height'), f.get('width'), ext_ord)
- formats = sorted(formats, key=_free_formats_key)
+ # TODO Central sorting goes here
+ if formats[0] is not info_dict:
+ # only set the 'formats' fields if the original info_dict list them
+ # otherwise we end up with a circular reference, the first (and unique)
+ # element in the 'formats' field in info_dict is info_dict itself,
+ # wich can't be exported to json
+ info_dict['formats'] = formats
+ if self.params.get('listformats', None):
+ self.list_formats(info_dict)
+ return
req_format = self.params.get('format', 'best')
if req_format is None:
@@ -689,21 +702,35 @@ class YoutubeDL(object):
if req_format in ('-1', 'all'):
formats_to_download = formats
- # We can accept formats requestd in the format: 34/5/best, we pick
+ # We can accept formats requested in the format: 34/5/best, we pick
# the first that is available, starting from left
req_formats = req_format.split('/')
for rf in req_formats:
- selected_format = self.select_format(rf, formats)
+ if re.match(r'.+?\+.+?', rf) is not None:
+ # Two formats have been requested like '137+139'
+ format_1, format_2 = rf.split('+')
+ formats_info = (self.select_format(format_1, formats),
+ self.select_format(format_2, formats))
+ if all(formats_info):
+ selected_format = {
+ 'requested_formats': formats_info,
+ 'format': rf,
+ 'ext': formats_info[0]['ext'],
+ }
+ else:
+ selected_format = None
+ else:
+ selected_format = self.select_format(rf, formats)
if selected_format is not None:
formats_to_download = [selected_format]
if not formats_to_download:
- raise ExtractorError(u'requested format not available',
+ raise ExtractorError('requested format not available',
if download:
if len(formats_to_download) > 1:
- self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
+ self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
for format in formats_to_download:
new_info = dict(info_dict)
@@ -721,7 +748,7 @@ class YoutubeDL(object):
info_dict['fulltitle'] = info_dict['title']
if len(info_dict['title']) > 200:
- info_dict['title'] = info_dict['title'][:197] + u'...'
+ info_dict['title'] = info_dict['title'][:197] + '...'
# Keep for backwards compatibility
info_dict['stitle'] = info_dict['title']
@@ -731,7 +758,7 @@ class YoutubeDL(object):
reason = self._match_entry(info_dict)
if reason is not None:
- self.to_screen(u'[download] ' + reason)
+ self.to_screen('[download] ' + reason)
max_downloads = self.params.get('max_downloads')
@@ -748,7 +775,7 @@ class YoutubeDL(object):
if self.params.get('forceurl', False):
# For RTMP URLs, also include the playpath
- self.to_stdout(info_dict['url'] + info_dict.get('play_path', u''))
+ self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
@@ -775,37 +802,37 @@ class YoutubeDL(object):
if dn != '' and not os.path.exists(dn):
except (OSError, IOError) as err:
- self.report_error(u'unable to create directory ' + compat_str(err))
+ self.report_error('unable to create directory ' + compat_str(err))
if self.params.get('writedescription', False):
- descfn = filename + u'.description'
+ descfn = filename + '.description'
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
- self.to_screen(u'[info] Video description is already present')
+ self.to_screen('[info] Video description is already present')
- self.to_screen(u'[info] Writing video description to: ' + descfn)
+ self.to_screen('[info] Writing video description to: ' + descfn)
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
except (KeyError, TypeError):
- self.report_warning(u'There\'s no description to write.')
+ self.report_warning('There\'s no description to write.')
except (OSError, IOError):
- self.report_error(u'Cannot write description file ' + descfn)
+ self.report_error('Cannot write description file ' + descfn)
if self.params.get('writeannotations', False):
- annofn = filename + u'.annotations.xml'
+ annofn = filename + '.annotations.xml'
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
- self.to_screen(u'[info] Video annotations are already present')
+ self.to_screen('[info] Video annotations are already present')
- self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+ self.to_screen('[info] Writing video annotations to: ' + annofn)
with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
except (KeyError, TypeError):
- self.report_warning(u'There are no annotations to write.')
+ self.report_warning('There are no annotations to write.')
except (OSError, IOError):
- self.report_error(u'Cannot write annotations file: ' + annofn)
+ self.report_error('Cannot write annotations file: ' + annofn)
subtitles_are_requested = any([self.params.get('writesubtitles', False),
@@ -823,46 +850,45 @@ class YoutubeDL(object):
sub_filename = subtitles_filename(filename, sub_lang, sub_format)
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
- self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+ self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
- self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
+ self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
except (OSError, IOError):
- self.report_error(u'Cannot write subtitles file ' + descfn)
+ self.report_error('Cannot write subtitles file ' + descfn)
if self.params.get('writeinfojson', False):
- infofn = os.path.splitext(filename)[0] + u'.info.json'
+ infofn = os.path.splitext(filename)[0] + '.info.json'
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
- self.to_screen(u'[info] Video description metadata is already present')
+ self.to_screen('[info] Video description metadata is already present')
- self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn)
+ self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
- json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
- write_json_file(json_info_dict, encodeFilename(infofn))
+ write_json_file(info_dict, encodeFilename(infofn))
except (OSError, IOError):
- self.report_error(u'Cannot write metadata to JSON file ' + infofn)
+ self.report_error('Cannot write metadata to JSON file ' + infofn)
if self.params.get('writethumbnail', False):
if info_dict.get('thumbnail') is not None:
- thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
- thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format
+ thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
+ thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
- self.to_screen(u'[%s] %s: Thumbnail is already present' %
+ self.to_screen('[%s] %s: Thumbnail is already present' %
(info_dict['extractor'], info_dict['id']))
- self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
+ self.to_screen('[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))
uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
with open(thumb_filename, 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
- self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
+ self.to_screen('[%s] %s: Writing thumbnail to: %s' %
(info_dict['extractor'], info_dict['id'], thumb_filename))
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_warning(u'Unable to download thumbnail "%s": %s' %
+ self.report_warning('Unable to download thumbnail "%s": %s' %
(info_dict['thumbnail'], compat_str(err)))
if not self.params.get('skip_download', False):
@@ -870,21 +896,41 @@ class YoutubeDL(object):
success = True
- success = self.fd._do_download(filename, info_dict)
+ def dl(name, info):
+ fd = get_suitable_downloader(info)(self, self.params)
+ for ph in self._progress_hooks:
+ fd.add_progress_hook(ph)
+ return fd.download(name, info)
+ if info_dict.get('requested_formats') is not None:
+ downloaded = []
+ success = True
+ for f in info_dict['requested_formats']:
+ new_info = dict(info_dict)
+ new_info.update(f)
+ fname = self.prepare_filename(new_info)
+ fname = prepend_extension(fname, 'f%s' % f['format_id'])
+ downloaded.append(fname)
+ partial_success = dl(fname, new_info)
+ success = success and partial_success
+ info_dict['__postprocessors'] = [FFmpegMergerPP(self)]
+ info_dict['__files_to_merge'] = downloaded
+ else:
+ # Just a single file
+ success = dl(filename, info_dict)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_error(u'unable to download video data: %s' % str(err))
+ self.report_error('unable to download video data: %s' % str(err))
except (OSError, IOError) as err:
raise UnavailableVideoError(err)
except (ContentTooShortError, ) as err:
- self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+ self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
if success:
self.post_process(filename, info_dict)
except (PostProcessingError) as err:
- self.report_error(u'postprocessing: %s' % str(err))
+ self.report_error('postprocessing: %s' % str(err))
@@ -901,9 +947,9 @@ class YoutubeDL(object):
#It also downloads the videos
except UnavailableVideoError:
- self.report_error(u'unable to download video')
+ self.report_error('unable to download video')
except MaxDownloadsReached:
- self.to_screen(u'[info] Maximum number of downloaded files reached.')
+ self.to_screen('[info] Maximum number of downloaded files reached.')
return self._download_retcode
@@ -916,7 +962,7 @@ class YoutubeDL(object):
except DownloadError:
webpage_url = info.get('webpage_url')
if webpage_url is not None:
- self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url)
+ self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
return self.download([webpage_url])
@@ -927,7 +973,11 @@ class YoutubeDL(object):
info = dict(ie_info)
info['filepath'] = filename
keep_video = None
- for pp in self._pps:
+ pps_chain = []
+ if ie_info.get('__postprocessors') is not None:
+ pps_chain.extend(ie_info['__postprocessors'])
+ pps_chain.extend(self._pps)
+ for pp in pps_chain:
keep_video_wish, new_info = pp.run(info)
if keep_video_wish is not None:
@@ -940,10 +990,10 @@ class YoutubeDL(object):
if keep_video is False and not self.params.get('keepvideo', False):
- self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
+ self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
except (IOError, OSError):
- self.report_warning(u'Unable to remove downloaded video file')
+ self.report_warning('Unable to remove downloaded video file')
def _make_archive_id(self, info_dict):
# Future-proof against any change in case
@@ -954,7 +1004,7 @@ class YoutubeDL(object):
extractor = info_dict.get('ie_key') # key in a playlist
if extractor is None:
return None # Incomplete video information
- return extractor.lower() + u' ' + info_dict['id']
+ return extractor.lower() + ' ' + info_dict['id']
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
@@ -982,53 +1032,59 @@ class YoutubeDL(object):
vid_id = self._make_archive_id(info_dict)
assert vid_id
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
- archive_file.write(vid_id + u'\n')
+ archive_file.write(vid_id + '\n')
def format_resolution(format, default='unknown'):
if format.get('vcodec') == 'none':
return 'audio only'
- if format.get('_resolution') is not None:
- return format['_resolution']
+ if format.get('resolution') is not None:
+ return format['resolution']
if format.get('height') is not None:
if format.get('width') is not None:
- res = u'%sx%s' % (format['width'], format['height'])
+ res = '%sx%s' % (format['width'], format['height'])
- res = u'%sp' % format['height']
+ res = '%sp' % format['height']
+ elif format.get('width') is not None:
+ res = '?x%d' % format['width']
res = default
return res
def list_formats(self, info_dict):
def format_note(fdict):
- res = u''
+ res = ''
+ if fdict.get('ext') in ['f4f', 'f4m']:
+ res += '(unsupported) '
if fdict.get('format_note') is not None:
- res += fdict['format_note'] + u' '
+ res += fdict['format_note'] + ' '
+ if fdict.get('tbr') is not None:
+ res += '%4dk ' % fdict['tbr']
if (fdict.get('vcodec') is not None and
fdict.get('vcodec') != 'none'):
- res += u'%-5s' % fdict['vcodec']
- elif fdict.get('vbr') is not None:
- res += u'video'
+ res += '%-5s@' % fdict['vcodec']
+ elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
+ res += 'video@'
if fdict.get('vbr') is not None:
- res += u'@%4dk' % fdict['vbr']
+ res += '%4dk' % fdict['vbr']
if fdict.get('acodec') is not None:
if res:
- res += u', '
- res += u'%-5s' % fdict['acodec']
+ res += ', '
+ res += '%-5s' % fdict['acodec']
elif fdict.get('abr') is not None:
if res:
- res += u', '
+ res += ', '
res += 'audio'
if fdict.get('abr') is not None:
- res += u'@%3dk' % fdict['abr']
+ res += '@%3dk' % fdict['abr']
if fdict.get('filesize') is not None:
if res:
- res += u', '
+ res += ', '
res += format_bytes(fdict['filesize'])
return res
def line(format, idlen=20):
- return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
+ return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
@@ -1036,7 +1092,7 @@ class YoutubeDL(object):
formats = info_dict.get('formats', [info_dict])
- idlen = max(len(u'format code'),
+ idlen = max(len('format code'),
max(len(f['format_id']) for f in formats))
formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1:
@@ -1044,10 +1100,10 @@ class YoutubeDL(object):
formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
header_line = line({
- 'format_id': u'format code', 'ext': u'extension',
- '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
- self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
- (info_dict['id'], header_line, u"\n".join(formats_s)))
+ 'format_id': 'format code', 'ext': 'extension',
+ 'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
+ self.to_screen('[info] Available formats for %s:\n%s\n%s' %
+ (info_dict['id'], header_line, '\n'.join(formats_s)))
def urlopen(self, req):
""" Start an HTTP download """
@@ -1056,7 +1112,7 @@ class YoutubeDL(object):
def print_debug_header(self):
if not self.params.get('verbose'):
- write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
+ write_string('[debug] youtube-dl version ' + __version__ + '\n')
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
@@ -1065,20 +1121,20 @@ class YoutubeDL(object):
out, err = sp.communicate()
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
- write_string(u'[debug] Git HEAD: ' + out + u'\n')
+ write_string('[debug] Git HEAD: ' + out + '\n')
- write_string(u'[debug] Python version %s - %s' %
- (platform.python_version(), platform_name()) + u'\n')
+ write_string('[debug] Python version %s - %s' %
+ (platform.python_version(), platform_name()) + '\n')
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
- write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
+ write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
@@ -1108,10 +1164,13 @@ class YoutubeDL(object):
if 'http' in proxies and 'https' not in proxies:
proxies['https'] = proxies['http']
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
+ debuglevel = 1 if self.params.get('debug_printtraffic') else 0
https_handler = make_HTTPS_handler(
- self.params.get('nocheckcertificate', False))
+ self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
+ ydlh = YoutubeDLHandler(debuglevel=debuglevel)
opener = compat_urllib_request.build_opener(
- https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+ https_handler, proxy_handler, cookie_processor, ydlh)
# Delete the default user-agent header, which would otherwise apply in
# cases where our custom HTTP handler doesn't come into play
# (See https://github.com/rg3/youtube-dl/issues/1309 for details)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 03f98f504..ba243d4d2 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -45,6 +45,7 @@ __license__ = 'Public Domain'
import codecs
import getpass
+import locale
import optparse
import os
import random
@@ -187,16 +188,16 @@ def parseOpts(overrideArguments=None):
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
- help='Location in the filesystem where youtube-dl can store downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl .')
+ help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
'--no-cache-dir', action='store_const', const=None, dest='cachedir',
help='Disable filesystem caching')
'--socket-timeout', dest='socket_timeout',
- type=float, default=None, help=optparse.SUPPRESS_HELP)
+ type=float, default=None, help=u'Time to wait before giving up, in seconds')
'--bidi-workaround', dest='bidi_workaround', action='store_true',
- help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH')
+ help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
@@ -335,7 +336,9 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='youtube_print_sig_code', default=False,
+ verbosity.add_option('--print-traffic',
+ dest='debug_printtraffic', action='store_true', default=False,
+ help=optparse.SUPPRESS_HELP)
filesystem.add_option('-t', '--title',
action='store_true', dest='usetitle', help='use title in file name (default)', default=False)
@@ -477,6 +480,8 @@ def parseOpts(overrideArguments=None):
write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
+ write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' %
+ (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))
return parser, opts, args
@@ -521,6 +526,8 @@ def _real_main(argv=None):
sys.exit(u'ERROR: batch file could not be read')
all_urls = batchurls + args
all_urls = [url.strip() for url in all_urls]
+ _enc = preferredencoding()
+ all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
extractors = gen_extractors()
@@ -697,6 +704,7 @@ def _real_main(argv=None):
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
'bidi_workaround': opts.bidi_workaround,
+ 'debug_printtraffic': opts.debug_printtraffic,
with YoutubeDL(ydl_opts) as ydl:
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
new file mode 100644
index 000000000..f19b490f1
--- /dev/null
+++ b/youtube_dl/downloader/__init__.py
@@ -0,0 +1,23 @@
+from .common import FileDownloader
+from .hls import HlsFD
+from .http import HttpFD
+from .mplayer import MplayerFD
+from .rtmp import RtmpFD
+from ..utils import (
+ determine_ext,
+def get_suitable_downloader(info_dict):
+ """Get the downloader class that can handle the info dict."""
+ url = info_dict['url']
+ if url.startswith('rtmp'):
+ return RtmpFD
+ if determine_ext(url) == u'm3u8':
+ return HlsFD
+ if url.startswith('mms') or url.startswith('rtsp'):
+ return MplayerFD
+ else:
+ return HttpFD
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
new file mode 100644
index 000000000..10143d56a
--- /dev/null
+++ b/youtube_dl/downloader/common.py
@@ -0,0 +1,317 @@
+import os
+import re
+import sys
+import time
+from ..utils import (
+ encodeFilename,
+ timeconvert,
+ format_bytes,
+class FileDownloader(object):
+ """File Downloader class.
+ File downloader objects are the ones responsible of downloading the
+ actual video file and writing it to disk.
+ File downloaders accept a lot of parameters. In order not to saturate
+ the object constructor with arguments, it receives a dictionary of
+ options instead.
+ Available options:
+ verbose: Print additional info to stdout.
+ quiet: Do not print messages to stdout.
+ ratelimit: Download speed limit, in bytes/sec.
+ retries: Number of times to retry for HTTP error 5xx
+ buffersize: Size of download buffer in bytes.
+ noresizebuffer: Do not automatically resize the download buffer.
+ continuedl: Try to continue downloads if possible.
+ noprogress: Do not print the progress bar.
+ logtostderr: Log messages to stderr instead of stdout.
+ consoletitle: Display progress in console window's titlebar.
+ nopart: Do not use temporary .part files.
+ updatetime: Use the Last-modified header to set output file timestamps.
+ test: Download only first bytes to test the downloader.
+ min_filesize: Skip files smaller than this size
+ max_filesize: Skip files larger than this size
+ Subclasses of this one must re-define the real_download method.
+ """
+ params = None
+ def __init__(self, ydl, params):
+ """Create a FileDownloader object with the given options."""
+ self.ydl = ydl
+ self._progress_hooks = []
+ self.params = params
+ @staticmethod
+ def format_seconds(seconds):
+ (mins, secs) = divmod(seconds, 60)
+ (hours, mins) = divmod(mins, 60)
+ if hours > 99:
+ return '--:--:--'
+ if hours == 0:
+ return '%02d:%02d' % (mins, secs)
+ else:
+ return '%02d:%02d:%02d' % (hours, mins, secs)
+ @staticmethod
+ def calc_percent(byte_counter, data_len):
+ if data_len is None:
+ return None
+ return float(byte_counter) / float(data_len) * 100.0
+ @staticmethod
+ def format_percent(percent):
+ if percent is None:
+ return '---.-%'
+ return '%6s' % ('%3.1f%%' % percent)
+ @staticmethod
+ def calc_eta(start, now, total, current):
+ if total is None:
+ return None
+ dif = now - start
+ if current == 0 or dif < 0.001: # One millisecond
+ return None
+ rate = float(current) / dif
+ return int((float(total) - float(current)) / rate)
+ @staticmethod
+ def format_eta(eta):
+ if eta is None:
+ return '--:--'
+ return FileDownloader.format_seconds(eta)
+ @staticmethod
+ def calc_speed(start, now, bytes):
+ dif = now - start
+ if bytes == 0 or dif < 0.001: # One millisecond
+ return None
+ return float(bytes) / dif
+ @staticmethod
+ def format_speed(speed):
+ if speed is None:
+ return '%10s' % '---b/s'
+ return '%10s' % ('%s/s' % format_bytes(speed))
+ @staticmethod
+ def best_block_size(elapsed_time, bytes):
+ new_min = max(bytes / 2.0, 1.0)
+ new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
+ if elapsed_time < 0.001:
+ return int(new_max)
+ rate = bytes / elapsed_time
+ if rate > new_max:
+ return int(new_max)
+ if rate < new_min:
+ return int(new_min)
+ return int(rate)
+ @staticmethod
+ def parse_bytes(bytestr):
+ """Parse a string indicating a byte quantity into an integer."""
+ matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
+ if matchobj is None:
+ return None
+ number = float(matchobj.group(1))
+ multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
+ return int(round(number * multiplier))
+ def to_screen(self, *args, **kargs):
+ self.ydl.to_screen(*args, **kargs)
+ def to_stderr(self, message):
+ self.ydl.to_screen(message)
+ def to_console_title(self, message):
+ self.ydl.to_console_title(message)
+ def trouble(self, *args, **kargs):
+ self.ydl.trouble(*args, **kargs)
+ def report_warning(self, *args, **kargs):
+ self.ydl.report_warning(*args, **kargs)
+ def report_error(self, *args, **kargs):
+ self.ydl.report_error(*args, **kargs)
+ def slow_down(self, start_time, byte_counter):
+ """Sleep if the download speed is over the rate limit."""
+ rate_limit = self.params.get('ratelimit', None)
+ if rate_limit is None or byte_counter == 0:
+ return
+ now = time.time()
+ elapsed = now - start_time
+ if elapsed <= 0.0:
+ return
+ speed = float(byte_counter) / elapsed
+ if speed > rate_limit:
+ time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
+ def temp_name(self, filename):
+ """Returns a temporary filename for the given filename."""
+ if self.params.get('nopart', False) or filename == u'-' or \
+ (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))):
+ return filename
+ return filename + u'.part'
+ def undo_temp_name(self, filename):
+ if filename.endswith(u'.part'):
+ return filename[:-len(u'.part')]
+ return filename
+ def try_rename(self, old_filename, new_filename):
+ try:
+ if old_filename == new_filename:
+ return
+ os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
+ except (IOError, OSError) as err:
+ self.report_error(u'unable to rename file: %s' % str(err))
+ def try_utime(self, filename, last_modified_hdr):
+ """Try to set the last-modified time of the given file."""
+ if last_modified_hdr is None:
+ return
+ if not os.path.isfile(encodeFilename(filename)):
+ return
+ timestr = last_modified_hdr
+ if timestr is None:
+ return
+ filetime = timeconvert(timestr)
+ if filetime is None:
+ return filetime
+ # Ignore obviously invalid dates
+ if filetime == 0:
+ return
+ try:
+ os.utime(filename, (time.time(), filetime))
+ except:
+ pass
+ return filetime
+ def report_destination(self, filename):
+ """Report destination filename."""
+ self.to_screen(u'[download] Destination: ' + filename)
+ def _report_progress_status(self, msg, is_last_line=False):
+ fullmsg = u'[download] ' + msg
+ if self.params.get('progress_with_newline', False):
+ self.to_screen(fullmsg)
+ else:
+ if os.name == 'nt':
+ prev_len = getattr(self, '_report_progress_prev_line_length',
+ 0)
+ if prev_len > len(fullmsg):
+ fullmsg += u' ' * (prev_len - len(fullmsg))
+ self._report_progress_prev_line_length = len(fullmsg)
+ clear_line = u'\r'
+ else:
+ clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r')
+ self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
+ self.to_console_title(u'youtube-dl ' + msg)
+ def report_progress(self, percent, data_len_str, speed, eta):
+ """Report download progress."""
+ if self.params.get('noprogress', False):
+ return
+ if eta is not None:
+ eta_str = self.format_eta(eta)
+ else:
+ eta_str = 'Unknown ETA'
+ if percent is not None:
+ percent_str = self.format_percent(percent)
+ else:
+ percent_str = 'Unknown %'
+ speed_str = self.format_speed(speed)
+ msg = (u'%s of %s at %s ETA %s' %
+ (percent_str, data_len_str, speed_str, eta_str))
+ self._report_progress_status(msg)
+ def report_progress_live_stream(self, downloaded_data_len, speed, elapsed):
+ if self.params.get('noprogress', False):
+ return
+ downloaded_str = format_bytes(downloaded_data_len)
+ speed_str = self.format_speed(speed)
+ elapsed_str = FileDownloader.format_seconds(elapsed)
+ msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
+ self._report_progress_status(msg)
+ def report_finish(self, data_len_str, tot_time):
+ """Report download finished."""
+ if self.params.get('noprogress', False):
+ self.to_screen(u'[download] Download completed')
+ else:
+ self._report_progress_status(
+ (u'100%% of %s in %s' %
+ (data_len_str, self.format_seconds(tot_time))),
+ is_last_line=True)
+ def report_resuming_byte(self, resume_len):
+ """Report attempt to resume at given byte."""
+ self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
+ def report_retry(self, count, retries):
+ """Report retry in case of HTTP error 5xx"""
+ self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
+ def report_file_already_downloaded(self, file_name):
+ """Report file has already been fully downloaded."""
+ try:
+ self.to_screen(u'[download] %s has already been downloaded' % file_name)
+ except UnicodeEncodeError:
+ self.to_screen(u'[download] The file has already been downloaded')
+ def report_unable_to_resume(self):
+ """Report it was impossible to resume download."""
+ self.to_screen(u'[download] Unable to resume')
+ def download(self, filename, info_dict):
+ """Download to a filename using the info from info_dict
+ Return True on success and False otherwise
+ """
+ # Check file already present
+ if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
+ self.report_file_already_downloaded(filename)
+ self._hook_progress({
+ 'filename': filename,
+ 'status': 'finished',
+ 'total_bytes': os.path.getsize(encodeFilename(filename)),
+ })
+ return True
+ return self.real_download(filename, info_dict)
+ def real_download(self, filename, info_dict):
+ """Real download process. Redefine in subclasses."""
+ raise NotImplementedError(u'This method must be implemented by sublcasses')
+ def _hook_progress(self, status):
+ for ph in self._progress_hooks:
+ ph(status)
+ def add_progress_hook(self, ph):
+ """ ph gets called on download progress, with a dictionary with the entries
+ * filename: The final filename
+ * status: One of "downloading" and "finished"
+ It can also have some of the following entries:
+ * downloaded_bytes: Bytes on disks
+ * total_bytes: Total bytes, None if unknown
+ * tmpfilename: The filename we're currently writing to
+ * eta: The estimated time in seconds, None if unknown
+ * speed: The download speed in bytes/second, None if unknown
+ Hooks are guaranteed to be called at least once (with status "finished")
+ if the download is successful.
+ """
+ self._progress_hooks.append(ph)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
new file mode 100644
index 000000000..51e8c4778
--- /dev/null
+++ b/youtube_dl/downloader/hls.py
@@ -0,0 +1,44 @@
+import os
+import subprocess
+from .common import FileDownloader
+from ..utils import (
+ encodeFilename,
+class HlsFD(FileDownloader):
+ def real_download(self, filename, info_dict):
+ url = info_dict['url']
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+ args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
+ '-bsf:a', 'aac_adtstoasc', tmpfilename]
+ for program in ['avconv', 'ffmpeg']:
+ try:
+ subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ break
+ except (OSError, IOError):
+ pass
+ else:
+ self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found')
+ cmd = [program] + args
+ retval = subprocess.call(cmd)
+ if retval == 0:
+ fsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ self.to_stderr(u"\n")
+ self.report_error(u'ffmpeg exited with code %d' % retval)
+ return False
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
new file mode 100644
index 000000000..8407727ba
--- /dev/null
+++ b/youtube_dl/downloader/http.py
@@ -0,0 +1,186 @@
+import os
+import time
+from .common import FileDownloader
+from ..utils import (
+ compat_urllib_request,
+ compat_urllib_error,
+ ContentTooShortError,
+ encodeFilename,
+ sanitize_open,
+ format_bytes,
+class HttpFD(FileDownloader):
+ def real_download(self, filename, info_dict):
+ url = info_dict['url']
+ tmpfilename = self.temp_name(filename)
+ stream = None
+ # Do not include the Accept-Encoding header
+ headers = {'Youtubedl-no-compression': 'True'}
+ if 'user_agent' in info_dict:
+ headers['Youtubedl-user-agent'] = info_dict['user_agent']
+ basic_request = compat_urllib_request.Request(url, None, headers)
+ request = compat_urllib_request.Request(url, None, headers)
+ if self.params.get('test', False):
+ request.add_header('Range','bytes=0-10240')
+ # Establish possible resume length
+ if os.path.isfile(encodeFilename(tmpfilename)):
+ resume_len = os.path.getsize(encodeFilename(tmpfilename))
+ else:
+ resume_len = 0
+ open_mode = 'wb'
+ if resume_len != 0:
+ if self.params.get('continuedl', False):
+ self.report_resuming_byte(resume_len)
+ request.add_header('Range','bytes=%d-' % resume_len)
+ open_mode = 'ab'
+ else:
+ resume_len = 0
+ count = 0
+ retries = self.params.get('retries', 0)
+ while count <= retries:
+ # Establish connection
+ try:
+ data = compat_urllib_request.urlopen(request)
+ break
+ except (compat_urllib_error.HTTPError, ) as err:
+ if (err.code < 500 or err.code >= 600) and err.code != 416:
+ # Unexpected HTTP error
+ raise
+ elif err.code == 416:
+ # Unable to resume (requested range not satisfiable)
+ try:
+ # Open the connection again without the range header
+ data = compat_urllib_request.urlopen(basic_request)
+ content_length = data.info()['Content-Length']
+ except (compat_urllib_error.HTTPError, ) as err:
+ if err.code < 500 or err.code >= 600:
+ raise
+ else:
+ # Examine the reported length
+ if (content_length is not None and
+ (resume_len - 100 < int(content_length) < resume_len + 100)):
+ # The file had already been fully downloaded.
+ # Explanation to the above condition: in issue #175 it was revealed that
+ # YouTube sometimes adds or removes a few bytes from the end of the file,
+ # changing the file size slightly and causing problems for some users. So
+ # I decided to implement a suggested change and consider the file
+ # completely downloaded if the file size differs less than 100 bytes from
+ # the one in the hard drive.
+ self.report_file_already_downloaded(filename)
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ # The length does not match, we start the download over
+ self.report_unable_to_resume()
+ open_mode = 'wb'
+ break
+ # Retry
+ count += 1
+ if count <= retries:
+ self.report_retry(count, retries)
+ if count > retries:
+ self.report_error(u'giving up after %s retries' % retries)
+ return False
+ data_len = data.info().get('Content-length', None)
+ if data_len is not None:
+ data_len = int(data_len) + resume_len
+ min_data_len = self.params.get("min_filesize", None)
+ max_data_len = self.params.get("max_filesize", None)
+ if min_data_len is not None and data_len < min_data_len:
+ self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+ return False
+ if max_data_len is not None and data_len > max_data_len:
+ self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+ return False
+ data_len_str = format_bytes(data_len)
+ byte_counter = 0 + resume_len
+ block_size = self.params.get('buffersize', 1024)
+ start = time.time()
+ while True:
+ # Download and write
+ before = time.time()
+ data_block = data.read(block_size)
+ after = time.time()
+ if len(data_block) == 0:
+ break
+ byte_counter += len(data_block)
+ # Open file just in time
+ if stream is None:
+ try:
+ (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
+ assert stream is not None
+ filename = self.undo_temp_name(tmpfilename)
+ self.report_destination(filename)
+ except (OSError, IOError) as err:
+ self.report_error(u'unable to open for writing: %s' % str(err))
+ return False
+ try:
+ stream.write(data_block)
+ except (IOError, OSError) as err:
+ self.to_stderr(u"\n")
+ self.report_error(u'unable to write data: %s' % str(err))
+ return False
+ if not self.params.get('noresizebuffer', False):
+ block_size = self.best_block_size(after - before, len(data_block))
+ # Progress message
+ speed = self.calc_speed(start, time.time(), byte_counter - resume_len)
+ if data_len is None:
+ eta = percent = None
+ else:
+ percent = self.calc_percent(byte_counter, data_len)
+ eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
+ self.report_progress(percent, data_len_str, speed, eta)
+ self._hook_progress({
+ 'downloaded_bytes': byte_counter,
+ 'total_bytes': data_len,
+ 'tmpfilename': tmpfilename,
+ 'filename': filename,
+ 'status': 'downloading',
+ 'eta': eta,
+ 'speed': speed,
+ })
+ # Apply rate limit
+ self.slow_down(start, byte_counter - resume_len)
+ if stream is None:
+ self.to_stderr(u"\n")
+ self.report_error(u'Did not get any data blocks')
+ return False
+ stream.close()
+ self.report_finish(data_len_str, (time.time() - start))
+ if data_len is not None and byte_counter != data_len:
+ raise ContentTooShortError(byte_counter, int(data_len))
+ self.try_rename(tmpfilename, filename)
+ # Update file modification time
+ if self.params.get('updatetime', True):
+ info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
+ self._hook_progress({
+ 'downloaded_bytes': byte_counter,
+ 'total_bytes': byte_counter,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py
new file mode 100644
index 000000000..67e0e4189
--- /dev/null
+++ b/youtube_dl/downloader/mplayer.py
@@ -0,0 +1,40 @@
+import os
+import subprocess
+from .common import FileDownloader
+from ..utils import (
+ encodeFilename,
+class MplayerFD(FileDownloader):
+ def real_download(self, filename, info_dict):
+ url = info_dict['url']
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+ args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]
+ # Check for mplayer first
+ try:
+ subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ except (OSError, IOError):
+ self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] )
+ return False
+ # Download using mplayer.
+ retval = subprocess.call(args)
+ if retval == 0:
+ fsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ self.to_stderr(u"\n")
+ self.report_error(u'mplayer exited with code %d' % retval)
+ return False
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
new file mode 100644
index 000000000..b165e396f
--- /dev/null
+++ b/youtube_dl/downloader/rtmp.py
@@ -0,0 +1,178 @@
+import os
+import re
+import subprocess
+import sys
+import time
+from .common import FileDownloader
+from ..utils import (
+ encodeFilename,
+ format_bytes,
+class RtmpFD(FileDownloader):
+ def real_download(self, filename, info_dict):
+ def run_rtmpdump(args):
+ start = time.time()
+ resume_percent = None
+ resume_downloaded_data_len = None
+ proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+ cursor_in_new_line = True
+ proc_stderr_closed = False
+ while not proc_stderr_closed:
+ # read line from stderr
+ line = u''
+ while True:
+ char = proc.stderr.read(1)
+ if not char:
+ proc_stderr_closed = True
+ break
+ if char in [b'\r', b'\n']:
+ break
+ line += char.decode('ascii', 'replace')
+ if not line:
+ # proc_stderr_closed is True
+ continue
+ mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
+ if mobj:
+ downloaded_data_len = int(float(mobj.group(1))*1024)
+ percent = float(mobj.group(2))
+ if not resume_percent:
+ resume_percent = percent
+ resume_downloaded_data_len = downloaded_data_len
+ eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
+ speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
+ data_len = None
+ if percent > 0:
+ data_len = int(downloaded_data_len * 100 / percent)
+ data_len_str = u'~' + format_bytes(data_len)
+ self.report_progress(percent, data_len_str, speed, eta)
+ cursor_in_new_line = False
+ self._hook_progress({
+ 'downloaded_bytes': downloaded_data_len,
+ 'total_bytes': data_len,
+ 'tmpfilename': tmpfilename,
+ 'filename': filename,
+ 'status': 'downloading',
+ 'eta': eta,
+ 'speed': speed,
+ })
+ else:
+ # no percent for live streams
+ mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
+ if mobj:
+ downloaded_data_len = int(float(mobj.group(1))*1024)
+ time_now = time.time()
+ speed = self.calc_speed(start, time_now, downloaded_data_len)
+ self.report_progress_live_stream(downloaded_data_len, speed, time_now - start)
+ cursor_in_new_line = False
+ self._hook_progress({
+ 'downloaded_bytes': downloaded_data_len,
+ 'tmpfilename': tmpfilename,
+ 'filename': filename,
+ 'status': 'downloading',
+ 'speed': speed,
+ })
+ elif self.params.get('verbose', False):
+ if not cursor_in_new_line:
+ self.to_screen(u'')
+ cursor_in_new_line = True
+ self.to_screen(u'[rtmpdump] '+line)
+ proc.wait()
+ if not cursor_in_new_line:
+ self.to_screen(u'')
+ return proc.returncode
+ url = info_dict['url']
+ player_url = info_dict.get('player_url', None)
+ page_url = info_dict.get('page_url', None)
+ play_path = info_dict.get('play_path', None)
+ tc_url = info_dict.get('tc_url', None)
+ live = info_dict.get('rtmp_live', False)
+ conn = info_dict.get('rtmp_conn', None)
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+ test = self.params.get('test', False)
+ # Check for rtmpdump first
+ try:
+ subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ except (OSError, IOError):
+ self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
+ return False
+ # Download using rtmpdump. rtmpdump returns exit code 2 when
+ # the connection was interrumpted and resuming appears to be
+ # possible. This is part of rtmpdump's normal usage, AFAIK.
+ basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
+ if player_url is not None:
+ basic_args += ['--swfVfy', player_url]
+ if page_url is not None:
+ basic_args += ['--pageUrl', page_url]
+ if play_path is not None:
+ basic_args += ['--playpath', play_path]
+ if tc_url is not None:
+ basic_args += ['--tcUrl', url]
+ if test:
+ basic_args += ['--stop', '1']
+ if live:
+ basic_args += ['--live']
+ if conn:
+ basic_args += ['--conn', conn]
+ args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
+ if sys.platform == 'win32' and sys.version_info < (3, 0):
+ # Windows subprocess module does not actually support Unicode
+ # on Python 2.x
+ # See http://stackoverflow.com/a/9951851/35070
+ subprocess_encoding = sys.getfilesystemencoding()
+ args = [a.encode(subprocess_encoding, 'ignore') for a in args]
+ else:
+ subprocess_encoding = None
+ if self.params.get('verbose', False):
+ if subprocess_encoding:
+ str_args = [
+ a.decode(subprocess_encoding) if isinstance(a, bytes) else a
+ for a in args]
+ else:
+ str_args = args
+ try:
+ import pipes
+ shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
+ except ImportError:
+ shell_quote = repr
+ self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
+ retval = run_rtmpdump(args)
+ while (retval == 2 or retval == 1) and not test:
+ prevsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
+ time.sleep(5.0) # This seems to be needed
+ retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+ cursize = os.path.getsize(encodeFilename(tmpfilename))
+ if prevsize == cursize and retval == 1:
+ break
+ # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
+ if prevsize == cursize and retval == 2 and cursize > 1024:
+ self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+ retval = 0
+ break
+ if retval == 0 or (test and retval == 2):
+ fsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen(u'[rtmpdump] %s bytes' % fsize)
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ self.to_stderr(u"\n")
+ self.report_error(u'rtmpdump exited with code %d' % retval)
+ return False
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a39a1e2f4..f1167989e 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -28,6 +28,7 @@ from .channel9 import Channel9IE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .clipsyndicate import ClipsyndicateIE
+from .cmt import CMTIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
@@ -79,7 +80,10 @@ from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
-from .imdb import ImdbIE
+from .imdb import (
+ ImdbIE,
+ ImdbListIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
@@ -91,12 +95,18 @@ from .ivi import (
from .jeuxvideo import JeuxVideoIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
+from .jpopsukitv import JpopsukiIE
from .kankan import KankanIE
from .keezmovies import KeezMoviesIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
from .livestream import LivestreamIE, LivestreamOriginalIE
+from .lynda import (
+ LyndaIE,
+ LyndaCourseIE
+from .macgamestore import MacGameStoreIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
@@ -189,6 +199,7 @@ from .vimeo import (
+ VimeoReviewIE,
from .vine import VineIE
from .viki import VikiIE
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index ef5644aa5..e7361ae06 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -110,7 +110,8 @@ class AppleTrailersIE(InfoExtractor):
'width': format['width'],
'height': int(format['height']),
- formats = sorted(formats, key=lambda f: (f['height'], f['width']))
+ self._sort_formats(formats)
'_type': 'video',
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 3a32c14c5..15aee2786 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -10,14 +10,14 @@ from ..utils import (
class BandcampIE(InfoExtractor):
- IE_NAME = u'Bandcamp'
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
_TESTS = [{
u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
u'file': u'1812978515.mp3',
- u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
+ u'md5': u'c557841d5e50261777a6585648adf439',
u'info_dict': {
- u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
+ u"title": u"youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
+ u"duration": 10,
u'skip': u'There is a limit of 200 free downloads / month for the test song'
@@ -30,29 +30,42 @@ class BandcampIE(InfoExtractor):
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
- if m_trackinfo:
- json_code = m_trackinfo.group(1)
- data = json.loads(json_code)
+ if m_trackinfo:
+ json_code = m_trackinfo.group(1)
+ data = json.loads(json_code)
+ d = data[0]
+ duration = int(round(d['duration']))
+ formats = []
+ for format_id, format_url in d['file'].items():
+ ext, _, abr_str = format_id.partition('-')
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ 'ext': format_id.partition('-')[0],
+ 'vcodec': 'none',
+ 'acodec': format_id.partition('-')[0],
+ 'abr': int(format_id.partition('-')[2]),
+ })
+ self._sort_formats(formats)
- for d in data:
- formats = [{
- 'format_id': 'format_id',
- 'url': format_url,
- 'ext': format_id.partition('-')[0]
- } for format_id, format_url in sorted(d['file'].items())]
return {
'id': compat_str(d['id']),
'title': d['title'],
'formats': formats,
+ 'duration': duration,
- else:
- raise ExtractorError(u'No free songs found')
+ else:
+ raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
- id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
- webpage, re.MULTILINE|re.DOTALL).group('id')
+ video_id = re.search(
+ r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
+ webpage, re.MULTILINE | re.DOTALL).group('id')
- download_webpage = self._download_webpage(download_link, id,
+ download_webpage = self._download_webpage(download_link, video_id,
'Downloading free downloads page')
# We get the dictionary of the track from some javascrip code
info = re.search(r'items: (.*?),$',
@@ -66,21 +79,21 @@ class BandcampIE(InfoExtractor):
m_url = re.match(re_url, initial_url)
#We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
- request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
+ request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
#in the "download_url" key
final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
- track_info = {'id':id,
- 'title' : info[u'title'],
- 'ext' : 'mp3',
- 'url' : final_url,
- 'thumbnail' : info[u'thumb_url'],
- 'uploader' : info[u'artist']
- }
- return [track_info]
+ return {
+ 'id': video_id,
+ 'title': info[u'title'],
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'url': final_url,
+ 'thumbnail': info[u'thumb_url'],
+ 'uploader': info[u'artist'],
+ }
class BandcampAlbumIE(InfoExtractor):
@@ -117,7 +130,7 @@ class BandcampAlbumIE(InfoExtractor):
webpage = self._download_webpage(url, title)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
- raise ExtractorError(u'The page doesn\'t contain any track')
+ raise ExtractorError(u'The page doesn\'t contain any tracks')
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths]
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
index 144ce64cc..0229840a3 100644
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@@ -61,9 +61,10 @@ class BlinkxIE(InfoExtractor):
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
+ tbr = (int(m['vbr']) + int(m['abr'])) // 1000
format_id = (u'%s-%sk-%s' %
- (int(m['vbr']) + int(m['abr'])) // 1000,
+ tbr,
'format_id': format_id,
@@ -72,10 +73,12 @@ class BlinkxIE(InfoExtractor):
'acodec': acodec,
'abr': int(m['abr']) // 1000,
'vbr': int(m['vbr']) // 1000,
+ 'tbr': tbr,
'width': int(m['w']),
'height': int(m['h']),
- formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr']))
+ self._sort_formats(formats)
return {
'id': display_id,
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 5e33a69df..3ce9b5324 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -1,16 +1,15 @@
+from __future__ import unicode_literals
import datetime
import json
-import os
import re
import socket
from .common import InfoExtractor
from ..utils import (
- compat_parse_qs,
- compat_urllib_parse_urlparse,
@@ -22,42 +21,35 @@ class BlipTVIE(InfoExtractor):
"""Information extractor for blip.tv"""
_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
- _URL_EXT = r'^.*\.([a-z0-9]+)$'
- IE_NAME = u'blip.tv'
_TEST = {
- u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
- u'file': u'5779306.m4v',
- u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
- u'info_dict': {
- u"upload_date": u"20111205",
- u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596",
- u"uploader": u"Comic Book Resources - CBR TV",
- u"title": u"CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
+ 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
+ 'file': '5779306.mov',
+ 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+ 'info_dict': {
+ 'upload_date': '20111205',
+ 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
+ 'uploader': 'Comic Book Resources - CBR TV',
+ 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
def report_direct_download(self, title):
"""Report information extraction."""
- self.to_screen(u'%s: Direct download detected' % title)
+ self.to_screen('%s: Direct download detected' % title)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
# See https://github.com/rg3/youtube-dl/issues/857
- api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
- if api_mobj is not None:
- url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
- urlp = compat_urllib_parse_urlparse(url)
- if urlp.path.startswith('/play/'):
- response = self._request_webpage(url, None, False)
- redirecturl = response.geturl()
- rurlp = compat_urllib_parse_urlparse(redirecturl)
- file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
- url = 'http://blip.tv/a/a-' + file_id
- return self._real_extract(url)
+ embed_mobj = re.search(r'^(?:https?://)?(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url)
+ if embed_mobj:
+ info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1)
+ info_page = self._download_webpage(info_url, embed_mobj.group(1))
+ video_id = self._search_regex(r'data-episode-id="(\d+)', info_page, 'video_id')
+ return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')
if '?' in url:
cchar = '&'
@@ -67,67 +59,55 @@ class BlipTVIE(InfoExtractor):
request = compat_urllib_request.Request(json_url)
request.add_header('User-Agent', 'iTunes/10.6.1')
- info = None
urlh = self._request_webpage(request, None, False,
- u'unable to download video info webpage')
- if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
- basename = url.split('/')[-1]
- title,ext = os.path.splitext(basename)
- title = title.decode('UTF-8')
- ext = ext.replace('.', '')
- self.report_direct_download(title)
- info = {
- 'id': title,
- 'url': url,
- 'uploader': None,
- 'upload_date': None,
- 'title': title,
- 'ext': ext,
- 'urlhandle': urlh
+ 'unable to download video info webpage')
+ try:
+ json_code_bytes = urlh.read()
+ json_code = json_code_bytes.decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ raise ExtractorError('Unable to read video info webpage: %s' % compat_str(err))
+ try:
+ json_data = json.loads(json_code)
+ if 'Post' in json_data:
+ data = json_data['Post']
+ else:
+ data = json_data
+ upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
+ formats = []
+ if 'additionalMedia' in data:
+ for f in sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])):
+ if not int(f['media_width']): # filter m3u8
+ continue
+ formats.append({
+ 'url': f['url'],
+ 'format_id': f['role'],
+ 'width': int(f['media_width']),
+ 'height': int(f['media_height']),
+ })
+ else:
+ formats.append({
+ 'url': data['media']['url'],
+ 'width': int(data['media']['width']),
+ 'height': int(data['media']['height']),
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': compat_str(data['item_id']),
+ 'uploader': data['display_name'],
+ 'upload_date': upload_date,
+ 'title': data['title'],
+ 'thumbnail': data['thumbnailUrl'],
+ 'description': data['description'],
+ 'user_agent': 'iTunes/10.6.1',
+ 'formats': formats,
- if info is None: # Regular URL
- try:
- json_code_bytes = urlh.read()
- json_code = json_code_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
- try:
- json_data = json.loads(json_code)
- if 'Post' in json_data:
- data = json_data['Post']
- else:
- data = json_data
- upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
- if 'additionalMedia' in data:
- formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
- best_format = formats[-1]
- video_url = best_format['url']
- else:
- video_url = data['media']['url']
- umobj = re.match(self._URL_EXT, video_url)
- if umobj is None:
- raise ValueError('Can not determine filename extension')
- ext = umobj.group(1)
- info = {
- 'id': compat_str(data['item_id']),
- 'url': video_url,
- 'uploader': data['display_name'],
- 'upload_date': upload_date,
- 'title': data['title'],
- 'ext': ext,
- 'format': data['media']['mimeType'],
- 'thumbnail': data['thumbnailUrl'],
- 'description': data['description'],
- 'player_url': data['embedUrl'],
- 'user_agent': 'iTunes/10.6.1',
- }
- except (ValueError,KeyError) as err:
- raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
- return [info]
+ except (ValueError, KeyError) as err:
+ raise ExtractorError('Unable to parse video information: %s' % repr(err))
class BlipTVUserIE(InfoExtractor):
@@ -135,19 +115,19 @@ class BlipTVUserIE(InfoExtractor):
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
- IE_NAME = u'blip.tv:user'
+ IE_NAME = 'blip.tv:user'
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
username = mobj.group(1)
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
- page = self._download_webpage(url, username, u'Downloading user page')
+ page = self._download_webpage(url, username, 'Downloading user page')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
@@ -163,7 +143,7 @@ class BlipTVUserIE(InfoExtractor):
while True:
url = page_base + "&page=" + str(pagenum)
page = self._download_webpage(url, username,
- u'Downloading video ids from page %d' % pagenum)
+ 'Downloading video ids from page %d' % pagenum)
# Extract video identifiers
ids_in_page = []
@@ -185,6 +165,6 @@ class BlipTVUserIE(InfoExtractor):
pagenum += 1
- urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
+ urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index f7f0041c0..4ba3f7c42 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -1,4 +1,5 @@
# encoding: utf-8
+from __future__ import unicode_literals
import re
import json
@@ -13,6 +14,7 @@ from ..utils import (
+ unsmuggle_url,
@@ -24,47 +26,47 @@ class BrightcoveIE(InfoExtractor):
_TESTS = [
# From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
- u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
- u'file': u'2371591881001.mp4',
- u'md5': u'5423e113865d26e40624dce2e4b45d95',
- u'note': u'Test Brightcove downloads and detection in GenericIE',
- u'info_dict': {
- u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
- u'uploader': u'8TV',
- u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
+ 'file': '2371591881001.mp4',
+ 'md5': '5423e113865d26e40624dce2e4b45d95',
+ 'note': 'Test Brightcove downloads and detection in GenericIE',
+ 'info_dict': {
+ 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+ 'uploader': '8TV',
+ 'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
# From http://medianetwork.oracle.com/video/player/1785452137001
- u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
- u'file': u'1785452137001.flv',
- u'info_dict': {
- u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
- u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.',
- u'uploader': u'Oracle',
+ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
+ 'file': '1785452137001.flv',
+ 'info_dict': {
+ 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
+ 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
+ 'uploader': 'Oracle',
# From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
- u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
- u'info_dict': {
- u'id': u'2750934548001',
- u'ext': u'mp4',
- u'title': u'This Bracelet Acts as a Personal Thermostat',
- u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0',
- u'uploader': u'Mashable',
+ 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
+ 'info_dict': {
+ 'id': '2750934548001',
+ 'ext': 'mp4',
+ 'title': 'This Bracelet Acts as a Personal Thermostat',
+ 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
+ 'uploader': 'Mashable',
# test that the default referer works
# from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
- u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
- u'info_dict': {
- u'id': u'2878862109001',
- u'ext': u'mp4',
- u'title': u'Lost in Motion II',
- u'description': u'md5:363109c02998fee92ec02211bd8000df',
- u'uploader': u'National Ballet of Canada',
+ 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
+ 'info_dict': {
+ 'id': '2878862109001',
+ 'ext': 'mp4',
+ 'title': 'Lost in Motion II',
+ 'description': 'md5:363109c02998fee92ec02211bd8000df',
+ 'uploader': 'National Ballet of Canada',
@@ -80,10 +82,10 @@ class BrightcoveIE(InfoExtractor):
object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
- object_str = object_str.replace(u'<--', u'<!--')
+ object_str = object_str.replace('<--', '<!--')
object_doc = xml.etree.ElementTree.fromstring(object_str)
- assert u'BrightcoveExperience' in object_doc.attrib['class']
+ assert 'BrightcoveExperience' in object_doc.attrib['class']
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
@@ -120,6 +122,8 @@ class BrightcoveIE(InfoExtractor):
return None
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
# Change the 'videoId' and others field to '@videoPlayer'
url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)
# Change bckey (used by bcove.me urls) to playerKey
@@ -130,9 +134,10 @@ class BrightcoveIE(InfoExtractor):
videoPlayer = query.get('@videoPlayer')
if videoPlayer:
- return self._get_video_info(videoPlayer[0], query_str, query,
- # We set the original url as the default 'Referer' header
- referer=url)
+ # We set the original url as the default 'Referer' header
+ referer = smuggled_data.get('Referer', url)
+ return self._get_video_info(
+ videoPlayer[0], query_str, query, referer=referer)
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
@@ -156,11 +161,11 @@ class BrightcoveIE(InfoExtractor):
def _get_playlist_info(self, player_key):
playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
- player_key, u'Downloading playlist information')
+ player_key, 'Downloading playlist information')
json_data = json.loads(playlist_info)
if 'videoList' not in json_data:
- raise ExtractorError(u'Empty playlist')
+ raise ExtractorError('Empty playlist')
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
@@ -189,5 +194,5 @@ class BrightcoveIE(InfoExtractor):
'url': video_info['FLVFullLengthURL'],
- raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
+ raise ExtractorError('Unable to extract video url for %s' % info['id'])
return info
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index ae70ea229..574881b70 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -76,14 +76,18 @@ class Channel9IE(InfoExtractor):
</div>)? # File size part may be missing
# Extract known formats
- formats = [{'url': x.group('url'),
- 'format_id': x.group('quality'),
- 'format_note': x.group('note'),
- 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
- } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
- # Sort according to known formats list
- formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
+ formats = [{
+ 'url': x.group('url'),
+ 'format_id': x.group('quality'),
+ 'format_note': x.group('note'),
+ 'format': u'%s (%s)' % (x.group('quality'), x.group('note')),
+ 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+ 'preference': self._known_formats.index(x.group('quality')),
+ 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
+ } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+ self._sort_formats(formats)
return formats
def _extract_title(self, html):
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
new file mode 100644
index 000000000..88e0e9aba
--- /dev/null
+++ b/youtube_dl/extractor/cmt.py
@@ -0,0 +1,19 @@
+from .mtv import MTVIE
+class CMTIE(MTVIE):
+ IE_NAME = u'cmt.com'
+ _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
+ _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
+ _TESTS = [
+ {
+ u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
+ u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2',
+ u'info_dict': {
+ u'id': u'989124',
+ u'ext': u'mp4',
+ u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
+ u'description': u'Blame It All On My Roots',
+ },
+ },
+ ]
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index a034bb2fb..ecac5e0e9 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -1,7 +1,10 @@
import re
from .common import InfoExtractor
-from ..utils import determine_ext
+from ..utils import (
+ int_or_none,
+ parse_duration,
class CNNIE(InfoExtractor):
@@ -15,6 +18,8 @@ class CNNIE(InfoExtractor):
u'info_dict': {
u'title': u'Nadal wins 8th French Open title',
u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+ u'duration': 135,
+ u'upload_date': u'20130609',
@@ -35,22 +40,58 @@ class CNNIE(InfoExtractor):
info = self._download_xml(info_url, page_title)
formats = []
+ rex = re.compile(r'''(?x)
+ (?P<width>[0-9]+)x(?P<height>[0-9]+)
+ (?:_(?P<bitrate>[0-9]+)k)?
+ ''')
for f in info.findall('files/file'):
- mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate'])
- if mf is not None:
- formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text))
- formats = sorted(formats)
- (_,_,_, video_path) = formats[-1]
- video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path
+ video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip())
+ fdct = {
+ 'format_id': f.attrib['bitrate'],
+ 'url': video_url,
+ }
+ mf = rex.match(f.attrib['bitrate'])
+ if mf:
+ fdct['width'] = int(mf.group('width'))
+ fdct['height'] = int(mf.group('height'))
+ fdct['tbr'] = int_or_none(mf.group('bitrate'))
+ else:
+ mf = rex.search(f.text)
+ if mf:
+ fdct['width'] = int(mf.group('width'))
+ fdct['height'] = int(mf.group('height'))
+ fdct['tbr'] = int_or_none(mf.group('bitrate'))
+ else:
+ mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate'])
+ if mi:
+ if mi.group(1) == 'audio':
+ fdct['vcodec'] = 'none'
+ fdct['ext'] = 'm4a'
+ else:
+ fdct['tbr'] = int(mi.group(1))
+ formats.append(fdct)
+ self._sort_formats(formats)
thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
- return {'id': info.attrib['id'],
- 'title': info.find('headline').text,
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'thumbnail': thumbnails[-1][1],
- 'thumbnails': thumbs_dict,
- 'description': info.find('description').text,
- }
+ metas_el = info.find('metas')
+ upload_date = (
+ metas_el.attrib.get('version') if metas_el is not None else None)
+ duration_el = info.find('length')
+ duration = parse_duration(duration_el.text)
+ return {
+ 'id': info.attrib['id'],
+ 'title': info.find('headline').text,
+ 'formats': formats,
+ 'thumbnail': thumbnails[-1][1],
+ 'thumbnails': thumbs_dict,
+ 'description': info.find('description').text,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index b27c1dfc5..d10b7bd0c 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,82 +1,68 @@
+from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse_urlparse,
- determine_ext,
- ExtractorError,
class CollegeHumorIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
_TESTS = [{
- u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
- u'file': u'6902724.mp4',
- u'md5': u'1264c12ad95dca142a9f0bf7968105a0',
- u'info_dict': {
- u'title': u'Comic-Con Cosplay Catastrophe',
- u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.',
+ 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+ 'file': '6902724.mp4',
+ 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
+ 'info_dict': {
+ 'title': 'Comic-Con Cosplay Catastrophe',
+ 'description': 'Fans get creative this year at San Diego. Too',
+ 'age_limit': 13,
- u'url': u'http://www.collegehumor.com/video/3505939/font-conference',
- u'file': u'3505939.mp4',
- u'md5': u'c51ca16b82bb456a4397987791a835f5',
- u'info_dict': {
- u'title': u'Font Conference',
- u'description': u'This video wasn\'t long enough, so we made it double-spaced.',
+ 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
+ 'file': '3505939.mp4',
+ 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
+ 'info_dict': {
+ 'title': 'Font Conference',
+ 'description': 'This video wasn\'t long enough, so we made it double-spaced.',
+ 'age_limit': 10,
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
- info = {
- 'id': video_id,
- 'uploader': None,
- 'upload_date': None,
- }
- self.report_extraction(video_id)
- xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- mdoc = self._download_xml(xmlUrl, video_id,
- u'Downloading info XML',
- u'Unable to download video info XML')
+ jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
+ data = json.loads(self._download_webpage(
+ jsonUrl, video_id, 'Downloading info JSON'))
+ vdata = data['video']
- try:
- videoNode = mdoc.findall('./video')[0]
- youtubeIdNode = videoNode.find('./youtubeID')
- if youtubeIdNode is not None:
- return self.url_result(youtubeIdNode.text, 'Youtube')
- info['description'] = videoNode.findall('./description')[0].text
- info['title'] = videoNode.findall('./caption')[0].text
- info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
- next_url = videoNode.findall('./file')[0].text
- except IndexError:
- raise ExtractorError(u'Invalid metadata XML file')
- if next_url.endswith(u'manifest.f4m'):
- manifest_url = next_url + '?hdcore=2.10.3'
- adoc = self._download_xml(manifest_url, video_id,
- u'Downloading XML manifest',
- u'Unable to download video info XML')
- try:
- video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
- except IndexError:
- raise ExtractorError(u'Invalid manifest file')
- url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
- info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
- info['ext'] = 'mp4'
+ AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
+ rating = vdata.get('rating')
+ if rating:
+ age_limit = AGE_LIMITS.get(rating.lower())
- # Old-style direct links
- info['url'] = next_url
- info['ext'] = determine_ext(info['url'])
+ age_limit = None # None = No idea
+ PREFS = {'high_quality': 2, 'low_quality': 0}
+ formats = []
+ for format_key in ('mp4', 'webm'):
+ for qname, qurl in vdata[format_key].items():
+ formats.append({
+ 'format_id': format_key + '_' + qname,
+ 'url': qurl,
+ 'format': format_key,
+ 'preference': PREFS.get(qname),
+ })
+ self._sort_formats(formats)
- return info
+ return {
+ 'id': video_id,
+ 'title': vdata['title'],
+ 'description': vdata.get('description'),
+ 'thumbnail': vdata.get('thumbnail'),
+ 'formats': formats,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index a54ce3ee7..27bd8256e 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -12,7 +12,9 @@ from ..utils import (
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+ _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/
+ (video-clips|episodes|cc-studios|video-collections)
+ /(?P<title>.*)'''
_FEED_URL = u'http://comedycentral.com/feeds/mrss/'
_TEST = {
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index ba46a7bc7..2a5e8076c 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -9,6 +9,7 @@ import xml.etree.ElementTree
from ..utils import (
+ compat_urllib_parse_urlparse,
@@ -37,10 +38,12 @@ class InfoExtractor(object):
id: Video identifier.
title: Video title, unescaped.
- Additionally, it must contain either a formats entry or url and ext:
+ Additionally, it must contain either a formats entry or a url one:
- formats: A list of dictionaries for each format available, it must
- be ordered from worst to best quality. Potential fields:
+ formats: A list of dictionaries for each format available, ordered
+ from worst to best quality.
+ Potential fields:
* url Mandatory. The URL of the video file
* ext Will be calculated from url if missing
* format A human-readable description of the format
@@ -48,23 +51,36 @@ class InfoExtractor(object):
Calculated from the format_id, width, height.
and format_note fields if missing.
* format_id A short description of the format
- ("mp4_h264_opus" or "19")
+ ("mp4_h264_opus" or "19").
+ Technically optional, but strongly recommended.
* format_note Additional info about the format
("3D" or "DASH video")
* width Width of the video, if known
* height Height of the video, if known
+ * resolution Textual description of width and height
+ * tbr Average bitrate of audio and video in KBit/s
* abr Average audio bitrate in KBit/s
* acodec Name of the audio codec in use
* vbr Average video bitrate in KBit/s
* vcodec Name of the video codec in use
* filesize The number of bytes, if known in advance
* player_url SWF Player URL (used for rtmpdump).
+ * protocol The protocol that will be used for the actual
+ download, lower-case.
+ "http", "https", "rtsp", "rtmp" or so.
+ * preference Order number of this format. If this field is
+ present and not None, the formats get sorted
+ by this field.
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
+ * quality Order number of the video quality of this
+ format, irrespective of the file format.
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
url: Final video URL.
ext: Video filename extension.
format: The video format, defaults to ext (used for --get-format)
player_url: SWF Player URL (used for rtmpdump).
- urlhandle: [internal] The urlHandle to be used to download the file,
- like returned by urllib.request.urlopen
The following fields are optional:
@@ -244,6 +260,11 @@ class InfoExtractor(object):
xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ def report_warning(self, msg, video_id=None):
+ idstr = u'' if video_id is None else u'%s: ' % video_id
+ self._downloader.report_warning(
+ u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@@ -361,7 +382,7 @@ class InfoExtractor(object):
def _og_regexes(prop):
content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
- property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+ property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
@@ -426,6 +447,57 @@ class InfoExtractor(object):
return RATING_TABLE.get(rating.lower(), None)
+ def _sort_formats(self, formats):
+ def _formats_key(f):
+ # TODO remove the following workaround
+ from ..utils import determine_ext
+ if not f.get('ext') and 'url' in f:
+ f['ext'] = determine_ext(f['url'])
+ preference = f.get('preference')
+ if preference is None:
+ proto = f.get('protocol')
+ if proto is None:
+ proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
+ preference = 0 if proto in ['http', 'https'] else -0.1
+ if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
+ preference -= 0.5
+ if f.get('vcodec') == 'none': # audio only
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
+ else:
+ ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
+ ext_preference = 0
+ try:
+ audio_ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ audio_ext_preference = -1
+ else:
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = [u'flv', u'mp4', u'webm']
+ else:
+ ORDER = [u'webm', u'flv', u'mp4']
+ try:
+ ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ ext_preference = -1
+ audio_ext_preference = 0
+ return (
+ preference,
+ f.get('quality') if f.get('quality') is not None else -1,
+ f.get('height') if f.get('height') is not None else -1,
+ f.get('width') if f.get('width') is not None else -1,
+ ext_preference,
+ f.get('vbr') if f.get('vbr') is not None else -1,
+ f.get('abr') if f.get('abr') is not None else -1,
+ audio_ext_preference,
+ f.get('filesize') if f.get('filesize') is not None else -1,
+ f.get('format_id'),
+ )
+ formats.sort(key=_formats_key)
class SearchInfoExtractor(InfoExtractor):
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d5730684d..a2cbd4d8d 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -1,20 +1,25 @@
+from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse,
+ unescapeHTML,
class CSpanIE(InfoExtractor):
_VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)'
_TEST = {
- u'url': u'http://www.c-spanvideo.org/program/HolderonV',
- u'file': u'315139.flv',
- u'md5': u'74a623266956f69e4df0068ab6c80fe4',
- u'info_dict': {
- u"title": u"Attorney General Eric Holder on Voting Rights Act Decision"
+ 'url': 'http://www.c-spanvideo.org/program/HolderonV',
+ 'file': '315139.mp4',
+ 'md5': '8e44ce11f0f725527daccc453f553eb0',
+ 'info_dict': {
+ 'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
+ 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
- u'skip': u'Requires rtmpdump'
def _real_extract(self, url):
@@ -22,30 +27,22 @@ class CSpanIE(InfoExtractor):
prog_name = mobj.group(1)
webpage = self._download_webpage(url, prog_name)
video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id')
- data = compat_urllib_parse.urlencode({'programid': video_id,
- 'dynamic':'1'})
- info_url = 'http://www.c-spanvideo.org/common/services/flashXml.php?' + data
- video_info = self._download_webpage(info_url, video_id, u'Downloading video info')
- self.report_extraction(video_id)
- title = self._html_search_regex(r'<string name="title">(.*?)</string>',
- video_info, 'title')
- description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
- webpage, 'description',
- flags=re.MULTILINE|re.DOTALL)
- url = self._search_regex(r'<string name="URL">(.*?)</string>',
- video_info, 'video url')
- url = url.replace('$(protocol)', 'rtmp').replace('$(port)', '443')
- path = self._search_regex(r'<string name="path">(.*?)</string>',
- video_info, 'rtmp play path')
- return {'id': video_id,
- 'title': title,
- 'ext': 'flv',
- 'url': url,
- 'play_path': path,
- 'description': description,
- 'thumbnail': self._og_search_thumbnail(webpage),
- }
+ title = self._html_search_regex(
+ r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title')
+ description = self._og_search_description(webpage)
+ info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
+ data_json = self._download_webpage(
+ info_url, video_id, 'Downloading video info')
+ data = json.loads(data_json)
+ url = unescapeHTML(data['video']['files'][0]['path']['#text'])
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': url,
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index cb7226f82..0b11d1f10 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -4,18 +4,17 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
- _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TEST = {
u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
- u'file': u'36983.webm',
- u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+ u'file': u'36983.mp4',
+ u'md5': u'9dcfe344732808dbfcc901537973c922',
u'info_dict': {
u"title": u"Kaffeeland Schweiz",
u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...",
@@ -52,18 +51,12 @@ class DreiSatIE(InfoExtractor):
'width': int(fe.find('./width').text),
'height': int(fe.find('./height').text),
'url': fe.find('./url').text,
- 'ext': determine_ext(fe.find('./url').text),
'filesize': int(fe.find('./filesize').text),
'video_bitrate': int(fe.find('./videoBitrate').text),
- '3sat_qualityname': fe.find('./quality').text,
} for fe in format_els
if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
- def _sortkey(format):
- qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
- prefer_http = 1 if 'rtmp' in format['url'] else 0
- return (qidx, prefer_http, format['video_bitrate'])
- formats.sort(key=_sortkey)
+ self._sort_formats(formats)
return {
'_type': 'video',
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 7a14c98f9..7d0e117de 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1,9 +1,12 @@
# encoding: utf-8
+from __future__ import unicode_literals
import os
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
@@ -22,78 +25,78 @@ from .ooyala import OoyalaIE
class GenericIE(InfoExtractor):
- IE_DESC = u'Generic downloader that works on some sites'
+ IE_DESC = 'Generic downloader that works on some sites'
_VALID_URL = r'.*'
- IE_NAME = u'generic'
+ IE_NAME = 'generic'
_TESTS = [
- u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
- u'file': u'13601338388002.mp4',
- u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd',
- u'info_dict': {
- u"uploader": u"www.hodiho.fr",
- u"title": u"R\u00e9gis plante sa Jeep"
+ 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+ 'file': '13601338388002.mp4',
+ 'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd',
+ 'info_dict': {
+ 'uploader': 'www.hodiho.fr',
+ 'title': 'R\u00e9gis plante sa Jeep',
# embedded vimeo video
- u'add_ie': ['Vimeo'],
- u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
- u'file': u'22444065.mp4',
- u'md5': u'2903896e23df39722c33f015af0666e2',
- u'info_dict': {
- u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
- u"uploader_id": u"skillsmatter",
- u"uploader": u"Skills Matter",
+ 'add_ie': ['Vimeo'],
+ 'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',
+ 'file': '22444065.mp4',
+ 'md5': '2903896e23df39722c33f015af0666e2',
+ 'info_dict': {
+ 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011',
+ 'uploader_id': 'skillsmatter',
+ 'uploader': 'Skills Matter',
# bandcamp page with custom domain
- u'add_ie': ['Bandcamp'],
- u'url': u'http://bronyrock.com/track/the-pony-mash',
- u'file': u'3235767654.mp3',
- u'info_dict': {
- u'title': u'The Pony Mash',
- u'uploader': u'M_Pallante',
+ 'add_ie': ['Bandcamp'],
+ 'url': 'http://bronyrock.com/track/the-pony-mash',
+ 'file': '3235767654.mp3',
+ 'info_dict': {
+ 'title': 'The Pony Mash',
+ 'uploader': 'M_Pallante',
- u'skip': u'There is a limit of 200 free downloads / month for the test song',
+ 'skip': 'There is a limit of 200 free downloads / month for the test song',
# embedded brightcove video
# it also tests brightcove videos that need to set the 'Referer' in the
# http requests
- u'add_ie': ['Brightcove'],
- u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
- u'info_dict': {
- u'id': u'2765128793001',
- u'ext': u'mp4',
- u'title': u'Le cours de bourse : l’analyse technique',
- u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9',
- u'uploader': u'BFM BUSINESS',
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
+ 'info_dict': {
+ 'id': '2765128793001',
+ 'ext': 'mp4',
+ 'title': 'Le cours de bourse : l’analyse technique',
+ 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
+ 'uploader': 'BFM BUSINESS',
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
# Direct link to a video
- u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4',
- u'file': u'trailer.mp4',
- u'md5': u'67d406c2bcb6af27fa886f31aa934bbe',
- u'info_dict': {
- u'id': u'trailer',
- u'title': u'trailer',
- u'upload_date': u'20100513',
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'file': 'trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
# ooyala video
- u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c',
- u'info_dict': {
- u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
- u'ext': u'mp4',
- u'title': u'2cc213299525360.mov', #that's what we get
+ 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+ 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'info_dict': {
+ 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
+ 'ext': 'mp4',
+ 'title': '2cc213299525360.mov', #that's what we get
@@ -101,12 +104,12 @@ class GenericIE(InfoExtractor):
def report_download_webpage(self, video_id):
"""Report webpage download."""
if not self._downloader.params.get('test', False):
- self._downloader.report_warning(u'Falling back on generic information extractor.')
+ self._downloader.report_warning('Falling back on generic information extractor.')
super(GenericIE, self).report_download_webpage(video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
- self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
+ self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
def _send_head(self, url):
"""Check if it is a redirect, like url shorteners, in case return the new url."""
@@ -152,7 +155,7 @@ class GenericIE(InfoExtractor):
response = opener.open(HEADRequest(url))
if response is None:
- raise ExtractorError(u'Invalid URL protocol')
+ raise ExtractorError('Invalid URL protocol')
return response
def _real_extract(self, url):
@@ -162,6 +165,8 @@ class GenericIE(InfoExtractor):
return self.url_result('http://' + url)
video_id = os.path.splitext(url.split('/')[-1])[0]
+ self.to_screen('%s: Requesting header' % video_id)
response = self._send_head(url)
@@ -184,7 +189,7 @@ class GenericIE(InfoExtractor):
'formats': [{
'format_id': m.group('format_id'),
'url': url,
- 'vcodec': u'none' if m.group('type') == 'audio' else None
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
'upload_date': upload_date,
@@ -198,7 +203,7 @@ class GenericIE(InfoExtractor):
except ValueError:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
- raise ExtractorError(u'Failed to download URL: %s' % url)
+ raise ExtractorError('Failed to download URL: %s' % url)
@@ -209,18 +214,19 @@ class GenericIE(InfoExtractor):
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
video_title = self._html_search_regex(
- r'(?s)<title>(.*?)</title>', webpage, u'video title',
- default=u'video')
+ r'(?s)<title>(.*?)</title>', webpage, 'video title',
+ default='video')
# video uploader is domain name
video_uploader = self._search_regex(
- r'^(?:https?://)?([^/]*)/.*', url, u'video uploader')
+ r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is not None:
- self.to_screen(u'Brightcove video detected.')
- return self.url_result(bc_url, 'Brightcove')
+ self.to_screen('Brightcove video detected.')
+ surl = smuggle_url(bc_url, {'Referer': url})
+ return self.url_result(surl, 'Brightcove')
# Look for embedded (iframe) Vimeo player
mobj = re.search(
@@ -271,16 +277,12 @@ class GenericIE(InfoExtractor):
# Look for embedded blip.tv player
- mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage)
+ mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
if mobj:
- return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV')
- mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage)
+ return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
+ mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage)
if mobj:
- player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1)
- player_page = self._download_webpage(player_url, mobj.group(1))
- blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False)
- if blip_video_id:
- return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV')
+ return self.url_result(mobj.group(1), 'BlipTV')
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
@@ -308,6 +310,9 @@ class GenericIE(InfoExtractor):
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
+ # Look for gorilla-vid style embedding
+ mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage)
+ if mobj is None:
# Broaden the search a little bit
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
@@ -327,23 +332,27 @@ class GenericIE(InfoExtractor):
# HTML5 video
mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
if mobj is None:
- raise ExtractorError(u'Unsupported URL: %s' % url)
+ raise ExtractorError('Unsupported URL: %s' % url)
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj.group(1) is None:
- raise ExtractorError(u'Did not find a valid video URL at %s' % url)
+ raise ExtractorError('Did not find a valid video URL at %s' % url)
video_url = mobj.group(1)
video_url = compat_urlparse.urljoin(url, video_url)
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ # Sometimes, jwplayer extraction will result in a YouTube URL
+ if YoutubeIE.suitable(video_url):
+ return self.url_result(video_url, 'Youtube')
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
return {
- 'id': video_id,
- 'url': video_url,
+ 'id': video_id,
+ 'url': video_url,
'uploader': video_uploader,
- 'title': video_title,
+ 'title': video_title,
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index e5332cce8..16926b4d3 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -55,3 +55,32 @@ class ImdbIE(InfoExtractor):
'description': descr,
'thumbnail': format_info['slate'],
+class ImdbListIE(InfoExtractor):
+ IE_NAME = u'imdb:list'
+ IE_DESC = u'Internet Movie Database lists'
+ _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ list_id = mobj.group('id')
+ # RSS XML is sometimes malformed
+ rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, u'Downloading list RSS')
+ list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, u'list title')
+ # Export is independent of actual author_id, but returns 404 if no author_id is provided.
+ # However, passing dummy author_id seems to be enough.
+ csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id,
+ list_id, u'Downloading list CSV')
+ entries = []
+ for item in csv.split('\n')[1:]:
+ cols = item.split(',')
+ if len(cols) < 2:
+ continue
+ item_id = cols[1][1:-1]
+ if item_id.startswith('vi'):
+ entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb'))
+ return self.playlist_result(entries, list_id, list_title) \ No newline at end of file
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 16a6f73c8..4ddda2f1b 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -5,7 +5,6 @@ from ..utils import (
- determine_ext,
@@ -63,13 +62,17 @@ class InternetVideoArchiveIE(InfoExtractor):
for content in item.findall(_bp('media:group/media:content')):
attr = content.attrib
f_url = attr['url']
+ width = int(attr['width'])
+ bitrate = int(attr['bitrate'])
+ format_id = '%d-%dk' % (width, bitrate)
+ 'format_id': format_id,
'url': f_url,
- 'ext': determine_ext(f_url),
- 'width': int(attr['width']),
- 'bitrate': int(attr['bitrate']),
+ 'width': width,
+ 'tbr': bitrate,
- formats = sorted(formats, key=lambda f: f['bitrate'])
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 4bdf55f93..98d1d272a 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -84,14 +84,16 @@ class IviIE(InfoExtractor):
result = video_json[u'result']
- formats = [{'url': x[u'url'],
- 'format_id': x[u'content_format']
- } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
- formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id']))
- if len(formats) == 0:
- self._downloader.report_warning(u'No media links available for %s' % video_id)
- return
+ formats = [{
+ 'url': x[u'url'],
+ 'format_id': x[u'content_format'],
+ 'preference': self._known_formats.index(x[u'content_format']),
+ } for x in result[u'files'] if x[u'content_format'] in self._known_formats]
+ self._sort_formats(formats)
+ if not formats:
+ raise ExtractorError(u'No media links available for %s' % video_id)
duration = result[u'duration']
compilation = result[u'compilation']
diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py
new file mode 100644
index 000000000..aad782578
--- /dev/null
+++ b/youtube_dl/extractor/jpopsukitv.py
@@ -0,0 +1,73 @@
+# coding=utf-8
+from __future__ import unicode_literals
+import re
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+class JpopsukiIE(InfoExtractor):
+ IE_NAME = 'jpopsuki.tv'
+ _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)'
+ _TEST = {
+ 'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771',
+ 'md5': '88018c0c1a9b1387940e90ec9e7e198e',
+ 'file': '00be659d23b0b40508169cdee4545771.mp4',
+ 'info_dict': {
+ 'id': '00be659d23b0b40508169cdee4545771',
+ 'title': 'ayumi hamasaki - evolution',
+ 'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution',
+ 'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg',
+ 'uploader': 'plama_chan',
+ 'uploader_id': '404',
+ 'upload_date': '20121101'
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ video_url = 'http://www.jpopsuki.tv' + self._html_search_regex(
+ r'<source src="(.*?)" type', webpage, 'video url')
+ video_title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ uploader = self._html_search_regex(
+ r'<li>from: <a href="/user/view/user/(.*?)/uid/',
+ webpage, 'video uploader', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)',
+ webpage, 'video uploader_id', fatal=False)
+ upload_date = self._html_search_regex(
+ r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date',
+ fatal=False)
+ if upload_date is not None:
+ upload_date = unified_strdate(upload_date)
+ view_count_str = self._html_search_regex(
+ r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count',
+ fatal=False)
+ comment_count_str = self._html_search_regex(
+ r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count',
+ fatal=False)
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ 'view_count': int_or_none(view_count_str),
+ 'comment_count': int_or_none(comment_count_str),
+ }
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
new file mode 100644
index 000000000..844ba4dcb
--- /dev/null
+++ b/youtube_dl/extractor/lynda.py
@@ -0,0 +1,142 @@
+from __future__ import unicode_literals
+import re
+import json
+from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
+from ..utils import ExtractorError
+class LyndaIE(SubtitlesInfoExtractor):
+ IE_NAME = 'lynda'
+ IE_DESC = 'lynda.com videos'
+ _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
+ _TEST = {
+ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
+ 'file': '114408.mp4',
+ 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
+ u"info_dict": {
+ 'title': 'Using the exercise files',
+ 'duration': 68
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
+ video_id, 'Downloading video JSON')
+ video_json = json.loads(page)
+ if 'Status' in video_json and video_json['Status'] == 'NotFound':
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ if video_json['HasAccess'] is False:
+ raise ExtractorError('Video %s is only available for members' % video_id, expected=True)
+ video_id = video_json['ID']
+ duration = video_json['DurationInSeconds']
+ title = video_json['Title']
+ formats = [{'url': fmt['Url'],
+ 'ext': fmt['Extension'],
+ 'width': fmt['Width'],
+ 'height': fmt['Height'],
+ 'filesize': fmt['FileSize'],
+ 'format_id': str(fmt['Resolution'])
+ } for fmt in video_json['Formats']]
+ self._sort_formats(formats)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, page)
+ return
+ subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'subtitles': subtitles,
+ 'formats': formats
+ }
+ _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
+ def _fix_subtitles(self, subtitles):
+ fixed_subtitles = {}
+ for k, v in subtitles.items():
+ subs = json.loads(v)
+ if len(subs) == 0:
+ continue
+ srt = ''
+ for pos in range(0, len(subs) - 1):
+ seq_current = subs[pos]
+ m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
+ if m_current is None:
+ continue
+ seq_next = subs[pos+1]
+ m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
+ if m_next is None:
+ continue
+ appear_time = m_current.group('timecode')
+ disappear_time = m_next.group('timecode')
+ text = seq_current['Caption']
+ srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
+ if srt:
+ fixed_subtitles[k] = srt
+ return fixed_subtitles
+ def _get_available_subtitles(self, video_id, webpage):
+ url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
+ sub = self._download_webpage(url, None, note=False)
+ sub_json = json.loads(sub)
+ return {'en': url} if len(sub_json) > 0 else {}
+class LyndaCourseIE(InfoExtractor):
+ IE_NAME = 'lynda:course'
+ IE_DESC = 'lynda.com online courses'
+ # Course link equals to welcome/introduction video link of same course
+ # We will recognize it as course link
+ _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html'
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_path = mobj.group('coursepath')
+ course_id = mobj.group('courseid')
+ page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
+ course_id, 'Downloading course JSON')
+ course_json = json.loads(page)
+ if 'Status' in course_json and course_json['Status'] == 'NotFound':
+ raise ExtractorError('Course %s does not exist' % course_id, expected=True)
+ unaccessible_videos = 0
+ videos = []
+ for chapter in course_json['Chapters']:
+ for video in chapter['Videos']:
+ if video['HasAccess'] is not True:
+ unaccessible_videos += 1
+ continue
+ videos.append(video['ID'])
+ if unaccessible_videos > 0:
+ self._downloader.report_warning('%s videos are only available for members and will not be downloaded' % unaccessible_videos)
+ entries = [
+ self.url_result('http://www.lynda.com/%s/%s-4.html' %
+ (course_path, video_id),
+ 'Lynda')
+ for video_id in videos]
+ course_title = course_json['Title']
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py
new file mode 100644
index 000000000..b818cf50c
--- /dev/null
+++ b/youtube_dl/extractor/macgamestore.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+import re
+from .common import InfoExtractor
+from ..utils import ExtractorError
+class MacGameStoreIE(InfoExtractor):
+ IE_NAME = 'macgamestore'
+ IE_DESC = 'MacGameStore trailers'
+ _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450',
+ 'file': '2450.m4v',
+ 'md5': '8649b8ea684b6666b4c5be736ecddc61',
+ 'info_dict': {
+ 'title': 'Crow',
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id, 'Downloading trailer page')
+ if re.search(r'>Missing Media<', webpage) is not None:
+ raise ExtractorError('Trailer %s does not exist' % video_id, expected=True)
+ video_title = self._html_search_regex(
+ r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>',
+ webpage, 'video URL')
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title
+ }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 08ce0647f..7aa0080d7 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -52,10 +52,11 @@ class MDRIE(InfoExtractor):
'format_id': u'%s-%d' % (media_type, vbr),
- formats.sort(key=lambda f: (f.get('vbr'), f['abr']))
if not formats:
raise ExtractorError(u'Could not find any valid formats')
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 52be9232f..76b717fe5 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -33,8 +33,18 @@ class TechTVMITIE(InfoExtractor):
raw_page, u'base url')
formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
u'video formats')
- formats = json.loads(formats_json)
- formats = sorted(formats, key=lambda f: f['bitrate'])
+ formats_mit = json.loads(formats_json)
+ formats = [
+ {
+ 'format_id': f['label'],
+ 'url': base_url + f['url'].partition(':')[2],
+ 'ext': f['url'].partition(':')[0],
+ 'format': f['label'],
+ 'width': f['width'],
+ 'vbr': f['bitrate'],
+ }
+ for f in formats_mit
+ ]
title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', clean_page))
@@ -43,8 +53,7 @@ class TechTVMITIE(InfoExtractor):
return {'id': video_id,
'title': title,
- 'url': base_url + formats[-1]['url'].replace('mp4:', ''),
- 'ext': 'mp4',
+ 'formats': formats,
'description': description,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 125d81551..7c54ea0f4 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -53,7 +53,7 @@ class MixcloudIE(InfoExtractor):
info = json.loads(json_data)
preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
- song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
+ song_url = preview_url.replace('/previews/', '/c/originals/')
template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
final_song_url = self._get_url(template_url)
if final_song_url is None:
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index ed11f521a..f1cf41e2d 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -129,7 +129,7 @@ class MTVIE(MTVServicesInfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- uri = mobj.group('mgid')
+ uri = mobj.groupdict().get('mgid')
if uri is None:
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 0404e6e43..6d35c7861 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -143,8 +143,10 @@ class MyVideoIE(InfoExtractor):
if mobj:
video_url = compat_urllib_parse.unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
- self._downloader.report_warning(u'forcing RTMPT ...')
- video_url = video_url.replace('rtmpe://', 'rtmpt://')
+ self.report_warning(
+ u'Rewriting URL to use unencrypted rtmp:// ...',
+ video_id)
+ video_url = video_url.replace('rtmpe://', 'rtmp://')
if not video_url:
# extract non rtmp videos
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index b42eae89a..88f03608b 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -1,54 +1,98 @@
# coding: utf-8
+from __future__ import unicode_literals
-import re
-import xml.etree.ElementTree
import json
+import re
from .common import InfoExtractor
from ..utils import (
- compat_urlparse,
- ExtractorError,
- find_xpath_attr,
+ HEADRequest,
+ unified_strdate,
class ORFIE(InfoExtractor):
- _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
+ _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
+ 'file': '7319747.mp4',
+ 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
+ 'info_dict': {
+ 'title': 'Was Sie schon immer über Klassik wissen wollten',
+ 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
+ 'duration': 3508,
+ 'upload_date': '20140105',
+ },
+ 'skip': 'Blocked outside of Austria',
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
- flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
- flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
- flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
- playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
- playlist = json.loads(playlist_json)
- videos = []
- ns = '{http://tempuri.org/XMLSchema.xsd}'
- xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
- webpage_description = self._og_search_description(webpage)
- for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
- # Get best quality url
- rtmp_url = None
- for q in ['Q6A', 'Q4A', 'Q1A']:
- video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
- if video_url is not None:
- rtmp_url = video_url.text
- break
- if rtmp_url is None:
- raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
- description = self._html_search_regex(
- r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
- u'description', default=webpage_description, flags=re.DOTALL)
- videos.append({
+ data_json = self._search_regex(
+ r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
+ all_data = json.loads(data_json)
+ sdata = all_data[0]['values']['segments']
+ def quality_to_int(s):
+ m = re.search('([0-9]+)', s)
+ if m is None:
+ return -1
+ return int(m.group(1))
+ entries = []
+ for sd in sdata:
+ video_id = sd['id']
+ formats = [{
+ 'preference': -10 if fd['delivery'] == 'hls' else None,
+ 'format_id': '%s-%s-%s' % (
+ fd['delivery'], fd['quality'], fd['quality_string']),
+ 'url': fd['src'],
+ 'protocol': fd['protocol'],
+ 'quality': quality_to_int(fd['quality']),
+ } for fd in sd['playlist_item_array']['sources']]
+ # Check for geoblocking.
+ # There is a property is_geoprotection, but that's always false
+ geo_str = sd.get('geoprotection_string')
+ if geo_str:
+ try:
+ http_url = next(
+ f['url']
+ for f in formats
+ if re.match(r'^https?://.*\.mp4$', f['url']))
+ except StopIteration:
+ pass
+ else:
+ req = HEADRequest(http_url)
+ response = self._request_webpage(
+ req, video_id,
+ note='Testing for geoblocking',
+ errnote=((
+ 'This video seems to be blocked outside of %s. '
+ 'You may want to try the streaming-* formats.')
+ % geo_str),
+ fatal=False)
+ self._sort_formats(formats)
+ upload_date = unified_strdate(sd['created_date'])
+ entries.append({
'_type': 'video',
- 'id': info['id'],
- 'title': info['title'],
- 'url': rtmp_url,
- 'ext': 'flv',
- 'description': description,
- })
- return videos
+ 'id': video_id,
+ 'title': sd['header'],
+ 'formats': formats,
+ 'description': sd.get('description'),
+ 'duration': int(sd['duration_in_seconds']),
+ 'upload_date': upload_date,
+ 'thumbnail': sd.get('image_full_url'),
+ })
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': playlist_id,
+ }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 71abd5013..e9ff8d1af 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -5,7 +5,7 @@ from ..utils import compat_urllib_parse
class PornHdIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
_TEST = {
u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
u'file': u'1962.flv',
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index a589a893b..99f5b19d2 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -1,5 +1,6 @@
# encoding: utf-8
+import os.path
import re
import json
import hashlib
@@ -10,6 +11,7 @@ from ..utils import (
+ url_basename,
@@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):
# We will extract some from the video web page instead
video_page_url = 'http://' + mobj.group('url')
video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
+ # Warning if video is unavailable
+ warning = self._html_search_regex(
+ r'<div class="videoUnModer">(.*?)</div>', video_page,
+ u'warning message', default=None)
+ if warning is not None:
+ self._downloader.report_warning(
+ u'Video %s may not be available; smotri said: %s ' %
+ (video_id, warning))
# Adult content
if re.search(u'EroConfirmText">', video_page) is not None:
@@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):
# Extract the rest of meta data
video_title = self._search_meta(u'name', video_page, u'title')
if not video_title:
- video_title = video_url.rsplit('/', 1)[-1]
+ video_title = os.path.splitext(url_basename(video_url))[0]
video_description = self._search_meta(u'description', video_page)
END_TEXT = u' на сайте Smotri.com'
- if video_description.endswith(END_TEXT):
+ if video_description and video_description.endswith(END_TEXT):
video_description = video_description[:-len(END_TEXT)]
START_TEXT = u'Смотреть онлайн ролик '
- if video_description.startswith(START_TEXT):
+ if video_description and video_description.startswith(START_TEXT):
video_description = video_description[len(START_TEXT):]
video_thumbnail = self._search_meta(u'thumbnail', video_page)
upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
- upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
- video_upload_date = (
- (
- upload_date_m.group('year') +
- upload_date_m.group('month') +
- upload_date_m.group('day')
+ if upload_date_str:
+ upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+ video_upload_date = (
+ (
+ upload_date_m.group('year') +
+ upload_date_m.group('month') +
+ upload_date_m.group('day')
+ )
+ if upload_date_m else None
- if upload_date_m else None
- )
+ else:
+ video_upload_date = None
duration_str = self._search_meta(u'duration', video_page)
- duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
- video_duration = (
- (
- (int(duration_m.group('hours')) * 60 * 60) +
- (int(duration_m.group('minutes')) * 60) +
- int(duration_m.group('seconds'))
+ if duration_str:
+ duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+ video_duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m else None
- if duration_m else None
- )
+ else:
+ video_duration = None
video_uploader = self._html_search_regex(
u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index e22ff9c38..951e977bd 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):
- |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*)
+ |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
IE_NAME = u'soundcloud'
@@ -193,7 +193,7 @@ class SoundcloudIE(InfoExtractor):
if track_id is not None:
info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
full_title = track_id
- elif mobj.group('widget'):
+ elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
return self.url_result(query['url'][0], ie='Soundcloud')
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 695520524..051a34d5b 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -51,9 +51,10 @@ class SpiegelIE(InfoExtractor):
# Blacklist type 6, it's extremely LQ and not available on the same server
if n.tag.startswith('type') and n.tag != 'type6'
- formats.sort(key=lambda f: f['vbr'])
duration = float(idoc[0].findall('./duration')[0].text)
+ self._sort_formats(formats)
info = {
'id': video_id,
'title': video_title,
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index cec65261b..23172143e 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -55,15 +55,21 @@ class ThePlatformIE(InfoExtractor):
formats = []
for f in switch.findall(_x('smil:video')):
attr = f.attrib
+ width = int(attr['width'])
+ height = int(attr['height'])
+ vbr = int(attr['system-bitrate']) // 1000
+ format_id = '%dx%d_%dk' % (width, height, vbr)
+ 'format_id': format_id,
'url': base_url,
'play_path': 'mp4:' + attr['src'],
'ext': 'flv',
- 'width': int(attr['width']),
- 'height': int(attr['height']),
- 'vbr': int(attr['system-bitrate']),
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
- formats.sort(key=lambda f: (f['height'], f['width'], f['vbr']))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 3cf8c853d..b1c854a64 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
import re
import json
@@ -8,16 +10,17 @@ from ..utils import (
class VeeHDIE(InfoExtractor):
_VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
_TEST = {
- u'url': u'http://veehd.com/video/4686958',
- u'file': u'4686958.mp4',
- u'info_dict': {
- u'title': u'Time Lapse View from Space ( ISS)',
- u'uploader_id': u'spotted',
- u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
+ 'url': 'http://veehd.com/video/4686958',
+ 'file': '4686958.mp4',
+ 'info_dict': {
+ 'title': 'Time Lapse View from Space ( ISS)',
+ 'uploader_id': 'spotted',
+ 'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
@@ -25,24 +28,30 @@ class VeeHDIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ # VeeHD seems to send garbage on the first request.
+ # See https://github.com/rg3/youtube-dl/issues/2102
+ self._download_webpage(url, video_id, 'Requesting webpage')
webpage = self._download_webpage(url, video_id)
- player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
- webpage, u'player path')
+ player_path = self._search_regex(
+ r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
+ webpage, 'player path')
player_url = compat_urlparse.urljoin(url, player_path)
- player_page = self._download_webpage(player_url, video_id,
- u'Downloading player page')
- config_json = self._search_regex(r'value=\'config=({.+?})\'',
- player_page, u'config json')
+ self._download_webpage(player_url, video_id, 'Requesting player page')
+ player_page = self._download_webpage(
+ player_url, video_id, 'Downloading player page')
+ config_json = self._search_regex(
+ r'value=\'config=({.+?})\'', player_page, 'config json')
config = json.loads(config_json)
video_url = compat_urlparse.unquote(config['clip']['url'])
title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>',
- webpage, u'uploader')
+ webpage, 'uploader')
thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"',
- webpage, u'thumbnail')
+ webpage, 'thumbnail')
description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul',
- webpage, u'description', flags=re.DOTALL)
+ webpage, 'description', flags=re.DOTALL)
return {
'_type': 'video',
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index 00672c9e5..baa57f343 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -1,22 +1,22 @@
+from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
class VeohIE(InfoExtractor):
- _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+ _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
_TEST = {
- u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- u'file': u'56314296.mp4',
- u'md5': u'620e68e6a3cff80086df3348426c9ca3',
- u'info_dict': {
- u'title': u'Straight Backs Are Stronger',
- u'uploader': u'LUMOback',
- u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ 'file': '56314296.mp4',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
+ 'info_dict': {
+ 'title': 'Straight Backs Are Stronger',
+ 'uploader': 'LUMOback',
+ 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
@@ -28,20 +28,20 @@ class VeohIE(InfoExtractor):
m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
- self.to_screen(u'%s: detected Youtube video.' % video_id)
+ self.to_screen('%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
info = json.loads(info)
- video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
- return {'id': info['videoId'],
- 'title': info['title'],
- 'ext': determine_ext(video_url),
- 'url': video_url,
- 'uploader': info['username'],
- 'thumbnail': info.get('highResImage') or info.get('medResImage'),
- 'description': info['description'],
- 'view_count': info['views'],
- }
+ video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+ return {
+ 'id': info['videoId'],
+ 'title': info['title'],
+ 'url': video_url,
+ 'uploader': info['username'],
+ 'thumbnail': info.get('highResImage') or info.get('medResImage'),
+ 'description': info['description'],
+ 'view_count': info['views'],
+ }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index c3623fcbe..ad86d033a 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -1,4 +1,6 @@
# encoding: utf-8
+from __future__ import unicode_literals
import json
import re
import itertools
@@ -31,54 +33,55 @@ class VimeoIE(InfoExtractor):
_NETRC_MACHINE = 'vimeo'
- IE_NAME = u'vimeo'
+ IE_NAME = 'vimeo'
_TESTS = [
- u'url': u'http://vimeo.com/56015672#at=0',
- u'file': u'56015672.mp4',
- u'md5': u'8879b6cc097e987f02484baf890129e5',
- u'info_dict': {
- u"upload_date": u"20121220",
- u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- u"uploader_id": u"user7108434",
- u"uploader": u"Filippo Valsorda",
- u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'url': 'http://vimeo.com/56015672#at=0',
+ 'file': '56015672.mp4',
+ 'md5': '8879b6cc097e987f02484baf890129e5',
+ 'info_dict': {
+ "upload_date": "20121220",
+ "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ "uploader_id": "user7108434",
+ "uploader": "Filippo Valsorda",
+ "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
- u'file': u'68093876.mp4',
- u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82',
- u'note': u'Vimeo Pro video (#1197)',
- u'info_dict': {
- u'uploader_id': u'openstreetmapus',
- u'uploader': u'OpenStreetMap US',
- u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+ 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
+ 'file': '68093876.mp4',
+ 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
+ 'note': 'Vimeo Pro video (#1197)',
+ 'info_dict': {
+ 'uploader_id': 'openstreetmapus',
+ 'uploader': 'OpenStreetMap US',
+ 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
- u'url': u'http://player.vimeo.com/video/54469442',
- u'file': u'54469442.mp4',
- u'md5': u'619b811a4417aa4abe78dc653becf511',
- u'note': u'Videos that embed the url in the player page',
- u'info_dict': {
- u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',
- u'uploader': u'The BLN & Business of Software',
+ 'url': 'http://player.vimeo.com/video/54469442',
+ 'file': '54469442.mp4',
+ 'md5': '619b811a4417aa4abe78dc653becf511',
+ 'note': 'Videos that embed the url in the player page',
+ 'info_dict': {
+ 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software',
+ 'uploader': 'The BLN & Business of Software',
+ 'uploader_id': 'theblnbusinessofsoftware',
- u'url': u'http://vimeo.com/68375962',
- u'file': u'68375962.mp4',
- u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7',
- u'note': u'Video protected with password',
- u'info_dict': {
- u'title': u'youtube-dl password protected test video',
- u'upload_date': u'20130614',
- u'uploader_id': u'user18948128',
- u'uploader': u'Jaime Marquínez Ferrándiz',
+ 'url': 'http://vimeo.com/68375962',
+ 'file': '68375962.mp4',
+ 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
+ 'note': 'Video protected with password',
+ 'info_dict': {
+ 'title': 'youtube-dl password protected test video',
+ 'upload_date': '20130614',
+ 'uploader_id': 'user18948128',
+ 'uploader': 'Jaime Marquínez Ferrándiz',
- u'params': {
- u'videopassword': u'youtube-dl',
+ 'params': {
+ 'videopassword': 'youtube-dl',
@@ -90,7 +93,7 @@ class VimeoIE(InfoExtractor):
login_url = 'https://vimeo.com/log_in'
webpage = self._download_webpage(login_url, None, False)
- token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({'email': username,
'password': password,
'action': 'login',
@@ -100,13 +103,13 @@ class VimeoIE(InfoExtractor):
login_request = compat_urllib_request.Request(login_url, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Cookie', 'xsrft=%s' % token)
- self._download_webpage(login_request, None, False, u'Wrong login info')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
- raise ExtractorError(u'This video is protected by a password, use the --video-password option')
- token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ raise ExtractorError('This video is protected by a password, use the --video-password option')
+ token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
data = compat_urllib_parse.urlencode({'password': password,
'token': token})
# I didn't manage to use the password with https
@@ -118,8 +121,8 @@ class VimeoIE(InfoExtractor):
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'xsrft=%s' % token)
self._download_webpage(password_request, video_id,
- u'Verifying the password',
- u'Wrong password')
+ 'Verifying the password',
+ 'Wrong password')
def _real_initialize(self):
@@ -134,7 +137,7 @@ class VimeoIE(InfoExtractor):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group('id')
if mobj.group('pro') or mobj.group('player'):
@@ -155,7 +158,7 @@ class VimeoIE(InfoExtractor):
config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage, u'config URL')
+ r' data-config-url="(.+?)"', webpage, 'config URL')
config_json = self._download_webpage(config_url, video_id)
config = json.loads(config_json)
except RegexNotFoundError:
@@ -166,19 +169,23 @@ class VimeoIE(InfoExtractor):
config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
- config = self._search_regex(config_re, webpage, u'info section',
+ config = self._search_regex(config_re, webpage, 'info section',
config = json.loads(config)
except Exception as e:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
- raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
+ raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
if re.search('<form[^>]+?id="pw_form"', webpage) is not None:
self._verify_video_password(url, video_id, webpage)
return self._real_extract(url)
- raise ExtractorError(u'Unable to extract info section',
+ raise ExtractorError('Unable to extract info section',
+ else:
+ if config.get('view') == 4:
+ self._verify_video_password(url, video_id, webpage)
+ return self._real_extract(url)
# Extract title
video_title = config["video"]["title"]
@@ -212,9 +219,9 @@ class VimeoIE(InfoExtractor):
video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
- view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count'))
- like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count'))
- comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count'))
+ view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
+ like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
+ comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
except RegexNotFoundError:
# This info is only available in vimeo.com/{id} urls
view_count = None
@@ -255,7 +262,7 @@ class VimeoIE(InfoExtractor):
for key in ('other', 'sd', 'hd'):
formats += files[key]
if len(formats) == 0:
- raise ExtractorError(u'No known codec found')
+ raise ExtractorError('No known codec found')
return {
'id': video_id,
@@ -274,7 +281,7 @@ class VimeoIE(InfoExtractor):
class VimeoChannelIE(InfoExtractor):
- IE_NAME = u'vimeo:channel'
+ IE_NAME = 'vimeo:channel'
_VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
@@ -283,14 +290,14 @@ class VimeoChannelIE(InfoExtractor):
return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage):
- return self._html_search_regex(self._TITLE_RE, webpage, u'list title')
+ return self._html_search_regex(self._TITLE_RE, webpage, 'list title')
def _extract_videos(self, list_id, base_url):
video_ids = []
for pagenum in itertools.count(1):
webpage = self._download_webpage(
self._page_url(base_url, pagenum) ,list_id,
- u'Downloading page %s' % pagenum)
+ 'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
@@ -310,8 +317,8 @@ class VimeoChannelIE(InfoExtractor):
class VimeoUserIE(VimeoChannelIE):
- IE_NAME = u'vimeo:user'
- _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+ IE_NAME = 'vimeo:user'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)(?:[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
@@ -327,7 +334,7 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE):
- IE_NAME = u'vimeo:album'
+ IE_NAME = 'vimeo:album'
_VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
@@ -336,12 +343,12 @@ class VimeoAlbumIE(VimeoChannelIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- album_id = mobj.group('id')
+ album_id = mobj.group('id')
return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
class VimeoGroupsIE(VimeoAlbumIE):
- IE_NAME = u'vimeo:group'
+ IE_NAME = 'vimeo:group'
_VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'
def _extract_list_title(self, webpage):
@@ -351,3 +358,24 @@ class VimeoGroupsIE(VimeoAlbumIE):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)
+class VimeoReviewIE(InfoExtractor):
+ IE_NAME = 'vimeo:review'
+ IE_DESC = 'Review pages on vimeo'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/[^/]+/review/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
+ 'file': '75524534.mp4',
+ 'md5': 'c507a72f780cacc12b2248bb4006d253',
+ 'info_dict': {
+ 'title': "DICK HARDWICK 'Comedian'",
+ 'uploader': 'Richard Hardwick',
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ player_url = 'https://player.vimeo.com/player/' + video_id
+ return self.url_result(player_url, 'Vimeo', video_id)
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index e1748c261..bc31c2e64 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -44,8 +44,10 @@ class WistiaIE(InfoExtractor):
'height': a['height'],
'filesize': a['size'],
'ext': a['ext'],
+ 'preference': 1 if atype == 'original' else None,
- formats.sort(key=lambda a: a['filesize'])
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 5c9c361b9..e17a39782 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
- determine_ext,
+ int_or_none,
@@ -68,9 +68,9 @@ class YahooIE(InfoExtractor):
formats = []
for s in info['streams']:
format_info = {
- 'width': s.get('width'),
- 'height': s.get('height'),
- 'bitrate': s.get('bitrate'),
+ 'width': int_or_none(s.get('width')),
+ 'height': int_or_none(s.get('height')),
+ 'tbr': int_or_none(s.get('bitrate')),
host = s['host']
@@ -84,10 +84,10 @@ class YahooIE(InfoExtractor):
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
- format_info['ext'] = determine_ext(format_url)
- formats = sorted(formats, key=lambda f:(f['height'], f['width']))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index bd0f2cae0..77ad423c4 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,5 +1,4 @@
import json
-import os
import re
import sys
@@ -16,6 +15,7 @@ from ..aes import (
class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
@@ -23,9 +23,9 @@ class YouPornIE(InfoExtractor):
u'file': u'505835.mp4',
u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
u'info_dict': {
- u"upload_date": u"20101221",
- u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
- u"uploader": u"Ask Dan And Jennifer",
+ u"upload_date": u"20101221",
+ u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
+ u"uploader": u"Ask Dan And Jennifer",
u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
u"age_limit": 18,
@@ -71,38 +71,36 @@ class YouPornIE(InfoExtractor):
link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
- if not links:
- raise ExtractorError(u'ERROR: no known formats available for video')
formats = []
for link in links:
# A link looks like this:
# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
# A path looks like this:
# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
video_url = unescapeHTML(link)
path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[4].split('_')[:2]
+ format_parts = path.split('/')[4].split('_')[:2]
- # size = format[0]
- # bitrate = format[1]
- format = "-".join(format)
- # title = u'%s-%s-%s' % (video_title, size, bitrate)
+ dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0]
+ resolution = format_parts[0]
+ height = int(resolution[:-len('p')])
+ bitrate = int(format_parts[1][:-len('k')])
+ format = u'-'.join(format_parts) + u'-' + dn
'url': video_url,
- 'ext': extension,
'format': format,
'format_id': format,
+ 'height': height,
+ 'tbr': bitrate,
+ 'resolution': resolution,
- # Sort and remove doubles
- formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-'))))
- for i in range(len(formats)-1,0,-1):
- if formats[i]['format_id'] == formats[i-1]['format_id']:
- del formats[i]
+ self._sort_formats(formats)
+ if not formats:
+ raise ExtractorError(u'ERROR: no known formats available for video')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index a68576547..9424d5e26 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -150,168 +150,69 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
(?(1).+)? # if we found the ID, everything can follow
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
- # Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '84', '102', '83', '101', '82', '100',
- # Dash video
- '138', '137', '248', '136', '247', '135', '246',
- '245', '244', '134', '243', '133', '242', '160',
- # Dash audio
- '141', '172', '140', '171', '139',
- ]
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
- # Apple HTTP Live Streaming
- '96', '95', '94', '93', '92', '132', '151',
- # 3D
- '85', '102', '84', '101', '83', '100', '82',
- # Dash video
- '138', '248', '137', '247', '136', '246', '245',
- '244', '135', '243', '134', '242', '133', '160',
- # Dash audio
- '172', '141', '171', '140', '139',
- ]
- _video_formats_map = {
- 'flv': ['35', '34', '6', '5'],
- '3gp': ['36', '17', '13'],
- 'mp4': ['38', '37', '22', '18'],
- 'webm': ['46', '45', '44', '43'],
- }
- _video_extensions = {
- '13': '3gp',
- '17': '3gp',
- '18': 'mp4',
- '22': 'mp4',
- '36': '3gp',
- '37': 'mp4',
- '38': 'mp4',
- '43': 'webm',
- '44': 'webm',
- '45': 'webm',
- '46': 'webm',
+ _formats = {
+ '5': {'ext': 'flv', 'width': 400, 'height': 240},
+ '6': {'ext': 'flv', 'width': 450, 'height': 270},
+ '13': {'ext': '3gp'},
+ '17': {'ext': '3gp', 'width': 176, 'height': 144},
+ '18': {'ext': 'mp4', 'width': 640, 'height': 360},
+ '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
+ '34': {'ext': 'flv', 'width': 640, 'height': 360},
+ '35': {'ext': 'flv', 'width': 854, 'height': 480},
+ '36': {'ext': '3gp', 'width': 320, 'height': 240},
+ '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
+ '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
+ '43': {'ext': 'webm', 'width': 640, 'height': 360},
+ '44': {'ext': 'webm', 'width': 854, 'height': 480},
+ '45': {'ext': 'webm', 'width': 1280, 'height': 720},
+ '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
# 3d videos
- '82': 'mp4',
- '83': 'mp4',
- '84': 'mp4',
- '85': 'mp4',
- '100': 'webm',
- '101': 'webm',
- '102': 'webm',
+ '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
# Apple HTTP Live Streaming
- '92': 'mp4',
- '93': 'mp4',
- '94': 'mp4',
- '95': 'mp4',
- '96': 'mp4',
- '132': 'mp4',
- '151': 'mp4',
- # Dash mp4
- '133': 'mp4',
- '134': 'mp4',
- '135': 'mp4',
- '136': 'mp4',
- '137': 'mp4',
- '138': 'mp4',
- '160': 'mp4',
+ '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
+ # DASH mp4 video
+ '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
+ '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
+ '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
+ '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
+ '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+ '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
+ '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
+ '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
# Dash mp4 audio
- '139': 'm4a',
- '140': 'm4a',
- '141': 'm4a',
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
# Dash webm
- '171': 'webm',
- '172': 'webm',
- '242': 'webm',
- '243': 'webm',
- '244': 'webm',
- '245': 'webm',
- '246': 'webm',
- '247': 'webm',
- '248': 'webm',
- }
- _video_dimensions = {
- '5': '400x240',
- '6': '???',
- '13': '???',
- '17': '176x144',
- '18': '640x360',
- '22': '1280x720',
- '34': '640x360',
- '35': '854x480',
- '36': '320x240',
- '37': '1920x1080',
- '38': '4096x3072',
- '43': '640x360',
- '44': '854x480',
- '45': '1280x720',
- '46': '1920x1080',
- '82': '360p',
- '83': '480p',
- '84': '720p',
- '85': '1080p',
- '92': '240p',
- '93': '360p',
- '94': '480p',
- '95': '720p',
- '96': '1080p',
- '100': '360p',
- '101': '480p',
- '102': '720p',
- '132': '240p',
- '151': '72p',
- '133': '240p',
- '134': '360p',
- '135': '480p',
- '136': '720p',
- '137': '1080p',
- '138': '>1080p',
- '139': '48k',
- '140': '128k',
- '141': '256k',
- '160': '192p',
- '171': '128k',
- '172': '256k',
- '242': '240p',
- '243': '360p',
- '244': '480p',
- '245': '480p',
- '246': '480p',
- '247': '720p',
- '248': '1080p',
- }
- _special_itags = {
- '82': '3D',
- '83': '3D',
- '84': '3D',
- '85': '3D',
- '100': '3D',
- '101': '3D',
- '102': '3D',
- '133': 'DASH Video',
- '134': 'DASH Video',
- '135': 'DASH Video',
- '136': 'DASH Video',
- '137': 'DASH Video',
- '138': 'DASH Video',
- '139': 'DASH Audio',
- '140': 'DASH Audio',
- '141': 'DASH Audio',
- '160': 'DASH Video',
- '171': 'DASH Audio',
- '172': 'DASH Audio',
- '242': 'DASH Video',
- '243': 'DASH Video',
- '244': 'DASH Video',
- '245': 'DASH Video',
- '246': 'DASH Video',
- '247': 'DASH Video',
- '248': 'DASH Video',
+ '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
+ '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
+ '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
+ '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
+ '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
+ # Dash webm audio
+ '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
+ '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
IE_NAME = u'youtube'
@@ -1153,13 +1054,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return {}
- def _print_formats(self, formats):
- print('Available formats:')
- for x in formats:
- print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
- self._video_dimensions.get(x, '???'),
- ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
@@ -1172,48 +1066,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
Transform a dictionary in the format {itag:url} to a list of (itag, url)
with the requested formats.
- req_format = self._downloader.params.get('format', None)
- format_limit = self._downloader.params.get('format_limit', None)
- available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
- if format_limit is not None and format_limit in available_formats:
- format_list = available_formats[available_formats.index(format_limit):]
- else:
- format_list = available_formats
- existing_formats = [x for x in format_list if x in url_map]
+ existing_formats = [x for x in self._formats if x in url_map]
if len(existing_formats) == 0:
raise ExtractorError(u'no known formats available for video')
- if self._downloader.params.get('listformats', None):
- self._print_formats(existing_formats)
- return
- if req_format is None or req_format == 'best':
- video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
- elif req_format == 'worst':
- video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
- elif req_format in ('-1', 'all'):
- video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- else:
- # Specific formats. We pick the first in a slash-delimeted sequence.
- # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
- # available in the specified format. For example,
- # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
- # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
- # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
- req_formats = req_format.split('/')
- video_url_list = None
- for rf in req_formats:
- if rf in url_map:
- video_url_list = [(rf, url_map[rf])]
- break
- if rf in self._video_formats_map:
- for srf in self._video_formats_map[rf]:
- if srf in url_map:
- video_url_list = [(srf, url_map[srf])]
- break
- else:
- continue
- break
- if video_url_list is None:
- raise ExtractorError(u'requested format not available')
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+ video_url_list.reverse() # order worst to best
return video_url_list
def _extract_from_m3u8(self, manifest_url, video_id):
@@ -1462,50 +1319,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
video_url_list = self._get_video_url_list(url_map)
- if not video_url_list:
- return
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
- results = []
+ formats = []
for itag, video_real_url in video_url_list:
- # Extension
- video_extension = self._video_extensions.get(itag, 'flv')
- video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
- self._video_dimensions.get(itag, '???'),
- ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
- results.append({
- 'id': video_id,
- 'url': video_real_url,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': video_extension,
- 'format': video_format,
+ dct = {
'format_id': itag,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'player_url': player_url,
- 'subtitles': video_subtitles,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
- 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
- 'like_count': like_count,
- 'dislike_count': dislike_count,
- })
- return results
+ 'url': video_real_url,
+ 'player_url': player_url,
+ }
+ dct.update(self._formats[itag])
+ formats.append(dct)
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'subtitles': video_subtitles,
+ 'duration': video_duration,
+ 'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations,
+ 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'formats': formats,
+ }
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 35ece354a..829f002cf 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,10 +1,10 @@
# coding: utf-8
-import operator
import re
from .common import InfoExtractor
from ..utils import (
+ int_or_none,
@@ -67,29 +67,13 @@ class ZDFIE(InfoExtractor):
''', format_id)
ext = format_m.group('container')
- is_supported = ext != 'f4f'
- PROTO_ORDER = ['http', 'rtmp', 'rtsp']
- try:
- proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
- except ValueError:
- proto_pref = -999
+ proto = format_m.group('proto').lower()
quality = fnode.find('./quality').text
- QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
- try:
- quality_pref = -QUALITY_ORDER.index(quality)
- except ValueError:
- quality_pref = -999
abr = int(fnode.find('./audioBitrate').text) // 1000
vbr = int(fnode.find('./videoBitrate').text) // 1000
- pref = (is_available, is_supported,
- proto_pref, quality_pref, vbr, abr)
format_note = u''
- if not is_supported:
- format_note += u'(unsupported)'
if not format_note:
format_note = None
@@ -101,18 +85,20 @@ class ZDFIE(InfoExtractor):
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
- 'width': int(fnode.find('./width').text),
- 'height': int(fnode.find('./height').text),
- 'filesize': int(fnode.find('./filesize').text),
+ 'width': int_or_none(fnode.find('./width').text),
+ 'height': int_or_none(fnode.find('./height').text),
+ 'filesize': int_or_none(fnode.find('./filesize').text),
'format_note': format_note,
- '_pref': pref,
+ 'protocol': proto,
'_available': is_available,
format_nodes = doc.findall('.//formitaeten/formitaet')
- formats = sorted(filter(lambda f: f['_available'],
- map(xml_to_format, format_nodes)),
- key=operator.itemgetter('_pref'))
+ formats = list(filter(
+ lambda f: f['_available'],
+ map(xml_to_format, format_nodes)))
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 20ebea38c..a509f8e2f 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -500,12 +500,13 @@ def unescapeHTML(s):
result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
return result
-def encodeFilename(s):
+def encodeFilename(s, for_subprocess=False):
@param s The name of the file
- assert type(s) == type(u'')
+ assert type(s) == compat_str
# Python 3 has a Unicode API
if sys.version_info >= (3, 0):
@@ -515,12 +516,18 @@ def encodeFilename(s):
# Pass u'' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
- return s
+ if not for_subprocess:
+ return s
+ else:
+ # For subprocess calls, encode with locale encoding
+ # Refer to http://stackoverflow.com/a/9951851/35070
+ encoding = preferredencoding()
encoding = sys.getfilesystemencoding()
- if encoding is None:
- encoding = 'utf-8'
- return s.encode(encoding, 'ignore')
+ if encoding is None:
+ encoding = 'utf-8'
+ return s.encode(encoding, 'ignore')
def decodeOption(optval):
if optval is None:
@@ -539,7 +546,8 @@ def formatSeconds(secs):
return '%d' % secs
-def make_HTTPS_handler(opts_no_check_certificate):
+def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
if sys.version_info < (3, 2):
import httplib
@@ -560,7 +568,7 @@ def make_HTTPS_handler(opts_no_check_certificate):
class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
- return HTTPSHandlerV3()
+ return HTTPSHandlerV3(**kwargs)
context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
context.verify_mode = (ssl.CERT_NONE
@@ -571,7 +579,7 @@ def make_HTTPS_handler(opts_no_check_certificate):
except AttributeError:
pass # Python < 3.4
- return compat_urllib_request.HTTPSHandler(context=context)
+ return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
class ExtractorError(Exception):
"""Error during info extraction."""
@@ -756,6 +764,7 @@ def unified_strdate(date_str):
'%Y/%m/%d %H:%M:%S',
+ '%Y-%m-%d %H:%M:%S',
'%d.%m.%Y %H:%M',
@@ -858,12 +867,22 @@ def platform_name():
def write_string(s, out=None):
if out is None:
out = sys.stderr
- assert type(s) == type(u'')
+ assert type(s) == compat_str
if ('b' in getattr(out, 'mode', '') or
sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
s = s.encode(preferredencoding(), 'ignore')
- out.write(s)
+ try:
+ out.write(s)
+ except UnicodeEncodeError:
+ # In Windows shells, this can fail even when the codec is just charmap!?
+ # See https://wiki.python.org/moin/PrintFails#Issue
+ if sys.platform == 'win32' and hasattr(out, 'encoding'):
+ s = s.encode(out.encoding, 'ignore').decode(out.encoding)
+ out.write(s)
+ else:
+ raise
@@ -1017,9 +1036,9 @@ def smuggle_url(url, data):
return url + u'#' + sdata
-def unsmuggle_url(smug_url):
+def unsmuggle_url(smug_url, default=None):
if not '#__youtubedl_smuggle' in smug_url:
- return smug_url, None
+ return smug_url, default
url, _, sdata = smug_url.rpartition(u'#')
jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
data = json.loads(jsond)
@@ -1079,7 +1098,7 @@ def fix_xml_all_ampersand(xml_str):
def setproctitle(title):
- assert isinstance(title, type(u''))
+ assert isinstance(title, compat_str)
libc = ctypes.cdll.LoadLibrary("libc.so.6")
except OSError:
@@ -1107,3 +1126,28 @@ def url_basename(url):
class HEADRequest(compat_urllib_request.Request):
def get_method(self):
return "HEAD"
+def int_or_none(v):
+ return v if v is None else int(v)
+def parse_duration(s):
+ if s is None:
+ return None
+ m = re.match(
+ r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s)
+ if not m:
+ return None
+ res = int(m.group('secs'))
+ if m.group('mins'):
+ res += int(m.group('mins')) * 60
+ if m.group('hours'):
+ res += int(m.group('hours')) * 60 * 60
+ return res
+def prepend_extension(filename, ext):
+ name, real_ext = os.path.splitext(filename)
+ return u'{0}.{1}{2}'.format(name, ext, real_ext)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index c13af8abd..246233e7e 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.12.20'
+__version__ = '2014.01.07'