aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rw-r--r--youtube_dl/FileDownloader.py20
-rw-r--r--youtube_dl/YoutubeDL.py182
-rw-r--r--youtube_dl/__init__.py108
-rw-r--r--youtube_dl/extractor/__init__.py12
-rw-r--r--youtube_dl/extractor/anitube.py55
-rw-r--r--youtube_dl/extractor/auengine.py31
-rw-r--r--youtube_dl/extractor/bandcamp.py70
-rw-r--r--youtube_dl/extractor/brightcove.py18
-rw-r--r--youtube_dl/extractor/canalplus.py3
-rw-r--r--youtube_dl/extractor/clipfish.py53
-rw-r--r--youtube_dl/extractor/collegehumor.py11
-rw-r--r--youtube_dl/extractor/comedycentral.py33
-rw-r--r--youtube_dl/extractor/common.py43
-rw-r--r--youtube_dl/extractor/d8.py22
-rw-r--r--youtube_dl/extractor/eighttracks.py1
-rw-r--r--youtube_dl/extractor/escapist.py66
-rw-r--r--youtube_dl/extractor/facebook.py1
-rw-r--r--youtube_dl/extractor/fktv.py1
-rw-r--r--youtube_dl/extractor/gamespot.py2
-rw-r--r--youtube_dl/extractor/generic.py42
-rw-r--r--youtube_dl/extractor/howcast.py2
-rw-r--r--youtube_dl/extractor/jeuxvideo.py2
-rw-r--r--youtube_dl/extractor/livestream.py2
-rw-r--r--youtube_dl/extractor/mixcloud.py2
-rw-r--r--youtube_dl/extractor/mtv.py1
-rw-r--r--youtube_dl/extractor/nhl.py2
-rw-r--r--youtube_dl/extractor/niconico.py131
-rw-r--r--youtube_dl/extractor/pornhub.py1
-rw-r--r--youtube_dl/extractor/soundcloud.py33
-rw-r--r--youtube_dl/extractor/spankwire.py12
-rw-r--r--youtube_dl/extractor/spiegel.py1
-rw-r--r--youtube_dl/extractor/streamcloud.py66
-rw-r--r--youtube_dl/extractor/sztvhu.py3
-rw-r--r--youtube_dl/extractor/teamcoco.py2
-rw-r--r--youtube_dl/extractor/ted.py3
-rw-r--r--youtube_dl/extractor/toutv.py74
-rw-r--r--youtube_dl/extractor/tube8.py2
-rw-r--r--youtube_dl/extractor/videopremium.py14
-rw-r--r--youtube_dl/extractor/viki.py101
-rw-r--r--youtube_dl/extractor/vimeo.py2
-rw-r--r--youtube_dl/extractor/xtube.py1
-rw-r--r--youtube_dl/extractor/youtube.py108
-rw-r--r--youtube_dl/extractor/zdf.py124
-rw-r--r--youtube_dl/update.py18
-rw-r--r--youtube_dl/utils.py66
-rw-r--r--youtube_dl/version.py2
46 files changed, 1176 insertions, 373 deletions
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py
index 84a539b82..c6276d194 100644
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -1,4 +1,3 @@
-import math
import os
import re
import subprocess
@@ -11,6 +10,7 @@ from .utils import (
ContentTooShortError,
determine_ext,
encodeFilename,
+ format_bytes,
sanitize_open,
timeconvert,
)
@@ -54,20 +54,6 @@ class FileDownloader(object):
self.params = params
@staticmethod
- def format_bytes(bytes):
- if bytes is None:
- return 'N/A'
- if type(bytes) is str:
- bytes = float(bytes)
- if bytes == 0.0:
- exponent = 0
- else:
- exponent = int(math.log(bytes, 1024.0))
- suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
- converted = float(bytes) / float(1024 ** exponent)
- return '%.2f%s' % (converted, suffix)
-
- @staticmethod
def format_seconds(seconds):
(mins, secs) = divmod(seconds, 60)
(hours, mins) = divmod(mins, 60)
@@ -117,7 +103,7 @@ class FileDownloader(object):
def format_speed(speed):
if speed is None:
return '%10s' % '---b/s'
- return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
+ return '%10s' % ('%s/s' % format_bytes(speed))
@staticmethod
def best_block_size(elapsed_time, bytes):
@@ -581,7 +567,7 @@ class FileDownloader(object):
self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False
- data_len_str = self.format_bytes(data_len)
+ data_len_str = format_bytes(data_len)
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 20eed96ca..30ba94666 100644
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -5,9 +5,12 @@ from __future__ import absolute_import
import errno
import io
+import json
import os
+import platform
import re
import shutil
+import subprocess
import socket
import sys
import time
@@ -17,6 +20,7 @@ if os.name == 'nt':
import ctypes
from .utils import (
+ compat_cookiejar,
compat_http_client,
compat_print,
compat_str,
@@ -29,9 +33,12 @@ from .utils import (
DownloadError,
encodeFilename,
ExtractorError,
+ format_bytes,
locked_file,
+ make_HTTPS_handler,
MaxDownloadsReached,
PostProcessingError,
+ platform_name,
preferredencoding,
SameFileError,
sanitize_filename,
@@ -40,9 +47,11 @@ from .utils import (
UnavailableVideoError,
write_json_file,
write_string,
+ YoutubeDLHandler,
)
from .extractor import get_info_extractor, gen_extractors
from .FileDownloader import FileDownloader
+from .version import __version__
class YoutubeDL(object):
@@ -84,6 +93,7 @@ class YoutubeDL(object):
forcethumbnail: Force printing thumbnail URL.
forcedescription: Force printing description.
forcefilename: Force printing final filename.
+ forcejson: Force printing info_dict as JSON.
simulate: Do not download the video files.
format: Video format code.
format_limit: Highest quality format to try.
@@ -95,6 +105,7 @@ class YoutubeDL(object):
playlistend: Playlist item to end at.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
+ logger: Log messages to a logging.Logger instance.
logtostderr: Log messages to stderr instead of stdout.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
@@ -118,6 +129,9 @@ class YoutubeDL(object):
downloadarchive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
+ cookiefile: File name where cookies should be read from and dumped to.
+ nocheckcertificate:Do not verify SSL certificates
+ proxy: URL of the proxy server to use
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -158,6 +172,8 @@ class YoutubeDL(object):
if '%(stitle)s' in self.params['outtmpl']:
self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
+ self._setup_opener()
+
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
@@ -190,7 +206,9 @@ class YoutubeDL(object):
def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
- if not self.params.get('quiet', False):
+ if self.params.get('logger'):
+ self.params['logger'].debug(message)
+ elif not self.params.get('quiet', False):
terminator = [u'\n', u''][skip_eol]
output = message + terminator
write_string(output, self._screen_file)
@@ -198,10 +216,13 @@ class YoutubeDL(object):
def to_stderr(self, message):
"""Print message to stderr."""
assert type(message) == type(u'')
- output = message + u'\n'
- if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
- output = output.encode(preferredencoding())
- sys.stderr.write(output)
+ if self.params.get('logger'):
+ self.params['logger'].error(message)
+ else:
+ output = message + u'\n'
+ if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+ output = output.encode(preferredencoding())
+ sys.stderr.write(output)
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
@@ -217,13 +238,15 @@ class YoutubeDL(object):
if not self.params.get('consoletitle', False):
return
if 'TERM' in os.environ:
- write_string(u'\033[22t', self._screen_file)
+ # Save the title on stack
+ write_string(u'\033[22;0t', self._screen_file)
def restore_console_title(self):
if not self.params.get('consoletitle', False):
return
if 'TERM' in os.environ:
- write_string(u'\033[23t', self._screen_file)
+ # Restore the title from stack
+ write_string(u'\033[23;0t', self._screen_file)
def __enter__(self):
self.save_console_title()
@@ -231,6 +254,9 @@ class YoutubeDL(object):
def __exit__(self, *args):
self.restore_console_title()
+
+ if self.params.get('cookiefile') is not None:
+ self.cookiejar.save()
def fixed_template(self):
"""Checks if the output template is fixed."""
@@ -351,15 +377,17 @@ class YoutubeDL(object):
def _match_entry(self, info_dict):
""" Returns None iff the file should be downloaded """
- title = info_dict['title']
- matchtitle = self.params.get('matchtitle', False)
- if matchtitle:
- if not re.search(matchtitle, title, re.IGNORECASE):
- return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
- rejecttitle = self.params.get('rejecttitle', False)
- if rejecttitle:
- if re.search(rejecttitle, title, re.IGNORECASE):
- return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+ if 'title' in info_dict:
+ # This can happen when we're just evaluating the playlist
+ title = info_dict['title']
+ matchtitle = self.params.get('matchtitle', False)
+ if matchtitle:
+ if not re.search(matchtitle, title, re.IGNORECASE):
+ return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+ rejecttitle = self.params.get('rejecttitle', False)
+ if rejecttitle:
+ if re.search(rejecttitle, title, re.IGNORECASE):
+ return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
date = info_dict.get('upload_date', None)
if date is not None:
dateRange = self.params.get('daterange', DateRange())
@@ -370,8 +398,8 @@ class YoutubeDL(object):
if age_limit < info_dict.get('age_limit', 0):
return u'Skipping "' + title + '" because it is age restricted'
if self.in_download_archive(info_dict):
- return (u'%(title)s has already been recorded in archive'
- % info_dict)
+ return (u'%s has already been recorded in archive'
+ % info_dict.get('title', info_dict.get('id', u'video')))
return None
@staticmethod
@@ -450,7 +478,7 @@ class YoutubeDL(object):
ie_key=ie_result.get('ie_key'),
extra_info=extra_info)
elif result_type == 'playlist':
- self.add_extra_info(ie_result, extra_info)
+
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -480,6 +508,12 @@ class YoutubeDL(object):
'webpage_url': ie_result['webpage_url'],
'extractor_key': ie_result['extractor_key'],
}
+
+ reason = self._match_entry(entry)
+ if reason is not None:
+ self.to_screen(u'[download] ' + reason)
+ continue
+
entry_result = self.process_ie_result(entry,
download=download,
extra_info=extra)
@@ -635,7 +669,7 @@ class YoutubeDL(object):
# Forced printings
if self.params.get('forcetitle', False):
- compat_print(info_dict['title'])
+ compat_print(info_dict['fulltitle'])
if self.params.get('forceid', False):
compat_print(info_dict['id'])
if self.params.get('forceurl', False):
@@ -649,6 +683,8 @@ class YoutubeDL(object):
compat_print(filename)
if self.params.get('forceformat', False):
compat_print(info_dict['format'])
+ if self.params.get('forcejson', False):
+ compat_print(json.dumps(info_dict))
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
@@ -711,7 +747,7 @@ class YoutubeDL(object):
return
if self.params.get('writeinfojson', False):
- infofn = filename + u'.info.json'
+ infofn = os.path.splitext(filename)[0] + u'.info.json'
self.report_writeinfojson(infofn)
try:
json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
@@ -768,7 +804,7 @@ class YoutubeDL(object):
for url in url_list:
try:
#It also downloads the videos
- videos = self.extract_info(url)
+ self.extract_info(url)
except UnavailableVideoError:
self.report_error(u'unable to download video')
except MaxDownloadsReached:
@@ -804,7 +840,16 @@ class YoutubeDL(object):
fn = self.params.get('download_archive')
if fn is None:
return False
- vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+ extractor = info_dict.get('extractor_id')
+ if extractor is None:
+ if 'id' in info_dict:
+ extractor = info_dict.get('ie_key') # key in a playlist
+ if extractor is None:
+ return False # Incomplete video information
+ # Future-proof against any change in case
+ # and backwards compatibility with prior versions
+ extractor = extractor.lower()
+ vid_id = extractor + u' ' + info_dict['id']
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
@@ -838,9 +883,9 @@ class YoutubeDL(object):
def list_formats(self, info_dict):
def format_note(fdict):
- if fdict.get('format_note') is not None:
- return fdict['format_note']
res = u''
+ if fdict.get('format_note') is not None:
+ res += fdict['format_note'] + u' '
if fdict.get('vcodec') is not None:
res += u'%-5s' % fdict['vcodec']
elif fdict.get('vbr') is not None:
@@ -857,25 +902,100 @@ class YoutubeDL(object):
res += 'audio'
if fdict.get('abr') is not None:
res += u'@%3dk' % fdict['abr']
+ if fdict.get('filesize') is not None:
+ if res:
+ res += u', '
+ res += format_bytes(fdict['filesize'])
return res
- def line(format):
- return (u'%-20s%-10s%-12s%s' % (
+ def line(format, idlen=20):
+ return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
format['format_id'],
format['ext'],
self.format_resolution(format),
format_note(format),
- )
- )
+ ))
formats = info_dict.get('formats', [info_dict])
- formats_s = list(map(line, formats))
+ idlen = max(len(u'format code'),
+ max(len(f['format_id']) for f in formats))
+ formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1:
formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
header_line = line({
'format_id': u'format code', 'ext': u'extension',
- '_resolution': u'resolution', 'format_note': u'note'})
+ '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
(info_dict['id'], header_line, u"\n".join(formats_s)))
+
+ def urlopen(self, req):
+ """ Start an HTTP download """
+ return self._opener.open(req)
+
+ def print_debug_header(self):
+ if not self.params.get('verbose'):
+ return
+ write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
+ try:
+ sp = subprocess.Popen(
+ ['git', 'rev-parse', '--short', 'HEAD'],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)))
+ out, err = sp.communicate()
+ out = out.decode().strip()
+ if re.match('[0-9a-f]+', out):
+ write_string(u'[debug] Git HEAD: ' + out + u'\n')
+ except:
+ try:
+ sys.exc_clear()
+ except:
+ pass
+ write_string(u'[debug] Python version %s - %s' %
+ (platform.python_version(), platform_name()) + u'\n')
+
+ proxy_map = {}
+ for handler in self._opener.handlers:
+ if hasattr(handler, 'proxies'):
+ proxy_map.update(handler.proxies)
+ write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
+
+ def _setup_opener(self, timeout=300):
+ opts_cookiefile = self.params.get('cookiefile')
+ opts_proxy = self.params.get('proxy')
+
+ if opts_cookiefile is None:
+ self.cookiejar = compat_cookiejar.CookieJar()
+ else:
+ self.cookiejar = compat_cookiejar.MozillaCookieJar(
+ opts_cookiefile)
+ if os.access(opts_cookiefile, os.R_OK):
+ self.cookiejar.load()
+
+ cookie_processor = compat_urllib_request.HTTPCookieProcessor(
+ self.cookiejar)
+ if opts_proxy is not None:
+ if opts_proxy == '':
+ proxies = {}
+ else:
+ proxies = {'http': opts_proxy, 'https': opts_proxy}
+ else:
+ proxies = compat_urllib_request.getproxies()
+ # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+ if 'http' in proxies and 'https' not in proxies:
+ proxies['https'] = proxies['http']
+ proxy_handler = compat_urllib_request.ProxyHandler(proxies)
+ https_handler = make_HTTPS_handler(
+ self.params.get('nocheckcertificate', False))
+ opener = compat_urllib_request.build_opener(
+ https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+ # Delete the default user-agent header, which would otherwise apply in
+ # cases where our custom HTTP handler doesn't come into play
+ # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+ opener.addheaders = []
+ self._opener = opener
+
+ # TODO remove this global modification
+ compat_urllib_request.install_opener(opener)
+ socket.setdefaulttimeout(timeout)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index af4c9c5c4..1f15c7eaa 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -34,50 +34,42 @@ __authors__ = (
'Andras Elso',
'Jelle van der Waa',
'Marcin Cieślak',
+ 'Anton Larionov',
+ 'Takuya Tsuchida',
)
__license__ = 'Public Domain'
import codecs
-import collections
import getpass
import optparse
import os
import random
import re
import shlex
-import socket
import subprocess
import sys
-import traceback
-import platform
from .utils import (
- compat_cookiejar,
compat_print,
- compat_str,
- compat_urllib_request,
DateRange,
decodeOption,
determine_ext,
DownloadError,
get_cachedir,
- make_HTTPS_handler,
MaxDownloadsReached,
- platform_name,
preferredencoding,
SameFileError,
std_headers,
write_string,
- YoutubeDLHandler,
)
from .update import update_self
-from .version import __version__
from .FileDownloader import (
FileDownloader,
)
from .extractor import gen_extractors
+from .version import __version__
from .YoutubeDL import YoutubeDL
from .PostProcessor import (
FFmpegMetadataPP,
@@ -306,6 +298,9 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('--get-format',
action='store_true', dest='getformat',
help='simulate, quiet but print output format', default=False)
+ verbosity.add_option('-j', '--dump-json',
+ action='store_true', dest='dumpjson',
+ help='simulate, quiet but print JSON information', default=False)
verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress',
@@ -447,19 +442,6 @@ def _real_main(argv=None):
parser, opts, args = parseOpts(argv)
- # Open appropriate CookieJar
- if opts.cookiefile is None:
- jar = compat_cookiejar.CookieJar()
- else:
- try:
- jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile)
- if os.access(opts.cookiefile, os.R_OK):
- jar.load()
- except (IOError, OSError) as err:
- if opts.verbose:
- traceback.print_exc()
- write_string(u'ERROR: unable to open cookie file\n')
- sys.exit(101)
# Set user agent
if opts.user_agent is not None:
std_headers['User-Agent'] = opts.user_agent
@@ -491,8 +473,6 @@ def _real_main(argv=None):
all_urls = batchurls + args
all_urls = [url.strip() for url in all_urls]
- opener = _setup_opener(jar=jar, opts=opts)
-
extractors = gen_extractors()
if opts.list_extractors:
@@ -547,7 +527,7 @@ def _real_main(argv=None):
if opts.retries is not None:
try:
opts.retries = int(opts.retries)
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid retry count specified')
if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
@@ -558,13 +538,13 @@ def _real_main(argv=None):
opts.playliststart = int(opts.playliststart)
if opts.playliststart <= 0:
raise ValueError(u'Playlist start must be positive')
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid playlist start number specified')
try:
opts.playlistend = int(opts.playlistend)
if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
raise ValueError(u'Playlist end must be greater than playlist start')
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid playlist end number specified')
if opts.extractaudio:
if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
@@ -608,7 +588,7 @@ def _real_main(argv=None):
'username': opts.username,
'password': opts.password,
'videopassword': opts.videopassword,
- 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+ 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
@@ -616,8 +596,9 @@ def _real_main(argv=None):
'forcedescription': opts.getdescription,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
+ 'forcejson': opts.dumpjson,
'simulate': opts.simulate,
- 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+ 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
'format': opts.format,
'format_limit': opts.format_limit,
'listformats': opts.listformats,
@@ -666,34 +647,12 @@ def _real_main(argv=None):
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
'download_archive': opts.download_archive,
+ 'cookiefile': opts.cookiefile,
+ 'nocheckcertificate': opts.no_check_certificate,
}
with YoutubeDL(ydl_opts) as ydl:
- if opts.verbose:
- write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
- try:
- sp = subprocess.Popen(
- ['git', 'rev-parse', '--short', 'HEAD'],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- cwd=os.path.dirname(os.path.abspath(__file__)))
- out, err = sp.communicate()
- out = out.decode().strip()
- if re.match('[0-9a-f]+', out):
- write_string(u'[debug] Git HEAD: ' + out + u'\n')
- except:
- try:
- sys.exc_clear()
- except:
- pass
- write_string(u'[debug] Python version %s - %s' %
- (platform.python_version(), platform_name()) + u'\n')
-
- proxy_map = {}
- for handler in opener.handlers:
- if hasattr(handler, 'proxies'):
- proxy_map.update(handler.proxies)
- write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
-
+ ydl.print_debug_header()
ydl.add_default_info_extractors()
# PostProcessors
@@ -724,46 +683,9 @@ def _real_main(argv=None):
ydl.to_screen(u'--max-download limit reached, aborting.')
retcode = 101
- # Dump cookie jar if requested
- if opts.cookiefile is not None:
- try:
- jar.save()
- except (IOError, OSError):
- sys.exit(u'ERROR: unable to save cookie jar')
-
sys.exit(retcode)
-def _setup_opener(jar=None, opts=None, timeout=300):
- if opts is None:
- FakeOptions = collections.namedtuple(
- 'FakeOptions', ['proxy', 'no_check_certificate'])
- opts = FakeOptions(proxy=None, no_check_certificate=False)
-
- cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
- if opts.proxy is not None:
- if opts.proxy == '':
- proxies = {}
- else:
- proxies = {'http': opts.proxy, 'https': opts.proxy}
- else:
- proxies = compat_urllib_request.getproxies()
- # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
- if 'http' in proxies and 'https' not in proxies:
- proxies['https'] = proxies['http']
- proxy_handler = compat_urllib_request.ProxyHandler(proxies)
- https_handler = make_HTTPS_handler(opts)
- opener = compat_urllib_request.build_opener(
- https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
- # Delete the default user-agent header, which would otherwise apply in
- # cases where our custom HTTP handler doesn't come into play
- # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
- opener.addheaders = []
- compat_urllib_request.install_opener(opener)
- socket.setdefaulttimeout(timeout)
- return opener
-
-
def main(argv=None):
try:
_real_main(argv)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 2d1e3cdfd..0b4d086b7 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,5 +1,6 @@
from .appletrailers import AppleTrailersIE
from .addanime import AddAnimeIE
+from .anitube import AnitubeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import (
@@ -10,7 +11,7 @@ from .arte import (
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
-from .bandcamp import BandcampIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
from .breakcom import BreakIE
@@ -19,12 +20,14 @@ from .c56 import C56IE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cinemassacre import CinemassacreIE
+from .clipfish import ClipfishIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
-from .comedycentral import ComedyCentralIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
from .criterion import CriterionIE
from .cspan import CSpanIE
+from .d8 import D8IE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
@@ -96,6 +99,7 @@ from .nba import NBAIE
from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
+from .niconico import NiconicoIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
@@ -126,12 +130,14 @@ from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
+from .streamcloud import StreamcloudIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
from .thisav import ThisAVIE
+from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .tube8 import Tube8IE
@@ -152,6 +158,7 @@ from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
+from .viki import VikiIE
from .vk import VKIE
from .wat import WatIE
from .websurg import WeBSurgIE
@@ -179,6 +186,7 @@ from .youtube import (
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
+ YoutubeHistoryIE,
)
from .zdf import ZDFIE
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py
new file mode 100644
index 000000000..691d5a844
--- /dev/null
+++ b/youtube_dl/extractor/anitube.py
@@ -0,0 +1,55 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+
+
+class AnitubeIE(InfoExtractor):
+ IE_NAME = u'anitube.se'
+ _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.anitube.se/video/36621',
+ u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
+ u'file': u'36621.mp4',
+ u'info_dict': {
+ u'id': u'36621',
+ u'ext': u'mp4',
+ u'title': u'Recorder to Randoseru 01',
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
+ webpage, u'key')
+
+ webpage_config = self._download_webpage('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
+ key)
+ config_xml = xml.etree.ElementTree.fromstring(webpage_config.encode('utf-8'))
+
+ video_title = config_xml.find('title').text
+
+ formats = []
+ video_url = config_xml.find('file')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'sd',
+ 'url': video_url.text,
+ })
+ video_url = config_xml.find('filehd')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'hd',
+ 'url': video_url.text,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
index 0febbff4f..95c038003 100644
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -1,10 +1,10 @@
-import os.path
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
- compat_urllib_parse_urlparse,
+ determine_ext,
+ ExtractorError,
)
class AUEngineIE(InfoExtractor):
@@ -25,22 +25,25 @@ class AUEngineIE(InfoExtractor):
title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
webpage, u'title')
title = title.strip()
- links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage)
- links = [compat_urllib_parse.unquote(l) for l in links]
+ links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
+ links = map(compat_urllib_parse.unquote, links)
+
+ thumbnail = None
+ video_url = None
for link in links:
- root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path)
- if pathext == '.png':
+ if link.endswith('.png'):
thumbnail = link
- elif pathext == '.mp4':
- url = link
- ext = pathext
+ elif '/videos/' in link:
+ video_url = link
+ if not video_url:
+ raise ExtractorError(u'Could not find video URL')
+ ext = u'.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
- ext = ext[1:]
- return [{
+
+ return {
'id': video_id,
- 'url': url,
- 'ext': ext,
+ 'url': video_url,
'title': title,
'thumbnail': thumbnail,
- }]
+ }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 129a20f44..3a32c14c5 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -3,13 +3,16 @@ import re
from .common import InfoExtractor
from ..utils import (
+ compat_str,
+ compat_urlparse,
ExtractorError,
)
class BandcampIE(InfoExtractor):
+ IE_NAME = u'Bandcamp'
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
- _TEST = {
+ _TESTS = [{
u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
u'file': u'1812978515.mp3',
u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
@@ -17,7 +20,7 @@ class BandcampIE(InfoExtractor):
u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
},
u'skip': u'There is a limit of 200 free downloads / month for the test song'
- }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -26,6 +29,23 @@ class BandcampIE(InfoExtractor):
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
+ m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
+ if m_trackinfo:
+ json_code = m_trackinfo.group(1)
+ data = json.loads(json_code)
+
+ for d in data:
+ formats = [{
+ 'format_id': 'format_id',
+ 'url': format_url,
+ 'ext': format_id.partition('-')[0]
+ } for format_id, format_url in sorted(d['file'].items())]
+ return {
+ 'id': compat_str(d['id']),
+ 'title': d['title'],
+ 'formats': formats,
+ }
+ else:
raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
@@ -61,3 +81,49 @@ class BandcampIE(InfoExtractor):
}
return [track_info]
+
+
+class BandcampAlbumIE(InfoExtractor):
+ IE_NAME = u'Bandcamp:album'
+ _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+
+ _TEST = {
+ u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+ u'playlist': [
+ {
+ u'file': u'1353101989.mp3',
+ u'md5': u'39bc1eded3476e927c724321ddf116cf',
+ u'info_dict': {
+ u'title': u'Intro',
+ }
+ },
+ {
+ u'file': u'38097443.mp3',
+ u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
+ u'info_dict': {
+ u'title': u'Kero One - Keep It Alive (Blazo remix)',
+ }
+ },
+ ],
+ u'params': {
+ u'playlistend': 2
+ },
+ u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
+ if not tracks_paths:
+ raise ExtractorError(u'The page doesn\'t contain any track')
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+ for t_path in tracks_paths]
+ title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index d8c35465a..66fe0ac9a 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -75,16 +75,22 @@ class BrightcoveIE(InfoExtractor):
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
- playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+ def find_param(name):
+ node = find_xpath_attr(object_doc, './param', 'name', name)
+ if node is not None:
+ return node.attrib['value']
+ return None
+ playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
- params['playerKey'] = playerKey.attrib['value']
- videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+ params['playerKey'] = playerKey
+ # The three fields hold the id of the video
+ videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
if videoPlayer is not None:
- params['@videoPlayer'] = videoPlayer.attrib['value']
- linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL')
+ params['@videoPlayer'] = videoPlayer
+ linkBase = find_param('linkBaseURL')
if linkBase is not None:
- params['linkBaseURL'] = linkBase.attrib['value']
+ params['linkBaseURL'] = linkBase
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 1db9b24cf..bfa2a8b40 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,6 +5,7 @@ import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
+
class CanalplusIE(InfoExtractor):
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
@@ -25,7 +26,7 @@ class CanalplusIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.groupdict().get('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
new file mode 100644
index 000000000..95449da3c
--- /dev/null
+++ b/youtube_dl/extractor/clipfish.py
@@ -0,0 +1,53 @@
+import re
+import time
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+
+
+class ClipfishIE(InfoExtractor):
+ IE_NAME = u'clipfish'
+
+ _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _TEST = {
+ u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/',
+ u'file': u'4028320.f4v',
+ u'md5': u'5e38bda8c329fbfb42be0386a3f5a382',
+ u'info_dict': {
+ u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect',
+ u'duration': 399,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
+ (video_id, int(time.time())))
+ info_xml = self._download_webpage(
+ info_url, video_id, note=u'Downloading info page')
+ doc = xml.etree.ElementTree.fromstring(info_xml)
+ title = doc.find('title').text
+ video_url = doc.find('filename').text
+ thumbnail = doc.find('imageurl').text
+ duration_str = doc.find('duration').text
+ m = re.match(
+ r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
+ duration_str)
+ if m:
+ duration = (
+ (int(m.group('hours')) * 60 * 60) +
+ (int(m.group('minutes')) * 60) +
+ (int(m.group('seconds')))
+ )
+ else:
+ duration = None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index 8d4c93d6d..b27c1dfc5 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,5 +1,4 @@
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
@@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
self.report_extraction(video_id)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- metaXml = self._download_webpage(xmlUrl, video_id,
+ mdoc = self._download_xml(xmlUrl, video_id,
u'Downloading info XML',
u'Unable to download video info XML')
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
videoNode = mdoc.findall('./video')[0]
youtubeIdNode = videoNode.find('./youtubeID')
@@ -65,16 +63,13 @@ class CollegeHumorIE(InfoExtractor):
if next_url.endswith(u'manifest.f4m'):
manifest_url = next_url + '?hdcore=2.10.3'
- manifestXml = self._download_webpage(manifest_url, video_id,
+ adoc = self._download_xml(manifest_url, video_id,
u'Downloading XML manifest',
u'Unable to download video info XML')
- adoc = xml.etree.ElementTree.fromstring(manifestXml)
try:
- media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
- node_id = media_node.attrib['url']
video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
- except IndexError as err:
+ except IndexError:
raise ExtractorError(u'Invalid manifest file')
url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 69b2beece..725849d2e 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -2,6 +2,7 @@ import re
import xml.etree.ElementTree
from .common import InfoExtractor
+from .mtv import MTVIE, _media_xml_tag
from ..utils import (
compat_str,
compat_urllib_parse,
@@ -11,7 +12,37 @@ from ..utils import (
)
-class ComedyCentralIE(InfoExtractor):
+class ComedyCentralIE(MTVIE):
+ _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+ _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
+
+ _TEST = {
+ u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+ u'md5': u'4167875aae411f903b751a21f357f1ee',
+ u'info_dict': {
+ u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+ u'ext': u'mp4',
+ u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
+ u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
+ },
+ }
+ # Overwrite MTVIE properties we don't want
+ _TESTS = []
+
+ def _get_thumbnail_url(self, uri, itemdoc):
+ search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+ return itemdoc.find(search_path).attrib['url']
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
+ webpage, u'mgid')
+ return self._get_videos_info(mgid)
+
+
+class ComedyCentralShowsIE(InfoExtractor):
IE_DESC = u'The Daily Show / Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index f787d0a3c..5656445a3 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -4,11 +4,11 @@ import re
import socket
import sys
import netrc
+import xml.etree.ElementTree
from ..utils import (
compat_http_client,
compat_urllib_error,
- compat_urllib_request,
compat_str,
clean_html,
@@ -19,6 +19,7 @@ from ..utils import (
unescapeHTML,
)
+
class InfoExtractor(object):
"""Information Extractor class.
@@ -75,6 +76,7 @@ class InfoExtractor(object):
* acodec Name of the audio codec in use
* vbr Average video bitrate in KBit/s
* vcodec Name of the video codec in use
+ * filesize The number of bytes, if known in advance
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
@@ -156,7 +158,7 @@ class InfoExtractor(object):
elif note is not False:
self.to_screen(u'%s: %s' % (video_id, note))
try:
- return compat_urllib_request.urlopen(url_or_request)
+ return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
@@ -208,6 +210,11 @@ class InfoExtractor(object):
""" Returns the data of the page as a string """
return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
+ def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'):
+ """Return the xml as an xml.etree.ElementTree.Element"""
+ xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@@ -229,12 +236,14 @@ class InfoExtractor(object):
self.to_screen(u'Logging in')
#Methods for following #608
- def url_result(self, url, ie=None):
+ def url_result(self, url, ie=None, video_id=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
+ if video_id is not None:
+ video_info['id'] = video_id
return video_info
def playlist_result(self, entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
@@ -350,6 +359,17 @@ class InfoExtractor(object):
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
+ def _html_search_meta(self, name, html, display_name=None):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'''(?ix)<meta(?=[^>]+(?:name|property)=["\']%s["\'])
+ [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
+ html, display_name, fatal=False)
+
+ def _dc_search_uploader(self, html):
+ return self._html_search_meta('dc.creator', html, 'uploader')
+
def _rta_search(self, html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+'
@@ -358,6 +378,23 @@ class InfoExtractor(object):
return 18
return 0
+ def _media_rating_search(self, html):
+ # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+ rating = self._html_search_meta('rating', html)
+
+ if not rating:
+ return None
+
+ RATING_TABLE = {
+ 'safe for kids': 0,
+ 'general': 8,
+ '14 years': 14,
+ 'mature': 17,
+ 'restricted': 19,
+ }
+ return RATING_TABLE.get(rating.lower(), None)
+
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py
new file mode 100644
index 000000000..a56842b16
--- /dev/null
+++ b/youtube_dl/extractor/d8.py
@@ -0,0 +1,22 @@
+# encoding: utf-8
+from .canalplus import CanalplusIE
+
+
+class D8IE(CanalplusIE):
+ _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
+ IE_NAME = u'd8.tv'
+
+ _TEST = {
+ u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+ u'file': u'966289.flv',
+ u'info_dict': {
+ u'title': u'Campagne intime - Documentaire exceptionnel',
+ u'description': u'md5:d2643b799fb190846ae09c61e59a859f',
+ u'upload_date': u'20131108',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ }
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index 2cfbcd363..f21ef8853 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -1,4 +1,3 @@
-import itertools
import json
import random
import re
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 3aa2da52c..b1242f6bc 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -11,11 +11,11 @@ from ..utils import (
class EscapistIE(InfoExtractor):
- _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
+ _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
_TEST = {
u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
- u'md5': u'c6793dbda81388f4264c1ba18684a74d',
+ u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
u'info_dict': {
u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
u"uploader": u"the-escapist-presents",
@@ -25,50 +25,60 @@ class EscapistIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
- videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+ videoDesc = self._html_search_regex(
+ r'<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- playerUrl = self._og_search_video_url(webpage, name='player url')
+ playerUrl = self._og_search_video_url(webpage, name=u'player URL')
- title = self._html_search_regex('<meta name="title" content="([^"]*)"',
- webpage, u'player url').split(' : ')[-1]
+ title = self._html_search_regex(
+ r'<meta name="title" content="([^"]*)"',
+ webpage, u'title').split(' : ')[-1]
- configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+ configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
configUrl = compat_urllib_parse.unquote(configUrl)
- configJSON = self._download_webpage(configUrl, videoId,
- u'Downloading configuration',
- u'unable to download configuration')
-
- # Technically, it's JavaScript, not JSON
- configJSON = configJSON.replace("'", '"')
-
+ formats = []
+
+ def _add_format(name, cfgurl):
+ configJSON = self._download_webpage(
+ cfgurl, videoId,
+ u'Downloading ' + name + ' configuration',
+ u'Unable to download ' + name + ' configuration')
+
+ # Technically, it's JavaScript, not JSON
+ configJSON = configJSON.replace("'", '"')
+
+ try:
+ config = json.loads(configJSON)
+ except (ValueError,) as err:
+ raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ playlist = config['playlist']
+ formats.append({
+ 'url': playlist[1]['url'],
+ 'format_id': name,
+ })
+
+ _add_format(u'normal', configUrl)
+ hq_url = (configUrl +
+ ('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
try:
- config = json.loads(configJSON)
- except (ValueError,) as err:
- raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ _add_format(u'hq', hq_url)
+ except ExtractorError:
+ pass # That's fine, we'll just use normal quality
- playlist = config['playlist']
- videoUrl = playlist[1]['url']
-
- info = {
+ return {
'id': videoId,
- 'url': videoUrl,
+ 'formats': formats,
'uploader': showName,
- 'upload_date': None,
'title': title,
- 'ext': 'mp4',
'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
}
-
- return [info]
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index f8bdfc2d3..3b210710e 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -1,5 +1,4 @@
import json
-import netrc
import re
import socket
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 9c89362ef..dba1a8dc2 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -39,7 +39,6 @@ class FKTVIE(InfoExtractor):
for i, _ in enumerate(files, 1):
video_id = '%04d%d' % (episode, i)
video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
- video_title = 'Fernsehkritik %d.%d' % (episode, i)
videos.append({
'id': video_id,
'url': video_url,
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 098768361..9645b00c3 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -24,7 +24,7 @@ class GameSpotIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- page_id = video_id = mobj.group('page_id')
+ page_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
data_video = json.loads(unescapeHTML(data_video_json))
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index c7552fddb..37671430a 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -162,6 +162,16 @@ class GenericIE(InfoExtractor):
raise ExtractorError(u'Failed to download URL: %s' % url)
self.report_extraction(video_id)
+
+ # it's tempting to parse this further, but you would
+ # have to take into account all the variations like
+ # Video Title - Site Name
+ # Site Name | Video Title
+ # Video Title - Tagline | Site Name
+ # and so on and so forth; it's just not practical
+ video_title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'video title', default=u'video', flags=re.DOTALL)
+
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is not None:
@@ -177,17 +187,20 @@ class GenericIE(InfoExtractor):
return self.url_result(surl, 'Vimeo')
# Look for embedded YouTube player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)
- if mobj:
- surl = unescapeHTML(mobj.group(u'url'))
- return self.url_result(surl, 'Youtube')
+ matches = re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+ for tuppl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
burl = unescapeHTML(mobj.group(1))
- return self.url_result(burl, 'Bandcamp')
+ # Don't set the extractor because it can be a track url or an album
+ return self.url_result(burl)
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
@@ -196,7 +209,7 @@ class GenericIE(InfoExtractor):
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+ mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -223,27 +236,16 @@ class GenericIE(InfoExtractor):
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
# here's a fun little line of code for you:
- video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title', default=u'video', flags=re.DOTALL)
-
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
- return [{
+ return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
- 'ext': video_extension,
- }]
+ }
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 46954337f..bafc5826f 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
_TEST = {
u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
u'file': u'390161.mp4',
- u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138',
+ u'md5': u'8b743df908c42f60cf6496586c7f12c3',
u'info_dict': {
u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.",
u"title": u"How to Tie a Square Knot Properly"
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index 6bb54b932..0020c47cf 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -22,7 +22,7 @@ class JeuxVideoIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- title = re.match(self._VALID_URL, url).group(1)
+ title = mobj.group(1)
webpage = self._download_webpage(url, title)
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 1a3e0ae6b..5f548437c 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -6,9 +6,7 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urlparse,
- get_meta_content,
xpath_with_ns,
- ExtractorError,
)
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index a200dcd74..e2baf44d7 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor):
'title': info['name'],
'url': final_song_url,
'ext': 'mp3',
- 'description': info['description'],
+ 'description': info.get('description'),
'thumbnail': info['pictures'].get('extra_large'),
'uploader': info['user']['name'],
'uploader_id': info['user']['username'],
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 3df7f9b85..04afd6c4c 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -59,7 +59,6 @@ class MTVIE(InfoExtractor):
if '/error_country_block.swf' in metadataXml:
raise ExtractorError(u'This video is not available from your country.', expected=True)
mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
- renditions = mdoc.findall('.//rendition')
formats = []
for rendition in mdoc.findall('.//rendition'):
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 224f56ac8..458fe4063 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -72,7 +72,7 @@ class NHLIE(NHLBaseInfoExtractor):
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = u'nhl.com:videocenter'
- IE_DESC = u'Download the first 12 videos from a videocenter category'
+ IE_DESC = u'NHL videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
@classmethod
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
new file mode 100644
index 000000000..729607ea3
--- /dev/null
+++ b/youtube_dl/extractor/niconico.py
@@ -0,0 +1,131 @@
+# encoding: utf-8
+
+import re
+import socket
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_http_client,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_request,
+ compat_urlparse,
+ compat_str,
+
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class NiconicoIE(InfoExtractor):
+ IE_NAME = u'niconico'
+ IE_DESC = u'ニコニコ動画'
+
+ _TEST = {
+ u'url': u'http://www.nicovideo.jp/watch/sm22312215',
+ u'file': u'sm22312215.mp4',
+ u'md5': u'd1a75c0823e2f629128c43e1212760f9',
+ u'info_dict': {
+ u'title': u'Big Buck Bunny',
+ u'uploader': u'takuya0301',
+ u'uploader_id': u'2698420',
+ u'upload_date': u'20131123',
+ u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ },
+ u'params': {
+ u'username': u'ydl.niconico@gmail.com',
+ u'password': u'youtube-dl',
+ },
+ }
+
+ _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+ _NETRC_MACHINE = 'niconico'
+ # If True it will raise an error if no login info is provided
+ _LOGIN_REQUIRED = True
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ # No authentication to be performed
+ if username is None:
+ if self._LOGIN_REQUIRED:
+ raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return False
+
+ # Log in
+ login_form_strs = {
+ u'mail': username,
+ u'password': password,
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ request = compat_urllib_request.Request(
+ u'https://secure.nicovideo.jp/secure/login', login_data)
+ login_results = self._download_webpage(
+ request, u'', note=u'Logging in', errnote=u'Unable to log in')
+ if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
+ return False
+ return True
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ # Get video webpage. We are not actually interested in it, but need
+ # the cookies in order to be able to download the info webpage
+ self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+
+ video_info_webpage = self._download_webpage(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
+ note=u'Downloading video info page')
+
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, u'Downloading flv info')
+ video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+
+ # Start extracting information
+ video_info = xml.etree.ElementTree.fromstring(video_info_webpage)
+ video_title = video_info.find('.//title').text
+ video_extension = video_info.find('.//movie_type').text
+ video_format = video_extension.upper()
+ video_thumbnail = video_info.find('.//thumbnail_url').text
+ video_description = video_info.find('.//description').text
+ video_uploader_id = video_info.find('.//user_id').text
+ video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+ video_view_count = video_info.find('.//view_counter').text
+ video_webpage_url = video_info.find('.//watch_url').text
+
+ # uploader
+ video_uploader = video_uploader_id
+ url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
+ try:
+ user_info_webpage = self._download_webpage(
+ url, video_id, note=u'Downloading user information')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+ else:
+ user_info = xml.etree.ElementTree.fromstring(user_info_webpage)
+ video_uploader = user_info.find('.//nickname').text
+
+ return {
+ 'id': video_id,
+ 'url': video_real_url,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'format': video_format,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'view_count': video_view_count,
+ 'webpage_url': video_webpage_url,
+ }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 75cf4bb9f..8b3471919 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -6,7 +6,6 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 83e1f055f..67b2dff9c 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -59,6 +59,7 @@ class SoundcloudIE(InfoExtractor):
]
_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@classmethod
def suitable(cls, url):
@@ -83,7 +84,6 @@ class SoundcloudIE(InfoExtractor):
thumbnail = thumbnail.replace('-large', '-t500x500')
result = {
'id': track_id,
- 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
'uploader': info['user']['username'],
'upload_date': unified_strdate(info['created_at']),
'title': info['title'],
@@ -92,19 +92,29 @@ class SoundcloudIE(InfoExtractor):
'thumbnail': thumbnail,
}
if info.get('downloadable', False):
+ # We can build a direct link to the song
result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID)
- if not info.get('streamable', False):
- # We have to get the rtmp url
+ else:
+ # We have to retrieve the url
stream_json = self._download_webpage(
- 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID),
+ 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
track_id, u'Downloading track url')
- rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url']
- # The url doesn't have an rtmp app, we have to extract the playpath
- url, path = rtmp_url.split('mp3:', 1)
- result.update({
- 'url': url,
- 'play_path': 'mp3:' + path,
- })
+ # There should be only one entry in the dictionary
+ key, stream_url = list(json.loads(stream_json).items())[0]
+ if key.startswith(u'http'):
+ result['url'] = stream_url
+ elif key.startswith(u'rtmp'):
+ # The url doesn't have an rtmp app, we have to extract the playpath
+ url, path = stream_url.split('mp3:', 1)
+ result.update({
+ 'url': url,
+ 'play_path': 'mp3:' + path,
+ })
+ else:
+ # We fallback to the stream_url in the original info, this
+ # cannot be always used, sometimes it can give an HTTP 404 error
+ result['url'] = info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+
return result
def _real_extract(self, url):
@@ -158,7 +168,6 @@ class SoundcloudSetIE(SoundcloudIE):
resolv_url = self._resolv_url(url)
info_json = self._download_webpage(resolv_url, full_title)
- videos = []
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 97f9c268a..9e2ad0d99 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -6,7 +6,6 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
@@ -36,11 +35,12 @@ class SpankwireIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title')
- video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
- thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
- description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False)
- if len(description) == 0:
- description = None
+ video_uploader = self._html_search_regex(
+ r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'<div\s+id="descriptionContent">([^<]+)<', webpage, u'description', fatal=False)
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 6dc2eda6d..19ce585cf 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -2,7 +2,6 @@ import re
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import determine_ext
class SpiegelIE(InfoExtractor):
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
new file mode 100644
index 000000000..9faf3a5e3
--- /dev/null
+++ b/youtube_dl/extractor/streamcloud.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+ IE_NAME = u'streamcloud.eu'
+ _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+
+ _TEST = {
+ u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
+ u'file': u'skp9j99s4bpz.mp4',
+ u'md5': u'6bea4c7fa5daaacc2a946b7146286686',
+ u'info_dict': {
+ u'title': u'youtube-dl test video \'/\\ ä ↭',
+ u'duration': 9,
+ },
+ u'skip': u'Only available from the EU'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ orig_webpage = self._download_webpage(url, video_id)
+
+ fields = re.findall(r'''(?x)<input\s+
+ type="(?:hidden|submit)"\s+
+ name="([^"]+)"\s+
+ (?:id="[^"]+"\s+)?
+ value="([^"]*)"
+ ''', orig_webpage)
+ post = compat_urllib_parse.urlencode(fields)
+
+ self.to_screen('%s: Waiting for timeout' % video_id)
+ time.sleep(12)
+ headers = {
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ }
+ req = compat_urllib_request.Request(url, post, headers)
+
+ webpage = self._download_webpage(
+ req, video_id, note=u'Downloading video page ...')
+ title = self._html_search_regex(
+ r'<h1[^>]*>([^<]+)<', webpage, u'title')
+ video_url = self._search_regex(
+ r'file:\s*"([^"]+)"', webpage, u'video URL')
+ duration_str = self._search_regex(
+ r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False)
+ duration = None if duration_str is None else int(duration_str)
+ thumbnail = self._search_regex(
+ r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
index 81fa35c4b..c9359fafb 100644
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -15,7 +15,8 @@ class SztvHuIE(InfoExtractor):
u'info_dict': {
u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
- }
+ },
+ u'skip': u'Service temporarily disabled as of 2013-11-20'
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index bc48620f0..165d9f88b 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -60,7 +60,7 @@ class TeamcocoIE(InfoExtractor):
return -1
formats.sort(key=sort_key)
if not formats:
- raise RegexNotFoundError(u'Unable to extract video URL')
+ raise ExtractorError(u'Unable to extract video URL')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 2e497c86e..4bca62ba0 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -4,7 +4,6 @@ import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
- compat_str,
RegexNotFoundError,
)
@@ -113,6 +112,6 @@ class TEDIE(SubtitlesInfoExtractor):
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
- except RegexNotFoundError as err:
+ except RegexNotFoundError:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py
new file mode 100644
index 000000000..2f728d3dc
--- /dev/null
+++ b/youtube_dl/extractor/toutv.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class TouTvIE(InfoExtractor):
+ IE_NAME = u'tou.tv'
+ _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+
+ _TEST = {
+ u'url': u'http://www.tou.tv/30-vies/S04E41',
+ u'file': u'30-vies_S04E41.mp4',
+ u'info_dict': {
+ u'title': u'30 vies Saison 4 / Épisode 41',
+ u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
+ u'age_limit': 8,
+ u'uploader': u'Groupe des Nouveaux Médias',
+ u'duration': 1296,
+ u'upload_date': u'20131118',
+ u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+ },
+ u'params': {
+ u'skip_download': True, # Requires rtmpdump
+ },
+ u'skip': 'Only available in Canada'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ mediaId = self._search_regex(
+ r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
+
+ streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
+ streams_webpage = self._download_webpage(
+ streams_url, video_id, note=u'Downloading stream list')
+
+ streams_doc = xml.etree.ElementTree.fromstring(
+ streams_webpage.encode('utf-8'))
+ video_url = next(n.text
+ for n in streams_doc.findall('.//choice/url')
+ if u'//ad.doubleclick' not in n.text)
+ if video_url.endswith('/Unavailable.flv'):
+ raise ExtractorError(
+ u'Access to this video is blocked from outside of Canada',
+ expected=True)
+
+ duration_str = self._html_search_meta(
+ 'video:duration', webpage, u'duration')
+ duration = int(duration_str) if duration_str else None
+ upload_date_str = self._html_search_meta(
+ 'video:release_date', webpage, u'upload date')
+ upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'url': video_url,
+ 'description': self._og_search_description(webpage),
+ 'uploader': self._dc_search_uploader(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'age_limit': self._media_rating_search(webpage),
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'ext': 'mp4',
+ }
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index d4b7603c7..4d9d41db3 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -5,8 +5,6 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
index 65f39b982..4800415bd 100644
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -24,12 +24,16 @@ class VideoPremiumIE(InfoExtractor):
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
- self.report_extraction(video_id)
+ if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+ # Download again, we need a cookie
+ webpage = self._download_webpage(
+ webpage_url, video_id,
+ note=u'Downloading webpage again (with cookie)')
- video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
- webpage, u'video title')
+ video_title = self._html_search_regex(
+ r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
- return [{
+ return {
'id': video_id,
'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
'play_path': "mp4:%s.f4v" % video_id,
@@ -37,4 +41,4 @@ class VideoPremiumIE(InfoExtractor):
'player_url': "http://videopremium.tv/uplayer/uppod.swf",
'ext': 'f4v',
'title': video_title,
- }]
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
new file mode 100644
index 000000000..2206a06d5
--- /dev/null
+++ b/youtube_dl/extractor/viki.py
@@ -0,0 +1,101 @@
+import re
+
+from ..utils import (
+ ExtractorError,
+ unescapeHTML,
+ unified_strdate,
+)
+from .subtitles import SubtitlesInfoExtractor
+
+
+class VikiIE(SubtitlesInfoExtractor):
+ IE_NAME = u'viki'
+
+ _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+ _TEST = {
+ u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
+ u'file': u'1023585v.mp4',
+ u'md5': u'a21454021c2646f5433514177e2caa5f',
+ u'info_dict': {
+ u'title': u'Heirs Episode 14',
+ u'uploader': u'SBS',
+ u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ u'upload_date': u'20131121',
+ u'age_limit': 13,
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ uploader_m = re.search(
+ r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
+ if uploader_m is None:
+ uploader = None
+ else:
+ uploader = uploader_m.group(1).strip()
+
+ rating_str = self._html_search_regex(
+ r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
+ u'rating information', default='').strip()
+ RATINGS = {
+ 'G': 0,
+ 'PG': 10,
+ 'PG-13': 13,
+ 'R': 16,
+ 'NC': 18,
+ }
+ age_limit = RATINGS.get(rating_str)
+
+ info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
+ info_webpage = self._download_webpage(
+ info_url, video_id, note=u'Downloading info page')
+ if re.match(r'\s*<div\s+class="video-error', info_webpage):
+ raise ExtractorError(
+ u'Video %s is blocked from your location.' % video_id,
+ expected=True)
+ video_url = self._html_search_regex(
+ r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+
+ upload_date_str = self._html_search_regex(
+ r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+ upload_date = (
+ unified_strdate(upload_date_str)
+ if upload_date_str is not None
+ else None
+ )
+
+ # subtitles
+ video_subtitles = self.extract_subtitles(video_id, info_webpage)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, info_webpage)
+ return
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'age_limit': age_limit,
+ 'uploader': uploader,
+ 'subtitles': video_subtitles,
+ 'upload_date': upload_date,
+ }
+
+ def _get_available_subtitles(self, video_id, info_webpage):
+ res = {}
+ for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
+ sturl = unescapeHTML(sturl_html)
+ m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
+ if not m:
+ continue
+ res[m.group('lang')] = sturl
+ return res
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index d465bf20b..7d82c2cfa 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -151,7 +151,7 @@ class VimeoIE(InfoExtractor):
config = json.loads(config_json)
except RegexNotFoundError:
# For pro videos or player.vimeo.com urls
- config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
+ config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
webpage, u'info section', flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 03ad88bed..e3458d2bd 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -5,7 +5,6 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
)
class XTubeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 1aa549740..64d4c2445 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -139,10 +139,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_DESC = u'YouTube.com'
- _VALID_URL = r"""^
+ _VALID_URL = r"""(?x)^
(
- (?:https?://)? # http(s):// (optional)
- (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+ (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
@@ -363,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader_id": u"justintimberlakeVEVO"
}
},
+ {
+ u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+ u"file": u"yZIXLfi8CZQ.mp4",
+ u"note": u"Embed-only video (#1746)",
+ u"info_dict": {
+ u"upload_date": u"20120608",
+ u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+ u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+ u"uploader": u"SET India",
+ u"uploader_id": u"setindia"
+ }
+ },
]
@@ -370,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ return re.match(cls._VALID_URL, url) is not None
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
@@ -1272,7 +1284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
- 'el': 'embedded',
+ 'el': 'player_embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@@ -1498,7 +1510,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
})
return results
-class YoutubePlaylistIE(InfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
(?:https?://)?
@@ -1514,8 +1526,9 @@ class YoutubePlaylistIE(InfoExtractor):
|
((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
- _MAX_RESULTS = 50
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+ _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+ _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&amp;'
IE_NAME = u'youtube:playlist'
@classmethod
@@ -1523,6 +1536,9 @@ class YoutubePlaylistIE(InfoExtractor):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ def _real_initialize(self):
+ self._login()
+
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1536,45 +1552,28 @@ class YoutubePlaylistIE(InfoExtractor):
video_id = query_dict['v'][0]
if self._downloader.params.get('noplaylist'):
self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+ return self.url_result(video_id, 'Youtube', video_id=video_id)
else:
self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- # Download playlist videos from API
- videos = []
+ # Extract the video ids from the playlist pages
+ ids = []
for page_num in itertools.count(1):
- start_index = self._MAX_RESULTS * (page_num - 1) + 1
- if start_index >= 1000:
- self._downloader.report_warning(u'Max number of results reached')
- break
- url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+ url = self._TEMPLATE_URL % (playlist_id, page_num)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+ # The ids are duplicated
+ new_ids = orderedSet(re.findall(self._VIDEO_RE, page))
+ ids.extend(new_ids)
- try:
- response = json.loads(page)
- except ValueError as err:
- raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-
- if 'feed' not in response:
- raise ExtractorError(u'Got a malformed response from YouTube API')
- playlist_title = response['feed']['title']['$t']
- if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
break
- for entry in response['feed']['entry']:
- index = entry['yt$position']['$t']
- if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
- videos.append((
- index,
- 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
- ))
-
- videos = [v[1] for v in sorted(videos)]
+ playlist_title = self._og_search_title(page)
- url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
- return [self.playlist_result(url_results, playlist_id, playlist_title)]
+ url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+ return self.playlist_result(url_results, playlist_id, playlist_title)
class YoutubeChannelIE(InfoExtractor):
@@ -1628,9 +1627,9 @@ class YoutubeChannelIE(InfoExtractor):
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
- urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
- url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
- return [self.playlist_result(url_entries, channel_id)]
+ url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_entries, channel_id)
class YoutubeUserIE(InfoExtractor):
@@ -1694,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor):
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
break
- urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
- url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
- return [self.playlist_result(url_results, playlist_title = username)]
+ url_results = [
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_results, playlist_title=username)
+
class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches'
@@ -1737,7 +1738,8 @@ class YoutubeSearchIE(SearchInfoExtractor):
if len(video_ids) > n:
video_ids = video_ids[:n]
- videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+ videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
return self.playlist_result(videos, query)
class YoutubeSearchDateIE(YoutubeSearchIE):
@@ -1797,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+ feed_entries.extend(
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in ids)
if info['paging'] is None:
break
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
@@ -1822,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
_PAGING_STEP = 100
_PERSONAL_FEED = True
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
+ _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PERSONAL_FEED = True
+ _PLAYLIST_TITLE = u'Youtube Watch History'
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History')
+ data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging')
+ # The step is actually a ridiculously big number (like 1374343569725646)
+ self._PAGING_STEP = int(data_paging)
+ return super(YoutubeHistoryIE, self)._real_extract(url)
+
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = u'youtube:favorites'
IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index faed7ff7f..07f830e80 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,75 +1,111 @@
+import operator
import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- ExtractorError,
+ parse_xml_doc,
+ unified_strdate,
)
class ZDFIE(InfoExtractor):
_VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
- _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id')
- if mobj.group('hash'):
- url = url.replace(u'#', u'', 1)
+ xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ info_xml = self._download_webpage(
+ xml_url, video_id, note=u'Downloading video info')
+ doc = parse_xml_doc(info_xml)
- html = self._download_webpage(url, video_id)
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
- if streams is None:
- raise ExtractorError(u'No media url found.')
+ title = doc.find('.//information/title').text
+ description = doc.find('.//information/detail').text
+ uploader_node = doc.find('.//details/originChannelTitle')
+ uploader = None if uploader_node is None else uploader_node.text
+ duration_str = doc.find('.//details/length').text
+ duration_m = re.match(r'''(?x)^
+ (?P<hours>[0-9]{2})
+ :(?P<minutes>[0-9]{2})
+ :(?P<seconds>[0-9]{2})
+ (?:\.(?P<ms>[0-9]+)?)
+ ''', duration_str)
+ duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m
+ else None
+ )
+ upload_date = unified_strdate(doc.find('.//details/airtime').text)
- # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
- # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
- # choose first/default media type and highest quality for now
- def stream_pref(s):
- TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+ def xml_to_format(fnode):
+ video_url = fnode.find('url').text
+ is_available = u'http://www.metafilegenerator' not in video_url
+
+ format_id = fnode.attrib['basetype']
+ format_m = re.match(r'''(?x)
+ (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+ ''', format_id)
+
+ ext = format_m.group('container')
+ is_supported = ext != 'f4f'
+
+ PROTO_ORDER = ['http', 'rtmp', 'rtsp']
try:
- type_pref = TYPE_ORDER.index(s['media_type'])
+ proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
except ValueError:
- type_pref = 999
+ proto_pref = 999
- QUALITY_ORDER = ['veryhigh', '300']
+ quality = fnode.find('./quality').text
+ QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
try:
- quality_pref = QUALITY_ORDER.index(s['quality'])
+ quality_pref = -QUALITY_ORDER.index(quality)
except ValueError:
quality_pref = 999
- return (type_pref, quality_pref)
-
- sorted_streams = sorted(streams, key=stream_pref)
- if not sorted_streams:
- raise ExtractorError(u'No stream found.')
- stream = sorted_streams[0]
-
- media_link = self._download_webpage(
- stream['video_url'],
- video_id,
- u'Get stream URL')
+ abr = int(fnode.find('./audioBitrate').text) // 1000
+ vbr = int(fnode.find('./videoBitrate').text) // 1000
+ pref = (is_available, is_supported,
+ proto_pref, quality_pref, vbr, abr)
- MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
- RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+ format_note = u''
+ if not is_supported:
+ format_note += u'(unsupported)'
+ if not format_note:
+ format_note = None
- mobj = re.search(self._MEDIA_STREAM, media_link)
- if mobj is None:
- mobj = re.search(RTSP_STREAM, media_link)
- if mobj is None:
- raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
- video_url = mobj.group('video_url')
+ return {
+ 'format_id': format_id + u'-' + quality,
+ 'url': video_url,
+ 'ext': ext,
+ 'acodec': format_m.group('acodec'),
+ 'vcodec': format_m.group('vcodec'),
+ 'abr': abr,
+ 'vbr': vbr,
+ 'width': int(fnode.find('./width').text),
+ 'height': int(fnode.find('./height').text),
+ 'filesize': int(fnode.find('./filesize').text),
+ 'format_note': format_note,
+ '_pref': pref,
+ '_available': is_available,
+ }
- title = self._html_search_regex(
- r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
- html, u'title')
+ format_nodes = doc.findall('.//formitaeten/formitaet')
+ formats = sorted(filter(lambda f: f['_available'],
+ map(xml_to_format, format_nodes)),
+ key=operator.itemgetter('_pref'))
return {
'id': video_id,
- 'url': video_url,
'title': title,
- 'ext': determine_ext(video_url)
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'upload_date': upload_date,
}
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index f41b4785a..cd9670166 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -41,6 +41,7 @@ def rsa_verify(message, signature, key):
if signature != sha256(message).digest(): return False
return True
+
def update_self(to_screen, verbose):
"""Update the program file with the latest version from the repository"""
@@ -82,6 +83,13 @@ def update_self(to_screen, verbose):
return
version_id = versions_info['latest']
+
+ def version_tuple(version_str):
+ return tuple(map(int, version_str.split('.')))
+ if version_tuple(__version__) >= version_tuple(version_id):
+ to_screen(u'youtube-dl is up to date (%s)' % __version__)
+ return
+
to_screen(u'Updating to version ' + version_id + '...')
version = versions_info['versions'][version_id]
@@ -109,7 +117,7 @@ def update_self(to_screen, verbose):
urlh = compat_urllib_request.urlopen(version['exe'][0])
newcontent = urlh.read()
urlh.close()
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to download latest version')
return
@@ -122,7 +130,7 @@ def update_self(to_screen, verbose):
try:
with open(exe + '.new', 'wb') as outf:
outf.write(newcontent)
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to write the new version')
return
@@ -141,7 +149,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
subprocess.Popen([bat]) # Continues to run in the background
return # Do not show premature success messages
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to overwrite current version')
return
@@ -152,7 +160,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
urlh = compat_urllib_request.urlopen(version['bin'][0])
newcontent = urlh.read()
urlh.close()
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to download latest version')
return
@@ -165,7 +173,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
try:
with open(filename, 'wb') as outf:
outf.write(newcontent)
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to overwrite current version')
return
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 1d9785341..946e90e93 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -8,13 +8,16 @@ import gzip
import io
import json
import locale
+import math
import os
import pipes
import platform
import re
+import ssl
import socket
import sys
import traceback
+import xml.etree.ElementTree
import zlib
try:
@@ -535,17 +538,34 @@ def formatSeconds(secs):
else:
return '%d' % secs
-def make_HTTPS_handler(opts):
- if sys.version_info < (3,2):
- # Python's 2.x handler is very simplistic
- return compat_urllib_request.HTTPSHandler()
+def make_HTTPS_handler(opts_no_check_certificate):
+ if sys.version_info < (3, 2):
+ import httplib
+
+ class HTTPSConnectionV3(httplib.HTTPSConnection):
+ def __init__(self, *args, **kwargs):
+ httplib.HTTPSConnection.__init__(self, *args, **kwargs)
+
+ def connect(self):
+ sock = socket.create_connection((self.host, self.port), self.timeout)
+ if self._tunnel_host:
+ self.sock = sock
+ self._tunnel()
+ try:
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
+ except ssl.SSLError:
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
+
+ class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
+ def https_open(self, req):
+ return self.do_open(HTTPSConnectionV3, req)
+ return HTTPSHandlerV3()
else:
- import ssl
- context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+ context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
context.set_default_verify_paths()
context.verify_mode = (ssl.CERT_NONE
- if opts.no_check_certificate
+ if opts_no_check_certificate
else ssl.CERT_REQUIRED)
return compat_urllib_request.HTTPSHandler(context=context)
@@ -734,6 +754,8 @@ def unified_strdate(date_str):
'%Y/%m/%d %H:%M:%S',
'%d.%m.%Y %H:%M',
'%Y-%m-%dT%H:%M:%SZ',
+ '%Y-%m-%dT%H:%M:%S.%fZ',
+ '%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S',
]
for expression in format_expressions:
@@ -949,7 +971,16 @@ class locked_file(object):
def shell_quote(args):
- return ' '.join(map(pipes.quote, args))
+ quoted_args = []
+ encoding = sys.getfilesystemencoding()
+ if encoding is None:
+ encoding = 'utf-8'
+ for a in args:
+ if isinstance(a, bytes):
+ # We may get a filename encoded with 'encodeFilename'
+ a = a.decode(encoding)
+ quoted_args.append(pipes.quote(a))
+ return u' '.join(quoted_args)
def takewhile_inclusive(pred, seq):
@@ -976,3 +1007,22 @@ def unsmuggle_url(smug_url):
jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
data = json.loads(jsond)
return url, data
+
+
+def parse_xml_doc(s):
+ assert isinstance(s, type(u''))
+ return xml.etree.ElementTree.fromstring(s.encode('utf-8'))
+
+
+def format_bytes(bytes):
+ if bytes is None:
+ return u'N/A'
+ if type(bytes) is str:
+ bytes = float(bytes)
+ if bytes == 0.0:
+ exponent = 0
+ else:
+ exponent = int(math.log(bytes, 1024.0))
+ suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+ converted = float(bytes) / float(1024 ** exponent)
+ return u'%.2f%s' % (converted, suffix)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 110058c79..2af23040f 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.11.17'
+__version__ = '2013.11.25.1'