aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py24
-rw-r--r--youtube_dl/__init__.py494
-rw-r--r--youtube_dl/cache.py94
-rw-r--r--youtube_dl/downloader/hls.py9
-rw-r--r--youtube_dl/downloader/http.py3
-rw-r--r--youtube_dl/downloader/rtmp.py7
-rw-r--r--youtube_dl/extractor/__init__.py28
-rw-r--r--youtube_dl/extractor/academicearth.py9
-rw-r--r--youtube_dl/extractor/adultswim.py8
-rw-r--r--youtube_dl/extractor/anysex.py61
-rw-r--r--youtube_dl/extractor/aol.py11
-rw-r--r--youtube_dl/extractor/ard.py65
-rw-r--r--youtube_dl/extractor/arte.py3
-rw-r--r--youtube_dl/extractor/bambuser.py15
-rw-r--r--youtube_dl/extractor/bandcamp.py10
-rw-r--r--youtube_dl/extractor/beeg.py65
-rw-r--r--youtube_dl/extractor/br.py11
-rw-r--r--youtube_dl/extractor/cbs.py2
-rw-r--r--youtube_dl/extractor/chilloutzone.py2
-rw-r--r--youtube_dl/extractor/cliphunter.py61
-rw-r--r--youtube_dl/extractor/cloudy.py108
-rw-r--r--youtube_dl/extractor/comedycentral.py36
-rw-r--r--youtube_dl/extractor/common.py133
-rw-r--r--youtube_dl/extractor/crunchyroll.py97
-rw-r--r--youtube_dl/extractor/cspan.py7
-rw-r--r--youtube_dl/extractor/dailymotion.py100
-rw-r--r--youtube_dl/extractor/daum.py17
-rw-r--r--youtube_dl/extractor/dbtv.py74
-rw-r--r--youtube_dl/extractor/deezer.py89
-rw-r--r--youtube_dl/extractor/dropbox.py7
-rw-r--r--youtube_dl/extractor/drtuber.py70
-rw-r--r--youtube_dl/extractor/drtv.py2
-rw-r--r--youtube_dl/extractor/eighttracks.py138
-rw-r--r--youtube_dl/extractor/empflix.py47
-rw-r--r--youtube_dl/extractor/eporner.py75
-rw-r--r--youtube_dl/extractor/everyonesmixtape.py14
-rw-r--r--youtube_dl/extractor/facebook.py42
-rw-r--r--youtube_dl/extractor/generic.py45
-rw-r--r--youtube_dl/extractor/hornbunny.py56
-rw-r--r--youtube_dl/extractor/hostingbulk.py84
-rw-r--r--youtube_dl/extractor/ign.py18
-rw-r--r--youtube_dl/extractor/imdb.py8
-rw-r--r--youtube_dl/extractor/instagram.py24
-rw-r--r--youtube_dl/extractor/ivi.py15
-rw-r--r--youtube_dl/extractor/izlesene.py96
-rw-r--r--youtube_dl/extractor/khanacademy.py17
-rw-r--r--youtube_dl/extractor/livestream.py85
-rw-r--r--youtube_dl/extractor/mixcloud.py3
-rw-r--r--youtube_dl/extractor/mlb.py6
-rw-r--r--youtube_dl/extractor/moevideo.py112
-rw-r--r--youtube_dl/extractor/mofosex.py22
-rw-r--r--youtube_dl/extractor/musicvault.py76
-rw-r--r--youtube_dl/extractor/nba.py18
-rw-r--r--youtube_dl/extractor/nhl.py61
-rw-r--r--youtube_dl/extractor/noco.py27
-rw-r--r--youtube_dl/extractor/nosvideo.py76
-rw-r--r--youtube_dl/extractor/npo.py35
-rw-r--r--youtube_dl/extractor/pornhd.py51
-rw-r--r--youtube_dl/extractor/pornoxo.py65
-rw-r--r--youtube_dl/extractor/promptfile.py67
-rw-r--r--youtube_dl/extractor/prosiebensat1.py5
-rw-r--r--youtube_dl/extractor/rtlnl.py34
-rw-r--r--youtube_dl/extractor/rutube.py15
-rw-r--r--youtube_dl/extractor/rutv.py2
-rw-r--r--youtube_dl/extractor/sharesix.py91
-rw-r--r--youtube_dl/extractor/smotri.py16
-rw-r--r--youtube_dl/extractor/sockshare.py5
-rw-r--r--youtube_dl/extractor/soundcloud.py57
-rw-r--r--youtube_dl/extractor/spiegel.py42
-rw-r--r--youtube_dl/extractor/sportdeutschland.py96
-rw-r--r--youtube_dl/extractor/sunporno.py70
-rw-r--r--youtube_dl/extractor/swrmediathek.py14
-rw-r--r--youtube_dl/extractor/teachertube.py7
-rw-r--r--youtube_dl/extractor/techtalks.py76
-rw-r--r--youtube_dl/extractor/ted.py8
-rw-r--r--youtube_dl/extractor/telemb.py78
-rw-r--r--youtube_dl/extractor/tnaflix.py84
-rw-r--r--youtube_dl/extractor/toypics.py7
-rw-r--r--youtube_dl/extractor/tudou.py60
-rw-r--r--youtube_dl/extractor/tumblr.py22
-rw-r--r--youtube_dl/extractor/turbo.py67
-rw-r--r--youtube_dl/extractor/tvigle.py82
-rw-r--r--youtube_dl/extractor/tvplay.py147
-rw-r--r--youtube_dl/extractor/unistra.py78
-rw-r--r--youtube_dl/extractor/ustream.py28
-rw-r--r--youtube_dl/extractor/veehd.py3
-rw-r--r--youtube_dl/extractor/vgtv.py119
-rw-r--r--youtube_dl/extractor/vimeo.py85
-rw-r--r--youtube_dl/extractor/vine.py7
-rw-r--r--youtube_dl/extractor/vporn.py125
-rw-r--r--youtube_dl/extractor/washingtonpost.py3
-rw-r--r--youtube_dl/extractor/xhamster.py2
-rw-r--r--youtube_dl/extractor/xtube.py8
-rw-r--r--youtube_dl/extractor/yahoo.py3
-rw-r--r--youtube_dl/extractor/youjizz.py2
-rw-r--r--youtube_dl/extractor/youporn.py1
-rw-r--r--youtube_dl/extractor/youtube.py425
-rw-r--r--youtube_dl/options.py481
-rw-r--r--youtube_dl/utils.py123
-rw-r--r--youtube_dl/version.py2
100 files changed, 4322 insertions, 1363 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 98639e004..9519594c9 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -28,6 +28,7 @@ from .utils import (
compat_str,
compat_urllib_error,
compat_urllib_request,
+ escape_url,
ContentTooShortError,
date_from_str,
DateRange,
@@ -57,6 +58,7 @@ from .utils import (
YoutubeDLHandler,
prepend_extension,
)
+from .cache import Cache
from .extractor import get_info_extractor, gen_extractors
from .downloader import get_suitable_downloader
from .postprocessor import FFmpegMergerPP
@@ -133,7 +135,7 @@ class YoutubeDL(object):
daterange: A DateRange object, download only if the upload_date is in the range.
skip_download: Skip the actual download of the video file
cachedir: Location of the cache files in the filesystem.
- None to disable filesystem cache.
+ False to disable filesystem cache.
noplaylist: Download single video instead of a playlist if in doubt.
age_limit: An integer representing the user's age in years.
Unsuitable videos for the given age are skipped.
@@ -195,6 +197,7 @@ class YoutubeDL(object):
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
self._err_file = sys.stderr
self.params = params
+ self.cache = Cache(self)
if params.get('bidi_workaround', False):
try:
@@ -1239,6 +1242,25 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
+
+ # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+ # always respected by websites, some tend to give out URLs with non percent-encoded
+ # non-ASCII characters (see telemb.py, ard.py [#3412])
+ # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+ # To work around aforementioned issue we will replace request's original URL with
+ # percent-encoded one
+ url = req if isinstance(req, compat_str) else req.get_full_url()
+ url_escaped = escape_url(url)
+
+ # Substitute URL if any change after escaping
+ if url != url_escaped:
+ if isinstance(req, compat_str):
+ req = url_escaped
+ else:
+ req = compat_urllib_request.Request(
+ url_escaped, data=req.data, headers=req.headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+
return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index b15695053..42d0a0180 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -74,29 +74,28 @@ __authors__ = (
'Keith Beckman',
'Ole Ernst',
'Aaron McDaniel (mcd1992)',
+ 'Magnus Kolstad',
)
__license__ = 'Public Domain'
import codecs
import io
-import optparse
import os
import random
-import shlex
-import shutil
import sys
+from .options import (
+ parseOpts,
+)
from .utils import (
compat_getpass,
compat_print,
DateRange,
DEFAULT_OUTTMPL,
decodeOption,
- get_term_width,
DownloadError,
- get_cachedir,
MaxDownloadsReached,
preferredencoding,
read_batch_urls,
@@ -110,7 +109,6 @@ from .downloader import (
FileDownloader,
)
from .extractor import gen_extractors
-from .version import __version__
from .YoutubeDL import YoutubeDL
from .postprocessor import (
AtomicParsleyPP,
@@ -124,475 +122,6 @@ from .postprocessor import (
)
-def parseOpts(overrideArguments=None):
- def _readOptions(filename_bytes, default=[]):
- try:
- optionf = open(filename_bytes)
- except IOError:
- return default # silently skip if file is not present
- try:
- res = []
- for l in optionf:
- res += shlex.split(l, comments=True)
- finally:
- optionf.close()
- return res
-
- def _readUserConf():
- xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
- if xdg_config_home:
- userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
- if not os.path.isfile(userConfFile):
- userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf')
- else:
- userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
- if not os.path.isfile(userConfFile):
- userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
- userConf = _readOptions(userConfFile, None)
-
- if userConf is None:
- appdata_dir = os.environ.get('appdata')
- if appdata_dir:
- userConf = _readOptions(
- os.path.join(appdata_dir, 'youtube-dl', 'config'),
- default=None)
- if userConf is None:
- userConf = _readOptions(
- os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
- default=None)
-
- if userConf is None:
- userConf = _readOptions(
- os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
- default=None)
- if userConf is None:
- userConf = _readOptions(
- os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
- default=None)
-
- if userConf is None:
- userConf = []
-
- return userConf
-
- def _format_option_string(option):
- ''' ('-o', '--option') -> -o, --format METAVAR'''
-
- opts = []
-
- if option._short_opts:
- opts.append(option._short_opts[0])
- if option._long_opts:
- opts.append(option._long_opts[0])
- if len(opts) > 1:
- opts.insert(1, ', ')
-
- if option.takes_value(): opts.append(' %s' % option.metavar)
-
- return "".join(opts)
-
- def _comma_separated_values_options_callback(option, opt_str, value, parser):
- setattr(parser.values, option.dest, value.split(','))
-
- def _hide_login_info(opts):
- opts = list(opts)
- for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
- try:
- i = opts.index(private_opt)
- opts[i+1] = '<PRIVATE>'
- except ValueError:
- pass
- return opts
-
- max_width = 80
- max_help_position = 80
-
- # No need to wrap help messages if we're on a wide console
- columns = get_term_width()
- if columns: max_width = columns
-
- fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
- fmt.format_option_strings = _format_option_string
-
- kw = {
- 'version' : __version__,
- 'formatter' : fmt,
- 'usage' : '%prog [options] url [url...]',
- 'conflict_handler' : 'resolve',
- }
-
- parser = optparse.OptionParser(**kw)
-
- # option groups
- general = optparse.OptionGroup(parser, 'General Options')
- selection = optparse.OptionGroup(parser, 'Video Selection')
- authentication = optparse.OptionGroup(parser, 'Authentication Options')
- video_format = optparse.OptionGroup(parser, 'Video Format Options')
- subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
- downloader = optparse.OptionGroup(parser, 'Download Options')
- postproc = optparse.OptionGroup(parser, 'Post-processing Options')
- filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
- workarounds = optparse.OptionGroup(parser, 'Workarounds')
- verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
-
- general.add_option('-h', '--help',
- action='help', help='print this help text and exit')
- general.add_option('-v', '--version',
- action='version', help='print program version and exit')
- general.add_option('-U', '--update',
- action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
- general.add_option('-i', '--ignore-errors',
- action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False)
- general.add_option('--abort-on-error',
- action='store_false', dest='ignoreerrors',
- help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
- general.add_option('--dump-user-agent',
- action='store_true', dest='dump_user_agent',
- help='display the current browser identification', default=False)
- general.add_option('--list-extractors',
- action='store_true', dest='list_extractors',
- help='List all supported extractors and the URLs they would handle', default=False)
- general.add_option('--extractor-descriptions',
- action='store_true', dest='list_extractor_descriptions',
- help='Output descriptions of all supported extractors', default=False)
- general.add_option(
- '--proxy', dest='proxy', default=None, metavar='URL',
- help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
- general.add_option(
- '--socket-timeout', dest='socket_timeout',
- type=float, default=None, help=u'Time to wait before giving up, in seconds')
- general.add_option(
- '--default-search',
- dest='default_search', metavar='PREFIX',
- help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')
- general.add_option(
- '--ignore-config',
- action='store_true',
- help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
-
- selection.add_option(
- '--playlist-start',
- dest='playliststart', metavar='NUMBER', default=1, type=int,
- help='playlist video to start at (default is %default)')
- selection.add_option(
- '--playlist-end',
- dest='playlistend', metavar='NUMBER', default=None, type=int,
- help='playlist video to end at (default is last)')
- selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
- selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
- selection.add_option('--max-downloads', metavar='NUMBER',
- dest='max_downloads', type=int, default=None,
- help='Abort after downloading NUMBER files')
- selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None)
- selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None)
- selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
- selection.add_option(
- '--datebefore', metavar='DATE', dest='datebefore', default=None,
- help='download only videos uploaded on or before this date (i.e. inclusive)')
- selection.add_option(
- '--dateafter', metavar='DATE', dest='dateafter', default=None,
- help='download only videos uploaded on or after this date (i.e. inclusive)')
- selection.add_option(
- '--min-views', metavar='COUNT', dest='min_views',
- default=None, type=int,
- help="Do not download any videos with less than COUNT views",)
- selection.add_option(
- '--max-views', metavar='COUNT', dest='max_views',
- default=None, type=int,
- help="Do not download any videos with more than COUNT views",)
- selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False)
- selection.add_option('--age-limit', metavar='YEARS', dest='age_limit',
- help='download only videos suitable for the given age',
- default=None, type=int)
- selection.add_option('--download-archive', metavar='FILE',
- dest='download_archive',
- help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
- selection.add_option(
- '--include-ads', dest='include_ads',
- action='store_true',
- help='Download advertisements as well (experimental)')
- selection.add_option(
- '--youtube-include-dash-manifest', action='store_true',
- dest='youtube_include_dash_manifest', default=False,
- help='Try to download the DASH manifest on YouTube videos (experimental)')
-
- authentication.add_option('-u', '--username',
- dest='username', metavar='USERNAME', help='account username')
- authentication.add_option('-p', '--password',
- dest='password', metavar='PASSWORD', help='account password')
- authentication.add_option('-2', '--twofactor',
- dest='twofactor', metavar='TWOFACTOR', help='two-factor auth code')
- authentication.add_option('-n', '--netrc',
- action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
- authentication.add_option('--video-password',
- dest='videopassword', metavar='PASSWORD', help='video password (vimeo, smotri)')
-
-
- video_format.add_option('-f', '--format',
- action='store', dest='format', metavar='FORMAT', default=None,
- help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.')
- video_format.add_option('--all-formats',
- action='store_const', dest='format', help='download all available video formats', const='all')
- video_format.add_option('--prefer-free-formats',
- action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
- video_format.add_option('--max-quality',
- action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
- video_format.add_option('-F', '--list-formats',
- action='store_true', dest='listformats', help='list all available formats')
-
- subtitles.add_option('--write-sub', '--write-srt',
- action='store_true', dest='writesubtitles',
- help='write subtitle file', default=False)
- subtitles.add_option('--write-auto-sub', '--write-automatic-sub',
- action='store_true', dest='writeautomaticsub',
- help='write automatic subtitle file (youtube only)', default=False)
- subtitles.add_option('--all-subs',
- action='store_true', dest='allsubtitles',
- help='downloads all the available subtitles of the video', default=False)
- subtitles.add_option('--list-subs',
- action='store_true', dest='listsubtitles',
- help='lists all available subtitles for the video', default=False)
- subtitles.add_option('--sub-format',
- action='store', dest='subtitlesformat', metavar='FORMAT',
- help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt')
- subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang',
- action='callback', dest='subtitleslangs', metavar='LANGS', type='str',
- default=[], callback=_comma_separated_values_options_callback,
- help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')
-
- downloader.add_option('-r', '--rate-limit',
- dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')
- downloader.add_option('-R', '--retries',
- dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)
- downloader.add_option('--buffer-size',
- dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")
- downloader.add_option('--no-resize-buffer',
- action='store_true', dest='noresizebuffer',
- help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False)
- downloader.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP)
-
- workarounds.add_option(
- '--encoding', dest='encoding', metavar='ENCODING',
- help='Force the specified encoding (experimental)')
- workarounds.add_option(
- '--no-check-certificate', action='store_true',
- dest='no_check_certificate', default=False,
- help='Suppress HTTPS certificate validation.')
- workarounds.add_option(
- '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
- help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
- workarounds.add_option(
- '--user-agent', metavar='UA',
- dest='user_agent', help='specify a custom user agent')
- workarounds.add_option(
- '--referer', metavar='REF',
- dest='referer', default=None,
- help='specify a custom referer, use if the video access is restricted to one domain',
- )
- workarounds.add_option(
- '--add-header', metavar='FIELD:VALUE',
- dest='headers', action='append',
- help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',
- )
- workarounds.add_option(
- '--bidi-workaround', dest='bidi_workaround', action='store_true',
- help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
-
- verbosity.add_option('-q', '--quiet',
- action='store_true', dest='quiet', help='activates quiet mode', default=False)
- verbosity.add_option(
- '--no-warnings',
- dest='no_warnings', action='store_true', default=False,
- help='Ignore warnings')
- verbosity.add_option('-s', '--simulate',
- action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
- verbosity.add_option('--skip-download',
- action='store_true', dest='skip_download', help='do not download the video', default=False)
- verbosity.add_option('-g', '--get-url',
- action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
- verbosity.add_option('-e', '--get-title',
- action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
- verbosity.add_option('--get-id',
- action='store_true', dest='getid', help='simulate, quiet but print id', default=False)
- verbosity.add_option('--get-thumbnail',
- action='store_true', dest='getthumbnail',
- help='simulate, quiet but print thumbnail URL', default=False)
- verbosity.add_option('--get-description',
- action='store_true', dest='getdescription',
- help='simulate, quiet but print video description', default=False)
- verbosity.add_option('--get-duration',
- action='store_true', dest='getduration',
- help='simulate, quiet but print video length', default=False)
- verbosity.add_option('--get-filename',
- action='store_true', dest='getfilename',
- help='simulate, quiet but print output filename', default=False)
- verbosity.add_option('--get-format',
- action='store_true', dest='getformat',
- help='simulate, quiet but print output format', default=False)
- verbosity.add_option('-j', '--dump-json',
- action='store_true', dest='dumpjson',
- help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
- verbosity.add_option('--newline',
- action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
- verbosity.add_option('--no-progress',
- action='store_true', dest='noprogress', help='do not print progress bar', default=False)
- verbosity.add_option('--console-title',
- action='store_true', dest='consoletitle',
- help='display progress in console titlebar', default=False)
- verbosity.add_option('-v', '--verbose',
- action='store_true', dest='verbose', help='print various debugging information', default=False)
- verbosity.add_option('--dump-intermediate-pages',
- action='store_true', dest='dump_intermediate_pages', default=False,
- help='print downloaded pages to debug problems (very verbose)')
- verbosity.add_option('--write-pages',
- action='store_true', dest='write_pages', default=False,
- help='Write downloaded intermediary pages to files in the current directory to debug problems')
- verbosity.add_option('--youtube-print-sig-code',
- action='store_true', dest='youtube_print_sig_code', default=False,
- help=optparse.SUPPRESS_HELP)
- verbosity.add_option('--print-traffic',
- dest='debug_printtraffic', action='store_true', default=False,
- help='Display sent and read HTTP traffic')
-
-
- filesystem.add_option('-a', '--batch-file',
- dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
- filesystem.add_option('--id',
- action='store_true', dest='useid', help='use only video ID in file name', default=False)
- filesystem.add_option('-A', '--auto-number',
- action='store_true', dest='autonumber',
- help='number downloaded files starting from 00000', default=False)
- filesystem.add_option('-o', '--output',
- dest='outtmpl', metavar='TEMPLATE',
- help=('output filename template. Use %(title)s to get the title, '
- '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, '
- '%(autonumber)s to get an automatically incremented number, '
- '%(ext)s for the filename extension, '
- '%(format)s for the format description (like "22 - 1280x720" or "HD"), '
- '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), '
- '%(upload_date)s for the upload date (YYYYMMDD), '
- '%(extractor)s for the provider (youtube, metacafe, etc), '
- '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
- '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
- '%(height)s and %(width)s for the width and height of the video format. '
- '%(resolution)s for a textual description of the resolution of the video format. '
- 'Use - to output to stdout. Can also be used to download to a different directory, '
- 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
- filesystem.add_option('--autonumber-size',
- dest='autonumber_size', metavar='NUMBER',
- help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given')
- filesystem.add_option('--restrict-filenames',
- action='store_true', dest='restrictfilenames',
- help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False)
- filesystem.add_option('-t', '--title',
- action='store_true', dest='usetitle', help='[deprecated] use title in file name (default)', default=False)
- filesystem.add_option('-l', '--literal',
- action='store_true', dest='usetitle', help='[deprecated] alias of --title', default=False)
- filesystem.add_option('-w', '--no-overwrites',
- action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
- filesystem.add_option('-c', '--continue',
- action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True)
- filesystem.add_option('--no-continue',
- action='store_false', dest='continue_dl',
- help='do not resume partially downloaded files (restart from beginning)')
- filesystem.add_option('--no-part',
- action='store_true', dest='nopart', help='do not use .part files', default=False)
- filesystem.add_option('--no-mtime',
- action='store_false', dest='updatetime',
- help='do not use the Last-modified header to set the file modification time', default=True)
- filesystem.add_option('--write-description',
- action='store_true', dest='writedescription',
- help='write video description to a .description file', default=False)
- filesystem.add_option('--write-info-json',
- action='store_true', dest='writeinfojson',
- help='write video metadata to a .info.json file', default=False)
- filesystem.add_option('--write-annotations',
- action='store_true', dest='writeannotations',
- help='write video annotations to a .annotation file', default=False)
- filesystem.add_option('--write-thumbnail',
- action='store_true', dest='writethumbnail',
- help='write thumbnail image to disk', default=False)
- filesystem.add_option('--load-info',
- dest='load_info_filename', metavar='FILE',
- help='json file containing the video information (created with the "--write-json" option)')
- filesystem.add_option('--cookies',
- dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
- filesystem.add_option(
- '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
- help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
- filesystem.add_option(
- '--no-cache-dir', action='store_const', const=None, dest='cachedir',
- help='Disable filesystem caching')
- filesystem.add_option(
- '--rm-cache-dir', action='store_true', dest='rm_cachedir',
- help='Delete all filesystem cache files')
-
-
- postproc.add_option('-x', '--extract-audio', action='store_true', dest='extractaudio', default=False,
- help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)')
- postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
- help='"best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; best by default')
- postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='5',
- help='ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5)')
- postproc.add_option('--recode-video', metavar='FORMAT', dest='recodevideo', default=None,
- help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)')
- postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
- help='keeps the video file on disk after the post-processing; the video is erased by default')
- postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False,
- help='do not overwrite post-processed files; the post-processed files are overwritten by default')
- postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
- help='embed subtitles in the video (only for mp4 videos)')
- postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False,
- help='embed thumbnail in the audio as cover art')
- postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False,
- help='write metadata to the video file')
- postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False,
- help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
- postproc.add_option('--prefer-avconv', action='store_false', dest='prefer_ffmpeg',
- help='Prefer avconv over ffmpeg for running the postprocessors (default)')
- postproc.add_option('--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg',
- help='Prefer ffmpeg over avconv for running the postprocessors')
- postproc.add_option(
- '--exec', metavar='CMD', dest='exec_cmd',
- help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'' )
-
- parser.add_option_group(general)
- parser.add_option_group(selection)
- parser.add_option_group(downloader)
- parser.add_option_group(filesystem)
- parser.add_option_group(verbosity)
- parser.add_option_group(workarounds)
- parser.add_option_group(video_format)
- parser.add_option_group(subtitles)
- parser.add_option_group(authentication)
- parser.add_option_group(postproc)
-
- if overrideArguments is not None:
- opts, args = parser.parse_args(overrideArguments)
- if opts.verbose:
- write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
- else:
- commandLineConf = sys.argv[1:]
- if '--ignore-config' in commandLineConf:
- systemConf = []
- userConf = []
- else:
- systemConf = _readOptions('/etc/youtube-dl.conf')
- if '--ignore-config' in systemConf:
- userConf = []
- else:
- userConf = _readUserConf()
- argv = systemConf + userConf + commandLineConf
-
- opts, args = parser.parse_args(argv)
- if opts.verbose:
- write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
- write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
- write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
-
- return parser, opts, args
-
-
def _real_main(argv=None):
# Compatibility fixes for Windows
if sys.platform == 'win32':
@@ -872,20 +401,7 @@ def _real_main(argv=None):
# Remove cache dir
if opts.rm_cachedir:
- if opts.cachedir is None:
- ydl.to_screen(u'No cache dir specified (Did you combine --no-cache-dir and --rm-cache-dir?)')
- else:
- if ('.cache' not in opts.cachedir) or ('youtube-dl' not in opts.cachedir):
- ydl.to_screen(u'Not removing directory %s - this does not look like a cache dir')
- retcode = 141
- else:
- ydl.to_screen(
- u'Removing cache dir %s .' % opts.cachedir,
- skip_eol=True)
- if os.path.exists(opts.cachedir):
- ydl.to_screen(u'.', skip_eol=True)
- shutil.rmtree(opts.cachedir)
- ydl.to_screen(u'.')
+ ydl.cache.remove()
# Maybe do nothing
if (len(all_urls) < 1) and (opts.load_info_filename is None):
diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py
new file mode 100644
index 000000000..79ff09f78
--- /dev/null
+++ b/youtube_dl/cache.py
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+import errno
+import io
+import json
+import os
+import re
+import shutil
+import traceback
+
+from .utils import (
+ write_json_file,
+)
+
+
+class Cache(object):
+ def __init__(self, ydl):
+ self._ydl = ydl
+
+ def _get_root_dir(self):
+ res = self._ydl.params.get('cachedir')
+ if res is None:
+ cache_root = os.environ.get('XDG_CACHE_HOME', '~/.cache')
+ res = os.path.join(cache_root, 'youtube-dl')
+ return os.path.expanduser(res)
+
+ def _get_cache_fn(self, section, key, dtype):
+ assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \
+ 'invalid section %r' % section
+ assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key
+ return os.path.join(
+ self._get_root_dir(), section, '%s.%s' % (key, dtype))
+
+ @property
+ def enabled(self):
+ return self._ydl.params.get('cachedir') is not False
+
+ def store(self, section, key, data, dtype='json'):
+ assert dtype in ('json',)
+
+ if not self.enabled:
+ return
+
+ fn = self._get_cache_fn(section, key, dtype)
+ try:
+ try:
+ os.makedirs(os.path.dirname(fn))
+ except OSError as ose:
+ if ose.errno != errno.EEXIST:
+ raise
+ write_json_file(data, fn)
+ except Exception:
+ tb = traceback.format_exc()
+ self._ydl.report_warning(
+ 'Writing cache to %r failed: %s' % (fn, tb))
+
+ def load(self, section, key, dtype='json', default=None):
+ assert dtype in ('json',)
+
+ if not self.enabled:
+ return default
+
+ cache_fn = self._get_cache_fn(section, key, dtype)
+ try:
+ try:
+ with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
+ return json.load(cachef)
+ except ValueError:
+ try:
+ file_size = os.path.getsize(cache_fn)
+ except (OSError, IOError) as oe:
+ file_size = str(oe)
+ self._ydl.report_warning(
+ 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size))
+ except IOError:
+ pass # No cache available
+
+ return default
+
+ def remove(self):
+ if not self.enabled:
+ self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)')
+ return
+
+ cachedir = self._get_root_dir()
+ if not any((term in cachedir) for term in ('cache', 'tmp')):
+ raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir)
+
+ self._ydl.to_screen(
+ 'Removing cache dir %s .' % cachedir, skip_eol=True)
+ if os.path.exists(cachedir):
+ self._ydl.to_screen('.', skip_eol=True)
+ shutil.rmtree(cachedir)
+ self._ydl.to_screen('.')
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 9f29e2f81..32852f333 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -3,6 +3,7 @@ import subprocess
from .common import FileDownloader
from ..utils import (
+ check_executable,
encodeFilename,
)
@@ -19,13 +20,11 @@ class HlsFD(FileDownloader):
encodeFilename(tmpfilename, for_subprocess=True)]
for program in ['avconv', 'ffmpeg']:
- try:
- subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ if check_executable(program, ['-version']):
break
- except (OSError, IOError):
- pass
else:
self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
+ return False
cmd = [program] + args
retval = subprocess.call(cmd)
@@ -42,5 +41,5 @@ class HlsFD(FileDownloader):
return True
else:
self.to_stderr(u"\n")
- self.report_error(u'ffmpeg exited with code %d' % retval)
+ self.report_error(u'%s exited with code %d' % (program, retval))
return False
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index d01d1897e..6caf7451e 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -193,7 +193,8 @@ class HttpFD(FileDownloader):
self.to_stderr(u"\n")
self.report_error(u'Did not get any data blocks')
return False
- stream.close()
+ if tmpfilename != u'-':
+ stream.close()
self.report_finish(data_len_str, (time.time() - start))
if data_len is not None and byte_counter != data_len:
raise ContentTooShortError(byte_counter, int(data_len))
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 68646709a..5eb108302 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -8,9 +8,10 @@ import time
from .common import FileDownloader
from ..utils import (
+ check_executable,
+ compat_str,
encodeFilename,
format_bytes,
- compat_str,
)
@@ -103,9 +104,7 @@ class RtmpFD(FileDownloader):
test = self.params.get('test', False)
# Check for rtmpdump first
- try:
- subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
- except (OSError, IOError):
+ if not check_executable('rtmpdump', ['-h']):
self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.')
return False
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7f0736ee8..4b83d8d99 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -4,12 +4,13 @@ from .addanime import AddAnimeIE
from .adultswim import AdultSwimIE
from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
+from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
-from .ard import ARDIE
+from .ard import ARDIE, ARDMediathekIE
from .arte import (
ArteTvIE,
ArteTVPlus7IE,
@@ -23,6 +24,7 @@ from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
+from .beeg import BeegIE
from .bilibili import BiliBiliIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
@@ -44,6 +46,7 @@ from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
+from .cloudy import CloudyIE
from .clubic import ClubicIE
from .cmt import CMTIE
from .cnet import CNETIE
@@ -65,9 +68,12 @@ from .dailymotion import (
DailymotionUserIE,
)
from .daum import DaumIE
+from .dbtv import DBTVIE
+from .deezer import DeezerPlaylistIE
from .dfb import DFBIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
+from .drtuber import DrTuberIE
from .drtv import DRTVIE
from .dump import DumpIE
from .defense import DefenseGouvFrIE
@@ -84,8 +90,9 @@ from .ellentv import (
EllenTVClipsIE,
)
from .elpais import ElPaisIE
-from .empflix import EmpflixIE
+from .empflix import EMPFlixIE
from .engadget import EngadgetIE
+from .eporner import EpornerIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
@@ -135,6 +142,8 @@ from .grooveshark import GroovesharkIE
from .hark import HarkIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
+from .hornbunny import HornBunnyIE
+from .hostingbulk import HostingBulkIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
@@ -195,6 +204,7 @@ from .mitele import MiTeleIE
from .mixcloud import MixcloudIE
from .mlb import MLBIE
from .mpora import MporaIE
+from .moevideo import MoeVideoIE
from .mofosex import MofosexIE
from .mojvideo import MojvideoIE
from .mooshare import MooshareIE
@@ -210,6 +220,7 @@ from .mtv import (
MTVIggyIE,
)
from .musicplayon import MusicPlayOnIE
+from .musicvault import MusicVaultIE
from .muzu import MuzuTVIE
from .myspace import MySpaceIE
from .myspass import MySpassIE
@@ -230,6 +241,7 @@ from .niconico import NiconicoIE
from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
+from .nosvideo import NosVideoIE
from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
@@ -257,6 +269,8 @@ from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
+from .pornoxo import PornoXOIE
+from .promptfile import PromptFileIE
from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .radiofrance import RadioFranceIE
@@ -288,6 +302,7 @@ from .scivee import SciVeeIE
from .screencast import ScreencastIE
from .servingsys import ServingSysIE
from .shared import SharedIE
+from .sharesix import ShareSixIE
from .sina import SinaIE
from .slideshare import SlideshareIE
from .slutload import SlutloadIE
@@ -313,13 +328,15 @@ from .southpark import (
)
from .space import SpaceIE
from .spankwire import SpankwireIE
-from .spiegel import SpiegelIE
+from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
+from .sportdeutschland import SportDeutschlandIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
+from .sunporno import SunPornoIE
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
@@ -332,6 +349,7 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
+from .telemb import TeleMBIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
from .tf1 import TF1IE
@@ -339,6 +357,7 @@ from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
+from .tnaflix import TNAFlixIE
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
@@ -347,6 +366,7 @@ from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
+from .turbo import TurboIE
from .tutv import TutvIE
from .tvigle import TvigleIE
from .tvp import TvpIE
@@ -364,6 +384,7 @@ from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
+from .vgtv import VGTVIE
from .vh1 import VH1IE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
@@ -391,6 +412,7 @@ from .vine import (
from .viki import VikiIE
from .vk import VKIE
from .vodlocker import VodlockerIE
+from .vporn import VpornIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vulture import VultureIE
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py
index 59d3bbba4..c983ef0f5 100644
--- a/youtube_dl/extractor/academicearth.py
+++ b/youtube_dl/extractor/academicearth.py
@@ -7,6 +7,15 @@ from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
+ _TEST = {
+ 'url': 'http://academicearth.org/playlists/laws-of-nature/',
+ 'info_dict': {
+ 'id': 'laws-of-nature',
+ 'title': 'Laws of Nature',
+ 'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.',
+ },
+ 'playlist_count': 4,
+ }
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index a00bfcb35..b4b40f2d4 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -75,7 +75,9 @@ class AdultSwimIE(InfoExtractor):
video_path = mobj.group('path')
webpage = self._download_webpage(url, video_path)
- episode_id = self._html_search_regex(r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>', webpage, 'episode_id')
+ episode_id = self._html_search_regex(
+ r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>',
+ webpage, 'episode_id')
title = self._og_search_title(webpage)
index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id
@@ -97,7 +99,9 @@ class AdultSwimIE(InfoExtractor):
duration = segment_el.attrib.get('duration')
segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id
- idoc = self._download_xml(segment_url, segment_title, 'Downloading segment information', 'Unable to download segment information')
+ idoc = self._download_xml(
+ segment_url, segment_title,
+ 'Downloading segment information', 'Unable to download segment information')
formats = []
file_els = idoc.findall('.//files/file')
diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py
new file mode 100644
index 000000000..bc64423a3
--- /dev/null
+++ b/youtube_dl/extractor/anysex.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+)
+
+
+class AnySexIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?anysex\.com/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://anysex.com/156592/',
+ 'md5': '023e9fbb7f7987f5529a394c34ad3d3d',
+ 'info_dict': {
+ 'id': '156592',
+ 'ext': 'mp4',
+ 'title': 'Busty and sexy blondie in her bikini strips for you',
+ 'description': 'md5:de9e418178e2931c10b62966474e1383',
+ 'categories': ['Erotic'],
+ 'duration': 270,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
+
+ title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+ description = self._html_search_regex(
+ r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False)
+
+ categories = re.findall(
+ r'<a href="http://anysex\.com/categories/[^"]+" title="[^"]*">([^<]+)</a>', webpage)
+
+ duration = parse_duration(self._search_regex(
+ r'<b>Duration:</b> (\d+:\d+)', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<b>Views:</b> (\d+)', webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index a7bfe5a5c..47f8e4157 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -21,7 +21,7 @@ class AolIE(InfoExtractor):
(?:$|\?)
'''
- _TEST = {
+ _TESTS = [{
'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
'md5': '18ef68f48740e86ae94b98da815eec42',
'info_dict': {
@@ -30,7 +30,14 @@ class AolIE(InfoExtractor):
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
},
'add_ie': ['FiveMin'],
- }
+ }, {
+ 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316',
+ 'info_dict': {
+ 'id': '152147',
+ 'title': 'Brace Yourself - Today\'s Weirdest News',
+ },
+ 'playlist_mincount': 10,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 7f0da8ab6..12457f0f9 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -10,10 +10,15 @@ from ..utils import (
qualities,
compat_urllib_parse_urlparse,
compat_urllib_parse,
+ int_or_none,
+ parse_duration,
+ unified_strdate,
+ xpath_text,
)
-class ARDIE(InfoExtractor):
+class ARDMediathekIE(InfoExtractor):
+ IE_NAME = 'ARD:mediathek'
_VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
@@ -128,3 +133,61 @@ class ARDIE(InfoExtractor):
'formats': formats,
'thumbnail': thumbnail,
}
+
+
+class ARDIE(InfoExtractor):
+ _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+ _TEST = {
+ 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
+ 'md5': 'd216c3a86493f9322545e045ddc3eb35',
+ 'info_dict': {
+ 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
+ 'id': '100',
+ 'ext': 'mp4',
+ 'duration': 2600,
+ 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
+ 'upload_date': '20140804',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+
+ player_url = mobj.group('mainurl') + '~playerXml.xml'
+ doc = self._download_xml(player_url, display_id)
+ video_node = doc.find('./video')
+ upload_date = unified_strdate(xpath_text(
+ video_node, './broadcastDate'))
+ thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
+
+ formats = []
+ for a in video_node.findall('.//asset'):
+ f = {
+ 'format_id': a.attrib['type'],
+ 'width': int_or_none(a.find('./frameWidth').text),
+ 'height': int_or_none(a.find('./frameHeight').text),
+ 'vbr': int_or_none(a.find('./bitrateVideo').text),
+ 'abr': int_or_none(a.find('./bitrateAudio').text),
+ 'vcodec': a.find('./codecVideo').text,
+ 'tbr': int_or_none(a.find('./totalBitrate').text),
+ }
+ if a.find('./serverPrefix').text:
+ f['url'] = a.find('./serverPrefix').text
+ f['playpath'] = a.find('./fileName').text
+ else:
+ f['url'] = a.find('./fileName').text
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': mobj.group('id'),
+ 'formats': formats,
+ 'display_id': display_id,
+ 'title': video_node.find('./title').text,
+ 'duration': parse_duration(video_node.find('./duration').text),
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ }
+
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 1c72b2ff6..957d35979 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -78,7 +78,8 @@ class ArteTVPlus7IE(InfoExtractor):
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(
- r'arte_vp_url="(.*?)"', webpage, 'json vp url')
+ [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'],
+ webpage, 'json vp url')
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
index ccd31c4c7..de5d4faf3 100644
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -59,6 +59,13 @@ class BambuserChannelIE(InfoExtractor):
_VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
# The maximum number we can get with each request
_STEP = 50
+ _TEST = {
+ 'url': 'http://bambuser.com/channel/pixelversity',
+ 'info_dict': {
+ 'title': 'pixelversity',
+ },
+ 'playlist_mincount': 60,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -73,10 +80,10 @@ class BambuserChannelIE(InfoExtractor):
req = compat_urllib_request.Request(req_url)
# Without setting this header, we wouldn't get any result
req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
- info_json = self._download_webpage(req, user,
- 'Downloading page %d' % i)
- results = json.loads(info_json)['result']
- if len(results) == 0:
+ data = self._download_json(
+ req, user, 'Downloading page %d' % i)
+ results = data['result']
+ if not results:
break
last_id = results[-1]['vid']
urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index dcbbdef43..c569aa4d2 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -96,7 +96,7 @@ class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
- _TEST = {
+ _TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'playlist': [
{
@@ -118,7 +118,13 @@ class BandcampAlbumIE(InfoExtractor):
'playlistend': 2
},
'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
- }
+ }, {
+ 'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
+ 'info_dict': {
+ 'title': 'Hierophany of the Open Grave',
+ },
+ 'playlist_mincount': 9,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
new file mode 100644
index 000000000..314e37f8b
--- /dev/null
+++ b/youtube_dl/extractor/beeg.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class BeegIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://beeg.com/5416503',
+ 'md5': '634526ae978711f6b748fe0dd6c11f57',
+ 'info_dict': {
+ 'id': '5416503',
+ 'ext': 'mp4',
+ 'title': 'Sultry Striptease',
+ 'description': 'md5:6db3c6177972822aaba18652ff59c773',
+ 'categories': list, # NSFW
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ quality_arr = self._search_regex(
+ r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats')
+
+ formats = [{
+ 'url': fmt[1],
+ 'format_id': fmt[0],
+ 'height': int(fmt[0][:-1]),
+ } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)]
+
+ self._sort_formats(formats)
+
+ title = self._html_search_regex(
+ r'<title>([^<]+)\s*-\s*beeg\.?</title>', webpage, 'title')
+
+ description = self._html_search_regex(
+ r'<meta name="description" content="([^"]*)"',
+ webpage, 'description', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'\'previewer.url\'\s*:\s*"([^"]*)"',
+ webpage, 'thumbnail', fatal=False)
+
+ categories_str = self._html_search_regex(
+ r'<meta name="keywords" content="([^"]+)"', webpage, 'categories', fatal=False)
+ categories = (
+ None if categories_str is None
+ else categories_str.split(','))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'formats': formats,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 86f0c2861..4e2960c62 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -29,17 +29,6 @@ class BRIE(InfoExtractor):
}
},
{
- 'url': 'http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html',
- 'md5': 'ab451b09d861dbed7d7cc9ab0be19ebe',
- 'info_dict': {
- 'id': '2c060e69-3a27-4e13-b0f0-668fac17d812',
- 'ext': 'mp4',
- 'title': 'Über den Pass',
- 'description': 'Die Eroberung der Alpen: Über den Pass',
- 'duration': 2588,
- }
- },
- {
'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
'info_dict': {
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 822f9a7be..db48dc24f 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -25,7 +25,7 @@ class CBSIE(InfoExtractor):
}, {
'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
'info_dict': {
- 'id': 'P9gjWjelt6iP',
+ 'id': 'WWF_5KqY3PK1',
'ext': 'flv',
'title': 'Live on Letterman - St. Vincent',
'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py
index a62395d4b..c922f6959 100644
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -42,7 +42,7 @@ class ChilloutzoneIE(InfoExtractor):
'id': '85523671',
'ext': 'mp4',
'title': 'The Sunday Times - Icons',
- 'description': 'md5:a5f7ff82e2f7a9ed77473fe666954e84',
+ 'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}',
'uploader': 'Us',
'uploader_id': 'usfilms',
'upload_date': '20140131'
diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py
index 58846e8e7..65c12136a 100644
--- a/youtube_dl/extractor/cliphunter.py
+++ b/youtube_dl/extractor/cliphunter.py
@@ -1,11 +1,13 @@
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
+from ..utils import int_or_none
-translation_table = {
+_translation_table = {
'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n',
'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r',
'y': 'l', 'z': 'i',
@@ -13,6 +15,10 @@ translation_table = {
}
+def _decode(s):
+ return ''.join(_translation_table.get(c, c) for c in s)
+
+
class CliphunterIE(InfoExtractor):
IE_NAME = 'cliphunter'
@@ -22,10 +28,14 @@ class CliphunterIE(InfoExtractor):
'''
_TEST = {
'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo',
- 'file': '1012420.flv',
- 'md5': '15e7740f30428abf70f4223478dc1225',
+ 'md5': 'a2ba71eebf523859fe527a61018f723e',
'info_dict': {
+ 'id': '1012420',
+ 'ext': 'mp4',
'title': 'Fun Jynx Maze solo',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'duration': 1317,
}
}
@@ -35,22 +45,55 @@ class CliphunterIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
+ video_title = self._search_regex(
+ r'mediaTitle = "([^"]+)"', webpage, 'title')
+
pl_fiji = self._search_regex(
r'pl_fiji = \'([^\']+)\'', webpage, 'video data')
pl_c_qual = self._search_regex(
r'pl_c_qual = "(.)"', webpage, 'video quality')
- video_title = self._search_regex(
- r'mediaTitle = "([^"]+)"', webpage, 'title')
-
- video_url = ''.join(translation_table.get(c, c) for c in pl_fiji)
-
+ video_url = _decode(pl_fiji)
formats = [{
'url': video_url,
- 'format_id': pl_c_qual,
+ 'format_id': 'default-%s' % pl_c_qual,
}]
+ qualities_json = self._search_regex(
+ r'var pl_qualities\s*=\s*(.*?);\n', webpage, 'quality info')
+ qualities_data = json.loads(qualities_json)
+
+ for i, t in enumerate(
+ re.findall(r"pl_fiji_([a-z0-9]+)\s*=\s*'([^']+')", webpage)):
+ quality_id, crypted_url = t
+ video_url = _decode(crypted_url)
+ f = {
+ 'format_id': quality_id,
+ 'url': video_url,
+ 'quality': i,
+ }
+ if quality_id in qualities_data:
+ qd = qualities_data[quality_id]
+ m = re.match(
+ r'''(?x)<b>(?P<width>[0-9]+)x(?P<height>[0-9]+)<\\/b>
+ \s*\(\s*(?P<tbr>[0-9]+)\s*kb\\/s''', qd)
+ if m:
+ f['width'] = int(m.group('width'))
+ f['height'] = int(m.group('height'))
+ f['tbr'] = int(m.group('tbr'))
+ formats.append(f)
+ self._sort_formats(formats)
+
+ thumbnail = self._search_regex(
+ r"var\s+mov_thumb\s*=\s*'([^']+)';",
+ webpage, 'thumbnail', fatal=False)
+ duration = int_or_none(self._search_regex(
+ r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False))
+
return {
'id': video_id,
'title': video_title,
'formats': formats,
+ 'duration': duration,
+ 'age_limit': self._rta_search(webpage),
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
new file mode 100644
index 000000000..386f080d2
--- /dev/null
+++ b/youtube_dl/extractor/cloudy.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_parse_qs,
+ compat_urllib_parse,
+ remove_end,
+ HEADRequest,
+ compat_HTTPError,
+)
+
+
+class CloudyIE(InfoExtractor):
+ _IE_DESC = 'cloudy.ec and videoraj.ch'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/
+ (?:v/|embed\.php\?id=)
+ (?P<id>[A-Za-z0-9]+)
+ '''
+ _EMBED_URL = 'http://www.%s/embed.php?id=%s'
+ _API_URL = 'http://www.%s/api/player.api.php?%s'
+ _MAX_TRIES = 2
+ _TESTS = [
+ {
+ 'url': 'https://www.cloudy.ec/v/af511e2527aac',
+ 'md5': '5cb253ace826a42f35b4740539bedf07',
+ 'info_dict': {
+ 'id': 'af511e2527aac',
+ 'ext': 'flv',
+ 'title': 'Funny Cats and Animals Compilation june 2013',
+ }
+ },
+ {
+ 'url': 'http://www.videoraj.ch/v/47f399fd8bb60',
+ 'md5': '7d0f8799d91efd4eda26587421c3c3b0',
+ 'info_dict': {
+ 'id': '47f399fd8bb60',
+ 'ext': 'flv',
+ 'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?',
+ }
+ }
+ ]
+
+ def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0):
+
+ if try_num > self._MAX_TRIES - 1:
+ raise ExtractorError('Unable to extract video URL', expected=True)
+
+ form = {
+ 'file': video_id,
+ 'key': file_key,
+ }
+
+ if error_url:
+ form.update({
+ 'numOfErrors': try_num,
+ 'errorCode': '404',
+ 'errorUrl': error_url,
+ })
+
+ data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form))
+ player_data = self._download_webpage(
+ data_url, video_id, 'Downloading player data')
+ data = compat_parse_qs(player_data)
+
+ try_num += 1
+
+ if 'error' in data:
+ raise ExtractorError(
+ '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])),
+ expected=True)
+
+ title = data.get('title', [None])[0]
+ if title:
+ title = remove_end(title, '&asdasdas').strip()
+
+ video_url = data.get('url', [None])[0]
+
+ if video_url:
+ try:
+ self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]:
+ self.report_warning('Invalid video URL, requesting another', video_id)
+ return self._extract_video(video_host, video_id, file_key, video_url, try_num)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_host = mobj.group('host')
+ video_id = mobj.group('id')
+
+ url = self._EMBED_URL % (video_host, video_id)
+ webpage = self._download_webpage(url, video_id)
+
+ file_key = self._search_regex(
+ r'filekey\s*=\s*"([^"]+)"', webpage, 'file_key')
+
+ return self._extract_video(video_host, video_id, file_key)
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index c81ce5a96..035046120 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -43,14 +43,14 @@ class ComedyCentralShowsIE(InfoExtractor):
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip>
- (?:(?:guests/[^/]+|videos|video-playlists|special-editions)/[^/]+/(?P<videotitle>[^/?#]+))
+ (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
|(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
)|
(?P<interview>
extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
(?:[?#].*|$)'''
- _TEST = {
+ _TESTS = [{
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
'info_dict': {
@@ -61,7 +61,34 @@ class ComedyCentralShowsIE(InfoExtractor):
'uploader': 'thedailyshow',
'title': 'thedailyshow kristen-stewart part 1',
}
- }
+ }, {
+ 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',
+ 'only_matching': True,
+ }]
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
@@ -185,6 +212,9 @@ class ComedyCentralShowsIE(InfoExtractor):
'ext': self._video_extensions.get(format, 'mp4'),
'height': h,
'width': w,
+
+ 'format_note': 'HTTP 400 at the moment (patches welcome!)',
+ 'preference': -100,
})
formats.append({
'format_id': 'rtmp-%s' % format,
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 69d5f687c..929dd1e97 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import base64
import hashlib
import json
@@ -114,7 +116,7 @@ class InfoExtractor(object):
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
- location: Physical location of the video.
+ location: Physical location where the video was filmed.
subtitles: The subtitle file contents as a dictionary in the format
{language: subtitles}.
duration: Length of the video in seconds, as an integer.
@@ -202,17 +204,17 @@ class InfoExtractor(object):
self.report_download_webpage(video_id)
elif note is not False:
if video_id is None:
- self.to_screen(u'%s' % (note,))
+ self.to_screen('%s' % (note,))
else:
- self.to_screen(u'%s: %s' % (video_id, note))
+ self.to_screen('%s: %s' % (video_id, note))
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is False:
return False
if errnote is None:
- errnote = u'Unable to download webpage'
- errmsg = u'%s: %s' % (errnote, compat_str(err))
+ errnote = 'Unable to download webpage'
+ errmsg = '%s: %s' % (errnote, compat_str(err))
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
@@ -249,7 +251,7 @@ class InfoExtractor(object):
url = url_or_request.get_full_url()
except AttributeError:
url = url_or_request
- self.to_screen(u'Dumping request to ' + url)
+ self.to_screen('Dumping request to ' + url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self._downloader.params.get('write_pages', False):
@@ -259,11 +261,11 @@ class InfoExtractor(object):
url = url_or_request
basen = '%s_%s' % (video_id, url)
if len(basen) > 240:
- h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+ h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
basen = basen[:240 - len(h)] + h
raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
- self.to_screen(u'Saving request to ' + filename)
+ self.to_screen('Saving request to ' + filename)
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -272,14 +274,14 @@ class InfoExtractor(object):
except LookupError:
content = webpage_bytes.decode('utf-8', 'replace')
- if (u'<title>Access to this site is blocked</title>' in content and
- u'Websense' in content[:512]):
- msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
+ if ('<title>Access to this site is blocked</title>' in content and
+ 'Websense' in content[:512]):
+ msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
blocked_iframe = self._html_search_regex(
r'<iframe src="([^"]+)"', content,
- u'Websense information URL', default=None)
+ 'Websense information URL', default=None)
if blocked_iframe:
- msg += u' Visit %s for more details' % blocked_iframe
+ msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
return (content, urlh)
@@ -294,7 +296,7 @@ class InfoExtractor(object):
return content
def _download_xml(self, url_or_request, video_id,
- note=u'Downloading XML', errnote=u'Unable to download XML',
+ note='Downloading XML', errnote='Unable to download XML',
transform_source=None, fatal=True):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
@@ -306,8 +308,8 @@ class InfoExtractor(object):
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
- note=u'Downloading JSON metadata',
- errnote=u'Unable to download JSON metadata',
+ note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata',
transform_source=None,
fatal=True):
json_string = self._download_webpage(
@@ -322,29 +324,29 @@ class InfoExtractor(object):
raise ExtractorError('Failed to download JSON', cause=ve)
def report_warning(self, msg, video_id=None):
- idstr = u'' if video_id is None else u'%s: ' % video_id
+ idstr = '' if video_id is None else '%s: ' % video_id
self._downloader.report_warning(
- u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
+ '[%s] %s%s' % (self.IE_NAME, idstr, msg))
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
- self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
+ self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
def report_extraction(self, id_or_name):
"""Report information extraction."""
- self.to_screen(u'%s: Extracting information' % id_or_name)
+ self.to_screen('%s: Extracting information' % id_or_name)
def report_download_webpage(self, video_id):
"""Report webpage download."""
- self.to_screen(u'%s: Downloading webpage' % video_id)
+ self.to_screen('%s: Downloading webpage' % video_id)
def report_age_confirmation(self):
"""Report attempt to confirm age."""
- self.to_screen(u'Confirming age')
+ self.to_screen('Confirming age')
def report_login(self):
"""Report attempt to log in."""
- self.to_screen(u'Logging in')
+ self.to_screen('Logging in')
#Methods for following #608
@staticmethod
@@ -384,7 +386,7 @@ class InfoExtractor(object):
break
if os.name != 'nt' and sys.stderr.isatty():
- _name = u'\033[0;34m%s\033[0m' % name
+ _name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -394,10 +396,10 @@ class InfoExtractor(object):
elif default is not _NO_DEFAULT:
return default
elif fatal:
- raise RegexNotFoundError(u'Unable to extract %s' % _name)
+ raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning(u'unable to extract %s; '
- u'please report this issue on http://yt-dl.org/bug' % _name)
+ self._downloader.report_warning('unable to extract %s; '
+ 'please report this issue on http://yt-dl.org/bug' % _name)
return None
def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
@@ -436,7 +438,7 @@ class InfoExtractor(object):
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+ self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
return (username, password)
@@ -476,7 +478,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -535,7 +537,7 @@ class InfoExtractor(object):
def _sort_formats(self, formats):
if not formats:
- raise ExtractorError(u'No video formats found')
+ raise ExtractorError('No video formats found')
def _formats_key(f):
# TODO remove the following workaround
@@ -555,9 +557,9 @@ class InfoExtractor(object):
if f.get('vcodec') == 'none': # audio only
if self._downloader.params.get('prefer_free_formats'):
- ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
+ ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
else:
- ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
+ ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
ext_preference = 0
try:
audio_ext_preference = ORDER.index(f['ext'])
@@ -565,9 +567,9 @@ class InfoExtractor(object):
audio_ext_preference = -1
else:
if self._downloader.params.get('prefer_free_formats'):
- ORDER = [u'flv', u'mp4', u'webm']
+ ORDER = ['flv', 'mp4', 'webm']
else:
- ORDER = [u'webm', u'flv', u'mp4']
+ ORDER = ['webm', 'flv', 'mp4']
try:
ext_preference = ORDER.index(f['ext'])
except ValueError:
@@ -609,7 +611,7 @@ class InfoExtractor(object):
def _sleep(self, timeout, video_id, msg_template=None):
if msg_template is None:
- msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
+ msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
msg = msg_template % {'video_id': video_id, 'timeout': timeout}
self.to_screen(msg)
time.sleep(timeout)
@@ -636,6 +638,61 @@ class InfoExtractor(object):
return formats
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None):
+ formats = [{
+ 'format_id': 'm3u8-meta',
+ 'url': m3u8_url,
+ 'ext': ext,
+ 'protocol': 'm3u8',
+ 'preference': -1,
+ 'resolution': 'multiple',
+ 'format_note': 'Quality selection URL',
+ }]
+
+ m3u8_doc = self._download_webpage(m3u8_url, video_id)
+ last_info = None
+ kv_rex = re.compile(
+ r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
+ for line in m3u8_doc.splitlines():
+ if line.startswith('#EXT-X-STREAM-INF:'):
+ last_info = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_info[m.group('key')] = v
+ elif line.startswith('#') or not line.strip():
+ continue
+ else:
+ if last_info is None:
+ formats.append({'url': line})
+ continue
+ tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+
+ f = {
+ 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
+ 'url': line.strip(),
+ 'tbr': tbr,
+ 'ext': ext,
+ }
+ codecs = last_info.get('CODECS')
+ if codecs:
+ # TODO: looks like video codec is not always necessarily goes first
+ va_codecs = codecs.split(',')
+ if va_codecs[0]:
+ f['vcodec'] = va_codecs[0].partition('.')[0]
+ if len(va_codecs) > 1 and va_codecs[1]:
+ f['acodec'] = va_codecs[1].partition('.')[0]
+ resolution = last_info.get('RESOLUTION')
+ if resolution:
+ width_str, height_str = resolution.split('x')
+ f['width'] = int(width_str)
+ f['height'] = int(height_str)
+ formats.append(f)
+ last_info = {}
+ self._sort_formats(formats)
+ return formats
+
class SearchInfoExtractor(InfoExtractor):
"""
@@ -655,7 +712,7 @@ class SearchInfoExtractor(InfoExtractor):
def _real_extract(self, query):
mobj = re.match(self._make_valid_url(), query)
if mobj is None:
- raise ExtractorError(u'Invalid search query "%s"' % query)
+ raise ExtractorError('Invalid search query "%s"' % query)
prefix = mobj.group('prefix')
query = mobj.group('query')
@@ -666,9 +723,9 @@ class SearchInfoExtractor(InfoExtractor):
else:
n = int(prefix)
if n <= 0:
- raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
+ raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
elif n > self._MAX_RESULTS:
- self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
+ self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
n = self._MAX_RESULTS
return self._get_n_results(query, n)
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 026a9177e..4903764f7 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -5,6 +5,7 @@ import re
import json
import base64
import zlib
+import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
@@ -17,6 +18,7 @@ from ..utils import (
intlist_to_bytes,
unified_strdate,
clean_html,
+ urlencode_postdata,
)
from ..aes import (
aes_cbc_decrypt,
@@ -51,6 +53,26 @@ class CrunchyrollIE(InfoExtractor):
'1080': ('80', '108'),
}
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ login_url = 'https://www.crunchyroll.com/?a=formhandler'
+ data = urlencode_postdata({
+ 'formname': 'RpcApiUser_Login',
+ 'name': username,
+ 'password': password,
+ })
+ login_request = compat_urllib_request.Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+
+ def _real_initialize(self):
+ self._login()
+
+
def _decrypt_subtitles(self, data, iv, id):
data = bytes_to_intlist(data)
iv = bytes_to_intlist(iv)
@@ -97,6 +119,75 @@ class CrunchyrollIE(InfoExtractor):
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
return output
+ def _convert_subtitles_to_ass(self, subtitles):
+ output = ''
+
+ def ass_bool(strvalue):
+ assvalue = '0'
+ if strvalue == '1':
+ assvalue = '-1'
+ return assvalue
+
+ sub_root = xml.etree.ElementTree.fromstring(subtitles)
+ if not sub_root:
+ return output
+
+ output = '[Script Info]\n'
+ output += 'Title: %s\n' % sub_root.attrib["title"]
+ output += 'ScriptType: v4.00+\n'
+ output += 'WrapStyle: %s\n' % sub_root.attrib["wrap_style"]
+ output += 'PlayResX: %s\n' % sub_root.attrib["play_res_x"]
+ output += 'PlayResY: %s\n' % sub_root.attrib["play_res_y"]
+ output += """ScaledBorderAndShadow: yes
+
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+"""
+ for style in sub_root.findall('./styles/style'):
+ output += 'Style: ' + style.attrib["name"]
+ output += ',' + style.attrib["font_name"]
+ output += ',' + style.attrib["font_size"]
+ output += ',' + style.attrib["primary_colour"]
+ output += ',' + style.attrib["secondary_colour"]
+ output += ',' + style.attrib["outline_colour"]
+ output += ',' + style.attrib["back_colour"]
+ output += ',' + ass_bool(style.attrib["bold"])
+ output += ',' + ass_bool(style.attrib["italic"])
+ output += ',' + ass_bool(style.attrib["underline"])
+ output += ',' + ass_bool(style.attrib["strikeout"])
+ output += ',' + style.attrib["scale_x"]
+ output += ',' + style.attrib["scale_y"]
+ output += ',' + style.attrib["spacing"]
+ output += ',' + style.attrib["angle"]
+ output += ',' + style.attrib["border_style"]
+ output += ',' + style.attrib["outline"]
+ output += ',' + style.attrib["shadow"]
+ output += ',' + style.attrib["alignment"]
+ output += ',' + style.attrib["margin_l"]
+ output += ',' + style.attrib["margin_r"]
+ output += ',' + style.attrib["margin_v"]
+ output += ',' + style.attrib["encoding"]
+ output += '\n'
+
+ output += """
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+ for event in sub_root.findall('./events/event'):
+ output += 'Dialogue: 0'
+ output += ',' + event.attrib["start"]
+ output += ',' + event.attrib["end"]
+ output += ',' + event.attrib["style"]
+ output += ',' + event.attrib["name"]
+ output += ',' + event.attrib["margin_l"]
+ output += ',' + event.attrib["margin_r"]
+ output += ',' + event.attrib["margin_v"]
+ output += ',' + event.attrib["effect"]
+ output += ',' + event.attrib["text"]
+ output += '\n'
+
+ return output
+
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
@@ -158,6 +249,7 @@ class CrunchyrollIE(InfoExtractor):
})
subtitles = {}
+ sub_format = self._downloader.params.get('subtitlesformat', 'srt')
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\
video_id, note='Downloading subtitles for '+sub_name)
@@ -174,7 +266,10 @@ class CrunchyrollIE(InfoExtractor):
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
- subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
+ if sub_format == 'ass':
+ subtitles[lang_code] = self._convert_subtitles_to_ass(subtitle)
+ else:
+ subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index b6552c542..541106684 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -34,6 +34,13 @@ class CSpanIE(InfoExtractor):
'title': 'International Health Care Models',
'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
}
+ }, {
+ 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
+ 'info_dict': {
+ 'id': '342759',
+ 'title': 'General Motors Ignition Switch Recall',
+ },
+ 'playlist_duration_sum': 14855,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 5d0bfe454..66a8f16d9 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,3 +1,6 @@
+#coding: utf-8
+from __future__ import unicode_literals
+
import re
import json
import itertools
@@ -28,51 +31,53 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
"""Information Extractor for Dailymotion"""
_VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
- IE_NAME = u'dailymotion'
+ IE_NAME = 'dailymotion'
_FORMATS = [
- (u'stream_h264_ld_url', u'ld'),
- (u'stream_h264_url', u'standard'),
- (u'stream_h264_hq_url', u'hq'),
- (u'stream_h264_hd_url', u'hd'),
- (u'stream_h264_hd1080_url', u'hd180'),
+ ('stream_h264_ld_url', 'ld'),
+ ('stream_h264_url', 'standard'),
+ ('stream_h264_hq_url', 'hq'),
+ ('stream_h264_hd_url', 'hd'),
+ ('stream_h264_hd1080_url', 'hd180'),
]
_TESTS = [
{
- u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
- u'file': u'x33vw9.mp4',
- u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
- u'info_dict': {
- u"uploader": u"Amphora Alex and Van .",
- u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
+ 'url': 'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
+ 'md5': '392c4b85a60a90dc4792da41ce3144eb',
+ 'info_dict': {
+ 'id': 'x33vw9',
+ 'ext': 'mp4',
+ 'uploader': 'Amphora Alex and Van .',
+ 'title': 'Tutoriel de Youtubeur"DL DES VIDEO DE YOUTUBE"',
}
},
# Vevo video
{
- u'url': u'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
- u'file': u'USUV71301934.mp4',
- u'info_dict': {
- u'title': u'Roar (Official)',
- u'uploader': u'Katy Perry',
- u'upload_date': u'20130905',
+ 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+ 'info_dict': {
+ 'title': 'Roar (Official)',
+ 'id': 'USUV71301934',
+ 'ext': 'mp4',
+ 'uploader': 'Katy Perry',
+ 'upload_date': '20130905',
},
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
- u'skip': u'VEVO is only available in some countries',
+ 'skip': 'VEVO is only available in some countries',
},
# age-restricted video
{
- u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
- u'file': u'xyh2zz.mp4',
- u'md5': u'0d667a7b9cebecc3c89ee93099c4159d',
- u'info_dict': {
- u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
- u'uploader': 'HotWaves1012',
- u'age_limit': 18,
+ 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+ 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+ 'info_dict': {
+ 'id': 'xyh2zz',
+ 'ext': 'mp4',
+ 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+ 'uploader': 'HotWaves1012',
+ 'age_limit': 18,
}
-
}
]
@@ -97,8 +102,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
webpage)
if m_vevo is not None:
vevo_id = m_vevo.group('id')
- self.to_screen(u'Vevo video detected: %s' % vevo_id)
- return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo')
+ self.to_screen('Vevo video detected: %s' % vevo_id)
+ return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
age_limit = self._rta_search(webpage)
@@ -109,7 +114,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
embed_page = self._download_webpage(embed_url, video_id,
- u'Downloading embed page')
+ 'Downloading embed page')
info = self._search_regex(r'var info = ({.*?}),$', embed_page,
'video info', flags=re.MULTILINE)
info = json.loads(info)
@@ -134,7 +139,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'height': height,
})
if not formats:
- raise ExtractorError(u'Unable to extract video URL')
+ raise ExtractorError('Unable to extract video URL')
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
@@ -143,7 +148,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
return
view_count = self._search_regex(
- r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False)
+ r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, 'view count', fatal=False)
if view_count is not None:
view_count = str_to_int(view_count)
@@ -165,28 +170,35 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
video_id, note=False)
except ExtractorError as err:
- self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+ self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
return {}
info = json.loads(sub_list)
if (info['total'] > 0):
sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
return sub_lang_list
- self._downloader.report_warning(u'video doesn\'t have subtitles')
+ self._downloader.report_warning('video doesn\'t have subtitles')
return {}
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
- IE_NAME = u'dailymotion:playlist'
+ IE_NAME = 'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
_MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
+ _TESTS = [{
+ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
+ 'info_dict': {
+ 'title': 'SPORT',
+ },
+ 'playlist_mincount': 20,
+ }]
def _extract_entries(self, id):
video_ids = []
for pagenum in itertools.count(1):
request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
webpage = self._download_webpage(request,
- id, u'Downloading page %s' % pagenum)
+ id, 'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
@@ -209,9 +221,17 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
- IE_NAME = u'dailymotion:user'
+ IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
+ _TESTS = [{
+ 'url': 'https://www.dailymotion.com/user/nqtv',
+ 'info_dict': {
+ 'id': 'nqtv',
+ 'title': 'Rémi Gaillard',
+ },
+ 'playlist_mincount': 100,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -219,7 +239,7 @@ class DailymotionUserIE(DailymotionPlaylistIE):
webpage = self._download_webpage(url, user)
full_user = unescapeHTML(self._html_search_regex(
r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
- webpage, u'user', flags=re.DOTALL))
+ webpage, 'user'))
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 6033cd94a..45d66e2e6 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -11,10 +11,10 @@ from ..utils import (
class DaumIE(InfoExtractor):
- _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:v/|.*?clipid=)(?P<id>[^?#&]+)'
IE_NAME = 'daum.net'
- _TEST = {
+ _TESTS = [{
'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
'info_dict': {
'id': '52554690',
@@ -24,11 +24,17 @@ class DaumIE(InfoExtractor):
'upload_date': '20130831',
'duration': 3868,
},
- }
+ }, {
+ 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
@@ -42,7 +48,6 @@ class DaumIE(InfoExtractor):
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
video_id, 'Downloading video formats info')
- self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
for format_el in urls.findall('result/output_list/output_list'):
profile = format_el.attrib['profile']
@@ -52,7 +57,7 @@ class DaumIE(InfoExtractor):
})
url_doc = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
- video_id, note=False)
+ video_id, note='Downloading video data for %s format' % profile)
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py
new file mode 100644
index 000000000..1d3e2ff08
--- /dev/null
+++ b/youtube_dl/extractor/dbtv.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ clean_html,
+)
+
+
+class DBTVIE(InfoExtractor):
+ _VALID_URL = r'http://dbtv\.no/(?P<id>[0-9]+)#(?P<display_id>.+)'
+ _TEST = {
+ 'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
+ 'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc',
+ 'info_dict': {
+ 'id': '33100',
+ 'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
+ 'ext': 'mp4',
+ 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen',
+ 'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'timestamp': 1404039863.438,
+ 'upload_date': '20140629',
+ 'duration': 69.544,
+ 'view_count': int,
+ 'categories': list,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ data = self._download_json(
+ 'http://api.dbtv.no/discovery/%s' % video_id, display_id)
+
+ video = data['playlist'][0]
+
+ formats = [{
+ 'url': f['URL'],
+ 'vcodec': f.get('container'),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'vbr': float_or_none(f.get('rate'), 1000),
+ 'filesize': int_or_none(f.get('size')),
+ } for f in video['renditions'] if 'URL' in f]
+
+ if not formats:
+ for url_key, format_id in [('URL', 'mp4'), ('HLSURL', 'hls')]:
+ if url_key in video:
+ formats.append({
+ 'url': video[url_key],
+ 'format_id': format_id,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video['id'],
+ 'display_id': display_id,
+ 'title': video['title'],
+ 'description': clean_html(video['desc']),
+ 'thumbnail': video.get('splash') or video.get('thumb'),
+ 'timestamp': float_or_none(video.get('publishedAt'), 1000),
+ 'duration': float_or_none(video.get('length'), 1000),
+ 'view_count': int_or_none(video.get('views')),
+ 'categories': video.get('tags'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py
new file mode 100644
index 000000000..c3205ff5f
--- /dev/null
+++ b/youtube_dl/extractor/deezer.py
@@ -0,0 +1,89 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ orderedSet,
+)
+
+
+class DeezerPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?deezer\.com/playlist/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.deezer.com/playlist/176747451',
+ 'info_dict': {
+ 'id': '176747451',
+ 'title': 'Best!',
+ 'uploader': 'Anonymous',
+ 'thumbnail': 're:^https?://cdn-images.deezer.com/images/cover/.*\.jpg$',
+ },
+ 'playlist_count': 30,
+ 'skip': 'Only available in .de',
+ }
+
+ def _real_extract(self, url):
+ if 'test' not in self._downloader.params:
+ self._downloader.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!')
+
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, playlist_id)
+ geoblocking_msg = self._html_search_regex(
+ r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message',
+ default=None)
+ if geoblocking_msg is not None:
+ raise ExtractorError(
+ 'Deezer said: %s' % geoblocking_msg, expected=True)
+
+ data_json = self._search_regex(
+ r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n', webpage, 'data JSON')
+ data = json.loads(data_json)
+
+ playlist_title = data.get('DATA', {}).get('TITLE')
+ playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME')
+ playlist_thumbnail = self._search_regex(
+ r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage,
+ 'playlist thumbnail')
+
+ preview_pattern = self._search_regex(
+ r"var SOUND_PREVIEW_GATEWAY\s*=\s*'([^']+)';", webpage,
+ 'preview URL pattern', fatal=False)
+ entries = []
+ for s in data['SONGS']['data']:
+ puid = s['MD5_ORIGIN']
+ preview_video_url = preview_pattern.\
+ replace('{0}', puid[0]).\
+ replace('{1}', puid).\
+ replace('{2}', s['MEDIA_VERSION'])
+ formats = [{
+ 'format_id': 'preview',
+ 'url': preview_video_url,
+ 'preference': -100, # Only the first 30 seconds
+ 'ext': 'mp3',
+ }]
+ self._sort_formats(formats)
+ artists = ', '.join(
+ orderedSet(a['ART_NAME'] for a in s['ARTISTS']))
+ entries.append({
+ 'id': s['SNG_ID'],
+ 'duration': int_or_none(s.get('DURATION')),
+ 'title': '%s - %s' % (artists, s['SNG_TITLE']),
+ 'uploader': s['ART_NAME'],
+ 'uploader_id': s['ART_ID'],
+ 'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0,
+ 'formats': formats,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_title,
+ 'uploader': playlist_uploader,
+ 'thumbnail': playlist_thumbnail,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py
index 9f569aa93..1e1763abf 100644
--- a/youtube_dl/extractor/dropbox.py
+++ b/youtube_dl/extractor/dropbox.py
@@ -11,8 +11,7 @@ from ..utils import compat_urllib_parse_unquote
class DropboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)'
_TEST = {
- 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4',
- 'md5': '8a3d905427a6951ccb9eb292f154530b',
+ 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
'info_dict': {
'id': 'nelirfsxnmcfbfh',
'ext': 'mp4',
@@ -25,7 +24,9 @@ class DropboxIE(InfoExtractor):
video_id = mobj.group('id')
fn = compat_urllib_parse_unquote(mobj.group('title'))
title = os.path.splitext(fn)[0]
- video_url = url + '?dl=1'
+ video_url = (
+ re.sub(r'[?&]dl=0', '', url) +
+ ('?' if '?' in url else '&') + 'dl=1')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py
new file mode 100644
index 000000000..d5bfd7f22
--- /dev/null
+++ b/youtube_dl/extractor/drtuber.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import str_to_int
+
+
+class DrTuberIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?drtuber\.com/video/(?P<id>\d+)/(?P<display_id>[\w-]+)'
+ _TEST = {
+ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf',
+ 'md5': '93e680cf2536ad0dfb7e74d94a89facd',
+ 'info_dict': {
+ 'id': '1740434',
+ 'display_id': 'hot-perky-blonde-naked-golf',
+ 'ext': 'mp4',
+ 'title': 'Hot Perky Blonde Naked Golf',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'categories': list, # NSFW
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._html_search_regex(
+ r'<source src="([^"]+)"', webpage, 'video URL')
+
+ title = self._html_search_regex(
+ r'<title>([^<]+)\s*-\s*Free', webpage, 'title')
+
+ thumbnail = self._html_search_regex(
+ r'poster="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+
+ like_count = str_to_int(self._html_search_regex(
+ r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
+ webpage, 'like count', fatal=False))
+ dislike_count = str_to_int(self._html_search_regex(
+ r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
+ webpage, 'like count', fatal=False))
+ comment_count = str_to_int(self._html_search_regex(
+ r'<span class="comments_count">([\d,\.]+)</span>',
+ webpage, 'comment count', fatal=False))
+
+ cats_str = self._html_search_regex(
+ r'<meta name="keywords" content="([^"]+)"', webpage, 'categories', fatal=False)
+ categories = None if cats_str is None else cats_str.split(' ')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'comment_count': comment_count,
+ 'categories': categories,
+ 'age_limit': self._rta_search(webpage),
+ }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index cdccfd376..9d6ce1f48 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -8,7 +8,7 @@ from ..utils import parse_iso8601
class DRTVIE(SubtitlesInfoExtractor):
- _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/[^/]+/(?P<id>[\da-z-]+)'
+ _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)+(?P<id>[\da-z-]+)(?:[/#?]|$)'
_TEST = {
'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8',
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index 88f5526b8..c1b4c729e 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -1,10 +1,13 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
import json
import random
import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
+ compat_str,
)
@@ -12,86 +15,98 @@ class EightTracksIE(InfoExtractor):
IE_NAME = '8tracks'
_VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
_TEST = {
- u"name": u"EightTracks",
- u"url": u"http://8tracks.com/ytdl/youtube-dl-test-tracks-a",
- u"playlist": [
+ "name": "EightTracks",
+ "url": "http://8tracks.com/ytdl/youtube-dl-test-tracks-a",
+ "info_dict": {
+ 'id': '1336550',
+ 'display_id': 'youtube-dl-test-tracks-a',
+ "description": "test chars: \"'/\\ä↭",
+ "title": "youtube-dl test tracks \"'/\\ä↭<>",
+ },
+ "playlist": [
{
- u"file": u"11885610.m4a",
- u"md5": u"96ce57f24389fc8734ce47f4c1abcc55",
- u"info_dict": {
- u"title": u"youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "96ce57f24389fc8734ce47f4c1abcc55",
+ "info_dict": {
+ "id": "11885610",
+ "ext": "m4a",
+ "title": "youtue-dl project<>\"' - youtube-dl test track 1 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885608.m4a",
- u"md5": u"4ab26f05c1f7291ea460a3920be8021f",
- u"info_dict": {
- u"title": u"youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "4ab26f05c1f7291ea460a3920be8021f",
+ "info_dict": {
+ "id": "11885608",
+ "ext": "m4a",
+ "title": "youtube-dl project - youtube-dl test track 2 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885679.m4a",
- u"md5": u"d30b5b5f74217410f4689605c35d1fd7",
- u"info_dict": {
- u"title": u"youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "d30b5b5f74217410f4689605c35d1fd7",
+ "info_dict": {
+ "id": "11885679",
+ "ext": "m4a",
+ "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885680.m4a",
- u"md5": u"4eb0a669317cd725f6bbd336a29f923a",
- u"info_dict": {
- u"title": u"youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "4eb0a669317cd725f6bbd336a29f923a",
+ "info_dict": {
+ "id": "11885680",
+ "ext": "m4a",
+ "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885682.m4a",
- u"md5": u"1893e872e263a2705558d1d319ad19e8",
- u"info_dict": {
- u"title": u"PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "1893e872e263a2705558d1d319ad19e8",
+ "info_dict": {
+ "id": "11885682",
+ "ext": "m4a",
+ "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885683.m4a",
- u"md5": u"b673c46f47a216ab1741ae8836af5899",
- u"info_dict": {
- u"title": u"PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "b673c46f47a216ab1741ae8836af5899",
+ "info_dict": {
+ "id": "11885683",
+ "ext": "m4a",
+ "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885684.m4a",
- u"md5": u"1d74534e95df54986da7f5abf7d842b7",
- u"info_dict": {
- u"title": u"phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "1d74534e95df54986da7f5abf7d842b7",
+ "info_dict": {
+ "id": "11885684",
+ "ext": "m4a",
+ "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
- u"file": u"11885685.m4a",
- u"md5": u"f081f47af8f6ae782ed131d38b9cd1c0",
- u"info_dict": {
- u"title": u"phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
- u"uploader_id": u"ytdl"
+ "md5": "f081f47af8f6ae782ed131d38b9cd1c0",
+ "info_dict": {
+ "id": "11885685",
+ "ext": "m4a",
+ "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
}
]
}
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
- json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
+ json_like = self._search_regex(
+ r"(?s)PAGE.mix = (.*?);\n", webpage, 'trax information')
data = json.loads(json_like)
session = str(random.randint(0, 1000000000))
@@ -99,21 +114,30 @@ class EightTracksIE(InfoExtractor):
track_count = data['tracks_count']
first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
next_url = first_url
- res = []
+ entries = []
for i in range(track_count):
- api_json = self._download_webpage(next_url, playlist_id,
- note=u'Downloading song information %s/%s' % (str(i+1), track_count),
- errnote=u'Failed to download song information')
+ api_json = self._download_webpage(
+ next_url, playlist_id,
+ note='Downloading song information %d/%d' % (i + 1, track_count),
+ errnote='Failed to download song information')
api_data = json.loads(api_json)
- track_data = api_data[u'set']['track']
+ track_data = api_data['set']['track']
info = {
- 'id': track_data['id'],
+ 'id': compat_str(track_data['id']),
'url': track_data['track_file_stream_url'],
'title': track_data['performer'] + u' - ' + track_data['name'],
'raw_title': track_data['name'],
'uploader_id': data['user']['login'],
'ext': 'm4a',
}
- res.append(info)
- next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
- return res
+ entries.append(info)
+ next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (
+ session, mix_id, track_data['id'])
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': compat_str(mix_id),
+ 'display_id': playlist_id,
+ 'title': data.get('name'),
+ 'description': data.get('description'),
+ }
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
index e6952588f..70f8efe27 100644
--- a/youtube_dl/extractor/empflix.py
+++ b/youtube_dl/extractor/empflix.py
@@ -1,54 +1,25 @@
from __future__ import unicode_literals
-import re
+from .tnaflix import TNAFlixIE
-from .common import InfoExtractor
+class EMPFlixIE(TNAFlixIE):
+ _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
+
+ _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
+ _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
+ _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-class EmpflixIE(InfoExtractor):
- _VALID_URL = r'^https?://www\.empflix\.com/videos/.*?-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
'md5': 'b1bc15b6412d33902d6e5952035fcabc',
'info_dict': {
'id': '33051',
+ 'display_id': 'Amateur-Finger-Fuck',
'ext': 'mp4',
'title': 'Amateur Finger Fuck',
'description': 'Amateur solo finger fucking.',
+ 'thumbnail': 're:https?://.*\.jpg$',
'age_limit': 18,
}
}
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
- age_limit = self._rta_search(webpage)
-
- video_title = self._html_search_regex(
- r'name="title" value="(?P<title>[^"]*)"', webpage, 'title')
- video_description = self._html_search_regex(
- r'name="description" value="([^"]*)"', webpage, 'description', fatal=False)
-
- cfg_url = self._html_search_regex(
- r'flashvars\.config = escape\("([^"]+)"',
- webpage, 'flashvars.config')
-
- cfg_xml = self._download_xml(
- cfg_url, video_id, note='Downloading metadata')
-
- formats = [
- {
- 'url': item.find('videoLink').text,
- 'format_id': item.find('res').text,
- } for item in cfg_xml.findall('./quality/item')
- ]
-
- return {
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'formats': formats,
- 'age_limit': age_limit,
- }
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py
new file mode 100644
index 000000000..522aa3d63
--- /dev/null
+++ b/youtube_dl/extractor/eporner.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ str_to_int,
+)
+
+
+class EpornerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
+ _TEST = {
+ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
+ 'md5': '3b427ae4b9d60619106de3185c2987cd',
+ 'info_dict': {
+ 'id': '95008',
+ 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
+ 'ext': 'flv',
+ 'title': 'Infamous Tiffany Teen Strip Tease Video',
+ 'duration': 194,
+ 'view_count': int,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+ title = self._html_search_regex(
+ r'<title>(.*?) - EPORNER', webpage, 'title')
+
+ redirect_code = self._html_search_regex(
+ r'<script type="text/javascript" src="/config5/%s/([a-f\d]+)/">' % video_id,
+ webpage, 'redirect_code')
+ redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, redirect_code)
+ player_code = self._download_webpage(
+ redirect_url, display_id, note='Downloading player config')
+
+ sources = self._search_regex(
+ r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources')
+
+ formats = []
+ for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources):
+ fmt = {
+ 'url': video_url,
+ 'format_id': format_id,
+ }
+ m = re.search(r'^(\d+)', format_id)
+ if m:
+ fmt['height'] = int(m.group(1))
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ duration = parse_duration(self._search_regex(
+ r'class="mbtim">([0-9:]+)</div>', webpage, 'duration',
+ fatal=False))
+ view_count = str_to_int(self._search_regex(
+ r'id="cinemaviews">\s*([0-9,]+)\s*<small>views',
+ webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': self._rta_search(webpage),
+ }
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py
index 12829cbcc..d237a8281 100644
--- a/youtube_dl/extractor/everyonesmixtape.py
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -12,10 +12,11 @@ from ..utils import (
class EveryonesMixtapeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$'
- _TEST = {
+ _TESTS = [{
'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5',
- 'file': '5bfseWNmlds.mp4',
"info_dict": {
+ 'id': '5bfseWNmlds',
+ 'ext': 'mp4',
"title": "Passion Pit - \"Sleepyhead\" (Official Music Video)",
"uploader": "FKR.TV",
"uploader_id": "frenchkissrecords",
@@ -25,7 +26,14 @@ class EveryonesMixtapeIE(InfoExtractor):
'params': {
'skip_download': True, # This is simply YouTube
}
- }
+ }, {
+ 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi',
+ 'info_dict': {
+ 'id': 'm7m0jJAbMQi',
+ 'title': 'Driving',
+ },
+ 'playlist_count': 24
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index f7cf700b5..60e68d98a 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -12,8 +12,8 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
urlencode_postdata,
-
ExtractorError,
+ limit_length,
)
@@ -21,23 +21,34 @@ class FacebookIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:\w+\.)?facebook\.com/
(?:[^#]*?\#!/)?
- (?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
+ (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
(?:v|video_id)=(?P<id>[0-9]+)
(?:.*)'''
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
IE_NAME = 'facebook'
- _TEST = {
- 'url': 'https://www.facebook.com/photo.php?v=120708114770723',
- 'md5': '48975a41ccc4b7a581abd68651c1a5a8',
+ _TESTS = [{
+ 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
+ 'md5': '6a40d33c0eccbb1af76cf0485a052659',
+ 'info_dict': {
+ 'id': '637842556329505',
+ 'ext': 'mp4',
+ 'duration': 38,
+ 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...',
+ }
+ }, {
+ 'note': 'Video without discernible title',
+ 'url': 'https://www.facebook.com/video.php?v=274175099429670',
'info_dict': {
- 'id': '120708114770723',
+ 'id': '274175099429670',
'ext': 'mp4',
- 'duration': 279,
- 'title': 'PEOPLE ARE AWESOME 2013',
+ 'title': 'Facebook video #274175099429670',
}
- }
+ }, {
+ 'url': 'https://www.facebook.com/video.php?v=10204634152394104',
+ 'only_matching': True,
+ }]
def _login(self):
(useremail, password) = self._get_login_info()
@@ -76,7 +87,8 @@ class FacebookIE(InfoExtractor):
check_form = {
'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
- 'h': self._search_regex(r'name="h" value="(\w*?)"', login_results, 'h'),
+ 'h': self._search_regex(
+ r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'),
'name_action_selected': 'dont_save',
}
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
@@ -121,7 +133,15 @@ class FacebookIE(InfoExtractor):
raise ExtractorError('Cannot find video URL')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
+ r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
+ fatal=False)
+ if not video_title:
+ video_title = self._html_search_regex(
+ r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
+ webpage, 'alternative title', default=None)
+ video_title = limit_length(video_title, 80)
+ if not video_title:
+ video_title = 'Facebook video #%s' % video_id
return {
'id': video_id,
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 3a908d01f..2bfa20606 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -12,6 +12,7 @@ from ..utils import (
compat_urlparse,
compat_xml_parse_error,
+ determine_ext,
ExtractorError,
float_or_none,
HEADRequest,
@@ -351,7 +352,36 @@ class GenericIE(InfoExtractor):
'description': 're:'
},
'playlist_mincount': 11,
- }
+ },
+ # Multiple brightcove videos
+ # https://github.com/rg3/youtube-dl/issues/2283
+ {
+ 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
+ 'info_dict': {
+ 'id': 'always-never',
+ 'title': 'Always / Never - The New Yorker',
+ },
+ 'playlist_count': 3,
+ 'params': {
+ 'extract_flat': False,
+ 'skip_download': True,
+ }
+ },
+ # MLB embed
+ {
+ 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
+ 'md5': '96f09a37e44da40dd083e12d9a683327',
+ 'info_dict': {
+ 'id': '33322633',
+ 'ext': 'mp4',
+ 'title': 'Ump changes call to ball',
+ 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
+ 'duration': 48,
+ 'timestamp': 1401537900,
+ 'upload_date': '20140531',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
]
def report_download_webpage(self, video_id):
@@ -598,7 +628,7 @@ class GenericIE(InfoExtractor):
embedSWF\(?:\s*
)
(["\'])
- (?P<url>(?:https?:)?//(?:www\.)?youtube\.com/
+ (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
(?:embed|v)/.+?)
\1''', webpage)
if matches:
@@ -794,6 +824,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'MLB')
+
# Start with something easy: JW Player in SWFObject
found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if not found:
@@ -830,13 +866,14 @@ class GenericIE(InfoExtractor):
if m_video_type is not None:
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
- return '.' in vpath and not vpath.endswith('.swf')
+ vext = determine_ext(vpath)
+ return '.' in vpath and vext not in ('swf', 'png', 'jpg')
found = list(filter(
check_video,
re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)))
if not found:
# HTML5 video
- found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
if not found:
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
diff --git a/youtube_dl/extractor/hornbunny.py b/youtube_dl/extractor/hornbunny.py
new file mode 100644
index 000000000..7e7714438
--- /dev/null
+++ b/youtube_dl/extractor/hornbunny.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
+
+
+class HornBunnyIE(InfoExtractor):
+ _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html'
+ _TEST = {
+ 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html',
+ 'md5': '95e40865aedd08eff60272b704852ad7',
+ 'info_dict': {
+ 'id': '5227',
+ 'ext': 'flv',
+ 'title': 'panty slut jerk off instruction',
+ 'duration': 550,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(
+ url, video_id, note='Downloading initial webpage')
+ title = self._html_search_regex(
+ r'class="title">(.*?)</h2>', webpage, 'title')
+ redirect_url = self._html_search_regex(
+ r'pg&settings=(.*?)\|0"\);', webpage, 'title')
+ webpage2 = self._download_webpage(redirect_url, video_id)
+ video_url = self._html_search_regex(
+ r'flvMask:(.*?);', webpage2, 'video_url')
+
+ duration = parse_duration(self._search_regex(
+ r'<strong>Runtime:</strong>\s*([0-9:]+)</div>',
+ webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._search_regex(
+ r'<strong>Views:</strong>\s*(\d+)</div>',
+ webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': 'flv',
+ 'duration': duration,
+ 'view_count': view_count,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
new file mode 100644
index 000000000..8e812b669
--- /dev/null
+++ b/youtube_dl/extractor/hostingbulk.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_urllib_request,
+ int_or_none,
+ urlencode_postdata,
+)
+
+
+class HostingBulkIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?hostingbulk\.com/
+ (?:embed-)?(?P<id>[A-Za-z0-9]{12})(?:-\d+x\d+)?\.html'''
+ _FILE_DELETED_REGEX = r'<b>File Not Found</b>'
+ _TEST = {
+ 'url': 'http://hostingbulk.com/n0ulw1hv20fm.html',
+ 'md5': '6c8653c8ecf7ebfa83b76e24b7b2fe3f',
+ 'info_dict': {
+ 'id': 'n0ulw1hv20fm',
+ 'ext': 'mp4',
+ 'title': 'md5:5afeba33f48ec87219c269e054afd622',
+ 'filesize': 6816081,
+ 'thumbnail': 're:^http://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ url = 'http://hostingbulk.com/{0:}.html'.format(video_id)
+
+ # Custom request with cookie to set language to English, so our file
+ # deleted regex would work.
+ request = compat_urllib_request.Request(
+ url, headers={'Cookie': 'lang=english'})
+ webpage = self._download_webpage(request, video_id)
+
+ if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id,
+ expected=True)
+
+ title = self._html_search_regex(r'<h3>(.*?)</h3>', webpage, 'title')
+ filesize = int_or_none(
+ self._search_regex(
+ r'<small>\((\d+)\sbytes?\)</small>',
+ webpage,
+ 'filesize',
+ fatal=False
+ )
+ )
+ thumbnail = self._search_regex(
+ r'<img src="([^"]+)".+?class="pic"',
+ webpage, 'thumbnail', fatal=False)
+
+ fields = dict(re.findall(r'''(?x)<input\s+
+ type="hidden"\s+
+ name="([^"]+)"\s+
+ value="([^"]*)"
+ ''', webpage))
+
+ request = compat_urllib_request.Request(url, urlencode_postdata(fields))
+ request.add_header('Content-type', 'application/x-www-form-urlencoded')
+ response = self._request_webpage(request, video_id,
+ 'Submiting download request')
+ video_url = response.geturl()
+
+ formats = [{
+ 'format_id': 'sd',
+ 'filesize': filesize,
+ 'url': video_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 1f42c6d3a..9e8b69f57 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -18,6 +18,7 @@ class IGNIE(InfoExtractor):
_DESCRIPTION_RE = [
r'<span class="page-object-description">(.+?)</span>',
r'id="my_show_video">.*?<p>(.*?)</p>',
+ r'<meta name="description" content="(.*?)"',
]
_TESTS = [
@@ -55,6 +56,17 @@ class IGNIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
+ 'md5': '4e9a0bda1e5eebd31ddcf86ec0b9b3c7',
+ 'info_dict': {
+ 'id': '078fdd005f6d3c02f63d795faa1b984f',
+ 'ext': 'mp4',
+ 'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
+ 'description': 'Giant skeletons, bloody hunts, and captivating'
+ ' natural beauty take our breath away.',
+ },
+ },
]
def _find_video_id(self, webpage):
@@ -62,6 +74,7 @@ class IGNIE(InfoExtractor):
r'data-video-id="(.+?)"',
r'<object id="vid_(.+?)"',
r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+ r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
]
return self._search_regex(res_id, webpage, 'video id')
@@ -70,10 +83,7 @@ class IGNIE(InfoExtractor):
name_or_id = mobj.group('name_or_id')
page_type = mobj.group('type')
webpage = self._download_webpage(url, name_or_id)
- if page_type == 'articles':
- video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, 'video url')
- return self.url_result(video_url, ie='IGN')
- elif page_type != 'video':
+ if page_type != 'video':
multiple_urls = re.findall(
'<param name="flashvars" value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
webpage)
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 7cee505c0..4536db3bf 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -63,6 +63,14 @@ class ImdbListIE(InfoExtractor):
IE_NAME = 'imdb:list'
IE_DESC = 'Internet Movie Database lists'
_VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
+ _TEST = {
+ 'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
+ 'info_dict': {
+ 'id': 'JFs9NWw6XI0',
+ 'title': 'March 23, 2012 Releases',
+ },
+ 'playlist_count': 7,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index b5372bf7a..5109f26ce 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -46,6 +46,30 @@ class InstagramUserIE(InfoExtractor):
_VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
+ _TEST = {
+ 'url': 'http://instagram.com/porsche',
+ 'info_dict': {
+ 'id': 'porsche',
+ 'title': 'porsche',
+ },
+ 'playlist_mincount': 2,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '614605558512799803_462752227',
+ 'ext': 'mp4',
+ 'title': '#Porsche Intelligent Performance.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Porsche',
+ 'uploader_id': 'porsche',
+ 'timestamp': 1387486713,
+ 'upload_date': '20131219',
+ },
+ }],
+ 'params': {
+ 'extract_flat': True,
+ 'skip_download': True,
+ }
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 4027deb70..75b543b7c 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -127,6 +127,21 @@ class IviCompilationIE(InfoExtractor):
IE_DESC = 'ivi.ru compilations'
IE_NAME = 'ivi:compilation'
_VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
+ _TESTS = [{
+ 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa',
+ 'info_dict': {
+ 'id': 'dvoe_iz_lartsa',
+ 'title': 'Двое из ларца (2006 - 2008)',
+ },
+ 'playlist_mincount': 24,
+ }, {
+ 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/season1',
+ 'info_dict': {
+ 'id': 'dvoe_iz_lartsa/season1',
+ 'title': 'Двое из ларца (2006 - 2008) 1 сезон',
+ },
+ 'playlist_mincount': 12,
+ }]
def _extract_entries(self, html, compilation_id):
return [self.url_result('http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), 'Ivi')
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 79e8430b5..a83dd249f 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -9,29 +9,50 @@ from ..utils import (
parse_iso8601,
determine_ext,
int_or_none,
+ float_or_none,
str_to_int,
)
class IzleseneIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?izlesene\.com/(?:video|embedplayer)/(?:[^/]+/)?(?P<id>[0-9]+)'
- _STREAM_URL = 'http://panel.izlesene.com/api/streamurl/{id:}/{format:}'
- _TEST = {
- 'url': 'http://www.izlesene.com/video/sevincten-cildirtan-dogum-gunu-hediyesi/7599694',
- 'md5': '4384f9f0ea65086734b881085ee05ac2',
- 'info_dict': {
- 'id': '7599694',
- 'ext': 'mp4',
- 'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi',
- 'description': 'Annesi oğluna doğum günü hediyesi olarak minecraft cd si alıyor, ve çocuk hunharca seviniyor',
- 'thumbnail': 're:^http://.*\.jpg',
- 'uploader_id': 'pelikzzle',
- 'timestamp': 1404298698,
- 'upload_date': '20140702',
- 'duration': 95.395,
- 'age_limit': 0,
- }
- }
+ _VALID_URL = r'''(?x)
+ https?://(?:(?:www|m)\.)?izlesene\.com/
+ (?:video|embedplayer)/(?:[^/]+/)?(?P<id>[0-9]+)
+ '''
+ _TESTS = [
+ {
+ 'url': 'http://www.izlesene.com/video/sevincten-cildirtan-dogum-gunu-hediyesi/7599694',
+ 'md5': '4384f9f0ea65086734b881085ee05ac2',
+ 'info_dict': {
+ 'id': '7599694',
+ 'ext': 'mp4',
+ 'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi',
+ 'description': 'md5:253753e2655dde93f59f74b572454f6d',
+ 'thumbnail': 're:^http://.*\.jpg',
+ 'uploader_id': 'pelikzzle',
+ 'timestamp': 1404298698,
+ 'upload_date': '20140702',
+ 'duration': 95.395,
+ 'age_limit': 0,
+ }
+ },
+ {
+ 'url': 'http://www.izlesene.com/video/tarkan-dortmund-2006-konseri/17997',
+ 'md5': '97f09b6872bffa284cb7fa4f6910cb72',
+ 'info_dict': {
+ 'id': '17997',
+ 'ext': 'mp4',
+ 'title': 'Tarkan Dortmund 2006 Konseri',
+ 'description': 'Tarkan Dortmund 2006 Konseri',
+ 'thumbnail': 're:^http://.*\.jpg',
+ 'uploader_id': 'parlayankiz',
+ 'timestamp': 1163318593,
+ 'upload_date': '20061112',
+ 'duration': 253.666,
+ 'age_limit': 0,
+ }
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -45,18 +66,19 @@ class IzleseneIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
- r"adduserUsername\s*=\s*'([^']+)';", webpage, 'uploader', fatal=False, default='')
+ r"adduserUsername\s*=\s*'([^']+)';",
+ webpage, 'uploader', fatal=False, default='')
timestamp = parse_iso8601(self._html_search_meta(
'uploadDate', webpage, 'upload date', fatal=False))
- duration = int_or_none(self._html_search_regex(
- r'"videoduration"\s*:\s*"([^"]+)"', webpage, 'duration', fatal=False))
- if duration:
- duration /= 1000.0
+ duration = float_or_none(self._html_search_regex(
+ r'"videoduration"\s*:\s*"([^"]+)"',
+ webpage, 'duration', fatal=False), scale=1000)
view_count = str_to_int(get_element_by_id('videoViewCount', webpage))
comment_count = self._html_search_regex(
- r'comment_count\s*=\s*\'([^\']+)\';', webpage, 'uploader', fatal=False)
+ r'comment_count\s*=\s*\'([^\']+)\';',
+ webpage, 'comment_count', fatal=False)
family_friendly = self._html_search_meta(
'isFamilyFriendly', webpage, 'age limit', fatal=False)
@@ -66,20 +88,26 @@ class IzleseneIE(InfoExtractor):
ext = determine_ext(content_url, 'mp4')
# Might be empty for some videos.
- qualities = self._html_search_regex(
- r'"quality"\s*:\s*"([^"]+)"', webpage, 'qualities', fatal=False, default='')
+ streams = self._html_search_regex(
+ r'"qualitylevel"\s*:\s*"([^"]+)"',
+ webpage, 'streams', fatal=False, default='')
formats = []
- for quality in qualities.split('|'):
- json = self._download_json(
- self._STREAM_URL.format(id=video_id, format=quality), video_id,
- note='Getting video URL for "%s" quality' % quality,
- errnote='Failed to get video URL for "%s" quality' % quality
- )
+ if streams:
+ for stream in streams.split('|'):
+ quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
+ formats.append({
+ 'format_id': '%sp' % quality if quality else 'sd',
+ 'url': url,
+ 'ext': ext,
+ })
+ else:
+ stream_url = self._search_regex(
+ r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
formats.append({
- 'url': json.get('streamurl'),
+ 'format_id': 'sd',
+ 'url': stream_url,
'ext': ext,
- 'format_id': '%sp' % quality if quality else 'sd',
})
return {
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 772bb5671..408d00944 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -9,21 +9,30 @@ from ..utils import (
class KhanAcademyIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])'
+ _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])'
IE_NAME = 'KhanAcademy'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.khanacademy.org/video/one-time-pad',
- 'file': 'one-time-pad.mp4',
'md5': '7021db7f2d47d4fff89b13177cb1e8f4',
'info_dict': {
+ 'id': 'one-time-pad',
+ 'ext': 'mp4',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'duration': 176,
'uploader': 'Brit Cruise',
'upload_date': '20120411',
}
- }
+ }, {
+ 'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
+ 'info_dict': {
+ 'id': 'cryptography',
+ 'title': 'Journey into cryptography',
+ 'description': 'How have humans protected their secret messages through history? What has changed today?',
+ },
+ 'playlist_mincount': 3,
+ }]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 281a0ce40..516147417 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -19,7 +19,7 @@ from ..utils import (
class LivestreamIE(InfoExtractor):
IE_NAME = 'livestream'
_VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
- _TEST = {
+ _TESTS = [{
'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
'md5': '53274c76ba7754fb0e8d072716f2292b',
'info_dict': {
@@ -31,7 +31,13 @@ class LivestreamIE(InfoExtractor):
'view_count': int,
'thumbnail': 're:^http://.*\.jpg$'
}
- }
+ }, {
+ 'url': 'http://new.livestream.com/tedx/cityenglish',
+ 'info_dict': {
+ 'title': 'TEDCity2.0 (English)',
+ },
+ 'playlist_mincount': 4,
+ }]
def _parse_smil(self, video_id, smil_url):
formats = []
@@ -111,34 +117,37 @@ class LivestreamIE(InfoExtractor):
event_name = mobj.group('event_name')
webpage = self._download_webpage(url, video_id or event_name)
- og_video = self._og_search_video_url(webpage, 'player url', fatal=False, default=None)
- if og_video is None:
- config_json = self._search_regex(
- r'window.config = ({.*?});', webpage, 'window config')
- info = json.loads(config_json)['event']
-
- def is_relevant(vdata, vid):
- result = vdata['type'] == 'video'
- if video_id is not None:
- result = result and compat_str(vdata['data']['id']) == vid
- return result
-
- videos = [self._extract_video_info(video_data['data'])
- for video_data in info['feed']['data']
- if is_relevant(video_data, video_id)]
- if video_id is None:
- # This is an event page:
- return self.playlist_result(videos, info['id'], info['full_name'])
- else:
- if videos:
- return videos[0]
- else:
+ og_video = self._og_search_video_url(
+ webpage, 'player url', fatal=False, default=None)
+ if og_video is not None:
query_str = compat_urllib_parse_urlparse(og_video).query
query = compat_urlparse.parse_qs(query_str)
- api_url = query['play_url'][0].replace('.smil', '')
- info = json.loads(self._download_webpage(
- api_url, video_id, 'Downloading video info'))
- return self._extract_video_info(info)
+ if 'play_url' in query:
+ api_url = query['play_url'][0].replace('.smil', '')
+ info = json.loads(self._download_webpage(
+ api_url, video_id, 'Downloading video info'))
+ return self._extract_video_info(info)
+
+ config_json = self._search_regex(
+ r'window.config = ({.*?});', webpage, 'window config')
+ info = json.loads(config_json)['event']
+
+ def is_relevant(vdata, vid):
+ result = vdata['type'] == 'video'
+ if video_id is not None:
+ result = result and compat_str(vdata['data']['id']) == vid
+ return result
+
+ videos = [self._extract_video_info(video_data['data'])
+ for video_data in info['feed']['data']
+ if is_relevant(video_data, video_id)]
+ if video_id is None:
+ # This is an event page:
+ return self.playlist_result(videos, info['id'], info['full_name'])
+ else:
+ if not videos:
+ raise ExtractorError('Cannot find video %s' % video_id)
+ return videos[0]
# The original version of Livestream uses a different system
@@ -148,7 +157,7 @@ class LivestreamOriginalIE(InfoExtractor):
(?P<user>[^/]+)/(?P<type>video|folder)
(?:\?.*?Id=|/)(?P<id>.*?)(&|$)
'''
- _TEST = {
+ _TESTS = [{
'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
'info_dict': {
'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
@@ -159,7 +168,13 @@ class LivestreamOriginalIE(InfoExtractor):
# rtmp
'skip_download': True,
},
- }
+ }, {
+ 'url': 'https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+ 'info_dict': {
+ 'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+ },
+ 'playlist_mincount': 4,
+ }]
def _extract_video(self, user, video_id):
api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
@@ -182,15 +197,19 @@ class LivestreamOriginalIE(InfoExtractor):
def _extract_folder(self, url, folder_id):
webpage = self._download_webpage(url, folder_id)
- urls = orderedSet(re.findall(r'<a href="(https?://livestre\.am/.*?)"', webpage))
+ paths = orderedSet(re.findall(
+ r'''(?x)(?:
+ <li\s+class="folder">\s*<a\s+href="|
+ <a\s+href="(?=https?://livestre\.am/)
+ )([^"]+)"''', webpage))
return {
'_type': 'playlist',
'id': folder_id,
'entries': [{
'_type': 'url',
- 'url': video_url,
- } for video_url in urls],
+ 'url': compat_urlparse.urljoin(url, p),
+ } for p in paths],
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 5f64e7bd0..520f27fca 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
ExtractorError,
+ HEADRequest,
int_or_none,
parse_iso8601,
)
@@ -38,7 +39,7 @@ class MixcloudIE(InfoExtractor):
try:
# We only want to know if the request succeed
# don't download the whole file
- self._request_webpage(url, None, False)
+ self._request_webpage(HEADRequest(url), None, False)
return url
except ExtractorError:
url = None
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index 37c72bc53..bfdb462eb 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -11,7 +11,7 @@ from ..utils import (
class MLBIE(InfoExtractor):
- _VALID_URL = r'https?://m\.mlb\.com/(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v(?P<id>n?\d+)'
+ _VALID_URL = r'https?://m\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|shared/video/embed/embed\.html\?.*?\bcontent_id=)(?P<id>n?\d+)'
_TESTS = [
{
'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -69,6 +69,10 @@ class MLBIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
},
},
+ {
+ 'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py
new file mode 100644
index 000000000..2ff79b9b8
--- /dev/null
+++ b/youtube_dl/extractor/moevideo.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_urllib_parse,
+ compat_urllib_request,
+ int_or_none,
+)
+
+
+class MoeVideoIE(InfoExtractor):
+ IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net'
+ _VALID_URL = r'''(?x)
+ https?://(?P<host>(?:www\.)?
+ (?:(?:moevideo|playreplay|videochart)\.net))/
+ (?:video|framevideo)/(?P<id>[0-9]+\.[0-9A-Za-z]+)'''
+ _API_URL = 'http://api.letitbit.net/'
+ _API_KEY = 'tVL0gjqo5'
+ _TESTS = [
+ {
+ 'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29',
+ 'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a',
+ 'info_dict': {
+ 'id': '00297.0036103fe3d513ef27915216fd29',
+ 'ext': 'flv',
+ 'title': 'Sink cut out machine',
+ 'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'width': 540,
+ 'height': 360,
+ 'duration': 179,
+ 'filesize': 17822500,
+ }
+ },
+ {
+ 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a',
+ 'md5': '74f0a014d5b661f0f0e2361300d1620e',
+ 'info_dict': {
+ 'id': '77107.7f325710a627383d40540d8e991a',
+ 'ext': 'flv',
+ 'title': 'Operacion Condor.',
+ 'description': 'md5:7e68cb2fcda66833d5081c542491a9a3',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'width': 480,
+ 'height': 296,
+ 'duration': 6027,
+ 'filesize': 588257923,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(
+ 'http://%s/video/%s' % (mobj.group('host'), video_id),
+ video_id, 'Downloading webpage')
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+
+ r = [
+ self._API_KEY,
+ [
+ 'preview/flv_link',
+ {
+ 'uid': video_id,
+ },
+ ],
+ ]
+ r_json = json.dumps(r)
+ post = compat_urllib_parse.urlencode({'r': r_json})
+ req = compat_urllib_request.Request(self._API_URL, post)
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+ response = self._download_json(req, video_id)
+ if response['status'] != 'OK':
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, response['data']),
+ expected=True
+ )
+ item = response['data'][0]
+ video_url = item['link']
+ duration = int_or_none(item['length'])
+ width = int_or_none(item['width'])
+ height = int_or_none(item['height'])
+ filesize = int_or_none(item['convert_size'])
+
+ formats = [{
+ 'format_id': 'sd',
+ 'http_headers': {'Range': 'bytes=0-'}, # Required to download
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'filesize': filesize,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index b9430b09b..d658647e6 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import re
@@ -8,15 +10,17 @@ from ..utils import (
compat_urllib_parse,
)
+
class MofosexIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
+ _VALID_URL = r'^https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
_TEST = {
- u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
- u'file': u'5018.mp4',
- u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a',
- u'info_dict': {
- u"title": u"Japanese Teen Music Video",
- u"age_limit": 18,
+ 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
+ 'md5': '1b2eb47ac33cc75d4a80e3026b613c5a',
+ 'info_dict': {
+ 'id': '5018',
+ 'ext': 'mp4',
+ 'title': 'Japanese Teen Music Video',
+ 'age_limit': 18,
}
}
@@ -29,8 +33,8 @@ class MofosexIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url'))
+ video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py
new file mode 100644
index 000000000..ebb1eb8e9
--- /dev/null
+++ b/youtube_dl/extractor/musicvault.py
@@ -0,0 +1,76 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ unified_strdate,
+)
+
+
+class MusicVaultIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.musicvault\.com/(?P<uploader_id>[^/?#]*)/video/(?P<display_id>[^/?#]*)_(?P<id>[0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html',
+ 'md5': '2cdbb3ae75f7fb3519821507d2fb3c15',
+ 'info_dict': {
+ 'id': '1010863',
+ 'ext': 'mp4',
+ 'uploader_id': 'the-allman-brothers-band',
+ 'title': 'Straight from the Heart',
+ 'duration': 244,
+ 'uploader': 'The Allman Brothers Band',
+ 'thumbnail': 're:^https?://.*/thumbnail/.*',
+ 'upload_date': '19811216',
+ 'location': 'Capitol Theatre (Passaic, NJ)',
+ 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ webpage = self._download_webpage(url, display_id)
+
+ thumbnail = self._search_regex(
+ r'<meta itemprop="thumbnail" content="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+
+ data_div = self._search_regex(
+ r'(?s)<div class="data">(.*?)</div>', webpage, 'data fields')
+ uploader = self._html_search_regex(
+ r'<h1.*?>(.*?)</h1>', data_div, 'uploader', fatal=False)
+ title = self._html_search_regex(
+ r'<h2.*?>(.*?)</h2>', data_div, 'title')
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<h3.*?>(.*?)</h3>', data_div, 'uploader', fatal=False))
+ location = self._html_search_regex(
+ r'<h4.*?>(.*?)</h4>', data_div, 'location', fatal=False)
+
+ duration = parse_duration(self._html_search_meta('duration', webpage))
+
+ VIDEO_URL_TEMPLATE = 'http://cdnapi.kaltura.com/p/%(uid)s/sp/%(wid)s/playManifest/entryId/%(entry_id)s/format/url/protocol/http'
+ kaltura_id = self._search_regex(
+ r'<div id="video-detail-player" data-kaltura-id="([^"]+)"',
+ webpage, 'kaltura ID')
+ video_url = VIDEO_URL_TEMPLATE % {
+ 'entry_id': kaltura_id,
+ 'wid': self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid'),
+ 'uid': self._search_regex(r'uiconf_id/([0-9]+)/', webpage, 'uid'),
+ }
+
+ return {
+ 'id': mobj.group('id'),
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'display_id': display_id,
+ 'uploader_id': mobj.group('uploader_id'),
+ 'thumbnail': thumbnail,
+ 'description': self._html_search_meta('description', webpage),
+ 'upload_date': upload_date,
+ 'location': location,
+ 'title': title,
+ 'uploader': uploader,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 633b42f72..78e650b2d 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -3,18 +3,23 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ remove_end,
+ parse_duration,
+)
class NBAIE(InfoExtractor):
_VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
_TEST = {
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
- 'md5': u'c0edcfc37607344e2ff8f13c378c88a4',
+ 'md5': 'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': {
'id': '0021200253-okc-bkn-recap.nba',
'ext': 'mp4',
- 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'title': 'Thunder vs. Nets',
+ 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
+ 'duration': 181,
},
}
@@ -27,13 +32,18 @@ class NBAIE(InfoExtractor):
video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
- title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
+ title = remove_end(
+ self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
+
+ description = self._og_search_description(webpage)
+ duration = parse_duration(
+ self._html_search_meta('duration', webpage, 'duration', fatal=False))
- description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
return {
'id': shortened_video_id,
'url': video_url,
'title': title,
'description': description,
+ 'duration': duration,
}
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 2edd806a3..ceda1dcc0 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import json
@@ -25,8 +27,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
'path': initial_video_url.replace('.mp4', '_sd.mp4'),
})
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
- path_doc = self._download_xml(path_url, video_id,
- u'Downloading final video url')
+ path_doc = self._download_xml(
+ path_url, video_id, 'Downloading final video url')
video_url = path_doc.find('path').text
join = compat_urlparse.urljoin
@@ -43,17 +45,18 @@ class NHLBaseInfoExtractor(InfoExtractor):
class NHLIE(NHLBaseInfoExtractor):
- IE_NAME = u'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?<=[?&])id=(?P<id>\d+)'
+ IE_NAME = 'nhl.com'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console\?.*?(?:[?&])id=(?P<id>[0-9]+)'
_TEST = {
- u'url': u'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
- u'file': u'453614.mp4',
- u'info_dict': {
- u'title': u'Quick clip: Weise 4-3 goal vs Flames',
- u'description': u'Dale Weise scores his first of the season to put the Canucks up 4-3.',
- u'duration': 18,
- u'upload_date': u'20131006',
+ 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
+ 'info_dict': {
+ 'id': '453614',
+ 'ext': 'mp4',
+ 'title': 'Quick clip: Weise 4-3 goal vs Flames',
+ 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.',
+ 'duration': 18,
+ 'upload_date': '20131006',
},
}
@@ -61,23 +64,23 @@ class NHLIE(NHLBaseInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
- info_json = self._download_webpage(json_url, video_id,
- u'Downloading info json')
- info_json = self._fix_json(info_json)
- info = json.loads(info_json)[0]
- return self._extract_video(info)
+ data = self._download_json(
+ json_url, video_id, transform_source=self._fix_json)
+ return self._extract_video(data[0])
class NHLVideocenterIE(NHLBaseInfoExtractor):
- IE_NAME = u'nhl.com:videocenter'
- IE_DESC = u'NHL videocenter category'
- _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
-
- @classmethod
- def suitable(cls, url):
- if NHLIE.suitable(url):
- return False
- return super(NHLVideocenterIE, cls).suitable(url)
+ IE_NAME = 'nhl.com:videocenter'
+ IE_DESC = 'NHL videocenter category'
+ _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$'
+ _TEST = {
+ 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999',
+ 'info_dict': {
+ 'id': '999',
+ 'title': 'Highlights',
+ },
+ 'playlist_count': 12,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -86,10 +89,10 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
cat_id = self._search_regex(
[r'var defaultCatId = "(.+?)";',
r'{statusIndex:0,index:0,.*?id:(.*?),'],
- webpage, u'category id')
+ webpage, 'category id')
playlist_title = self._html_search_regex(
r'tab0"[^>]*?>(.*?)</td>',
- webpage, u'playlist title', flags=re.DOTALL).lower().capitalize()
+ webpage, 'playlist title', flags=re.DOTALL).lower().capitalize()
data = compat_urllib_parse.urlencode({
'cid': cat_id,
@@ -104,7 +107,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
response = self._fix_json(response)
if not response.strip():
self._downloader.report_warning(u'Got an empty reponse, trying '
- u'adding the "newvideos" parameter')
+ 'adding the "newvideos" parameter')
response = self._download_webpage(request_url + '&newvideos=true',
playlist_title)
response = self._fix_json(response)
@@ -114,5 +117,5 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
'_type': 'playlist',
'title': playlist_title,
'id': cat_id,
- 'entries': [self._extract_video(i) for i in videos],
+ 'entries': [self._extract_video(v) for v in videos],
}
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index da203538d..959fdf590 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -5,7 +5,10 @@ import re
from .common import InfoExtractor
from ..utils import (
+ compat_urllib_request,
+ compat_urllib_parse,
ExtractorError,
+ clean_html,
unified_strdate,
compat_str,
)
@@ -13,6 +16,8 @@ from ..utils import (
class NocoIE(InfoExtractor):
_VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+ _LOGIN_URL = 'http://noco.tv/do.php'
+ _NETRC_MACHINE = 'noco'
_TEST = {
'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
@@ -30,6 +35,28 @@ class NocoIE(InfoExtractor):
'skip': 'Requires noco account',
}
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'a': 'login',
+ 'cookie': '1',
+ 'username': username,
+ 'password': password,
+ }
+ request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
+
+ login = self._download_json(request, None, 'Logging in as %s' % username)
+
+ if 'erreur' in login:
+ raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py
new file mode 100644
index 000000000..f3be8f552
--- /dev/null
+++ b/youtube_dl/extractor/nosvideo.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_urllib_request,
+ urlencode_postdata,
+ xpath_text,
+ xpath_with_ns,
+)
+
+_x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'})
+
+
+class NosVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nosvideo\.com/' + \
+ '(?:embed/|\?v=)(?P<id>[A-Za-z0-9]{12})/?'
+ _PLAYLIST_URL = 'http://nosvideo.com/xml/{xml_id:s}.xml'
+ _FILE_DELETED_REGEX = r'<b>File Not Found</b>'
+ _TEST = {
+ 'url': 'http://nosvideo.com/?v=mu8fle7g7rpq',
+ 'md5': '6124ed47130d8be3eacae635b071e6b6',
+ 'info_dict': {
+ 'id': 'mu8fle7g7rpq',
+ 'ext': 'mp4',
+ 'title': 'big_buck_bunny_480p_surround-fix.avi.mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ fields = {
+ 'id': video_id,
+ 'op': 'download1',
+ 'method_free': 'Continue to Video',
+ }
+ req = compat_urllib_request.Request(url, urlencode_postdata(fields))
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ webpage = self._download_webpage(req, video_id,
+ 'Downloading download page')
+ if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id,
+ expected=True)
+
+ xml_id = self._search_regex(r'php\|([^\|]+)\|', webpage, 'XML ID')
+ playlist_url = self._PLAYLIST_URL.format(xml_id=xml_id)
+ playlist = self._download_xml(playlist_url, video_id)
+
+ track = playlist.find(_x('.//xspf:track'))
+ if track is None:
+ raise ExtractorError(
+ 'XML playlist is missing the \'track\' element',
+ expected=True)
+ title = xpath_text(track, _x('./xspf:title'), 'title')
+ url = xpath_text(track, _x('./xspf:file'), 'URL', fatal=True)
+ thumbnail = xpath_text(track, _x('./xspf:image'), 'thumbnail')
+ if title is not None:
+ title = title.strip()
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 12e85a716..902d62944 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
unified_strdate,
+ qualities,
)
@@ -17,7 +18,7 @@ class NPOIE(InfoExtractor):
'md5': '4b3f9c429157ec4775f2c9cb7b911016',
'info_dict': {
'id': 'VPWON_1220719',
- 'ext': 'mp4',
+ 'ext': 'm4v',
'title': 'Nieuwsuur',
'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
'upload_date': '20140622',
@@ -39,24 +40,32 @@ class NPOIE(InfoExtractor):
video_id,
note='Downloading token'
)
- token = self._search_regex(r'npoplayer.token = "(.+?)"', token_page, 'token')
- streams_info = self._download_json(
- 'http://ida.omroep.nl/odi/?prid=%s&puboptions=h264_std&adaptive=yes&token=%s' % (video_id, token),
- video_id
- )
+ token = self._search_regex(r'npoplayer\.token = "(.+?)"', token_page, 'token')
- stream_info = self._download_json(
- streams_info['streams'][0] + '&type=json',
- video_id,
- 'Downloading stream info'
- )
+ formats = []
+ quality = qualities(['adaptive', 'h264_sb', 'h264_bb', 'h264_std'])
+ for format_id in metadata['pubopties']:
+ streams_info = self._download_json(
+ 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' % (video_id, format_id, token),
+ video_id, 'Downloading %s streams info' % format_id)
+ stream_info = self._download_json(
+ streams_info['streams'][0] + '&type=json',
+ video_id, 'Downloading %s stream info' % format_id)
+ if format_id == 'adaptive':
+ formats.extend(self._extract_m3u8_formats(stream_info['url'], video_id))
+ else:
+ formats.append({
+ 'url': stream_info['url'],
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+ self._sort_formats(formats)
return {
'id': video_id,
'title': metadata['titel'],
- 'ext': 'mp4',
- 'url': stream_info['url'],
'description': metadata['info'],
'thumbnail': metadata['images'][-1]['url'],
'upload_date': unified_strdate(metadata['gidsdatum']),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 718fe9aba..48ce6e730 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -27,47 +27,40 @@ class PornHdIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' porn HD Video | PornHD.com '
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
-
+ title = self._html_search_regex(
+ r'<title>(.+) porn HD.+?</title>', webpage, 'title')
description = self._html_search_regex(
r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
view_count = int_or_none(self._html_search_regex(
- r'(\d+) views </span>', webpage, 'view count', fatal=False))
+ r'(\d+) views\s*</span>', webpage, 'view count', fatal=False))
- formats = [
- {
- 'url': format_url,
- 'ext': format.lower(),
- 'format_id': '%s-%s' % (format.lower(), quality.lower()),
- 'quality': 1 if quality.lower() == 'high' else 0,
- } for format, quality, format_url in re.findall(
- r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
- ]
+ videos = re.findall(
+ r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
if mobj:
flashvars = json.loads(mobj.group('flashvars'))
- formats.extend([
- {
- 'url': flashvars['hashlink'].replace('?noProxy=1', ''),
- 'ext': 'flv',
- 'format_id': 'flv-low',
- 'quality': 0,
- },
- {
- 'url': flashvars['hd'].replace('?noProxy=1', ''),
- 'ext': 'flv',
- 'format_id': 'flv-high',
- 'quality': 1,
- }
- ])
+ for key, quality in [('hashlink', 'low'), ('hd', 'high')]:
+ redirect_url = flashvars.get(key)
+ if redirect_url:
+ videos.append(('flv', quality, redirect_url))
thumbnail = flashvars['urlWallpaper']
else:
thumbnail = self._og_search_thumbnail(webpage)
+ formats = []
+ for format_, quality, redirect_url in videos:
+ format_id = '%s-%s' % (format_.lower(), quality.lower())
+ video_url = self._download_webpage(
+ redirect_url, video_id, 'Downloading %s video link' % format_id, fatal=False)
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'ext': format_.lower(),
+ 'format_id': format_id,
+ 'quality': 1 if quality.lower() == 'high' else 0,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py
new file mode 100644
index 000000000..202f58673
--- /dev/null
+++ b/youtube_dl/extractor/pornoxo.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ str_to_int,
+)
+
+
+class PornoXOIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html'
+ _TEST = {
+ 'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html',
+ 'md5': '582f28ecbaa9e6e24cb90f50f524ce87',
+ 'info_dict': {
+ 'id': '7564',
+ 'ext': 'flv',
+ 'title': 'Striptease From Sexy Secretary!',
+ 'description': 'Striptease From Sexy Secretary!',
+ 'categories': list, # NSFW
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(
+ r'\'file\'\s*:\s*"([^"]+)"', webpage, 'video_url')
+
+ title = self._html_search_regex(
+ r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title')
+
+ description = self._html_search_regex(
+ r'<meta name="description" content="([^"]+)\s*featuring',
+ webpage, 'description', fatal=False)
+
+ thumbnail = self._html_search_regex(
+ r'\'image\'\s*:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+ view_count = str_to_int(self._html_search_regex(
+ r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False))
+
+ categories_str = self._html_search_regex(
+ r'<meta name="description" content=".*featuring\s*([^"]+)"',
+ webpage, 'categories', fatal=False)
+ categories = (
+ None if categories_str is None
+ else categories_str.split(','))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'view_count': view_count,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
new file mode 100644
index 000000000..463e85501
--- /dev/null
+++ b/youtube_dl/extractor/promptfile.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+
+class PromptFileIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)'
+ _FILE_NOT_FOUND_REGEX = r'<div.+id="not_found_msg".+>.+</div>[^-]'
+ _TEST = {
+ 'url': 'http://www.promptfile.com/l/D21B4746E9-F01462F0FF',
+ 'md5': 'd1451b6302da7215485837aaea882c4c',
+ 'info_dict': {
+ 'id': 'D21B4746E9-F01462F0FF',
+ 'ext': 'mp4',
+ 'title': 'Birds.mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id,
+ expected=True)
+
+ fields = dict(re.findall(r'''(?x)type="hidden"\s+
+ name="(.+?)"\s+
+ value="(.*?)"
+ ''', webpage))
+ post = compat_urllib_parse.urlencode(fields)
+ req = compat_urllib_request.Request(url, post)
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ webpage = self._download_webpage(
+ req, video_id, 'Downloading video page')
+
+ url = self._html_search_regex(r'url:\s*\'([^\']+)\'', webpage, 'URL')
+ title = self._html_search_regex(
+ r'<span.+title="([^"]+)">', webpage, 'title')
+ thumbnail = self._html_search_regex(
+ r'<div id="player_overlay">.*button>.*?<img src="([^"]+)"',
+ webpage, 'thumbnail', fatal=False, flags=re.DOTALL)
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': url,
+ 'ext': determine_ext(title),
+ }]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index da64a1a7b..5b2a723c1 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -145,7 +145,6 @@ class ProSiebenSat1IE(InfoExtractor):
'ext': 'mp4',
'title': 'Kurztrips zum Valentinstag',
'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
- 'upload_date': '20130206',
'duration': 307.24,
},
'params': {
@@ -240,7 +239,7 @@ class ProSiebenSat1IE(InfoExtractor):
thumbnail = self._og_search_thumbnail(page)
upload_date = unified_strdate(self._html_search_regex(
- self._UPLOAD_DATE_REGEXES, page, 'upload date', fatal=False))
+ self._UPLOAD_DATE_REGEXES, page, 'upload date', default=None))
formats = []
@@ -249,7 +248,7 @@ class ProSiebenSat1IE(InfoExtractor):
urls_sources = urls_sources.values()
def fix_bitrate(bitrate):
- return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+ return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
for source in urls_sources:
protocol = source['protocol']
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index 2d9511d5e..0ab1eb69c 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -12,22 +12,16 @@ class RtlXlIE(InfoExtractor):
_TEST = {
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
+ 'md5': 'cc16baa36a6c169391f0764fa6b16654',
'info_dict': {
'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'RTL Nieuws - Laat',
- 'description': 'Dagelijks het laatste nieuws uit binnen- en '
- 'buitenland. Voor nog meer nieuws kunt u ook gebruikmaken van '
- 'onze mobiele apps.',
+ 'description': 'md5:6b61f66510c8889923b11f2778c72dc5',
'timestamp': 1408051800,
'upload_date': '20140814',
'duration': 576.880,
},
- 'params': {
- # We download the first bytes of the first fragment, it can't be
- # processed by the f4m downloader beacuse it isn't complete
- 'skip_download': True,
- },
}
def _real_extract(self, url):
@@ -41,14 +35,32 @@ class RtlXlIE(InfoExtractor):
material = info['material'][0]
episode_info = info['episodes'][0]
- f4m_url = 'http://manifest.us.rtl.nl' + material['videopath']
progname = info['abstracts'][0]['name']
subtitle = material['title'] or info['episodes'][0]['name']
+ videopath = material['videopath']
+ f4m_url = 'http://manifest.us.rtl.nl' + videopath
+
+ formats = self._extract_f4m_formats(f4m_url, uuid)
+
+ video_urlpart = videopath.split('/flash/')[1][:-4]
+ PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
+
+ formats.extend([
+ {
+ 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart),
+ 'format_id': 'pg-sd',
+ },
+ {
+ 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart),
+ 'format_id': 'pg-hd',
+ }
+ ])
+
return {
'id': uuid,
'title': '%s - %s' % (progname, subtitle),
- 'formats': self._extract_f4m_formats(f4m_url, uuid),
+ 'formats': formats,
'timestamp': material['original_date'],
'description': episode_info['synopsis'],
'duration': parse_duration(material.get('duration')),
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 357edbbda..0c8790da2 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -74,6 +74,13 @@ class RutubeChannelIE(InfoExtractor):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channels'
_VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://rutube.ru/tags/video/1800/',
+ 'info_dict': {
+ 'id': '1800',
+ },
+ 'playlist_mincount': 68,
+ }]
_PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
@@ -101,6 +108,7 @@ class RutubeMovieIE(RutubeChannelIE):
IE_NAME = 'rutube:movie'
IE_DESC = 'Rutube movies'
_VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)'
+ _TESTS = []
_MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json'
_PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
@@ -119,5 +127,12 @@ class RutubePersonIE(RutubeChannelIE):
IE_NAME = 'rutube:person'
IE_DESC = 'Rutube person videos'
_VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://rutube.ru/video/person/313878/',
+ 'info_dict': {
+ 'id': '313878',
+ },
+ 'playlist_mincount': 37,
+ }]
_PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index 6c5f5a680..f737b4e5f 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -100,7 +100,7 @@ class RUTVIE(InfoExtractor):
return mobj.group('url')
mobj = re.search(
- r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>http://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
+ r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
webpage)
if mobj:
return mobj.group('url')
diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py
new file mode 100644
index 000000000..7531e8325
--- /dev/null
+++ b/youtube_dl/extractor/sharesix.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+ parse_duration,
+)
+
+
+class ShareSixIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?sharesix\.com/(?:f/)?(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [
+ {
+ 'url': 'http://sharesix.com/f/OXjQ7Y6',
+ 'md5': '9e8e95d8823942815a7d7c773110cc93',
+ 'info_dict': {
+ 'id': 'OXjQ7Y6',
+ 'ext': 'mp4',
+ 'title': 'big_buck_bunny_480p_surround-fix.avi',
+ 'duration': 596,
+ 'width': 854,
+ 'height': 480,
+ },
+ },
+ {
+ 'url': 'http://sharesix.com/lfrwoxp35zdd',
+ 'md5': 'dd19f1435b7cec2d7912c64beeee8185',
+ 'info_dict': {
+ 'id': 'lfrwoxp35zdd',
+ 'ext': 'flv',
+ 'title': 'WhiteBoard___a_Mac_vs_PC_Parody_Cartoon.mp4.flv',
+ 'duration': 65,
+ 'width': 1280,
+ 'height': 720,
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ fields = {
+ 'method_free': 'Free'
+ }
+ post = compat_urllib_parse.urlencode(fields)
+ req = compat_urllib_request.Request(url, post)
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+ webpage = self._download_webpage(req, video_id,
+ 'Downloading video page')
+
+ video_url = self._search_regex(
+ r"var\slnk1\s=\s'([^']+)'", webpage, 'video URL')
+ title = self._html_search_regex(
+ r'(?s)<dt>Filename:</dt>.+?<dd>(.+?)</dd>', webpage, 'title')
+ duration = parse_duration(
+ self._search_regex(
+ r'(?s)<dt>Length:</dt>.+?<dd>(.+?)</dd>',
+ webpage,
+ 'duration',
+ fatal=False
+ )
+ )
+
+ m = re.search(
+ r'''(?xs)<dt>Width\sx\sHeight</dt>.+?
+ <dd>(?P<width>\d+)\sx\s(?P<height>\d+)</dd>''',
+ webpage
+ )
+ width = height = None
+ if m:
+ width, height = int(m.group('width')), int(m.group('height'))
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 13e7e71cb..9bd5defa7 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -267,6 +267,14 @@ class SmotriCommunityIE(InfoExtractor):
IE_DESC = 'Smotri.com community videos'
IE_NAME = 'smotri:community'
_VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+ _TEST = {
+ 'url': 'http://smotri.com/community/video/kommuna',
+ 'info_dict': {
+ 'id': 'kommuna',
+ 'title': 'КПРФ',
+ },
+ 'playlist_mincount': 4,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -289,6 +297,14 @@ class SmotriUserIE(InfoExtractor):
IE_DESC = 'Smotri.com user videos'
IE_NAME = 'smotri:user'
_VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+ _TESTS = [{
+ 'url': 'http://smotri.com/user/inspector',
+ 'info_dict': {
+ 'id': 'inspector',
+ 'title': 'Inspector',
+ },
+ 'playlist_mincount': 9,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
index dc9f80550..c663e56d4 100644
--- a/youtube_dl/extractor/sockshare.py
+++ b/youtube_dl/extractor/sockshare.py
@@ -61,7 +61,10 @@ class SockshareIE(InfoExtractor):
r'<a href="([^"]*)".+class="download_file_link"',
webpage, 'file url')
video_url = "http://www.sockshare.com" + video_url
- title = self._html_search_regex(r'<h1>(.+)<strong>', webpage, 'title')
+ title = self._html_search_regex((
+ r'<h1>(.+)<strong>',
+ r'var name = "([^"]+)";'),
+ webpage, 'title', default=None)
thumbnail = self._html_search_regex(
r'<img\s+src="([^"]*)".+?name="bg"',
webpage, 'thumbnail')
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 097d0e418..b78aed7f0 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -28,7 +28,8 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?P<uploader>[\w\d-]+)/
- (?!sets/)(?P<title>[\w\d-]+)/?
+ (?!sets/|likes/?(?:$|[?#]))
+ (?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
|(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
@@ -221,13 +222,16 @@ class SoundcloudIE(InfoExtractor):
class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
- # it's in tests/test_playlists.py
- _TESTS = []
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
+ 'info_dict': {
+ 'title': 'The Royal Concept EP',
+ },
+ 'playlist_mincount': 6,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
@@ -246,20 +250,32 @@ class SoundcloudSetIE(SoundcloudIE):
self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
return
- self.report_extraction(full_title)
- return {'_type': 'playlist',
- 'entries': [self._extract_info_dict(track) for track in info['tracks']],
- 'id': info['id'],
- 'title': info['title'],
- }
+ return {
+ '_type': 'playlist',
+ 'entries': [self._extract_info_dict(track) for track in info['tracks']],
+ 'id': info['id'],
+ 'title': info['title'],
+ }
class SoundcloudUserIE(SoundcloudIE):
_VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
IE_NAME = 'soundcloud:user'
-
- # it's in tests/test_playlists.py
- _TESTS = []
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/the-concept-band',
+ 'info_dict': {
+ 'id': '9615865',
+ 'title': 'The Royal Concept',
+ },
+ 'playlist_mincount': 12
+ }, {
+ 'url': 'https://soundcloud.com/the-concept-band/likes',
+ 'info_dict': {
+ 'id': '9615865',
+ 'title': 'The Royal Concept',
+ },
+ 'playlist_mincount': 1,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -301,9 +317,18 @@ class SoundcloudUserIE(SoundcloudIE):
class SoundcloudPlaylistIE(SoundcloudIE):
_VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)'
IE_NAME = 'soundcloud:playlist'
+ _TESTS = [
- # it's in tests/test_playlists.py
- _TESTS = []
+ {
+ 'url': 'http://api.soundcloud.com/playlists/4110309',
+ 'info_dict': {
+ 'id': '4110309',
+ 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
+ 'description': 're:.*?TILT Brass - Bowery Poetry Club',
+ },
+ 'playlist_count': 6,
+ }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 340a38440..9ed7d3b39 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import compat_urlparse
class SpiegelIE(InfoExtractor):
@@ -28,16 +29,6 @@ class SpiegelIE(InfoExtractor):
'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
'duration': 983,
},
- }, {
- 'url': 'http://www.spiegel.de/video/johann-westhauser-videobotschaft-des-hoehlenforschers-video-1502367.html',
- 'md5': '54f58ba0e752e3c07bc2a26222dd0acf',
- 'info_dict': {
- 'id': '1502367',
- 'ext': 'mp4',
- 'title': 'Videobotschaft: Höhlenforscher Westhauser dankt seinen Rettern',
- 'description': 'md5:c6f1ec11413ebd1088b6813943e5fc91',
- 'duration': 42,
- },
}]
def _real_extract(self, url):
@@ -82,3 +73,34 @@ class SpiegelIE(InfoExtractor):
'duration': duration,
'formats': formats,
}
+
+
+class SpiegelArticleIE(InfoExtractor):
+ _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
+ IE_NAME = 'Spiegel:Article'
+ IE_DESC = 'Articles on spiegel.de'
+ _TEST = {
+ 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
+ 'info_dict': {
+ 'id': '1516455',
+ 'ext': 'mp4',
+ 'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
+ 'description': 're:^Patrick Kämnitz gehört.{100,}',
+ },
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ video_link = self._search_regex(
+ r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
+ 'video page URL')
+ video_url = compat_urlparse.urljoin(
+ self.http_scheme() + '//spiegel.de/', video_link)
+
+ return {
+ '_type': 'url',
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
new file mode 100644
index 000000000..185353bef
--- /dev/null
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ parse_iso8601,
+)
+
+
+class SportDeutschlandIE(InfoExtractor):
+ _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])'
+ _TESTS = [{
+ 'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
+ 'info_dict': {
+ 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
+ 'ext': 'mp4',
+ 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
+ 'categories': ['Badminton'],
+ 'view_count': int,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE',
+ 'timestamp': int,
+ 'upload_date': 're:^201408[23][0-9]$',
+ },
+ 'params': {
+ 'skip_download': 'Live stream',
+ },
+ }, {
+ 'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs',
+ 'info_dict': {
+ 'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs',
+ 'ext': 'mp4',
+ 'upload_date': '20140825',
+ 'description': 'md5:60a20536b57cee7d9a4ec005e8687504',
+ 'timestamp': 1408976060,
+ 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'view_count': int,
+ 'categories': ['Li-Ning Badminton WM 2014'],
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ sport_id = mobj.group('sport')
+
+ api_url = 'http://splink.tv/api/permalinks/%s/%s' % (
+ sport_id, video_id)
+ req = compat_urllib_request.Request(api_url, headers={
+ 'Accept': 'application/vnd.vidibus.v2.html+json',
+ 'Referer': url,
+ })
+ data = self._download_json(req, video_id)
+
+ categories = list(data.get('section', {}).get('tags', {}).values())
+ asset = data['asset']
+
+ formats = []
+ smil_url = asset['video']
+ if '.smil' in smil_url:
+ m3u8_url = smil_url.replace('.smil', '.m3u8')
+ formats.extend(
+ self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'))
+
+ smil_doc = self._download_xml(
+ smil_url, video_id, note='Downloading SMIL metadata')
+ base_url = smil_doc.find('./head/meta').attrib['base']
+ formats.extend([{
+ 'format_id': 'rmtp',
+ 'url': base_url,
+ 'play_path': n.attrib['src'],
+ 'ext': 'flv',
+ 'preference': -100,
+ 'format_note': 'Seems to fail at example stream',
+ } for n in smil_doc.findall('./body/video')])
+ else:
+ formats.append({'url': smil_url})
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': asset['title'],
+ 'thumbnail': asset.get('image'),
+ 'description': asset.get('teaser'),
+ 'categories': categories,
+ 'view_count': asset.get('views'),
+ 'rtmp_live': asset.get('live'),
+ 'timestamp': parse_iso8601(asset.get('date')),
+ }
+
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
new file mode 100644
index 000000000..7de3c9dd5
--- /dev/null
+++ b/youtube_dl/extractor/sunporno.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+ qualities,
+ determine_ext,
+)
+
+
+class SunPornoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.sunporno.com/videos/807778/',
+ 'md5': '6457d3c165fd6de062b99ef6c2ff4c86',
+ 'info_dict': {
+ 'id': '807778',
+ 'ext': 'flv',
+ 'title': 'md5:0a400058e8105d39e35c35e7c5184164',
+ 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 302,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ description = self._html_search_meta('description', webpage, 'description')
+ thumbnail = self._html_search_regex(
+ r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+ duration = parse_duration(self._search_regex(
+ r'<span>Duration: (\d+:\d+)</span>', webpage, 'duration', fatal=False))
+
+ view_count = int_or_none(self._html_search_regex(
+ r'<span class="views">(\d+)</span>', webpage, 'view count', fatal=False))
+ comment_count = int_or_none(self._html_search_regex(
+ r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
+
+ formats = []
+ quality = qualities(['mp4', 'flv'])
+ for video_url in re.findall(r'<source src="([^"]+)"', webpage):
+ video_ext = determine_ext(video_url)
+ formats.append({
+ 'url': video_url,
+ 'format_id': video_ext,
+ 'quality': quality(video_ext),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py
index 5d9d70367..13c6ea677 100644
--- a/youtube_dl/extractor/swrmediathek.py
+++ b/youtube_dl/extractor/swrmediathek.py
@@ -52,20 +52,6 @@ class SWRMediathekIE(InfoExtractor):
'uploader': 'SWR 2',
'uploader_id': '284670',
}
- }, {
- 'url': 'http://swrmediathek.de/content/player.htm?show=52dc7e00-15c5-11e4-84bc-0026b975f2e6',
- 'md5': '881531487d0633080a8cc88d31ef896f',
- 'info_dict': {
- 'id': '52dc7e00-15c5-11e4-84bc-0026b975f2e6',
- 'ext': 'mp4',
- 'title': 'Familienspaß am Bodensee',
- 'description': 'md5:0b591225a32cfde7be1629ed49fe4315',
- 'thumbnail': 're:http://.*\.jpg',
- 'duration': 1784,
- 'upload_date': '20140727',
- 'uploader': 'SWR Fernsehen BW',
- 'uploader_id': '281130',
- }
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
index 46d727d1d..8a95fd656 100644
--- a/youtube_dl/extractor/teachertube.py
+++ b/youtube_dl/extractor/teachertube.py
@@ -106,6 +106,13 @@ class TeacherTubeUserIE(InfoExtractor):
\s*
<a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)"
'''
+ _TEST = {
+ 'url': 'http://www.teachertube.com/user/profile/rbhagwati2',
+ 'info_dict': {
+ 'id': 'rbhagwati2'
+ },
+ 'playlist_mincount': 179,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py
index a55f236cb..16e945d8e 100644
--- a/youtube_dl/extractor/techtalks.py
+++ b/youtube_dl/extractor/techtalks.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -11,24 +13,30 @@ class TechTalksIE(InfoExtractor):
_VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/'
_TEST = {
- u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
- u'playlist': [
+ 'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
+ 'info_dict': {
+ 'id': '57758',
+ 'title': 'Learning Topic Models --- Going beyond SVD',
+ },
+ 'playlist': [
{
- u'file': u'57758.flv',
- u'info_dict': {
- u'title': u'Learning Topic Models --- Going beyond SVD',
+ 'info_dict': {
+ 'id': '57758',
+ 'ext': 'flv',
+ 'title': 'Learning Topic Models --- Going beyond SVD',
},
},
{
- u'file': u'57758-slides.flv',
- u'info_dict': {
- u'title': u'Learning Topic Models --- Going beyond SVD',
+ 'info_dict': {
+ 'id': '57758-slides',
+ 'ext': 'flv',
+ 'title': 'Learning Topic Models --- Going beyond SVD',
},
},
],
- u'params': {
+ 'params': {
# rtmp download
- u'skip_download': True,
+ 'skip_download': True,
},
}
@@ -36,30 +44,36 @@ class TechTalksIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
talk_id = mobj.group('id')
webpage = self._download_webpage(url, talk_id)
- rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage,
- u'rtmp url')
- play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
- webpage, u'presenter play path')
+ rtmp_url = self._search_regex(
+ r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
+ play_path = self._search_regex(
+ r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
+ webpage, 'presenter play path')
title = clean_html(get_element_by_attribute('class', 'title', webpage))
video_info = {
- 'id': talk_id,
- 'title': title,
- 'url': rtmp_url,
- 'play_path': play_path,
- 'ext': 'flv',
- }
+ 'id': talk_id,
+ 'title': title,
+ 'url': rtmp_url,
+ 'play_path': play_path,
+ 'ext': 'flv',
+ }
m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
if m_slides is None:
return video_info
else:
- return [
- video_info,
- # The slides video
- {
- 'id': talk_id + '-slides',
- 'title': title,
- 'url': rtmp_url,
- 'play_path': m_slides.group(1),
- 'ext': 'flv',
- },
- ]
+ return {
+ '_type': 'playlist',
+ 'id': talk_id,
+ 'title': title,
+ 'entries': [
+ video_info,
+ # The slides video
+ {
+ 'id': talk_id + '-slides',
+ 'title': title,
+ 'url': rtmp_url,
+ 'play_path': m_slides.group(1),
+ 'ext': 'flv',
+ },
+ ],
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index bce32a873..1cca47771 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -51,7 +51,6 @@ class TEDIE(SubtitlesInfoExtractor):
}
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
- 'md5': '49144e345a899b8cb34d315f3b9cfeeb',
'info_dict': {
'id': '1972',
'ext': 'mp4',
@@ -59,6 +58,13 @@ class TEDIE(SubtitlesInfoExtractor):
'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
},
+ }, {
+ 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
+ 'info_dict': {
+ 'id': '10',
+ 'title': 'Who are the hackers?',
+ },
+ 'playlist_mincount': 6,
}]
_NATIVE_FORMATS = {
diff --git a/youtube_dl/extractor/telemb.py b/youtube_dl/extractor/telemb.py
new file mode 100644
index 000000000..1bbd0e7bd
--- /dev/null
+++ b/youtube_dl/extractor/telemb.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class TeleMBIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P<display_id>.+?)_d_(?P<id>\d+)\.html'
+ _TESTS = [
+ {
+ 'url': 'http://www.telemb.be/mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-_d_13466.html',
+ 'md5': 'f45ea69878516ba039835794e0f8f783',
+ 'info_dict': {
+ 'id': '13466',
+ 'display_id': 'mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-',
+ 'ext': 'mp4',
+ 'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages',
+ 'description': 'md5:bc5225f47b17c309761c856ad4776265',
+ 'thumbnail': 're:^http://.*\.(?:jpg|png)$',
+ }
+ },
+ {
+ # non-ASCII characters in download URL
+ 'url': 'http://telemb.be/les-reportages-havre-incendie-mortel_d_13514.html',
+ 'md5': '6e9682736e5ccd4eab7f21e855350733',
+ 'info_dict': {
+ 'id': '13514',
+ 'display_id': 'les-reportages-havre-incendie-mortel',
+ 'ext': 'mp4',
+ 'title': 'Havré - Incendie mortel - Les reportages',
+ 'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a',
+ 'thumbnail': 're:^http://.*\.(?:jpg|png)$',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ formats = []
+ for video_url in re.findall(r'file\s*:\s*"([^"]+)"', webpage):
+ fmt = {
+ 'url': video_url,
+ 'format_id': video_url.split(':')[0]
+ }
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url)
+ if rtmp:
+ fmt.update({
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf',
+ 'page_url': 'http://www.telemb.be',
+ 'preference': -1,
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ title = remove_start(self._og_search_title(webpage), 'TéléMB : ')
+ description = self._html_search_regex(
+ r'<meta property="og:description" content="(.+?)" />',
+ webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
new file mode 100644
index 000000000..4956f8577
--- /dev/null
+++ b/youtube_dl/extractor/tnaflix.py
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ fix_xml_ampersands,
+)
+
+
+class TNAFlixIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+
+ _TITLE_REGEX = None
+ _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
+ _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
+
+ _TEST = {
+ 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+ 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+ 'info_dict': {
+ 'id': '553878',
+ 'display_id': 'Carmella-Decesare-striptease',
+ 'ext': 'mp4',
+ 'title': 'Carmella Decesare - striptease',
+ 'description': '',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 91,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._html_search_regex(
+ self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+ description = self._html_search_regex(
+ self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
+
+ age_limit = self._rta_search(webpage)
+
+ duration = self._html_search_meta('duration', webpage, 'duration', default=None)
+ if duration:
+ duration = parse_duration(duration[1:])
+
+ cfg_url = self._html_search_regex(
+ self._CONFIG_REGEX, webpage, 'flashvars.config')
+
+ cfg_xml = self._download_xml(
+ cfg_url, display_id, note='Downloading metadata',
+ transform_source=fix_xml_ampersands)
+
+ thumbnail = cfg_xml.find('./startThumb').text
+
+ formats = []
+ for item in cfg_xml.findall('./quality/item'):
+ video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
+ format_id = item.find('res').text
+ fmt = {
+ 'url': video_url,
+ 'format_id': format_id,
+ }
+ m = re.search(r'^(\d+)', format_id)
+ if m:
+ fmt['height'] = int(m.group(1))
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
index 0f389bd93..2756f56d3 100644
--- a/youtube_dl/extractor/toypics.py
+++ b/youtube_dl/extractor/toypics.py
@@ -42,6 +42,13 @@ class ToypicsIE(InfoExtractor):
class ToypicsUserIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
_VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://videos.toypics.net/Mikey',
+ 'info_dict': {
+ 'id': 'Mikey',
+ },
+ 'playlist_mincount': 19,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index 7a3891b89..dcd823d08 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -1,5 +1,7 @@
# coding: utf-8
+from __future__ import unicode_literals
+
import re
import json
@@ -9,22 +11,29 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
_TESTS = [{
- u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
- u'file': u'159448201.f4v',
- u'md5': u'140a49ed444bd22f93330985d8475fcb',
- u'info_dict': {
- u"title": u"卡马乔国足开大脚长传冲吊集锦"
+ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
+ 'md5': '140a49ed444bd22f93330985d8475fcb',
+ 'info_dict': {
+ 'id': '159448201',
+ 'ext': 'f4v',
+ 'title': '卡马乔国足开大脚长传冲吊集锦',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
+ 'info_dict': {
+ 'id': '117049447',
+ 'ext': 'f4v',
+ 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
+ 'thumbnail': 're:^https?://.*\.jpg$',
}
- },
- {
- u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
- u'file': u'todo.mp4',
- u'md5': u'todo.mp4',
- u'info_dict': {
- u'title': u'todo.mp4',
+ }, {
+ 'url': 'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
+ 'info_dict': {
+ 'title': 'todo.mp4',
},
- u'add_ie': [u'Youku'],
- u'skip': u'Only works from China'
+ 'add_ie': ['Youku'],
+ 'skip': 'Only works from China'
}]
def _url_for_id(self, id, quality = None):
@@ -44,20 +53,22 @@ class TudouIE(InfoExtractor):
if m and m.group(1):
return {
'_type': 'url',
- 'url': u'youku:' + m.group(1),
+ 'url': 'youku:' + m.group(1),
'ie_key': 'Youku'
}
title = self._search_regex(
- r",kw:\s*['\"](.+?)[\"']", webpage, u'title')
+ r",kw:\s*['\"](.+?)[\"']", webpage, 'title')
thumbnail_url = self._search_regex(
- r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False)
+ r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False)
segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
segments = json.loads(segs_json)
# It looks like the keys are the arguments that have to be passed as
# the hd field in the request url, we pick the higher
- quality = sorted(segments.keys())[-1]
+ # Also, filter non-number qualities (see issue #3643).
+ quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
+ key=lambda k: int(k))[-1]
parts = segments[quality]
result = []
len_parts = len(parts)
@@ -67,12 +78,13 @@ class TudouIE(InfoExtractor):
part_id = part['k']
final_url = self._url_for_id(part_id, quality)
ext = (final_url.split('?')[0]).split('.')[-1]
- part_info = {'id': part_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- }
+ part_info = {
+ 'id': '%s' % part_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ }
result.append(part_info)
return result
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 2882c1809..306fe8974 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -10,7 +10,7 @@ from ..utils import (
class TumblrIE(InfoExtractor):
- _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)($|/)'
+ _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
_TESTS = [{
'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
'md5': '479bb068e5b16462f5176a6828829767',
@@ -56,13 +56,15 @@ class TumblrIE(InfoExtractor):
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
- video_title = self._html_search_regex(r'<title>(?P<title>.*?)(?: \| Tumblr)?</title>',
- webpage, 'title', flags=re.DOTALL)
+ video_title = self._html_search_regex(
+ r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>',
+ webpage, 'title')
- return [{'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'description': self._html_search_meta('description', webpage),
- 'thumbnail': video_thumbnail,
- 'ext': ext
- }]
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'description': self._html_search_meta('description', webpage),
+ 'thumbnail': video_thumbnail,
+ 'ext': ext,
+ }
diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py
new file mode 100644
index 000000000..29703a8a9
--- /dev/null
+++ b/youtube_dl/extractor/turbo.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ qualities,
+ xpath_text,
+)
+
+
+class TurboIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-'
+ _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}'
+ _TEST = {
+ 'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html',
+ 'md5': '33f4b91099b36b5d5a91f84b5bcba600',
+ 'info_dict': {
+ 'id': '454443',
+ 'ext': 'mp4',
+ 'duration': 3715,
+ 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+ 'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ playlist = self._download_xml(self._API_URL.format(video_id), video_id)
+ item = playlist.find('./channel/item')
+ if item is None:
+ raise ExtractorError('Playlist item was not found', expected=True)
+
+ title = xpath_text(item, './title', 'title')
+ duration = int_or_none(xpath_text(item, './durate', 'duration'))
+ thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
+ description = self._og_search_description(webpage)
+
+ formats = []
+ get_quality = qualities(['3g', 'sd', 'hq'])
+ for child in item:
+ m = re.search(r'url_video_(?P<quality>.+)', child.tag)
+ if m:
+ quality = m.group('quality')
+ formats.append({
+ 'format_id': quality,
+ 'url': child.text,
+ 'quality': get_quality(quality),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 0921cc5f8..dc8697850 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -5,80 +5,82 @@ import re
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
- clean_html,
- int_or_none,
+ float_or_none,
+ str_to_int,
)
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
- _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
+ _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$'
_TESTS = [
{
- 'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
- 'md5': '09afba4616666249f087efc6dcf83cb3',
+ 'url': 'http://www.tvigle.ru/video/brat-2/',
+ 'md5': '72cb7eab33e54314e1790da402d3c9c3',
'info_dict': {
- 'id': '503081',
- 'ext': 'flv',
+ 'id': '5119390',
+ 'display_id': 'brat-2',
+ 'ext': 'mp4',
'title': 'Брат 2 ',
- 'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
- 'upload_date': '20110919',
+ 'description': 'md5:5751f4fe345a58e1692585c361294bd8',
+ 'duration': 7356.369,
+ 'age_limit': 0,
},
},
{
- 'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
- 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
+ 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
+ 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
'info_dict': {
- 'id': '676433',
- 'ext': 'flv',
+ 'id': '5142516',
+ 'ext': 'mp4',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
- 'upload_date': '20121218',
+ 'duration': 186.080,
+ 'age_limit': 0,
},
},
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- video_data = self._download_xml(
- 'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
+ webpage = self._download_webpage(url, display_id)
- video = video_data.find('./video')
+ video_id = self._html_search_regex(
+ r'<li class="video-preview current_playing" id="(\d+)">', webpage, 'video id')
- title = video.get('name')
- description = video.get('anons')
- if description:
- description = clean_html(description)
- thumbnail = video_data.get('img')
- upload_date = unified_strdate(video.get('date'))
- like_count = int_or_none(video.get('vtp'))
+ video_data = self._download_json(
+ 'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id)
- formats = []
- for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
- video_url = video.get(format_id)
- if not video_url:
- continue
- formats.append({
- 'url': video_url,
- 'format_id': format_id,
- 'format_note': format_note,
- 'quality': num,
- })
+ item = video_data['playlist']['items'][0]
+
+ title = item['title']
+ description = item['description']
+ thumbnail = item['thumbnail']
+ duration = float_or_none(item['durationMilliseconds'], 1000)
+ age_limit = str_to_int(item['ageRestrictions'])
+ formats = []
+ for vcodec, fmts in item['videos'].items():
+ for quality, video_url in fmts.items():
+ formats.append({
+ 'url': video_url,
+ 'format_id': '%s-%s' % (vcodec, quality),
+ 'vcodec': vcodec,
+ 'height': int(quality[:-1]),
+ })
self._sort_formats(formats)
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'like_count': like_count,
- 'age_limit': 18,
+ 'duration': duration,
+ 'age_limit': age_limit,
'formats': formats,
} \ No newline at end of file
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index a56a7ab5f..445e0ec41 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -6,13 +6,28 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ compat_str,
parse_iso8601,
qualities,
)
class TVPlayIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?tvplay\.lv/parraides/[^/]+/(?P<id>\d+)'
+ IE_DESC = 'TV3Play and related services'
+ _VALID_URL = r'''(?x)http://(?:www\.)?
+ (?:tvplay\.lv/parraides|
+ tv3play\.lt/programos|
+ tv3play\.ee/sisu|
+ tv3play\.se/program|
+ tv6play\.se/program|
+ tv8play\.se/program|
+ tv10play\.se/program|
+ tv3play\.no/programmer|
+ viasat4play\.no/programmer|
+ tv6play\.no/programmer|
+ tv3play\.dk/programmer|
+ )/[^/]+/(?P<id>\d+)
+ '''
_TESTS = [
{
'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true',
@@ -30,6 +45,134 @@ class TVPlayIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.tv3play.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
+ 'info_dict': {
+ 'id': '409229',
+ 'ext': 'flv',
+ 'title': 'Moterys meluoja geriau',
+ 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e',
+ 'duration': 1330,
+ 'timestamp': 1403769181,
+ 'upload_date': '20140626',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv3play.ee/sisu/kodu-keset-linna/238551?autostart=true',
+ 'info_dict': {
+ 'id': '238551',
+ 'ext': 'flv',
+ 'title': 'Kodu keset linna 398537',
+ 'description': 'md5:7df175e3c94db9e47c0d81ffa5d68701',
+ 'duration': 1257,
+ 'timestamp': 1292449761,
+ 'upload_date': '20101215',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true',
+ 'info_dict': {
+ 'id': '395385',
+ 'ext': 'flv',
+ 'title': 'Husräddarna S02E07',
+ 'description': 'md5:f210c6c89f42d4fc39faa551be813777',
+ 'duration': 2574,
+ 'timestamp': 1400596321,
+ 'upload_date': '20140520',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true',
+ 'info_dict': {
+ 'id': '266636',
+ 'ext': 'flv',
+ 'title': 'Den sista dokusåpan S01E08',
+ 'description': 'md5:295be39c872520221b933830f660b110',
+ 'duration': 1492,
+ 'timestamp': 1330522854,
+ 'upload_date': '20120229',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true',
+ 'info_dict': {
+ 'id': '282756',
+ 'ext': 'flv',
+ 'title': 'Antikjakten S01E10',
+ 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8',
+ 'duration': 2646,
+ 'timestamp': 1348575868,
+ 'upload_date': '20120925',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true',
+ 'info_dict': {
+ 'id': '230898',
+ 'ext': 'flv',
+ 'title': 'Anna Anka søker assistent - Ep. 8',
+ 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474',
+ 'duration': 2656,
+ 'timestamp': 1277720005,
+ 'upload_date': '20100628',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true',
+ 'info_dict': {
+ 'id': '21873',
+ 'ext': 'flv',
+ 'title': 'Budbringerne program 10',
+ 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d',
+ 'duration': 1297,
+ 'timestamp': 1254205102,
+ 'upload_date': '20090929',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true',
+ 'info_dict': {
+ 'id': '361883',
+ 'ext': 'flv',
+ 'title': 'Hotelinspektør Alex Polizzi - Ep. 10',
+ 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81',
+ 'duration': 2594,
+ 'timestamp': 1393236292,
+ 'upload_date': '20140224',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
]
def _real_extract(self, url):
@@ -49,7 +192,7 @@ class TVPlayIE(InfoExtractor):
quality = qualities(['hls', 'medium', 'high'])
formats = []
for format_id, video_url in streams['streams'].items():
- if not video_url:
+ if not video_url or not isinstance(video_url, compat_str):
continue
fmt = {
'format_id': format_id,
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py
index 474610eec..f70978299 100644
--- a/youtube_dl/extractor/unistra.py
+++ b/youtube_dl/extractor/unistra.py
@@ -1,32 +1,66 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
+from ..utils import qualities
+
class UnistraIE(InfoExtractor):
- _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(\d+)'
-
- _TEST = {
- u'url': u'http://utv.unistra.fr/video.php?id_video=154',
- u'file': u'154.mp4',
- u'md5': u'736f605cfdc96724d55bb543ab3ced24',
- u'info_dict': {
- u'title': u'M!ss Yella',
- u'description': u'md5:104892c71bd48e55d70b902736b81bbf',
+ _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://utv.unistra.fr/video.php?id_video=154',
+ 'md5': '736f605cfdc96724d55bb543ab3ced24',
+ 'info_dict': {
+ 'id': '154',
+ 'ext': 'mp4',
+ 'title': 'M!ss Yella',
+ 'description': 'md5:104892c71bd48e55d70b902736b81bbf',
+ },
},
- }
+ {
+ 'url': 'http://utv.unistra.fr/index.php?id_video=437',
+ 'md5': '1ddddd6cccaae76f622ce29b8779636d',
+ 'info_dict': {
+ 'id': '437',
+ 'ext': 'mp4',
+ 'title': 'Prix Louise Weiss 2014',
+ 'description': 'md5:cc3a8735f079f4fb6b0b570fc10c135a',
+ },
+ }
+ ]
def _real_extract(self, url):
- id = re.match(self._VALID_URL, url).group(1)
- webpage = self._download_webpage(url, id)
- file = re.search(r'file: "(.*?)",', webpage).group(1)
- title = self._html_search_regex(r'<title>UTV - (.*?)</', webpage, u'title')
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
- video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file
+ webpage = self._download_webpage(url, video_id)
- return {'id': id,
- 'title': title,
- 'ext': 'mp4',
- 'url': video_url,
- 'description': self._html_search_regex(r'<meta name="Description" content="(.*?)"', webpage, u'description', flags=re.DOTALL),
- 'thumbnail': self._search_regex(r'image: "(.*?)"', webpage, u'thumbnail'),
- }
+ files = set(re.findall(r'file\s*:\s*"([^"]+)"', webpage))
+
+ quality = qualities(['SD', 'HD'])
+ formats = []
+ for file_path in files:
+ format_id = 'HD' if file_path.endswith('-HD.mp4') else 'SD'
+ formats.append({
+ 'url': 'http://vod-flash.u-strasbg.fr:8080%s' % file_path,
+ 'format_id': format_id,
+ 'quality': quality(format_id)
+ })
+
+ title = self._html_search_regex(
+ r'<title>UTV - (.*?)</', webpage, 'title')
+ description = self._html_search_regex(
+ r'<meta name="Description" content="(.*?)"', webpage, 'description', flags=re.DOTALL)
+ thumbnail = self._search_regex(
+ r'image: "(.*?)"', webpage, 'thumbnail')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 488b10df9..994b60a76 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
@@ -68,21 +67,36 @@ class UstreamIE(InfoExtractor):
class UstreamChannelIE(InfoExtractor):
_VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
IE_NAME = 'ustream:channel'
+ _TEST = {
+ 'url': 'http://www.ustream.tv/channel/channeljapan',
+ 'info_dict': {
+ 'id': '10874166',
+ },
+ 'playlist_mincount': 54,
+ }
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- slug = m.group('slug')
- webpage = self._download_webpage(url, slug)
+ display_id = m.group('slug')
+ webpage = self._download_webpage(url, display_id)
channel_id = get_meta_content('ustream:channel_id', webpage)
BASE = 'http://www.ustream.tv'
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
video_ids = []
while next_url:
- reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id))
+ reply = self._download_json(
+ compat_urlparse.urljoin(BASE, next_url), display_id,
+ note='Downloading video information (next: %d)' % (len(video_ids) + 1))
video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
next_url = reply['nextUrl']
- urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
- url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
- return self.playlist_result(url_entries, channel_id)
+ entries = [
+ self.url_result('http://www.ustream.tv/recorded/' + vid, 'Ustream')
+ for vid in video_ids]
+ return {
+ '_type': 'playlist',
+ 'id': channel_id,
+ 'display_id': display_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index b1c854a64..77b1f91ce 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -16,8 +16,9 @@ class VeeHDIE(InfoExtractor):
_TEST = {
'url': 'http://veehd.com/video/4686958',
- 'file': '4686958.mp4',
'info_dict': {
+ 'id': '4686958',
+ 'ext': 'mp4',
'title': 'Time Lapse View from Space ( ISS)',
'uploader_id': 'spotted',
'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
new file mode 100644
index 000000000..7d27d6c57
--- /dev/null
+++ b/youtube_dl/extractor/vgtv.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class VGTVIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/(?:.*)/(?P<id>[0-9]+)'
+ _TESTS = [
+ {
+ # streamType: vod
+ 'url': 'http://www.vgtv.no/#!/video/84196/hevnen-er-soet-episode-10-abu',
+ 'md5': 'b8be7a234cebb840c0d512c78013e02f',
+ 'info_dict': {
+ 'id': '84196',
+ 'ext': 'mp4',
+ 'title': 'Hevnen er søt episode 10: Abu',
+ 'description': 'md5:e25e4badb5f544b04341e14abdc72234',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 648.000,
+ 'timestamp': 1404626400,
+ 'upload_date': '20140706',
+ 'view_count': int,
+ },
+ },
+ {
+ # streamType: wasLive
+ 'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen',
+ 'info_dict': {
+ 'id': '100764',
+ 'ext': 'mp4',
+ 'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',
+ 'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 9056.000,
+ 'timestamp': 1410113864,
+ 'upload_date': '20140907',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ # streamType: live
+ 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+ 'info_dict': {
+ 'id': '100015',
+ 'ext': 'mp4',
+ 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
+ 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 0,
+ 'timestamp': 1407423348,
+ 'upload_date': '20140807',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ data = self._download_json(
+ 'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
+ video_id, 'Downloading media JSON')
+
+ streams = data['streamUrls']
+
+ formats = []
+
+ hls_url = streams.get('hls')
+ if hls_url:
+ formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4'))
+
+ hds_url = streams.get('hds')
+ if hds_url:
+ formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id))
+
+ mp4_url = streams.get('mp4')
+ if mp4_url:
+ _url = hls_url or hds_url
+ MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1])
+ for mp4_format in _url.split(','):
+ m = re.search('(?P<width>\d+)_(?P<height>\d+)_(?P<vbr>\d+)', mp4_format)
+ if not m:
+ continue
+ width = int(m.group('width'))
+ height = int(m.group('height'))
+ vbr = int(m.group('vbr'))
+ formats.append({
+ 'url': MP4_URL_TEMPLATE % mp4_format,
+ 'format_id': 'mp4-%s' % vbr,
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': data['title'],
+ 'description': data['description'],
+ 'thumbnail': data['images']['main'] + '?t[]=900x506q80',
+ 'timestamp': data['published'],
+ 'duration': float_or_none(data['duration'], 1000),
+ 'view_count': data['displays'],
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 55f6cd0d8..bc01d7fbf 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -57,6 +57,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
(?P<proto>(?:https?:)?//)?
(?:(?:www|(?P<player>player))\.)?
vimeo(?P<pro>pro)?\.com/
+ (?!channels/[^/?#]+/?(?:$|[?#])|album/)
(?:.*?/)?
(?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
(?:videos?/)?
@@ -151,30 +152,8 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'duration': 62,
}
},
- {
- 'note': 'video player needs Referer',
- 'url': 'http://vimeo.com/user22258446/review/91613211/13f927e053',
- 'md5': '6295fdab8f4bf6a002d058b2c6dce276',
- 'info_dict': {
- 'id': '91613211',
- 'ext': 'mp4',
- 'title': 'Death by dogma versus assembling agile - Sander Hoogendoorn',
- 'uploader': 'DevWeek Events',
- 'duration': 2773,
- 'thumbnail': 're:^https?://.*\.jpg$',
- }
- }
]
- @classmethod
- def suitable(cls, url):
- if VimeoChannelIE.suitable(url):
- # Otherwise channel urls like http://vimeo.com/channels/31259 would
- # match
- return False
- else:
- return super(VimeoIE, cls).suitable(url)
-
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
@@ -393,9 +372,16 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
class VimeoChannelIE(InfoExtractor):
IE_NAME = 'vimeo:channel'
- _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)/?(\?.*)?$'
+ _VALID_URL = r'https?://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
+ _TESTS = [{
+ 'url': 'http://vimeo.com/channels/tributes',
+ 'info_dict': {
+ 'title': 'Vimeo Tributes',
+ },
+ 'playlist_mincount': 25,
+ }]
def _page_url(self, base_url, pagenum):
return '%s/videos/page:%d/' % (base_url, pagenum)
@@ -429,14 +415,15 @@ class VimeoChannelIE(InfoExtractor):
class VimeoUserIE(VimeoChannelIE):
IE_NAME = 'vimeo:user'
- _VALID_URL = r'(?:https?://)?vimeo\.com/(?P<name>[^/]+)(?:/videos|[#?]|$)'
+ _VALID_URL = r'https?://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
-
- @classmethod
- def suitable(cls, url):
- if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url) or VimeoAlbumIE.suitable(url) or VimeoGroupsIE.suitable(url):
- return False
- return super(VimeoUserIE, cls).suitable(url)
+ _TESTS = [{
+ 'url': 'http://vimeo.com/nkistudio/videos',
+ 'info_dict': {
+ 'title': 'Nki',
+ },
+ 'playlist_mincount': 66,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -446,8 +433,15 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = 'vimeo:album'
- _VALID_URL = r'(?:https?://)?vimeo\.com/album/(?P<id>\d+)'
+ _VALID_URL = r'https?://vimeo\.com/album/(?P<id>\d+)'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
+ _TESTS = [{
+ 'url': 'http://vimeo.com/album/2632481',
+ 'info_dict': {
+ 'title': 'Staff Favorites: November 2013',
+ },
+ 'playlist_mincount': 13,
+ }]
def _page_url(self, base_url, pagenum):
return '%s/page:%d/' % (base_url, pagenum)
@@ -461,6 +455,13 @@ class VimeoAlbumIE(VimeoChannelIE):
class VimeoGroupsIE(VimeoAlbumIE):
IE_NAME = 'vimeo:group'
_VALID_URL = r'(?:https?://)?vimeo\.com/groups/(?P<name>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://vimeo.com/groups/rolexawards',
+ 'info_dict': {
+ 'title': 'Rolex Awards for Enterprise',
+ },
+ 'playlist_mincount': 73,
+ }]
def _extract_list_title(self, webpage):
return self._og_search_title(webpage)
@@ -474,8 +475,8 @@ class VimeoGroupsIE(VimeoAlbumIE):
class VimeoReviewIE(InfoExtractor):
IE_NAME = 'vimeo:review'
IE_DESC = 'Review pages on vimeo'
- _VALID_URL = r'(?:https?://)?vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
- _TEST = {
+ _VALID_URL = r'https?://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
+ _TESTS = [{
'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
'file': '75524534.mp4',
'md5': 'c507a72f780cacc12b2248bb4006d253',
@@ -483,7 +484,19 @@ class VimeoReviewIE(InfoExtractor):
'title': "DICK HARDWICK 'Comedian'",
'uploader': 'Richard Hardwick',
}
- }
+ }, {
+ 'note': 'video player needs Referer',
+ 'url': 'http://vimeo.com/user22258446/review/91613211/13f927e053',
+ 'md5': '6295fdab8f4bf6a002d058b2c6dce276',
+ 'info_dict': {
+ 'id': '91613211',
+ 'ext': 'mp4',
+ 'title': 'Death by dogma versus assembling agile - Sander Hoogendoorn',
+ 'uploader': 'DevWeek Events',
+ 'duration': 2773,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -498,6 +511,10 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
_VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater'
_LOGIN_REQUIRED = True
_TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
+ _TESTS = [{
+ 'url': 'http://vimeo.com/home/watchlater',
+ 'only_matching': True,
+ }]
def _real_initialize(self):
self._login()
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 076c87119..e7754158d 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -65,6 +65,13 @@ class VineUserIE(InfoExtractor):
IE_NAME = 'vine:user'
_VALID_URL = r'(?:https?://)?vine\.co/(?P<user>[^/]+)/?(\?.*)?$'
_VINE_BASE_URL = "https://vine.co/"
+ _TEST = {
+ 'url': 'https://vine.co/Visa',
+ 'info_dict': {
+ 'id': 'Visa',
+ },
+ 'playlist_mincount': 47,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py
new file mode 100644
index 000000000..2d23effcc
--- /dev/null
+++ b/youtube_dl/extractor/vporn.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ str_to_int,
+)
+
+
+class VpornIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/',
+ 'md5': 'facf37c1b86546fa0208058546842c55',
+ 'info_dict': {
+ 'id': '497944',
+ 'display_id': 'violet-on-her-th-birthday',
+ 'ext': 'mp4',
+ 'title': 'Violet on her 19th birthday',
+ 'description': 'Violet dances in front of the camera which is sure to get you horny.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'kileyGrope',
+ 'categories': ['Masturbation', 'Teen'],
+ 'duration': 393,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ }
+ },
+ {
+ 'url': 'http://www.vporn.com/female/hana-shower/523564/',
+ 'md5': 'ced35a4656198a1664cf2cda1575a25f',
+ 'info_dict': {
+ 'id': '523564',
+ 'display_id': 'hana-shower',
+ 'ext': 'mp4',
+ 'title': 'Hana Shower',
+ 'description': 'Hana showers at the bathroom.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Hmmmmm',
+ 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female'],
+ 'duration': 588,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._html_search_regex(
+ r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
+ description = self._html_search_regex(
+ r'<div class="description_txt">(.*?)</div>', webpage, 'description', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None)
+ if thumbnail:
+ thumbnail = 'http://www.vporn.com' + thumbnail
+
+ uploader = self._html_search_regex(
+ r'(?s)UPLOADED BY.*?<a href="/user/[^"]+">([^<]+)</a>',
+ webpage, 'uploader', fatal=False)
+
+ categories = re.findall(r'<a href="/cat/[^"]+">([^<]+)</a>', webpage)
+
+ duration = parse_duration(self._search_regex(
+ r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False))
+
+ view_count = str_to_int(self._html_search_regex(
+ r'<span>([\d,\.]+) VIEWS</span>', webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._html_search_regex(
+ r'<span id="like" class="n">([\d,\.]+)</span>', webpage, 'like count', fatal=False))
+ dislike_count = str_to_int(self._html_search_regex(
+ r'<span id="dislike" class="n">([\d,\.]+)</span>', webpage, 'dislike count', fatal=False))
+ comment_count = str_to_int(self._html_search_regex(
+ r'<h4>Comments \(<b>([\d,\.]+)</b>\)</h4>', webpage, 'comment count', fatal=False))
+
+ formats = []
+
+ for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"(https?://[^"]+)"', webpage):
+ video_url = video[1]
+ fmt = {
+ 'url': video_url,
+ 'format_id': video[0],
+ }
+ m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url)
+ if m:
+ fmt.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ 'vbr': int(m.group('vbr')),
+ })
+ formats.append(fmt)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'categories': categories,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'comment_count': comment_count,
+ 'age_limit': 18,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index cb8f0887d..88bbbb219 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -13,6 +13,9 @@ class WashingtonPostIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+ 'info_dict': {
+ 'title': 'Sinkhole of bureaucracy',
+ },
'playlist': [{
'md5': 'c3f4b4922ffa259243f68e928db2db8c',
'info_dict': {
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 00b6d1eba..4e8fbde8d 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -18,7 +18,6 @@ class XHamsterIE(InfoExtractor):
_TESTS = [
{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
- 'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': {
'id': '1509445',
'ext': 'mp4',
@@ -31,7 +30,6 @@ class XHamsterIE(InfoExtractor):
},
{
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
- 'md5': '4cbd8d56708ecb4fb4124c23e4acb81a',
'info_dict': {
'id': '2221348',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index b293e2665..273d93d9e 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -77,9 +77,17 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
+
class XTubeUserIE(InfoExtractor):
IE_DESC = 'XTube user profile'
_VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
+ _TEST = {
+ 'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
+ 'info_dict': {
+ 'id': 'greenshowers',
+ },
+ 'playlist_mincount': 155,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 0e3b33b16..3ab6017cd 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -71,7 +71,8 @@ class YahooIE(InfoExtractor):
if items_json is None:
CONTENT_ID_REGEXES = [
r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
- r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"'
+ r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
+ r'"first_videoid"\s*:\s*"([^"]+)"',
]
long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
video_id = long_id
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index fcb5ff758..b86331e3c 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -9,7 +9,7 @@ from ..utils import (
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
+ _VALID_URL = r'^https?://(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
_TEST = {
'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
'file': '2189178.flv',
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index d456c4da5..7bfda45e7 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -23,7 +23,6 @@ class YouPornIE(InfoExtractor):
_VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
- 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
'info_dict': {
'id': '505835',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 75044d71a..b54c69122 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1,7 +1,8 @@
# coding: utf-8
-import errno
-import io
+from __future__ import unicode_literals
+
+
import itertools
import json
import os.path
@@ -21,7 +22,6 @@ from ..utils import (
compat_str,
clean_html,
- get_cachedir,
get_element_by_id,
get_element_by_attribute,
ExtractorError,
@@ -30,7 +30,6 @@ from ..utils import (
unescapeHTML,
unified_strdate,
orderedSet,
- write_json_file,
uppercase_escape,
)
@@ -73,29 +72,29 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return
galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
- login_page, u'Login GALX parameter')
+ login_page, 'Login GALX parameter')
# Log in
login_form_strs = {
- u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- u'Email': username,
- u'GALX': galx,
- u'Passwd': password,
-
- u'PersistentCookie': u'yes',
- u'_utf8': u'霱',
- u'bgresponse': u'js_disabled',
- u'checkConnection': u'',
- u'checkedDomains': u'youtube',
- u'dnConn': u'',
- u'pstMsg': u'0',
- u'rmShown': u'1',
- u'secTok': u'',
- u'signIn': u'Sign in',
- u'timeStmp': u'',
- u'service': u'youtube',
- u'uilel': u'3',
- u'hl': u'en_US',
+ 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ 'Email': username,
+ 'GALX': galx,
+ 'Passwd': password,
+
+ 'PersistentCookie': 'yes',
+ '_utf8': '霱',
+ 'bgresponse': 'js_disabled',
+ 'checkConnection': '',
+ 'checkedDomains': 'youtube',
+ 'dnConn': '',
+ 'pstMsg': '0',
+ 'rmShown': '1',
+ 'secTok': '',
+ 'signIn': 'Sign in',
+ 'timeStmp': '',
+ 'service': 'youtube',
+ 'uilel': '3',
+ 'hl': 'en_US',
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
@@ -136,19 +135,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
timeStmp = match.group(1)
tfa_form_strs = {
- u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- u'smsToken': u'',
- u'smsUserPin': tfa_code,
- u'smsVerifyPin': u'Verify',
-
- u'PersistentCookie': u'yes',
- u'checkConnection': u'',
- u'checkedDomains': u'youtube',
- u'pstMsg': u'1',
- u'secTok': secTok,
- u'timeStmp': timeStmp,
- u'service': u'youtube',
- u'hl': u'en_US',
+ 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ 'smsToken': '',
+ 'smsUserPin': tfa_code,
+ 'smsVerifyPin': 'Verify',
+
+ 'PersistentCookie': 'yes',
+ 'checkConnection': '',
+ 'checkedDomains': 'youtube',
+ 'pstMsg': '1',
+ 'secTok': secTok,
+ 'timeStmp': timeStmp,
+ 'service': 'youtube',
+ 'hl': 'en_US',
}
tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
@@ -200,10 +199,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
- IE_DESC = u'YouTube.com'
+ IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
(
- (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
@@ -221,10 +220,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
)
))
|youtu\.be/ # just youtu.be/xxxx
- |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
+ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
+ (?!.*?&list=) # combined list/video URLs are handled by the playlist IE
(?(1).+)? # if we found the ID, everything can follow
$"""
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
@@ -304,7 +304,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'_rtmp': {'protocol': 'rtmp'},
}
- IE_NAME = u'youtube'
+ IE_NAME = 'youtube'
_TESTS = [
{
u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
@@ -316,6 +316,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"upload_date": u"20121002",
u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
u"categories": [u'Science & Technology'],
+ 'like_count': int,
+ 'dislike_count': int,
}
},
{
@@ -361,7 +363,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"info_dict": {
u"upload_date": "20121002",
u"uploader_id": "8KVIDEO",
- u"description": "No description available.",
+ u"description": '',
u"uploader": "8KVIDEO",
u"title": "UHDTV TEST 8K VIDEO.mp4"
},
@@ -372,30 +374,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
},
# DASH manifest with encrypted signature
{
- u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
- u'info_dict': {
- u'id': u'IB3lcPjvWLA',
- u'ext': u'm4a',
- u'title': u'Afrojack - The Spark ft. Spree Wilson',
- u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
- u'uploader': u'AfrojackVEVO',
- u'uploader_id': u'AfrojackVEVO',
- u'upload_date': u'20131011',
+ 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ 'info_dict': {
+ 'id': 'IB3lcPjvWLA',
+ 'ext': 'm4a',
+ 'title': 'Afrojack - The Spark ft. Spree Wilson',
+ 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
+ 'uploader': 'AfrojackVEVO',
+ 'uploader_id': 'AfrojackVEVO',
+ 'upload_date': '20131011',
},
u"params": {
- u'youtube_include_dash_manifest': True,
- u'format': '141',
+ 'youtube_include_dash_manifest': True,
+ 'format': '141',
},
},
]
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url): return False
- return re.match(cls._VALID_URL, url) is not None
-
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
self._player_cache = {}
@@ -418,7 +413,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _signature_cache_id(self, example_sig):
""" Return a string representation of a signature """
- return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
+ return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
@@ -433,26 +428,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
func_id = '%s_%s_%s' % (
player_type, player_id, self._signature_cache_id(example_sig))
assert os.path.basename(func_id) == func_id
- cache_dir = get_cachedir(self._downloader.params)
- cache_enabled = cache_dir is not None
- if cache_enabled:
- cache_fn = os.path.join(os.path.expanduser(cache_dir),
- u'youtube-sigfuncs',
- func_id + '.json')
- try:
- with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
- cache_spec = json.load(cachef)
- return lambda s: u''.join(s[i] for i in cache_spec)
- except IOError:
- pass # No cache available
- except ValueError:
- try:
- file_size = os.path.getsize(cache_fn)
- except (OSError, IOError) as oe:
- file_size = str(oe)
- self._downloader.report_warning(
- u'Cache %s failed (%s)' % (cache_fn, file_size))
+ cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
+ if cache_spec is not None:
+ return lambda s: ''.join(s[i] for i in cache_spec)
if player_type == 'js':
code = self._download_webpage(
@@ -470,31 +449,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
assert False, 'Invalid player type %r' % player_type
- if cache_enabled:
- try:
- test_string = u''.join(map(compat_chr, range(len(example_sig))))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
- try:
- os.makedirs(os.path.dirname(cache_fn))
- except OSError as ose:
- if ose.errno != errno.EEXIST:
- raise
- write_json_file(cache_spec, cache_fn)
- except Exception:
- tb = traceback.format_exc()
- self._downloader.report_warning(
- u'Writing cache to %r failed: %s' % (cache_fn, tb))
+ if cache_spec is None:
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
+ self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
return res
def _print_sig_code(self, func, example_sig):
def gen_sig_code(idxs):
def _genslice(start, end, step):
- starts = u'' if start == 0 else str(start)
- ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
- steps = u'' if step == 1 else (u':%d' % step)
- return u's[%s%s%s]' % (starts, ends, steps)
+ starts = '' if start == 0 else str(start)
+ ends = (u':%d' % (end+step)) if end + step >= 0 else ':'
+ steps = '' if step == 1 else (u':%d' % step)
+ return 's[%s%s%s]' % (starts, ends, steps)
step = None
start = '(Never used)' # Quelch pyflakes warnings - start will be
@@ -511,26 +480,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
start = prev
continue
else:
- yield u's[%d]' % prev
+ yield 's[%d]' % prev
if step is None:
- yield u's[%d]' % i
+ yield 's[%d]' % i
else:
yield _genslice(start, i, step)
- test_string = u''.join(map(compat_chr, range(len(example_sig))))
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
cache_res = func(test_string)
cache_spec = [ord(c) for c in cache_res]
- expr_code = u' + '.join(gen_sig_code(cache_spec))
+ expr_code = ' + '.join(gen_sig_code(cache_spec))
signature_id_tuple = '(%s)' % (
', '.join(compat_str(len(p)) for p in example_sig.split('.')))
code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
- u' return %s\n') % (signature_id_tuple, expr_code)
+ ' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen(u'Extracted signature function:\n' + code)
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'signature=([$a-zA-Z]+)', jscode,
- u'Initial JS player signature function name')
+ 'Initial JS player signature function name')
jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
@@ -538,9 +507,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_swf(self, file_contents):
swfi = SWFInterpreter(file_contents)
- TARGET_CLASSNAME = u'SignatureDecipher'
+ TARGET_CLASSNAME = 'SignatureDecipher'
searched_class = swfi.extract_class(TARGET_CLASSNAME)
- initial_function = swfi.extract_function(searched_class, u'decipher')
+ initial_function = swfi.extract_function(searched_class, 'decipher')
return lambda s: initial_function([s])
def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
@@ -550,7 +519,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError(u'Cannot decrypt signature without player_url')
if player_url.startswith(u'//'):
- player_url = u'https:' + player_url
+ player_url = 'https:' + player_url
try:
player_id = (player_url, self._signature_cache_id(s))
if player_id not in self._player_cache:
@@ -565,7 +534,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
except Exception as e:
tb = traceback.format_exc()
raise ExtractorError(
- u'Signature extraction failed: ' + tb, cause=e)
+ 'Signature extraction failed: ' + tb, cause=e)
def _get_available_subtitles(self, video_id, webpage):
try:
@@ -588,7 +557,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
'name': unescapeHTML(l[0]).encode('utf-8'),
})
- url = u'https://www.youtube.com/api/timedtext?' + params
+ url = 'https://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
if not sub_lang_list:
self._downloader.report_warning(u'video doesn\'t have subtitles')
@@ -601,7 +570,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
- err_msg = u'Couldn\'t find automatic captions for %s' % video_id
+ err_msg = 'Couldn\'t find automatic captions for %s' % video_id
if mobj is None:
self._downloader.report_warning(err_msg)
return {}
@@ -657,7 +626,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
urls = filter(lambda l: l and not l.startswith('#'),
lines)
return urls
- manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
+ manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
formats_urls = _get_urls(manifest)
for format_url in formats_urls:
itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
@@ -670,8 +639,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _real_extract(self, url):
proto = (
- u'http' if self._downloader.params.get('prefer_insecure', False)
- else u'https')
+ 'http' if self._downloader.params.get('prefer_insecure', False)
+ else 'https')
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
@@ -722,11 +691,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(
- u'YouTube said: %s' % video_info['reason'][0],
+ 'YouTube said: %s' % video_info['reason'][0],
expected=True, video_id=video_id)
else:
raise ExtractorError(
- u'"token" parameter not in video info for unknown reason',
+ '"token" parameter not in video info for unknown reason',
video_id=video_id)
if 'view_count' in video_info:
@@ -759,7 +728,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_title = video_info['title'][0]
else:
self._downloader.report_warning(u'Unable to extract video title')
- video_title = u'_'
+ video_title = '_'
# thumbnail image
# We try first to get a high quality image:
@@ -784,7 +753,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
- m_cat_container = get_element_by_id("eow-category", video_webpage)
+ m_cat_container = self._search_regex(
+ r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
+ video_webpage, 'categories', fatal=False)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
@@ -811,17 +782,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if fd_mobj:
video_description = unescapeHTML(fd_mobj.group(1))
else:
- video_description = u''
+ video_description = ''
- def _extract_count(klass):
+ def _extract_count(count_name):
count = self._search_regex(
- r'class="%s">([\d,]+)</span>' % re.escape(klass),
- video_webpage, klass, default=None)
+ r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
+ video_webpage, count_name, default=None)
if count is not None:
return int(count.replace(',', ''))
return None
- like_count = _extract_count(u'likes-count')
- dislike_count = _extract_count(u'dislikes-count')
+ like_count = _extract_count(u'like')
+ dislike_count = _extract_count(u'dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
@@ -858,7 +829,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if m_s is not None:
self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
- m_s = re_signature.search(args.get('adaptive_fmts', u''))
+ m_s = re_signature.search(args.get('adaptive_fmts', ''))
if m_s is not None:
if 'adaptive_fmts' in video_info:
video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
@@ -908,12 +879,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if not age_gate:
jsplayer_url_json = self._search_regex(
r'"assets":.+?"js":\s*("[^"]+")',
- video_webpage, u'JS player URL')
+ video_webpage, 'JS player URL')
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
- video_webpage, u'age gate player URL')
+ video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if self._downloader.params.get('verbose'):
@@ -924,14 +895,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if player_url.endswith('swf'):
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
- u'flash player', fatal=False)
+ 'flash player', fatal=False)
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
r'html5player-([^/]+?)(?:/html5player)?\.js',
player_url,
'html5 player', fatal=False)
- player_desc = u'html5 player %s' % player_version
+ player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen(u'{%s} signature length %s, %s' %
@@ -1023,7 +994,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
}
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
- IE_DESC = u'YouTube.com playlists'
+ IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
(?:https?://)?
(?:\w+\.)?
@@ -1045,27 +1016,72 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_MORE_PAGES_INDICATOR = r'data-link-type="next"'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
- IE_NAME = u'youtube:playlist'
+ IE_NAME = 'youtube:playlist'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
+ 'info_dict': {
+ 'title': 'ytdl test PL',
+ },
+ 'playlist_count': 3,
+ }, {
+ 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
+ 'info_dict': {
+ 'title': 'YDL_Empty_List',
+ },
+ 'playlist_count': 0,
+ }, {
+ 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+ 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'info_dict': {
+ 'title': '29C3: Not my department',
+ },
+ 'playlist_count': 95,
+ }, {
+ 'note': 'issue #673',
+ 'url': 'PLBB231211A4F62143',
+ 'info_dict': {
+ 'title': 'Team Fortress 2 (Class-based LP)',
+ },
+ 'playlist_mincount': 26,
+ }, {
+ 'note': 'Large playlist',
+ 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'info_dict': {
+ 'title': 'Uploads from Cauchemar',
+ },
+ 'playlist_mincount': 799,
+ }, {
+ 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ 'info_dict': {
+ 'title': 'YDL_safe_search',
+ },
+ 'playlist_count': 2,
+ }]
def _real_initialize(self):
self._login()
def _ids_to_results(self, ids):
- return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
def _extract_mix(self, playlist_id):
# The mixes are generated from a a single video
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
- webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+ webpage = self._download_webpage(
+ url, playlist_id, 'Downloading Youtube mix')
search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
- title_span = (search_title('playlist-title') or
- search_title('title long-title') or search_title('title'))
+ title_span = (
+ search_title('playlist-title') or
+ search_title('title long-title') or
+ search_title('title'))
title = clean_html(title_span)
- video_re = r'''(?x)data-video-username=".*?".*?
- href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+ ids = orderedSet(re.findall(
+ r'''(?xs)data-video-username=".*?".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
+ webpage))
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
@@ -1092,7 +1108,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self._extract_mix(playlist_id)
if playlist_id.startswith('TL'):
raise ExtractorError(u'For downloading YouTube.com top lists, use '
- u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
+ 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
@@ -1101,7 +1117,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
# Check if the playlist exists or is private
if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
raise ExtractorError(
- u'The playlist doesn\'t exist or is private, use --username or '
+ 'The playlist doesn\'t exist or is private, use --username or '
'--netrc to access it.',
expected=True)
@@ -1128,17 +1144,18 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
playlist_title = self._html_search_regex(
r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
- page, u'title')
+ page, 'title')
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
class YoutubeTopListIE(YoutubePlaylistIE):
- IE_NAME = u'youtube:toplist'
+ IE_NAME = 'youtube:toplist'
IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
- u' (Example: "yttoplist:music:Top Tracks")')
+ ' (Example: "yttoplist:music:Top Tracks")')
_VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
+ _TESTS = []
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -1147,7 +1164,7 @@ class YoutubeTopListIE(YoutubePlaylistIE):
query = compat_urllib_parse.urlencode({'title': title})
playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
- link = self._html_search_regex(playlist_re, channel_page, u'list')
+ link = self._html_search_regex(playlist_re, channel_page, 'list')
url = compat_urlparse.urljoin('https://www.youtube.com/', link)
video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
@@ -1155,9 +1172,10 @@ class YoutubeTopListIE(YoutubePlaylistIE):
# sometimes the webpage doesn't contain the videos
# retry until we get them
for i in itertools.count(0):
- msg = u'Downloading Youtube mix'
+ msg = 'Downloading Youtube mix'
if i > 0:
msg += ', retry #%d' % i
+
webpage = self._download_webpage(url, title, msg)
ids = orderedSet(re.findall(video_re, webpage))
if ids:
@@ -1167,11 +1185,11 @@ class YoutubeTopListIE(YoutubePlaylistIE):
class YoutubeChannelIE(InfoExtractor):
- IE_DESC = u'YouTube.com channels'
+ IE_DESC = 'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
_MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
- IE_NAME = u'youtube:channel'
+ IE_NAME = 'youtube:channel'
def extract_videos_from_page(self, page):
ids_in_page = []
@@ -1223,12 +1241,12 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
- IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
+ IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
- IE_NAME = u'youtube:user'
+ IE_NAME = 'youtube:user'
@classmethod
def suitable(cls, url):
@@ -1257,7 +1275,7 @@ class YoutubeUserIE(InfoExtractor):
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
page = self._download_webpage(
gdata_url, username,
- u'Downloading video ids from %d to %d' % (
+ 'Downloading video ids from %d to %d' % (
start_index, start_index + self._GDATA_PAGE_SIZE))
try:
@@ -1285,10 +1303,10 @@ class YoutubeUserIE(InfoExtractor):
class YoutubeSearchIE(SearchInfoExtractor):
- IE_DESC = u'YouTube.com searches'
- _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
+ IE_DESC = 'YouTube.com searches'
+ _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000
- IE_NAME = u'youtube:search'
+ IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
def _get_n_results(self, query, n):
@@ -1312,7 +1330,7 @@ class YoutubeSearchIE(SearchInfoExtractor):
if 'items' not in api_response:
raise ExtractorError(
- u'[youtube] No video results', expected=True)
+ '[youtube] No video results', expected=True)
new_ids = list(video['id'] for video in api_response['items'])
video_ids += new_ids
@@ -1331,12 +1349,12 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
- IE_DESC = u'YouTube.com searches, newest videos first'
+ IE_DESC = 'YouTube.com searches, newest videos first'
class YoutubeSearchURLIE(InfoExtractor):
- IE_DESC = u'YouTube.com search URLs'
- IE_NAME = u'youtube:search_url'
+ IE_DESC = 'YouTube.com search URLs'
+ IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
def _real_extract(self, url):
@@ -1345,7 +1363,7 @@ class YoutubeSearchURLIE(InfoExtractor):
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
- r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
+ r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
part_codes = re.findall(
r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
@@ -1371,14 +1389,14 @@ class YoutubeSearchURLIE(InfoExtractor):
class YoutubeShowIE(InfoExtractor):
- IE_DESC = u'YouTube.com (multi-season) shows'
+ IE_DESC = 'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
- IE_NAME = u'youtube:show'
+ IE_NAME = 'youtube:show'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
show_name = mobj.group(1)
- webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
+ webpage = self._download_webpage(url, show_name, 'Downloading show webpage')
# There's one playlist for each season of the show
m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
@@ -1404,7 +1422,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
@property
def IE_NAME(self):
- return u'youtube:%s' % self._FEED_NAME
+ return 'youtube:%s' % self._FEED_NAME
def _real_initialize(self):
self._login()
@@ -1414,9 +1432,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
paging = 0
for i in itertools.count(1):
info = self._download_json(self._FEED_TEMPLATE % paging,
- u'%s feed' % self._FEED_NAME,
- u'Downloading page %s' % i)
+ '%s feed' % self._FEED_NAME,
+ 'Downloading page %s' % i)
feed_html = info.get('feed_html') or info.get('content_html')
+ load_more_widget_html = info.get('load_more_widget_html') or feed_html
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend(
@@ -1424,50 +1443,82 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
for video_id in ids)
mobj = re.search(
r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
- feed_html)
+ load_more_widget_html)
if mobj is None:
break
paging = mobj.group('paging')
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _FEED_NAME = 'subscriptions'
- _PLAYLIST_TITLE = u'Youtube Subscriptions'
-
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
+ IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = u'Youtube Recommended videos'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
+ IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
- _PLAYLIST_TITLE = u'Youtube Watch Later'
+ _PLAYLIST_TITLE = 'Youtube Watch Later'
_PERSONAL_FEED = True
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
- _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+ IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
+ _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
_FEED_NAME = 'history'
_PERSONAL_FEED = True
- _PLAYLIST_TITLE = u'Youtube Watch History'
+ _PLAYLIST_TITLE = 'Youtube Watch History'
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
- IE_NAME = u'youtube:favorites'
- IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+ IE_NAME = 'youtube:favorites'
+ IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
_LOGIN_REQUIRED = True
def _real_extract(self, url):
webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
- playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
+ playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
return self.url_result(playlist_id, 'YoutubePlaylist')
+class YoutubeSubscriptionsIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:subscriptions'
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _TESTS = []
+
+ def _real_extract(self, url):
+ title = 'Youtube Subscriptions'
+ page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
+
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+ new_ids = orderedSet(matches)
+ ids.extend(new_ids)
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), title,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': self._ids_to_results(ids),
+ }
+
+
class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
@@ -1489,9 +1540,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):
def _real_extract(self, url):
raise ExtractorError(
- u'Did you forget to quote the URL? Remember that & is a meta '
- u'character in most shells, so you want to put the URL in quotes, '
- u'like youtube-dl '
- u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
- u' or simply youtube-dl BaW_jenozKc .',
+ 'Did you forget to quote the URL? Remember that & is a meta '
+ 'character in most shells, so you want to put the URL in quotes, '
+ 'like youtube-dl '
+ '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
+ ' or simply youtube-dl BaW_jenozKc .',
expected=True)
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
new file mode 100644
index 000000000..31baab469
--- /dev/null
+++ b/youtube_dl/options.py
@@ -0,0 +1,481 @@
+from __future__ import unicode_literals
+
+import os.path
+import optparse
+import shlex
+import sys
+
+from .utils import (
+ get_term_width,
+ write_string,
+)
+from .version import __version__
+
+
+def parseOpts(overrideArguments=None):
+ def _readOptions(filename_bytes, default=[]):
+ try:
+ optionf = open(filename_bytes)
+ except IOError:
+ return default # silently skip if file is not present
+ try:
+ res = []
+ for l in optionf:
+ res += shlex.split(l, comments=True)
+ finally:
+ optionf.close()
+ return res
+
+ def _readUserConf():
+ xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
+ if xdg_config_home:
+ userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
+ if not os.path.isfile(userConfFile):
+ userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf')
+ else:
+ userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
+ if not os.path.isfile(userConfFile):
+ userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
+ userConf = _readOptions(userConfFile, None)
+
+ if userConf is None:
+ appdata_dir = os.environ.get('appdata')
+ if appdata_dir:
+ userConf = _readOptions(
+ os.path.join(appdata_dir, 'youtube-dl', 'config'),
+ default=None)
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
+ default=None)
+
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
+ default=None)
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
+ default=None)
+
+ if userConf is None:
+ userConf = []
+
+ return userConf
+
+ def _format_option_string(option):
+ ''' ('-o', '--option') -> -o, --format METAVAR'''
+
+ opts = []
+
+ if option._short_opts:
+ opts.append(option._short_opts[0])
+ if option._long_opts:
+ opts.append(option._long_opts[0])
+ if len(opts) > 1:
+ opts.insert(1, ', ')
+
+ if option.takes_value(): opts.append(' %s' % option.metavar)
+
+ return "".join(opts)
+
+ def _comma_separated_values_options_callback(option, opt_str, value, parser):
+ setattr(parser.values, option.dest, value.split(','))
+
+ def _hide_login_info(opts):
+ opts = list(opts)
+ for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
+ try:
+ i = opts.index(private_opt)
+ opts[i+1] = '<PRIVATE>'
+ except ValueError:
+ pass
+ return opts
+
+ max_width = 80
+ max_help_position = 80
+
+ # No need to wrap help messages if we're on a wide console
+ columns = get_term_width()
+ if columns: max_width = columns
+
+ fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
+ fmt.format_option_strings = _format_option_string
+
+ kw = {
+ 'version' : __version__,
+ 'formatter' : fmt,
+ 'usage' : '%prog [options] url [url...]',
+ 'conflict_handler' : 'resolve',
+ }
+
+ parser = optparse.OptionParser(**kw)
+
+ # option groups
+ general = optparse.OptionGroup(parser, 'General Options')
+ selection = optparse.OptionGroup(parser, 'Video Selection')
+ authentication = optparse.OptionGroup(parser, 'Authentication Options')
+ video_format = optparse.OptionGroup(parser, 'Video Format Options')
+ subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
+ downloader = optparse.OptionGroup(parser, 'Download Options')
+ postproc = optparse.OptionGroup(parser, 'Post-processing Options')
+ filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
+ workarounds = optparse.OptionGroup(parser, 'Workarounds')
+ verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
+
+ general.add_option('-h', '--help',
+ action='help', help='print this help text and exit')
+ general.add_option('-v', '--version',
+ action='version', help='print program version and exit')
+ general.add_option('-U', '--update',
+ action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
+ general.add_option('-i', '--ignore-errors',
+ action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False)
+ general.add_option('--abort-on-error',
+ action='store_false', dest='ignoreerrors',
+ help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
+ general.add_option('--dump-user-agent',
+ action='store_true', dest='dump_user_agent',
+ help='display the current browser identification', default=False)
+ general.add_option('--list-extractors',
+ action='store_true', dest='list_extractors',
+ help='List all supported extractors and the URLs they would handle', default=False)
+ general.add_option('--extractor-descriptions',
+ action='store_true', dest='list_extractor_descriptions',
+ help='Output descriptions of all supported extractors', default=False)
+ general.add_option(
+ '--proxy', dest='proxy', default=None, metavar='URL',
+ help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
+ general.add_option(
+ '--socket-timeout', dest='socket_timeout',
+ type=float, default=None, help=u'Time to wait before giving up, in seconds')
+ general.add_option(
+ '--default-search',
+ dest='default_search', metavar='PREFIX',
+ help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')
+ general.add_option(
+ '--ignore-config',
+ action='store_true',
+ help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
+
+ selection.add_option(
+ '--playlist-start',
+ dest='playliststart', metavar='NUMBER', default=1, type=int,
+ help='playlist video to start at (default is %default)')
+ selection.add_option(
+ '--playlist-end',
+ dest='playlistend', metavar='NUMBER', default=None, type=int,
+ help='playlist video to end at (default is last)')
+ selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
+ selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
+ selection.add_option('--max-downloads', metavar='NUMBER',
+ dest='max_downloads', type=int, default=None,
+ help='Abort after downloading NUMBER files')
+ selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None)
+ selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None)
+ selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
+ selection.add_option(
+ '--datebefore', metavar='DATE', dest='datebefore', default=None,
+ help='download only videos uploaded on or before this date (i.e. inclusive)')
+ selection.add_option(
+ '--dateafter', metavar='DATE', dest='dateafter', default=None,
+ help='download only videos uploaded on or after this date (i.e. inclusive)')
+ selection.add_option(
+ '--min-views', metavar='COUNT', dest='min_views',
+ default=None, type=int,
+ help="Do not download any videos with less than COUNT views",)
+ selection.add_option(
+ '--max-views', metavar='COUNT', dest='max_views',
+ default=None, type=int,
+ help="Do not download any videos with more than COUNT views",)
+ selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False)
+ selection.add_option('--age-limit', metavar='YEARS', dest='age_limit',
+ help='download only videos suitable for the given age',
+ default=None, type=int)
+ selection.add_option('--download-archive', metavar='FILE',
+ dest='download_archive',
+ help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
+ selection.add_option(
+ '--include-ads', dest='include_ads',
+ action='store_true',
+ help='Download advertisements as well (experimental)')
+ selection.add_option(
+ '--youtube-include-dash-manifest', action='store_true',
+ dest='youtube_include_dash_manifest', default=False,
+ help='Try to download the DASH manifest on YouTube videos (experimental)')
+
+ authentication.add_option('-u', '--username',
+ dest='username', metavar='USERNAME', help='account username')
+ authentication.add_option('-p', '--password',
+ dest='password', metavar='PASSWORD', help='account password')
+ authentication.add_option('-2', '--twofactor',
+ dest='twofactor', metavar='TWOFACTOR', help='two-factor auth code')
+ authentication.add_option('-n', '--netrc',
+ action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
+ authentication.add_option('--video-password',
+ dest='videopassword', metavar='PASSWORD', help='video password (vimeo, smotri)')
+
+
+ video_format.add_option('-f', '--format',
+ action='store', dest='format', metavar='FORMAT', default=None,
+ help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.')
+ video_format.add_option('--all-formats',
+ action='store_const', dest='format', help='download all available video formats', const='all')
+ video_format.add_option('--prefer-free-formats',
+ action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
+ video_format.add_option('--max-quality',
+ action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
+ video_format.add_option('-F', '--list-formats',
+ action='store_true', dest='listformats', help='list all available formats')
+
+ subtitles.add_option('--write-sub', '--write-srt',
+ action='store_true', dest='writesubtitles',
+ help='write subtitle file', default=False)
+ subtitles.add_option('--write-auto-sub', '--write-automatic-sub',
+ action='store_true', dest='writeautomaticsub',
+ help='write automatic subtitle file (youtube only)', default=False)
+ subtitles.add_option('--all-subs',
+ action='store_true', dest='allsubtitles',
+ help='downloads all the available subtitles of the video', default=False)
+ subtitles.add_option('--list-subs',
+ action='store_true', dest='listsubtitles',
+ help='lists all available subtitles for the video', default=False)
+ subtitles.add_option('--sub-format',
+ action='store', dest='subtitlesformat', metavar='FORMAT',
+ help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt')
+ subtitles.add_option('--sub-lang', '--sub-langs', '--srt-lang',
+ action='callback', dest='subtitleslangs', metavar='LANGS', type='str',
+ default=[], callback=_comma_separated_values_options_callback,
+ help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')
+
+ downloader.add_option('-r', '--rate-limit',
+ dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')
+ downloader.add_option('-R', '--retries',
+ dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)
+ downloader.add_option('--buffer-size',
+ dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")
+ downloader.add_option('--no-resize-buffer',
+ action='store_true', dest='noresizebuffer',
+ help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False)
+ downloader.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP)
+
+ workarounds.add_option(
+ '--encoding', dest='encoding', metavar='ENCODING',
+ help='Force the specified encoding (experimental)')
+ workarounds.add_option(
+ '--no-check-certificate', action='store_true',
+ dest='no_check_certificate', default=False,
+ help='Suppress HTTPS certificate validation.')
+ workarounds.add_option(
+ '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
+ help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
+ workarounds.add_option(
+ '--user-agent', metavar='UA',
+ dest='user_agent', help='specify a custom user agent')
+ workarounds.add_option(
+ '--referer', metavar='REF',
+ dest='referer', default=None,
+ help='specify a custom referer, use if the video access is restricted to one domain',
+ )
+ workarounds.add_option(
+ '--add-header', metavar='FIELD:VALUE',
+ dest='headers', action='append',
+ help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',
+ )
+ workarounds.add_option(
+ '--bidi-workaround', dest='bidi_workaround', action='store_true',
+ help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
+
+ verbosity.add_option('-q', '--quiet',
+ action='store_true', dest='quiet', help='activates quiet mode', default=False)
+ verbosity.add_option(
+ '--no-warnings',
+ dest='no_warnings', action='store_true', default=False,
+ help='Ignore warnings')
+ verbosity.add_option('-s', '--simulate',
+ action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
+ verbosity.add_option('--skip-download',
+ action='store_true', dest='skip_download', help='do not download the video', default=False)
+ verbosity.add_option('-g', '--get-url',
+ action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
+ verbosity.add_option('-e', '--get-title',
+ action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
+ verbosity.add_option('--get-id',
+ action='store_true', dest='getid', help='simulate, quiet but print id', default=False)
+ verbosity.add_option('--get-thumbnail',
+ action='store_true', dest='getthumbnail',
+ help='simulate, quiet but print thumbnail URL', default=False)
+ verbosity.add_option('--get-description',
+ action='store_true', dest='getdescription',
+ help='simulate, quiet but print video description', default=False)
+ verbosity.add_option('--get-duration',
+ action='store_true', dest='getduration',
+ help='simulate, quiet but print video length', default=False)
+ verbosity.add_option('--get-filename',
+ action='store_true', dest='getfilename',
+ help='simulate, quiet but print output filename', default=False)
+ verbosity.add_option('--get-format',
+ action='store_true', dest='getformat',
+ help='simulate, quiet but print output format', default=False)
+ verbosity.add_option('-j', '--dump-json',
+ action='store_true', dest='dumpjson',
+ help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
+ verbosity.add_option('--newline',
+ action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
+ verbosity.add_option('--no-progress',
+ action='store_true', dest='noprogress', help='do not print progress bar', default=False)
+ verbosity.add_option('--console-title',
+ action='store_true', dest='consoletitle',
+ help='display progress in console titlebar', default=False)
+ verbosity.add_option('-v', '--verbose',
+ action='store_true', dest='verbose', help='print various debugging information', default=False)
+ verbosity.add_option('--dump-intermediate-pages',
+ action='store_true', dest='dump_intermediate_pages', default=False,
+ help='print downloaded pages to debug problems (very verbose)')
+ verbosity.add_option('--write-pages',
+ action='store_true', dest='write_pages', default=False,
+ help='Write downloaded intermediary pages to files in the current directory to debug problems')
+ verbosity.add_option('--youtube-print-sig-code',
+ action='store_true', dest='youtube_print_sig_code', default=False,
+ help=optparse.SUPPRESS_HELP)
+ verbosity.add_option('--print-traffic',
+ dest='debug_printtraffic', action='store_true', default=False,
+ help='Display sent and read HTTP traffic')
+
+
+ filesystem.add_option('-a', '--batch-file',
+ dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
+ filesystem.add_option('--id',
+ action='store_true', dest='useid', help='use only video ID in file name', default=False)
+ filesystem.add_option('-A', '--auto-number',
+ action='store_true', dest='autonumber',
+ help='number downloaded files starting from 00000', default=False)
+ filesystem.add_option('-o', '--output',
+ dest='outtmpl', metavar='TEMPLATE',
+ help=('output filename template. Use %(title)s to get the title, '
+ '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, '
+ '%(autonumber)s to get an automatically incremented number, '
+ '%(ext)s for the filename extension, '
+ '%(format)s for the format description (like "22 - 1280x720" or "HD"), '
+ '%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), '
+ '%(upload_date)s for the upload date (YYYYMMDD), '
+ '%(extractor)s for the provider (youtube, metacafe, etc), '
+ '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
+ '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+ '%(height)s and %(width)s for the width and height of the video format. '
+ '%(resolution)s for a textual description of the resolution of the video format. '
+ 'Use - to output to stdout. Can also be used to download to a different directory, '
+ 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
+ filesystem.add_option('--autonumber-size',
+ dest='autonumber_size', metavar='NUMBER',
+ help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given')
+ filesystem.add_option('--restrict-filenames',
+ action='store_true', dest='restrictfilenames',
+ help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False)
+ filesystem.add_option('-t', '--title',
+ action='store_true', dest='usetitle', help='[deprecated] use title in file name (default)', default=False)
+ filesystem.add_option('-l', '--literal',
+ action='store_true', dest='usetitle', help='[deprecated] alias of --title', default=False)
+ filesystem.add_option('-w', '--no-overwrites',
+ action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
+ filesystem.add_option('-c', '--continue',
+ action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True)
+ filesystem.add_option('--no-continue',
+ action='store_false', dest='continue_dl',
+ help='do not resume partially downloaded files (restart from beginning)')
+ filesystem.add_option('--no-part',
+ action='store_true', dest='nopart', help='do not use .part files', default=False)
+ filesystem.add_option('--no-mtime',
+ action='store_false', dest='updatetime',
+ help='do not use the Last-modified header to set the file modification time', default=True)
+ filesystem.add_option('--write-description',
+ action='store_true', dest='writedescription',
+ help='write video description to a .description file', default=False)
+ filesystem.add_option('--write-info-json',
+ action='store_true', dest='writeinfojson',
+ help='write video metadata to a .info.json file', default=False)
+ filesystem.add_option('--write-annotations',
+ action='store_true', dest='writeannotations',
+ help='write video annotations to a .annotation file', default=False)
+ filesystem.add_option('--write-thumbnail',
+ action='store_true', dest='writethumbnail',
+ help='write thumbnail image to disk', default=False)
+ filesystem.add_option('--load-info',
+ dest='load_info_filename', metavar='FILE',
+ help='json file containing the video information (created with the "--write-json" option)')
+ filesystem.add_option('--cookies',
+ dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
+ filesystem.add_option(
+ '--cache-dir', dest='cachedir', default=None, metavar='DIR',
+ help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
+ filesystem.add_option(
+ '--no-cache-dir', action='store_const', const=False, dest='cachedir',
+ help='Disable filesystem caching')
+ filesystem.add_option(
+ '--rm-cache-dir', action='store_true', dest='rm_cachedir',
+ help='Delete all filesystem cache files')
+
+
+ postproc.add_option('-x', '--extract-audio', action='store_true', dest='extractaudio', default=False,
+ help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)')
+ postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
+ help='"best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; best by default')
+ postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='5',
+ help='ffmpeg/avconv audio quality specification, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5)')
+ postproc.add_option('--recode-video', metavar='FORMAT', dest='recodevideo', default=None,
+ help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)')
+ postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
+ help='keeps the video file on disk after the post-processing; the video is erased by default')
+ postproc.add_option('--no-post-overwrites', action='store_true', dest='nopostoverwrites', default=False,
+ help='do not overwrite post-processed files; the post-processed files are overwritten by default')
+ postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
+ help='embed subtitles in the video (only for mp4 videos)')
+ postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False,
+ help='embed thumbnail in the audio as cover art')
+ postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False,
+ help='write metadata to the video file')
+ postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False,
+ help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
+ postproc.add_option('--prefer-avconv', action='store_false', dest='prefer_ffmpeg',
+ help='Prefer avconv over ffmpeg for running the postprocessors (default)')
+ postproc.add_option('--prefer-ffmpeg', action='store_true', dest='prefer_ffmpeg',
+ help='Prefer ffmpeg over avconv for running the postprocessors')
+ postproc.add_option(
+ '--exec', metavar='CMD', dest='exec_cmd',
+ help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'' )
+
+ parser.add_option_group(general)
+ parser.add_option_group(selection)
+ parser.add_option_group(downloader)
+ parser.add_option_group(filesystem)
+ parser.add_option_group(verbosity)
+ parser.add_option_group(workarounds)
+ parser.add_option_group(video_format)
+ parser.add_option_group(subtitles)
+ parser.add_option_group(authentication)
+ parser.add_option_group(postproc)
+
+ if overrideArguments is not None:
+ opts, args = parser.parse_args(overrideArguments)
+ if opts.verbose:
+ write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
+ else:
+ commandLineConf = sys.argv[1:]
+ if '--ignore-config' in commandLineConf:
+ systemConf = []
+ userConf = []
+ else:
+ systemConf = _readOptions('/etc/youtube-dl.conf')
+ if '--ignore-config' in systemConf:
+ userConf = []
+ else:
+ userConf = _readUserConf()
+ argv = systemConf + userConf + commandLineConf
+
+ opts, args = parser.parse_args(argv)
+ if opts.verbose:
+ write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
+ write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
+ write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
+
+ return parser, opts, args
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 16bc7408a..b644f4e92 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -280,6 +280,11 @@ if sys.version_info >= (2, 7):
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
+ # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+ # .//node does not match if a node is a direct child of . !
+ if isinstance(xpath, unicode):
+ xpath = xpath.encode('ascii')
+
for f in node.findall(xpath):
if f.attrib.get(key) == val:
return f
@@ -298,30 +303,20 @@ def xpath_with_ns(path, ns_map):
replaced.append('{%s}%s' % (ns_map[ns], tag))
return '/'.join(replaced)
-def htmlentity_transform(matchobj):
- """Transforms an HTML entity to a character.
- This function receives a match object and is intended to be used with
- the re.sub() function.
- """
- entity = matchobj.group(1)
+def xpath_text(node, xpath, name=None, fatal=False):
+ if sys.version_info < (2, 7): # Crazy 2.6
+ xpath = xpath.encode('ascii')
- # Known non-numeric HTML entity
- if entity in compat_html_entities.name2codepoint:
- return compat_chr(compat_html_entities.name2codepoint[entity])
-
- mobj = re.match(u'(?u)#(x?\\d+)', entity)
- if mobj is not None:
- numstr = mobj.group(1)
- if numstr.startswith(u'x'):
- base = 16
- numstr = u'0%s' % numstr
+ n = node.find(xpath)
+ if n is None:
+ if fatal:
+ name = xpath if name is None else name
+ raise ExtractorError('Could not find XML element %s' % name)
else:
- base = 10
- return compat_chr(int(numstr, base))
+ return None
+ return n.text
- # Unknown entity in name, return its literal representation
- return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class BaseHTMLParser(compat_html_parser.HTMLParser):
@@ -543,13 +538,33 @@ def orderedSet(iterable):
return res
+def _htmlentity_transform(entity):
+ """Transforms an HTML entity to a character."""
+ # Known non-numeric HTML entity
+ if entity in compat_html_entities.name2codepoint:
+ return compat_chr(compat_html_entities.name2codepoint[entity])
+
+ mobj = re.match(r'#(x?[0-9]+)', entity)
+ if mobj is not None:
+ numstr = mobj.group(1)
+ if numstr.startswith(u'x'):
+ base = 16
+ numstr = u'0%s' % numstr
+ else:
+ base = 10
+ return compat_chr(int(numstr, base))
+
+ # Unknown entity in name, return its literal representation
+ return (u'&%s;' % entity)
+
+
def unescapeHTML(s):
if s is None:
return None
assert type(s) == compat_str
- result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
- return result
+ return re.sub(
+ r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
def encodeFilename(s, for_subprocess=False):
@@ -621,7 +636,7 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
self.sock = sock
self._tunnel()
try:
- self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
except ssl.SSLError:
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
@@ -629,8 +644,14 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
return HTTPSHandlerV3(**kwargs)
- else:
- context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
+ elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
+ context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
+ context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
+ if opts_no_check_certificate:
+ context.verify_mode = ssl.CERT_NONE
+ return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+ else: # Python < 3.4
+ context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.verify_mode = (ssl.CERT_NONE
if opts_no_check_certificate
else ssl.CERT_REQUIRED)
@@ -766,10 +787,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
return ret
def http_request(self, req):
- for h,v in std_headers.items():
- if h in req.headers:
- del req.headers[h]
- req.add_header(h, v)
+ for h, v in std_headers.items():
+ if h not in req.headers:
+ req.add_header(h, v)
if 'Youtubedl-no-compression' in req.headers:
if 'Accept-encoding' in req.headers:
del req.headers['Accept-encoding']
@@ -1081,12 +1101,6 @@ def intlist_to_bytes(xs):
return bytes(xs)
-def get_cachedir(params={}):
- cache_root = os.environ.get('XDG_CACHE_HOME',
- os.path.expanduser('~/.cache'))
- return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
-
-
# Cross-platform file locking
if sys.platform == 'win32':
import ctypes.wintypes
@@ -1146,10 +1160,10 @@ else:
import fcntl
def _lock_file(f, exclusive):
- fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+ fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
def _unlock_file(f):
- fcntl.lockf(f, fcntl.LOCK_UN)
+ fcntl.flock(f, fcntl.LOCK_UN)
class locked_file(object):
@@ -1323,9 +1337,10 @@ def str_or_none(v, default=None):
def str_to_int(int_str):
+ """ A more relaxed version of int_or_none """
if int_str is None:
return None
- int_str = re.sub(r'[,\.]', u'', int_str)
+ int_str = re.sub(r'[,\.\+]', u'', int_str)
return int(int_str)
@@ -1337,8 +1352,10 @@ def parse_duration(s):
if s is None:
return None
+ s = s.strip()
+
m = re.match(
- r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s)
+ r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
if not m:
return None
res = int(m.group('secs'))
@@ -1420,6 +1437,24 @@ def uppercase_escape(s):
lambda m: unicode_escape(m.group(0))[0],
s)
+
+def escape_rfc3986(s):
+ """Escape non-ASCII characters as suggested by RFC 3986"""
+ if sys.version_info < (3, 0) and isinstance(s, unicode):
+ s = s.encode('utf-8')
+ return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+
+
+def escape_url(url):
+ """Escape URL as suggested by RFC 3986"""
+ url_parsed = compat_urllib_parse_urlparse(url)
+ return url_parsed._replace(
+ path=escape_rfc3986(url_parsed.path),
+ params=escape_rfc3986(url_parsed.params),
+ query=escape_rfc3986(url_parsed.query),
+ fragment=escape_rfc3986(url_parsed.fragment)
+ ).geturl()
+
try:
struct.pack(u'!I', 0)
except TypeError:
@@ -1554,3 +1589,13 @@ except AttributeError:
if ret:
raise subprocess.CalledProcessError(ret, p.args, output=output)
return output
+
+
+def limit_length(s, length):
+ """ Add ellipses to overly long strings """
+ if s is None:
+ return None
+ ELLIPSES = '...'
+ if len(s) > length:
+ return s[:length - len(ELLIPSES)] + ELLIPSES
+ return s
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 7939e48e9..cf0d862da 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2014.08.25.3'
+__version__ = '2014.09.15.1'