diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2017-09-23 12:55:40 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2017-09-23 12:55:40 +0100 |
commit | 460b6c8e581aa06b86f59eebd9e52edfe7adf417 (patch) | |
tree | 9115b412fa484e5394ec05e27137849be5037daa | |
parent | c348b54ab5c3e6c80fbf365b671974fd92f39113 (diff) | |
parent | bb86d05f4afab3ebfee2e897e295d61dbd8cc28e (diff) |
Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging
* Speed up AddressSpaceDispatch creation (Alexey)
* Fix kvm.c assert (David)
* Memory fixes and further speedup (me)
* Persistent reservation manager infrastructure (me)
* virtio-serial: add enable_backend callback (Pavel)
* chardev GMainContext fixes (Peter)
# gpg: Signature made Fri 22 Sep 2017 20:07:33 BST
# gpg: using RSA key 0xBFFBD25F78C7AE83
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>"
# gpg: aka "Paolo Bonzini <pbonzini@redhat.com>"
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1
# Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83
* remotes/bonzini/tags/for-upstream: (32 commits)
chardev: remove context in chr_update_read_handler
chardev: use per-dev context for io_add_watch_poll
chardev: add Chardev.gcontext field
chardev: new qemu_chr_be_update_read_handlers()
scsi: add persistent reservation manager using qemu-pr-helper
scsi: add multipath support to qemu-pr-helper
scsi: build qemu-pr-helper
scsi, file-posix: add support for persistent reservation management
memory: Share special empty FlatView
memory: seek FlatView sharing candidates among children subregions
memory: trace FlatView creation and destruction
memory: Create FlatView directly
memory: Get rid of address_space_init_shareable
memory: Rework "info mtree" to print flat views and dispatch trees
memory: Do not allocate FlatView in address_space_init
memory: Share FlatView's and dispatch trees between address spaces
memory: Move address_space_update_ioeventfds
memory: Alloc dispatch tree where topology is generared
memory: Store physical root MR in FlatView
memory: Rename mem_begin/mem_commit/mem_add helpers
...
# Conflicts:
# configure
44 files changed, 2550 insertions, 310 deletions
@@ -372,6 +372,11 @@ qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o $(COMMON_LDADDS) fsdev/virtfs-proxy-helper$(EXESUF): fsdev/virtfs-proxy-helper.o fsdev/9p-marshal.o fsdev/9p-iov-marshal.o $(COMMON_LDADDS) fsdev/virtfs-proxy-helper$(EXESUF): LIBS += -lcap +scsi/qemu-pr-helper$(EXESUF): scsi/qemu-pr-helper.o scsi/utils.o $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS) +ifdef CONFIG_MPATH +scsi/qemu-pr-helper$(EXESUF): LIBS += -ludev -lmultipath -lmpathpersist +endif + qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx $(SRC_PATH)/scripts/hxtool $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"GEN","$@") @@ -488,7 +493,7 @@ clean: rm -f *.msi find . \( -name '*.so' -o -name '*.dll' -o -name '*.mo' -o -name '*.[oda]' \) -type f -exec rm {} + rm -f $(filter-out %.tlb,$(TOOLS)) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~ - rm -f fsdev/*.pod + rm -f fsdev/*.pod scsi/*.pod rm -f qemu-img-cmds.h rm -f ui/shader/*-vert.h ui/shader/*-frag.h @# May not be present in GENERATED_FILES diff --git a/Makefile.objs b/Makefile.objs index 9e89100302..bdfa3b6177 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -171,6 +171,7 @@ trace-events-subdirs += qapi trace-events-subdirs += accel/tcg trace-events-subdirs += accel/kvm trace-events-subdirs += nbd +trace-events-subdirs += scsi trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index b0181d7220..4f1997deec 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -722,7 +722,6 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, mem = kvm_lookup_matching_slot(kml, start_addr, size); if (!add) { if (!mem) { - g_assert(!memory_region_is_ram(mr) && !writeable && !mr->romd_mode); return; } if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { diff --git a/block/file-posix.c b/block/file-posix.c index 6acbd56238..ab12a2b591 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -33,6 +33,9 @@ #include "block/raw-aio.h" #include "qapi/qmp/qstring.h" +#include "scsi/pr-manager.h" +#include "scsi/constants.h" + #if defined(__APPLE__) && (__MACH__) #include <paths.h> #include <sys/param.h> @@ -155,6 +158,8 @@ typedef struct BDRVRawState { bool page_cache_inconsistent:1; bool has_fallocate; bool needs_alignment; + + PRManager *pr_mgr; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -402,6 +407,11 @@ static QemuOptsList raw_runtime_opts = { .type = QEMU_OPT_STRING, .help = "file locking mode (on/off/auto, default: auto)", }, + { + .name = "pr-manager", + .type = QEMU_OPT_STRING, + .help = "id of persistent reservation manager object (default: none)", + }, { /* end of list */ } }, }; @@ -413,6 +423,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, QemuOpts *opts; Error *local_err = NULL; const char *filename = NULL; + const char *str; BlockdevAioOptions aio, aio_default; int fd, ret; struct stat st; @@ -476,6 +487,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, abort(); } + str = qemu_opt_get(opts, "pr-manager"); + if (str) { + s->pr_mgr = pr_manager_lookup(str, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + } + s->open_flags = open_flags; raw_parse_flags(bdrv_flags, &s->open_flags); @@ -2597,6 +2618,15 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, if (fd_open(bs) < 0) return NULL; + if (req == SG_IO && s->pr_mgr) { + struct sg_io_hdr *io_hdr = buf; + if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || + io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { + return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), + s->fd, io_hdr, cb, opaque); + } + } + acb = g_new(RawPosixAIOData, 1); acb->bs = bs; acb->aio_type = QEMU_AIO_IOCTL; diff --git a/chardev/char-fd.c b/chardev/char-fd.c index 6a62a545f2..2c9b2ce567 100644 --- a/chardev/char-fd.c +++ b/chardev/char-fd.c @@ -84,8 +84,7 @@ static GSource *fd_chr_add_watch(Chardev *chr, GIOCondition cond) return qio_channel_create_watch(s->ioc_out, cond); } -static void fd_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void fd_chr_update_read_handler(Chardev *chr) { FDChardev *s = FD_CHARDEV(chr); @@ -94,7 +93,7 @@ static void fd_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc_in, fd_chr_read_poll, fd_chr_read, chr, - context); + chr->gcontext); } } diff --git a/chardev/char-fe.c b/chardev/char-fe.c index f3af6ae584..ee6d596100 100644 --- a/chardev/char-fe.c +++ b/chardev/char-fe.c @@ -253,7 +253,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b, bool set_open) { Chardev *s; - ChardevClass *cc; int fe_open; s = b->chr; @@ -261,7 +260,6 @@ void qemu_chr_fe_set_handlers(CharBackend *b, return; } - cc = CHARDEV_GET_CLASS(s); if (!opaque && !fd_can_read && !fd_read && !fd_event) { fe_open = 0; remove_fd_in_watch(s); @@ -273,9 +271,8 @@ void qemu_chr_fe_set_handlers(CharBackend *b, b->chr_event = fd_event; b->chr_be_change = be_change; b->opaque = opaque; - if (cc->chr_update_read_handler) { - cc->chr_update_read_handler(s, context); - } + + qemu_chr_be_update_read_handlers(s, context); if (set_open) { qemu_chr_fe_set_open(b, fe_open); diff --git a/chardev/char-pty.c b/chardev/char-pty.c index e5d20a0e6a..761ae6dec1 100644 --- a/chardev/char-pty.c +++ b/chardev/char-pty.c @@ -112,8 +112,7 @@ static void pty_chr_update_read_handler_locked(Chardev *chr) } } -static void pty_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void pty_chr_update_read_handler(Chardev *chr) { qemu_mutex_lock(&chr->chr_write_lock); pty_chr_update_read_handler_locked(chr); @@ -219,7 +218,7 @@ static void pty_chr_state(Chardev *chr, int connected) chr->gsource = io_add_watch_poll(chr, s->ioc, pty_chr_read_poll, pty_chr_read, - chr, NULL); + chr, chr->gcontext); } } } diff --git a/chardev/char-socket.c b/chardev/char-socket.c index 1ae730a4cb..e65148fe97 100644 --- a/chardev/char-socket.c +++ b/chardev/char-socket.c @@ -516,13 +516,12 @@ static void tcp_chr_connect(void *opaque) chr->gsource = io_add_watch_poll(chr, s->ioc, tcp_chr_read_poll, tcp_chr_read, - chr, NULL); + chr, chr->gcontext); } qemu_chr_be_event(chr, CHR_EVENT_OPENED); } -static void tcp_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void tcp_chr_update_read_handler(Chardev *chr) { SocketChardev *s = SOCKET_CHARDEV(chr); @@ -535,7 +534,7 @@ static void tcp_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc, tcp_chr_read_poll, tcp_chr_read, chr, - context); + chr->gcontext); } } diff --git a/chardev/char-udp.c b/chardev/char-udp.c index 4ee11d3ebf..d46ff7ab53 100644 --- a/chardev/char-udp.c +++ b/chardev/char-udp.c @@ -100,8 +100,7 @@ static gboolean udp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque) return TRUE; } -static void udp_chr_update_read_handler(Chardev *chr, - GMainContext *context) +static void udp_chr_update_read_handler(Chardev *chr) { UdpChardev *s = UDP_CHARDEV(chr); @@ -110,7 +109,7 @@ static void udp_chr_update_read_handler(Chardev *chr, chr->gsource = io_add_watch_poll(chr, s->ioc, udp_chr_read_poll, udp_chr_read, chr, - context); + chr->gcontext); } } diff --git a/chardev/char.c b/chardev/char.c index b6fd5eb6a6..2ae4f465ec 100644 --- a/chardev/char.c +++ b/chardev/char.c @@ -180,6 +180,17 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len) } } +void qemu_chr_be_update_read_handlers(Chardev *s, + GMainContext *context) +{ + ChardevClass *cc = CHARDEV_GET_CLASS(s); + + s->gcontext = context; + if (cc->chr_update_read_handler) { + cc->chr_update_read_handler(s); + } +} + int qemu_chr_add_client(Chardev *s, int fd) { return CHARDEV_GET_CLASS(s)->chr_add_client ? @@ -290,6 +290,7 @@ netmap="no" sdl="" sdlabi="" virtfs="" +mpath="" vnc="yes" sparse="no" vde="" @@ -936,6 +937,10 @@ for opt do ;; --enable-virtfs) virtfs="yes" ;; + --disable-mpath) mpath="no" + ;; + --enable-mpath) mpath="yes" + ;; --disable-vnc) vnc="no" ;; --enable-vnc) vnc="yes" @@ -1479,6 +1484,7 @@ disabled with --disable-FEATURE, default is enabled if available: vnc-png PNG compression for VNC server cocoa Cocoa UI (Mac OS X only) virtfs VirtFS + mpath Multipath persistent reservation passthrough xen xen backend driver support xen-pci-passthrough brlapi BrlAPI (Braile) @@ -3295,6 +3301,30 @@ else fi ########################################## +# libmpathpersist probe + +if test "$mpath" != "no" ; then + cat > $TMPC <<EOF +#include <libudev.h> +#include <mpath_persist.h> +unsigned mpath_mx_alloc_len = 1024; +int logsink; +int main(void) { + struct udev *udev = udev_new(); + mpath_lib_init(udev); + return 0; +} +EOF + if compile_prog "" "-ludev -lmultipath -lmpathpersist" ; then + mpathpersist=yes + else + mpathpersist=no + fi +else + mpathpersist=no +fi + +########################################## # libcap probe if test "$cap" != "no" ; then @@ -5023,16 +5053,34 @@ if test "$want_tools" = "yes" ; then fi fi if test "$softmmu" = yes ; then - if test "$virtfs" != no ; then - if test "$cap" = yes && test "$linux" = yes && test "$attr" = yes ; then + if test "$linux" = yes; then + if test "$virtfs" != no && test "$cap" = yes && test "$attr" = yes ; then virtfs=yes tools="$tools fsdev/virtfs-proxy-helper\$(EXESUF)" else if test "$virtfs" = yes; then - error_exit "VirtFS is supported only on Linux and requires libcap devel and libattr devel" + error_exit "VirtFS requires libcap devel and libattr devel" fi virtfs=no fi + if test "$mpath" != no && test "$mpathpersist" = yes ; then + mpath=yes + else + if test "$mpath" = yes; then + error_exit "Multipath requires libmpathpersist devel" + fi + mpath=no + fi + tools="$tools scsi/qemu-pr-helper\$(EXESUF)" + else + if test "$virtfs" = yes; then + error_exit "VirtFS is supported only on Linux" + fi + virtfs=no + if test "$mpath" = yes; then + error_exit "Multipath is supported only on Linux" + fi + mpath=no fi fi @@ -5278,6 +5326,7 @@ echo "Audio drivers $audio_drv_list" echo "Block whitelist (rw) $block_drv_rw_whitelist" echo "Block whitelist (ro) $block_drv_ro_whitelist" echo "VirtFS support $virtfs" +echo "Multipath support $mpath" echo "VNC support $vnc" if test "$vnc" = "yes" ; then echo "VNC SASL support $vnc_sasl" @@ -5729,6 +5778,9 @@ fi if test "$virtfs" = "yes" ; then echo "CONFIG_VIRTFS=y" >> $config_host_mak fi +if test "$mpath" = "yes" ; then + echo "CONFIG_MPATH=y" >> $config_host_mak +fi if test "$vhost_scsi" = "yes" ; then echo "CONFIG_VHOST_SCSI=y" >> $config_host_mak fi @@ -6510,7 +6562,7 @@ fi # build tree in object directory in case the source is not in the current directory DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests tests/vm" -DIRS="$DIRS docs docs/interop fsdev" +DIRS="$DIRS docs docs/interop fsdev scsi" DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas pc-bios/s390-ccw" DIRS="$DIRS roms/seabios roms/vgabios" DIRS="$DIRS qapi-generated" @@ -1764,8 +1764,9 @@ void qemu_init_vcpu(CPUState *cpu) /* If the target cpu hasn't set up any address spaces itself, * give it the default one. */ - AddressSpace *as = address_space_init_shareable(cpu->memory, - "cpu-memory"); + AddressSpace *as = g_new0(AddressSpace, 1); + + address_space_init(as, cpu->memory, "cpu-memory"); cpu->num_ases = 1; cpu_address_space_init(cpu, as, 0); } diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt index 3ef5d85b1b..10c5fa37e8 100644 --- a/docs/devel/atomics.txt +++ b/docs/devel/atomics.txt @@ -63,11 +63,23 @@ operations: typeof(*ptr) atomic_fetch_sub(ptr, val) typeof(*ptr) atomic_fetch_and(ptr, val) typeof(*ptr) atomic_fetch_or(ptr, val) + typeof(*ptr) atomic_fetch_xor(ptr, val) + typeof(*ptr) atomic_fetch_inc_nonzero(ptr) typeof(*ptr) atomic_xchg(ptr, val) typeof(*ptr) atomic_cmpxchg(ptr, old, new) all of which return the old value of *ptr. These operations are -polymorphic; they operate on any type that is as wide as an int. +polymorphic; they operate on any type that is as wide as a pointer. + +Similar operations return the new value of *ptr: + + typeof(*ptr) atomic_inc_fetch(ptr) + typeof(*ptr) atomic_dec_fetch(ptr) + typeof(*ptr) atomic_add_fetch(ptr, val) + typeof(*ptr) atomic_sub_fetch(ptr, val) + typeof(*ptr) atomic_and_fetch(ptr, val) + typeof(*ptr) atomic_or_fetch(ptr, val) + typeof(*ptr) atomic_xor_fetch(ptr, val) Sequentially consistent loads and stores can be done using: diff --git a/docs/interop/pr-helper.rst b/docs/interop/pr-helper.rst new file mode 100644 index 0000000000..9f76d5bcf9 --- /dev/null +++ b/docs/interop/pr-helper.rst @@ -0,0 +1,83 @@ +.. + +====================================== +Persistent reservation helper protocol +====================================== + +QEMU's SCSI passthrough devices, ``scsi-block`` and ``scsi-generic``, +can delegate implementation of persistent reservations to an external +(and typically privileged) program. Persistent Reservations allow +restricting access to block devices to specific initiators in a shared +storage setup. + +For a more detailed reference please refer the the SCSI Primary +Commands standard, specifically the section on Reservations and the +"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands. + +This document describes the socket protocol used between QEMU's +``pr-manager-helper`` object and the external program. + +.. contents:: + +Connection and initialization +----------------------------- + +All data transmitted on the socket is big-endian. + +After connecting to the helper program's socket, the helper starts a simple +feature negotiation process by writing four bytes corresponding to +the features it exposes (``supported_features``). QEMU reads it, +then writes four bytes corresponding to the desired features of the +helper program (``requested_features``). + +If a bit is 1 in ``requested_features`` and 0 in ``supported_features``, +the corresponding feature is not supported by the helper and the connection +is closed. On the other hand, it is acceptable for a bit to be 0 in +``requested_features`` and 1 in ``supported_features``; in this case, +the helper will not enable the feature. + +Right now no feature is defined, so the two parties always write four +zero bytes. + +Command format +-------------- + +It is invalid to send multiple commands concurrently on the same +socket. It is however possible to connect multiple sockets to the +helper and send multiple commands to the helper for one or more +file descriptors. + +A command consists of a request and a response. A request consists +of a 16-byte SCSI CDB. A file descriptor must be passed to the helper +together with the SCSI CDB using ancillary data. + +The CDB has the following limitations: + +- the command (stored in the first byte) must be one of 0x5E + (PERSISTENT RESERVE IN) or 0x5F (PERSISTENT RESERVE OUT). + +- the allocation length (stored in bytes 7-8 of the CDB for PERSISTENT + RESERVE IN) or parameter list length (stored in bytes 5-8 of the CDB + for PERSISTENT RESERVE OUT) is limited to 8 KiB. + +For PERSISTENT RESERVE OUT, the parameter list is sent right after the +CDB. The length of the parameter list is taken from the CDB itself. + +The helper's reply has the following structure: + +- 4 bytes for the SCSI status + +- 4 bytes for the payload size (nonzero only for PERSISTENT RESERVE IN + and only if the SCSI status is 0x00, i.e. GOOD) + +- 96 bytes for the SCSI sense data + +- if the size is nonzero, the payload follows + +The sense data is always sent to keep the protocol simple, even though +it is only valid if the SCSI status is CHECK CONDITION (0x02). + +The payload size is always less than or equal to the allocation length +specified in the CDB for the PERSISTENT RESERVE IN command. + +If the protocol is violated, the helper closes the socket. diff --git a/docs/pr-manager.rst b/docs/pr-manager.rst new file mode 100644 index 0000000000..9b1de198b1 --- /dev/null +++ b/docs/pr-manager.rst @@ -0,0 +1,111 @@ +====================================== +Persistent reservation managers +====================================== + +SCSI persistent Reservations allow restricting access to block devices +to specific initiators in a shared storage setup. When implementing +clustering of virtual machines, it is a common requirement for virtual +machines to send persistent reservation SCSI commands. However, +the operating system restricts sending these commands to unprivileged +programs because incorrect usage can disrupt regular operation of the +storage fabric. + +For this reason, QEMU's SCSI passthrough devices, ``scsi-block`` +and ``scsi-generic`` (both are only available on Linux) can delegate +implementation of persistent reservations to a separate object, +the "persistent reservation manager". Only PERSISTENT RESERVE OUT and +PERSISTENT RESERVE IN commands are passed to the persistent reservation +manager object; other commands are processed by QEMU as usual. + +----------------------------------------- +Defining a persistent reservation manager +----------------------------------------- + +A persistent reservation manager is an instance of a subclass of the +"pr-manager" QOM class. + +Right now only one subclass is defined, ``pr-manager-helper``, which +forwards the commands to an external privileged helper program +over Unix sockets. The helper program only allows sending persistent +reservation commands to devices for which QEMU has a file descriptor, +so that QEMU will not be able to effect persistent reservations +unless it has access to both the socket and the device. + +``pr-manager-helper`` has a single string property, ``path``, which +accepts the path to the helper program's Unix socket. For example, +the following command line defines a ``pr-manager-helper`` object and +attaches it to a SCSI passthrough device:: + + $ qemu-system-x86_64 + -device virtio-scsi \ + -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock + -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0 + -device scsi-block,drive=hd + +Alternatively, using ``-blockdev``:: + + $ qemu-system-x86_64 + -device virtio-scsi \ + -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock + -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0 + -device scsi-block,drive=hd + +---------------------------------- +Invoking :program:`qemu-pr-helper` +---------------------------------- + +QEMU provides an implementation of the persistent reservation helper, +called :program:`qemu-pr-helper`. The helper should be started as a +system service and supports the following option: + +-d, --daemon run in the background +-q, --quiet decrease verbosity +-v, --verbose increase verbosity +-f, --pidfile=path PID file when running as a daemon +-k, --socket=path path to the socket +-T, --trace=trace-opts tracing options + +By default, the socket and PID file are placed in the runtime state +directory, for example :file:`/var/run/qemu-pr-helper.sock` and +:file:`/var/run/qemu-pr-helper.pid`. The PID file is not created +unless :option:`-d` is passed too. + +:program:`qemu-pr-helper` can also use the systemd socket activation +protocol. In this case, the systemd socket unit should specify a +Unix stream socket, like this:: + + [Socket] + ListenStream=/var/run/qemu-pr-helper.sock + +After connecting to the socket, :program:`qemu-pr-helper`` can optionally drop +root privileges, except for those capabilities that are needed for +its operation. To do this, add the following options: + +-u, --user=user user to drop privileges to +-g, --group=group group to drop privileges to + +--------------------------------------------- +Multipath devices and persistent reservations +--------------------------------------------- + +Proper support of persistent reservation for multipath devices requires +communication with the multipath daemon, so that the reservation is +registered and applied when a path is newly discovered or becomes online +again. :command:`qemu-pr-helper` can do this if the ``libmpathpersist`` +library was available on the system at build time. + +As of August 2017, a reservation key must be specified in ``multipath.conf`` +for ``multipathd`` to check for persistent reservation for newly +discovered paths or reinstated paths. The attribute can be added +to the ``defaults`` section or the ``multipaths`` section; for example:: + + multipaths { + multipath { + wwid XXXXXXXXXXXXXXXX + alias yellow + reservation_key 0x123abc + } + } + +Linking :program:`qemu-pr-helper` to ``libmpathpersist`` does not impede +its usage on regular SCSI devices. @@ -187,21 +187,18 @@ typedef struct PhysPageMap { } PhysPageMap; struct AddressSpaceDispatch { - struct rcu_head rcu; - MemoryRegionSection *mru_section; /* This is a multi-level map on the physical address space. * The bottom level has pointers to MemoryRegionSections. */ PhysPageEntry phys_map; PhysPageMap map; - AddressSpace *as; }; #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) typedef struct subpage_t { MemoryRegion iomem; - AddressSpace *as; + FlatView *fv; hwaddr base; uint16_t sub_section[]; } subpage_t; @@ -361,7 +358,7 @@ static void phys_page_compact(PhysPageEntry *lp, Node *nodes) } } -static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb) +void address_space_dispatch_compact(AddressSpaceDispatch *d) { if (d->phys_map.skip) { phys_page_compact(&d->phys_map, d->map.nodes); @@ -471,12 +468,13 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x } /* Called from RCU critical section */ -static MemoryRegionSection address_space_do_translate(AddressSpace *as, - hwaddr addr, - hwaddr *xlat, - hwaddr *plen, - bool is_write, - bool is_mmio) +static MemoryRegionSection flatview_do_translate(FlatView *fv, + hwaddr addr, + hwaddr *xlat, + hwaddr *plen, + bool is_write, + bool is_mmio, + AddressSpace **target_as) { IOMMUTLBEntry iotlb; MemoryRegionSection *section; @@ -484,8 +482,9 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, IOMMUMemoryRegionClass *imrc; for (;;) { - AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch); - section = address_space_translate_internal(d, addr, &addr, plen, is_mmio); + section = address_space_translate_internal( + flatview_to_dispatch(fv), addr, &addr, + plen, is_mmio); iommu_mr = memory_region_get_iommu(section->mr); if (!iommu_mr) { @@ -502,7 +501,8 @@ static MemoryRegionSection address_space_do_translate(AddressSpace *as, goto translate_fail; } - as = iotlb.target_as; + fv = address_space_to_flatview(iotlb.target_as); + *target_as = iotlb.target_as; } *xlat = addr; @@ -524,8 +524,8 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, plen = (hwaddr)-1; /* This can never be MMIO. */ - section = address_space_do_translate(as, addr, &xlat, &plen, - is_write, false); + section = flatview_do_translate(address_space_to_flatview(as), addr, + &xlat, &plen, is_write, false, &as); /* Illegal translation */ if (section.mr == &io_mem_unassigned) { @@ -548,7 +548,7 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, plen -= 1; return (IOMMUTLBEntry) { - .target_as = section.address_space, + .target_as = as, .iova = addr & ~plen, .translated_addr = xlat & ~plen, .addr_mask = plen, @@ -561,15 +561,15 @@ iotlb_fail: } /* Called from RCU critical section */ -MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, - hwaddr *xlat, hwaddr *plen, - bool is_write) +MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool is_write) { MemoryRegion *mr; MemoryRegionSection section; + AddressSpace *as = NULL; /* This can be MMIO, so setup MMIO bit. */ - section = address_space_do_translate(as, addr, xlat, plen, is_write, true); + section = flatview_do_translate(fv, addr, xlat, plen, is_write, true, &as); mr = section.mr; if (xen_enabled() && memory_access_is_direct(mr, is_write)) { @@ -1219,7 +1219,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, } else { AddressSpaceDispatch *d; - d = atomic_rcu_read(§ion->address_space->dispatch); + d = flatview_to_dispatch(section->fv); iotlb = section - d->map.sections; iotlb += xlat; } @@ -1245,7 +1245,7 @@ hwaddr memory_region_section_get_iotlb(CPUState *cpu, static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, uint16_t section); -static subpage_t *subpage_init(AddressSpace *as, hwaddr base); +static subpage_t *subpage_init(FlatView *fv, hwaddr base); static void *(*phys_mem_alloc)(size_t size, uint64_t *align) = qemu_anon_ram_alloc; @@ -1302,8 +1302,9 @@ static void phys_sections_free(PhysPageMap *map) g_free(map->nodes); } -static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section) +static void register_subpage(FlatView *fv, MemoryRegionSection *section) { + AddressSpaceDispatch *d = flatview_to_dispatch(fv); subpage_t *subpage; hwaddr base = section->offset_within_address_space & TARGET_PAGE_MASK; @@ -1317,8 +1318,8 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti assert(existing->mr->subpage || existing->mr == &io_mem_unassigned); if (!(existing->mr->subpage)) { - subpage = subpage_init(d->as, base); - subsection.address_space = d->as; + subpage = subpage_init(fv, base); + subsection.fv = fv; subsection.mr = &subpage->iomem; phys_page_set(d, base >> TARGET_PAGE_BITS, 1, phys_section_add(&d->map, &subsection)); @@ -1332,9 +1333,10 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti } -static void register_multipage(AddressSpaceDispatch *d, +static void register_multipage(FlatView *fv, MemoryRegionSection *section) { + AddressSpaceDispatch *d = flatview_to_dispatch(fv); hwaddr start_addr = section->offset_within_address_space; uint16_t section_index = phys_section_add(&d->map, section); uint64_t num_pages = int128_get64(int128_rshift(section->size, @@ -1344,10 +1346,8 @@ static void register_multipage(AddressSpaceDispatch *d, phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); } -static void mem_add(MemoryListener *listener, MemoryRegionSection *section) +void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section) { - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); - AddressSpaceDispatch *d = as->next_dispatch; MemoryRegionSection now = *section, remain = *section; Int128 page_size = int128_make64(TARGET_PAGE_SIZE); @@ -1356,7 +1356,7 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section) - now.offset_within_address_space; now.size = int128_min(int128_make64(left), now.size); - register_subpage(d, &now); + register_subpage(fv, &now); } else { now.size = int128_zero(); } @@ -1366,13 +1366,13 @@ static void mem_add(MemoryListener *listener, MemoryRegionSection *section) remain.offset_within_region += int128_get64(now.size); now = remain; if (int128_lt(remain.size, page_size)) { - register_subpage(d, &now); + register_subpage(fv, &now); } else if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { now.size = page_size; - register_subpage(d, &now); + register_subpage(fv, &now); } else { now.size = int128_and(now.size, int128_neg(page_size)); - register_multipage(d, &now); + register_multipage(fv, &now); } } } @@ -2500,6 +2500,11 @@ static const MemoryRegionOps watch_mem_ops = { .endianness = DEVICE_NATIVE_ENDIAN, }; +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const uint8_t *buf, int len); +static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, + bool is_write); + static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, unsigned len, MemTxAttrs attrs) { @@ -2511,8 +2516,7 @@ static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__, subpage, len, addr); #endif - res = address_space_read(subpage->as, addr + subpage->base, - attrs, buf, len); + res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len); if (res) { return res; } @@ -2561,8 +2565,7 @@ static MemTxResult subpage_write(void *opaque, hwaddr addr, default: abort(); } - return address_space_write(subpage->as, addr + subpage->base, - attrs, buf, len); + return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len); } static bool subpage_accepts(void *opaque, hwaddr addr, @@ -2574,8 +2577,8 @@ static bool subpage_accepts(void *opaque, hwaddr addr, __func__, subpage, is_write ? 'w' : 'r', len, addr); #endif - return address_space_access_valid(subpage->as, addr + subpage->base, - len, is_write); + return flatview_access_valid(subpage->fv, addr + subpage->base, + len, is_write); } static const MemoryRegionOps subpage_ops = { @@ -2609,12 +2612,12 @@ static int subpage_register (subpage_t *mmio, uint32_t start, uint32_t end, return 0; } -static subpage_t *subpage_init(AddressSpace *as, hwaddr base) +static subpage_t *subpage_init(FlatView *fv, hwaddr base) { subpage_t *mmio; mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t)); - mmio->as = as; + mmio->fv = fv; mmio->base = base; memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio, NULL, TARGET_PAGE_SIZE); @@ -2628,12 +2631,11 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base) return mmio; } -static uint16_t dummy_section(PhysPageMap *map, AddressSpace *as, - MemoryRegion *mr) +static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr) { - assert(as); + assert(fv); MemoryRegionSection section = { - .address_space = as, + .fv = fv, .mr = mr, .offset_within_address_space = 0, .offset_within_region = 0, @@ -2670,46 +2672,31 @@ static void io_mem_init(void) NULL, UINT64_MAX); } -static void mem_begin(MemoryListener *listener) +AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv) { - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); uint16_t n; - n = dummy_section(&d->map, as, &io_mem_unassigned); + n = dummy_section(&d->map, fv, &io_mem_unassigned); assert(n == PHYS_SECTION_UNASSIGNED); - n = dummy_section(&d->map, as, &io_mem_notdirty); + n = dummy_section(&d->map, fv, &io_mem_notdirty); assert(n == PHYS_SECTION_NOTDIRTY); - n = dummy_section(&d->map, as, &io_mem_rom); + n = dummy_section(&d->map, fv, &io_mem_rom); assert(n == PHYS_SECTION_ROM); - n = dummy_section(&d->map, as, &io_mem_watch); + n = dummy_section(&d->map, fv, &io_mem_watch); assert(n == PHYS_SECTION_WATCH); d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; - d->as = as; - as->next_dispatch = d; + + return d; } -static void address_space_dispatch_free(AddressSpaceDispatch *d) +void address_space_dispatch_free(AddressSpaceDispatch *d) { phys_sections_free(&d->map); g_free(d); } -static void mem_commit(MemoryListener *listener) -{ - AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener); - AddressSpaceDispatch *cur = as->dispatch; - AddressSpaceDispatch *next = as->next_dispatch; - - phys_page_compact_all(next, next->map.nodes_nb); - - atomic_rcu_set(&as->dispatch, next); - if (cur) { - call_rcu(cur, address_space_dispatch_free, rcu); - } -} - static void tcg_commit(MemoryListener *listener) { CPUAddressSpace *cpuas; @@ -2723,39 +2710,11 @@ static void tcg_commit(MemoryListener *listener) * We reload the dispatch pointer now because cpu_reloading_memory_map() * may have split the RCU critical section. */ - d = atomic_rcu_read(&cpuas->as->dispatch); + d = address_space_to_dispatch(cpuas->as); atomic_rcu_set(&cpuas->memory_dispatch, d); tlb_flush(cpuas->cpu); } -void address_space_init_dispatch(AddressSpace *as) -{ - as->dispatch = NULL; - as->dispatch_listener = (MemoryListener) { - .begin = mem_begin, - .commit = mem_commit, - .region_add = mem_add, - .region_nop = mem_add, - .priority = 0, - }; - memory_listener_register(&as->dispatch_listener, as); -} - -void address_space_unregister(AddressSpace *as) -{ - memory_listener_unregister(&as->dispatch_listener); -} - -void address_space_destroy_dispatch(AddressSpace *as) -{ - AddressSpaceDispatch *d = as->dispatch; - - atomic_rcu_set(&as->dispatch, NULL); - if (d) { - call_rcu(d, address_space_dispatch_free, rcu); - } -} - static void memory_map_init(void) { system_memory = g_malloc(sizeof(*system_memory)); @@ -2899,11 +2858,11 @@ static bool prepare_mmio_access(MemoryRegion *mr) } /* Called within RCU critical section. */ -static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, - const uint8_t *buf, - int len, hwaddr addr1, - hwaddr l, MemoryRegion *mr) +static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, + const uint8_t *buf, + int len, hwaddr addr1, + hwaddr l, MemoryRegion *mr) { uint8_t *ptr; uint64_t val; @@ -2965,14 +2924,14 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, } l = len; - mr = address_space_translate(as, addr, &addr1, &l, true); + mr = flatview_translate(fv, addr, &addr1, &l, true); } return result; } -MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - const uint8_t *buf, int len) +static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + const uint8_t *buf, int len) { hwaddr l; hwaddr addr1; @@ -2982,20 +2941,27 @@ MemTxResult address_space_write(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (len > 0) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, true); - result = address_space_write_continue(as, addr, attrs, buf, len, - addr1, l, mr); + mr = flatview_translate(fv, addr, &addr1, &l, true); + result = flatview_write_continue(fv, addr, attrs, buf, len, + addr1, l, mr); rcu_read_unlock(); } return result; } +MemTxResult address_space_write(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, + const uint8_t *buf, int len) +{ + return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len); +} + /* Called within RCU critical section. */ -MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, - int len, hwaddr addr1, hwaddr l, - MemoryRegion *mr) +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, hwaddr addr1, hwaddr l, + MemoryRegion *mr) { uint8_t *ptr; uint64_t val; @@ -3055,14 +3021,14 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, } l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); + mr = flatview_translate(fv, addr, &addr1, &l, false); } return result; } -MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, int len) +MemTxResult flatview_read_full(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len) { hwaddr l; hwaddr addr1; @@ -3072,25 +3038,33 @@ MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, if (len > 0) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); - result = address_space_read_continue(as, addr, attrs, buf, len, - addr1, l, mr); + mr = flatview_translate(fv, addr, &addr1, &l, false); + result = flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); rcu_read_unlock(); } return result; } -MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - uint8_t *buf, int len, bool is_write) +static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + uint8_t *buf, int len, bool is_write) { if (is_write) { - return address_space_write(as, addr, attrs, (uint8_t *)buf, len); + return flatview_write(fv, addr, attrs, (uint8_t *)buf, len); } else { - return address_space_read(as, addr, attrs, (uint8_t *)buf, len); + return flatview_read(fv, addr, attrs, (uint8_t *)buf, len); } } +MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, bool is_write) +{ + return flatview_rw(address_space_to_flatview(as), + addr, attrs, buf, len, is_write); +} + void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, int len, int is_write) { @@ -3248,7 +3222,8 @@ static void cpu_notify_map_clients(void) qemu_mutex_unlock(&map_client_list_lock); } -bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_write) +static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len, + bool is_write) { MemoryRegion *mr; hwaddr l, xlat; @@ -3256,7 +3231,7 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ rcu_read_lock(); while (len > 0) { l = len; - mr = address_space_translate(as, addr, &xlat, &l, is_write); + mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { l = memory_access_size(mr, l, addr); if (!memory_region_access_valid(mr, xlat, l, is_write)) { @@ -3272,8 +3247,16 @@ bool address_space_access_valid(AddressSpace *as, hwaddr addr, int len, bool is_ return true; } +bool address_space_access_valid(AddressSpace *as, hwaddr addr, + int len, bool is_write) +{ + return flatview_access_valid(address_space_to_flatview(as), + addr, len, is_write); +} + static hwaddr -address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_len, +flatview_extend_translation(FlatView *fv, hwaddr addr, + hwaddr target_len, MemoryRegion *mr, hwaddr base, hwaddr len, bool is_write) { @@ -3290,7 +3273,8 @@ address_space_extend_translation(AddressSpace *as, hwaddr addr, hwaddr target_le } len = target_len; - this_mr = address_space_translate(as, addr, &xlat, &len, is_write); + this_mr = flatview_translate(fv, addr, &xlat, + &len, is_write); if (this_mr != mr || xlat != base + done) { return done; } @@ -3313,6 +3297,7 @@ void *address_space_map(AddressSpace *as, hwaddr l, xlat; MemoryRegion *mr; void *ptr; + FlatView *fv = address_space_to_flatview(as); if (len == 0) { return NULL; @@ -3320,7 +3305,7 @@ void *address_space_map(AddressSpace *as, l = len; rcu_read_lock(); - mr = address_space_translate(as, addr, &xlat, &l, is_write); + mr = flatview_translate(fv, addr, &xlat, &l, is_write); if (!memory_access_is_direct(mr, is_write)) { if (atomic_xchg(&bounce.in_use, true)) { @@ -3336,7 +3321,7 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); bounce.mr = mr; if (!is_write) { - address_space_read(as, addr, MEMTXATTRS_UNSPECIFIED, + flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, bounce.buffer, l); } @@ -3347,7 +3332,8 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); - *plen = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write); + *plen = flatview_extend_translation(fv, addr, len, mr, xlat, + l, is_write); ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); rcu_read_unlock(); @@ -3630,3 +3616,87 @@ void page_size_init(void) } qemu_host_page_mask = -(intptr_t)qemu_host_page_size; } + +#if !defined(CONFIG_USER_ONLY) + +static void mtree_print_phys_entries(fprintf_function mon, void *f, + int start, int end, int skip, int ptr) +{ + if (start == end - 1) { + mon(f, "\t%3d ", start); + } else { + mon(f, "\t%3d..%-3d ", start, end - 1); + } + mon(f, " skip=%d ", skip); + if (ptr == PHYS_MAP_NODE_NIL) { + mon(f, " ptr=NIL"); + } else if (!skip) { + mon(f, " ptr=#%d", ptr); + } else { + mon(f, " ptr=[%d]", ptr); + } + mon(f, "\n"); +} + +#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \ + int128_sub((size), int128_one())) : 0) + +void mtree_print_dispatch(fprintf_function mon, void *f, + AddressSpaceDispatch *d, MemoryRegion *root) +{ + int i; + + mon(f, " Dispatch\n"); + mon(f, " Physical sections\n"); + + for (i = 0; i < d->map.sections_nb; ++i) { + MemoryRegionSection *s = d->map.sections + i; + const char *names[] = { " [unassigned]", " [not dirty]", + " [ROM]", " [watch]" }; + + mon(f, " #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx " %s%s%s%s%s", + i, + s->offset_within_address_space, + s->offset_within_address_space + MR_SIZE(s->mr->size), + s->mr->name ? s->mr->name : "(noname)", + i < ARRAY_SIZE(names) ? names[i] : "", + s->mr == root ? " [ROOT]" : "", + s == d->mru_section ? " [MRU]" : "", + s->mr->is_iommu ? " [iommu]" : ""); + + if (s->mr->alias) { + mon(f, " alias=%s", s->mr->alias->name ? + s->mr->alias->name : "noname"); + } + mon(f, "\n"); + } + + mon(f, " Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n", + P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip); + for (i = 0; i < d->map.nodes_nb; ++i) { + int j, jprev; + PhysPageEntry prev; + Node *n = d->map.nodes + i; + + mon(f, " [%d]\n", i); + + for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) { + PhysPageEntry *pe = *n + j; + + if (pe->ptr == prev.ptr && pe->skip == prev.skip) { + continue; + } + + mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr); + + jprev = j; + prev = *pe; + } + + if (jprev != ARRAY_SIZE(*n)) { + mtree_print_phys_entries(mon, f, jprev, j, prev.skip, prev.ptr); + } + } +} + +#endif diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx index 1c6772597d..4f1ece93e5 100644 --- a/hmp-commands-info.hx +++ b/hmp-commands-info.hx @@ -250,9 +250,10 @@ ETEXI { .name = "mtree", - .args_type = "flatview:-f", - .params = "[-f]", - .help = "show memory tree (-f: dump flat view for address spaces)", + .args_type = "flatview:-f,dispatch_tree:-d", + .params = "[-f][-d]", + .help = "show memory tree (-f: dump flat view for address spaces;" + "-d: dump dispatch tree, valid with -f only)", .cmd = hmp_info_mtree, }, diff --git a/hw/arm/armv7m.c b/hw/arm/armv7m.c index 57a680687a..bb2dfc942b 100644 --- a/hw/arm/armv7m.c +++ b/hw/arm/armv7m.c @@ -41,7 +41,7 @@ static MemTxResult bitband_read(void *opaque, hwaddr offset, /* Find address in underlying memory and round down to multiple of size */ addr = bitband_addr(s, offset) & (-size); - res = address_space_read(s->source_as, addr, attrs, buf, size); + res = address_space_read(&s->source_as, addr, attrs, buf, size); if (res) { return res; } @@ -66,7 +66,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value, /* Find address in underlying memory and round down to multiple of size */ addr = bitband_addr(s, offset) & (-size); - res = address_space_read(s->source_as, addr, attrs, buf, size); + res = address_space_read(&s->source_as, addr, attrs, buf, size); if (res) { return res; } @@ -79,7 +79,7 @@ static MemTxResult bitband_write(void *opaque, hwaddr offset, uint64_t value, } else { buf[bitpos >> 3] &= ~bit; } - return address_space_write(s->source_as, addr, attrs, buf, size); + return address_space_write(&s->source_as, addr, attrs, buf, size); } static const MemoryRegionOps bitband_ops = { @@ -111,8 +111,7 @@ static void bitband_realize(DeviceState *dev, Error **errp) return; } - s->source_as = address_space_init_shareable(s->source_memory, - "bitband-source"); + address_space_init(&s->source_as, s->source_memory, "bitband-source"); } /* Board init. */ diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c index 198b2a89c0..172c72d06c 100644 --- a/hw/char/virtio-console.c +++ b/hw/char/virtio-console.c @@ -187,6 +187,26 @@ static int chr_be_change(void *opaque) return 0; } +static void virtconsole_enable_backend(VirtIOSerialPort *port, bool enable) +{ + VirtConsole *vcon = VIRTIO_CONSOLE(port); + + if (!qemu_chr_fe_backend_connected(&vcon->chr)) { + return; + } + + if (enable) { + VirtIOSerialPortClass *k = VIRTIO_SERIAL_PORT_GET_CLASS(port); + + qemu_chr_fe_set_handlers(&vcon->chr, chr_can_read, chr_read, + k->is_console ? NULL : chr_event, + chr_be_change, vcon, NULL, false); + } else { + qemu_chr_fe_set_handlers(&vcon->chr, NULL, NULL, NULL, + NULL, NULL, NULL, false); + } +} + static void virtconsole_realize(DeviceState *dev, Error **errp) { VirtIOSerialPort *port = VIRTIO_SERIAL_PORT(dev); @@ -258,6 +278,7 @@ static void virtserialport_class_init(ObjectClass *klass, void *data) k->unrealize = virtconsole_unrealize; k->have_data = flush_buf; k->set_guest_connected = set_guest_connected; + k->enable_backend = virtconsole_enable_backend; k->guest_writable = guest_writable; dc->props = virtserialport_properties; } diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index 17a1bb008a..9470bd7be7 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -637,6 +637,13 @@ static void set_status(VirtIODevice *vdev, uint8_t status) if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { guest_reset(vser); } + + QTAILQ_FOREACH(port, &vser->ports, next) { + VirtIOSerialPortClass *vsc = VIRTIO_SERIAL_PORT_GET_CLASS(port); + if (vsc->enable_backend) { + vsc->enable_backend(port, vdev->vm_running); + } + } } static void vser_reset(VirtIODevice *vdev) diff --git a/hw/intc/openpic_kvm.c b/hw/intc/openpic_kvm.c index 0518e017c4..fa83420254 100644 --- a/hw/intc/openpic_kvm.c +++ b/hw/intc/openpic_kvm.c @@ -124,7 +124,7 @@ static void kvm_openpic_region_add(MemoryListener *listener, uint64_t reg_base; int ret; - if (section->address_space != &address_space_memory) { + if (section->fv != address_space_to_flatview(&address_space_memory)) { abort(); } diff --git a/include/chardev/char.h b/include/chardev/char.h index 66dde4637e..43aabccef5 100644 --- a/include/chardev/char.h +++ b/include/chardev/char.h @@ -55,6 +55,7 @@ struct Chardev { int logfd; int be_open; GSource *gsource; + GMainContext *gcontext; DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST); }; @@ -169,6 +170,16 @@ void qemu_chr_be_write(Chardev *s, uint8_t *buf, int len); void qemu_chr_be_write_impl(Chardev *s, uint8_t *buf, int len); /** + * @qemu_chr_be_update_read_handlers: + * + * Invoked when frontend read handlers are setup + * + * @context the gcontext that will be used to attach the watch sources + */ +void qemu_chr_be_update_read_handlers(Chardev *s, + GMainContext *context); + +/** * @qemu_chr_be_event: * * Send an event from the back end to the front end. @@ -227,7 +238,7 @@ typedef struct ChardevClass { int (*chr_write)(Chardev *s, const uint8_t *buf, int len); int (*chr_sync_read)(Chardev *s, const uint8_t *buf, int len); GSource *(*chr_add_watch)(Chardev *s, GIOCondition cond); - void (*chr_update_read_handler)(Chardev *s, GMainContext *context); + void (*chr_update_read_handler)(Chardev *s); int (*chr_ioctl)(Chardev *s, int cmd, void *arg); int (*get_msgfds)(Chardev *s, int* fds, int num); int (*set_msgfds)(Chardev *s, int *fds, int num); diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index fb467acdba..647e9bd5c4 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -22,14 +22,22 @@ #ifndef CONFIG_USER_ONLY typedef struct AddressSpaceDispatch AddressSpaceDispatch; -void address_space_init_dispatch(AddressSpace *as); -void address_space_unregister(AddressSpace *as); -void address_space_destroy_dispatch(AddressSpace *as); - extern const MemoryRegionOps unassigned_mem_ops; bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr, unsigned size, bool is_write); +void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section); +AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv); +void address_space_dispatch_compact(AddressSpaceDispatch *d); + +AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as); +AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv); +void address_space_dispatch_free(AddressSpaceDispatch *d); + +void mtree_print_dispatch(fprintf_function mon, void *f, + struct AddressSpaceDispatch *d, + MemoryRegion *root); + #endif #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 1dcd3122d7..5ed4042f87 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -308,21 +308,18 @@ struct AddressSpace { struct rcu_head rcu; char *name; MemoryRegion *root; - int ref_count; - bool malloced; /* Accessed via RCU. */ struct FlatView *current_map; int ioeventfd_nb; struct MemoryRegionIoeventfd *ioeventfds; - struct AddressSpaceDispatch *dispatch; - struct AddressSpaceDispatch *next_dispatch; - MemoryListener dispatch_listener; QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners; QTAILQ_ENTRY(AddressSpace) address_spaces_link; }; +FlatView *address_space_to_flatview(AddressSpace *as); + /** * MemoryRegionSection: describes a fragment of a #MemoryRegion * @@ -336,7 +333,7 @@ struct AddressSpace { */ struct MemoryRegionSection { MemoryRegion *mr; - AddressSpace *address_space; + FlatView *fv; hwaddr offset_within_region; Int128 size; hwaddr offset_within_address_space; @@ -1515,7 +1512,8 @@ void memory_global_dirty_log_start(void); */ void memory_global_dirty_log_stop(void); -void mtree_info(fprintf_function mon_printf, void *f, bool flatview); +void mtree_info(fprintf_function mon_printf, void *f, bool flatview, + bool dispatch_tree); /** * memory_region_request_mmio_ptr: request a pointer to an mmio @@ -1585,23 +1583,6 @@ MemTxResult memory_region_dispatch_write(MemoryRegion *mr, void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name); /** - * address_space_init_shareable: return an address space for a memory region, - * creating it if it does not already exist - * - * @root: a #MemoryRegion that routes addresses for the address space - * @name: an address space name. The name is only used for debugging - * output. - * - * This function will return a pointer to an existing AddressSpace - * which was initialized with the specified MemoryRegion, or it will - * create and initialize one if it does not already exist. The ASes - * are reference-counted, so the memory will be freed automatically - * when the AddressSpace is destroyed via address_space_destroy. - */ -AddressSpace *address_space_init_shareable(MemoryRegion *root, - const char *name); - -/** * address_space_destroy: destroy an address space * * Releases all resources associated with an address space. After an address space @@ -1845,9 +1826,17 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, * @len: pointer to length * @is_write: indicates the transfer direction */ -MemoryRegion *address_space_translate(AddressSpace *as, hwaddr addr, - hwaddr *xlat, hwaddr *len, - bool is_write); +MemoryRegion *flatview_translate(FlatView *fv, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write); + +static inline MemoryRegion *address_space_translate(AddressSpace *as, + hwaddr addr, hwaddr *xlat, + hwaddr *len, bool is_write) +{ + return flatview_translate(address_space_to_flatview(as), + addr, xlat, len, is_write); +} /* address_space_access_valid: check for validity of accessing an address * space range @@ -1898,12 +1887,13 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, /* Internal functions, part of the implementation of address_space_read. */ -MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, - int len, hwaddr addr1, hwaddr l, - MemoryRegion *mr); -MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, - MemTxAttrs attrs, uint8_t *buf, int len); +MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len, hwaddr addr1, hwaddr l, + MemoryRegion *mr); + +MemTxResult flatview_read_full(FlatView *fv, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, int len); void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr); static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) @@ -1930,8 +1920,8 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) * @buf: buffer with the data transferred */ static inline __attribute__((__always_inline__)) -MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, - uint8_t *buf, int len) +MemTxResult flatview_read(FlatView *fv, hwaddr addr, MemTxAttrs attrs, + uint8_t *buf, int len) { MemTxResult result = MEMTX_OK; hwaddr l, addr1; @@ -1942,22 +1932,29 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (len) { rcu_read_lock(); l = len; - mr = address_space_translate(as, addr, &addr1, &l, false); + mr = flatview_translate(fv, addr, &addr1, &l, false); if (len == l && memory_access_is_direct(mr, false)) { ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(buf, ptr, len); } else { - result = address_space_read_continue(as, addr, attrs, buf, len, - addr1, l, mr); + result = flatview_read_continue(fv, addr, attrs, buf, len, + addr1, l, mr); } rcu_read_unlock(); } } else { - result = address_space_read_full(as, addr, attrs, buf, len); + result = flatview_read_full(fv, addr, attrs, buf, len); } return result; } +static inline MemTxResult address_space_read(AddressSpace *as, hwaddr addr, + MemTxAttrs attrs, uint8_t *buf, + int len) +{ + return flatview_read(address_space_to_flatview(as), addr, attrs, buf, len); +} + /** * address_space_read_cached: read from a cached RAM region * diff --git a/include/hw/arm/armv7m.h b/include/hw/arm/armv7m.h index 9ad316c76e..35ab757264 100644 --- a/include/hw/arm/armv7m.h +++ b/include/hw/arm/armv7m.h @@ -21,7 +21,7 @@ typedef struct { SysBusDevice parent_obj; /*< public >*/ - AddressSpace *source_as; + AddressSpace source_as; MemoryRegion iomem; uint32_t base; MemoryRegion *source_memory; diff --git a/include/hw/virtio/virtio-serial.h b/include/hw/virtio/virtio-serial.h index b19c44727f..12657a9f39 100644 --- a/include/hw/virtio/virtio-serial.h +++ b/include/hw/virtio/virtio-serial.h @@ -58,6 +58,9 @@ typedef struct VirtIOSerialPortClass { /* Guest opened/closed device. */ void (*set_guest_connected)(VirtIOSerialPort *port, int guest_connected); + /* Enable/disable backend for virtio serial port */ + void (*enable_backend)(VirtIOSerialPort *port, bool enable); + /* Guest is now ready to accept data (virtqueues set up). */ void (*guest_ready)(VirtIOSerialPort *port); diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index b6b62fb771..d73c9e14d7 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -442,4 +442,12 @@ } while(0) #endif +#define atomic_fetch_inc_nonzero(ptr) ({ \ + typeof_strip_qual(*ptr) _oldn = atomic_read(ptr); \ + while (_oldn && atomic_cmpxchg(ptr, _oldn, _oldn + 1) != _oldn) { \ + _oldn = atomic_read(ptr); \ + } \ + _oldn; \ +}) + #endif /* QEMU_ATOMIC_H */ diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 163550214c..980d2b330e 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -30,6 +30,7 @@ typedef struct DisplaySurface DisplaySurface; typedef struct DriveInfo DriveInfo; typedef struct Error Error; typedef struct EventNotifier EventNotifier; +typedef struct FlatView FlatView; typedef struct FWCfgEntry FWCfgEntry; typedef struct FWCfgIoState FWCfgIoState; typedef struct FWCfgMemState FWCfgMemState; diff --git a/include/scsi/pr-manager.h b/include/scsi/pr-manager.h new file mode 100644 index 0000000000..b2b37d63bc --- /dev/null +++ b/include/scsi/pr-manager.h @@ -0,0 +1,56 @@ +#ifndef PR_MANAGER_H +#define PR_MANAGER_H + +#include "qom/object.h" +#include "qapi/qmp/qdict.h" +#include "qapi/visitor.h" +#include "qom/object_interfaces.h" +#include "block/aio.h" + +#define TYPE_PR_MANAGER "pr-manager" + +#define PR_MANAGER_CLASS(klass) \ + OBJECT_CLASS_CHECK(PRManagerClass, (klass), TYPE_PR_MANAGER) +#define PR_MANAGER_GET_CLASS(obj) \ + OBJECT_GET_CLASS(PRManagerClass, (obj), TYPE_PR_MANAGER) +#define PR_MANAGER(obj) \ + OBJECT_CHECK(PRManager, (obj), TYPE_PR_MANAGER) + +struct sg_io_hdr; + +typedef struct PRManager { + /* <private> */ + Object parent; +} PRManager; + +/** + * PRManagerClass: + * @parent_class: the base class + * @run: callback invoked in thread pool context + */ +typedef struct PRManagerClass { + /* <private> */ + ObjectClass parent_class; + + /* <public> */ + int (*run)(PRManager *pr_mgr, int fd, struct sg_io_hdr *hdr); +} PRManagerClass; + +BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, + AioContext *ctx, int fd, + struct sg_io_hdr *hdr, + BlockCompletionFunc *complete, + void *opaque); + +#ifdef CONFIG_LINUX +PRManager *pr_manager_lookup(const char *id, Error **errp); +#else +static inline PRManager *pr_manager_lookup(const char *id, Error **errp) +{ + /* The classes do not exist at all! */ + error_setg(errp, "No persistent reservation manager with id '%s'", id); + return NULL; +} +#endif + +#endif diff --git a/include/scsi/utils.h b/include/scsi/utils.h index d301b31768..00a4bdb080 100644 --- a/include/scsi/utils.h +++ b/include/scsi/utils.h @@ -72,10 +72,14 @@ extern const struct SCSISense sense_code_IO_ERROR; extern const struct SCSISense sense_code_I_T_NEXUS_LOSS; /* Command aborted, Logical Unit failure */ extern const struct SCSISense sense_code_LUN_FAILURE; +/* Command aborted, LUN Communication failure */ +extern const struct SCSISense sense_code_LUN_COMM_FAILURE; /* Command aborted, Overlapped Commands Attempted */ extern const struct SCSISense sense_code_OVERLAPPED_COMMANDS; /* LUN not ready, Capacity data has changed */ extern const struct SCSISense sense_code_CAPACITY_CHANGED; +/* Unit attention, SCSI bus reset */ +extern const struct SCSISense sense_code_SCSI_BUS_RESET; /* LUN not ready, Medium not present */ extern const struct SCSISense sense_code_UNIT_ATTENTION_NO_MEDIUM; /* Unit attention, Power on, reset or bus device reset occurred */ @@ -47,6 +47,8 @@ static QTAILQ_HEAD(memory_listeners, MemoryListener) memory_listeners static QTAILQ_HEAD(, AddressSpace) address_spaces = QTAILQ_HEAD_INITIALIZER(address_spaces); +static GHashTable *flat_views; + typedef struct AddrRange AddrRange; /* @@ -154,7 +156,8 @@ enum ListenerDirection { Forward, Reverse }; /* No need to ref/unref .mr, the FlatRange keeps it alive. */ #define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...) \ do { \ - MemoryRegionSection mrs = section_from_flat_range(fr, as); \ + MemoryRegionSection mrs = section_from_flat_range(fr, \ + address_space_to_flatview(as)); \ MEMORY_LISTENER_CALL(as, callback, dir, &mrs, ##_args); \ } while(0) @@ -208,7 +211,6 @@ static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd a, } typedef struct FlatRange FlatRange; -typedef struct FlatView FlatView; /* Range of memory in the global map. Addresses are absolute. */ struct FlatRange { @@ -229,6 +231,8 @@ struct FlatView { FlatRange *ranges; unsigned nr; unsigned nr_allocated; + struct AddressSpaceDispatch *dispatch; + MemoryRegion *root; }; typedef struct AddressSpaceOps AddressSpaceOps; @@ -237,11 +241,11 @@ typedef struct AddressSpaceOps AddressSpaceOps; for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var) static inline MemoryRegionSection -section_from_flat_range(FlatRange *fr, AddressSpace *as) +section_from_flat_range(FlatRange *fr, FlatView *fv) { return (MemoryRegionSection) { .mr = fr->mr, - .address_space = as, + .fv = fv, .offset_within_region = fr->offset_in_region, .size = fr->addr.size, .offset_within_address_space = int128_get64(fr->addr.start), @@ -258,12 +262,17 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b) && a->readonly == b->readonly; } -static void flatview_init(FlatView *view) +static FlatView *flatview_new(MemoryRegion *mr_root) { + FlatView *view; + + view = g_new0(FlatView, 1); view->ref = 1; - view->ranges = NULL; - view->nr = 0; - view->nr_allocated = 0; + view->root = mr_root; + memory_region_ref(mr_root); + trace_flatview_new(view, mr_root); + + return view; } /* Insert a range into a given position. Caller is responsible for maintaining @@ -287,25 +296,47 @@ static void flatview_destroy(FlatView *view) { int i; + trace_flatview_destroy(view, view->root); + if (view->dispatch) { + address_space_dispatch_free(view->dispatch); + } for (i = 0; i < view->nr; i++) { memory_region_unref(view->ranges[i].mr); } g_free(view->ranges); + memory_region_unref(view->root); g_free(view); } -static void flatview_ref(FlatView *view) +static bool flatview_ref(FlatView *view) { - atomic_inc(&view->ref); + return atomic_fetch_inc_nonzero(&view->ref) > 0; } static void flatview_unref(FlatView *view) { if (atomic_fetch_dec(&view->ref) == 1) { - flatview_destroy(view); + trace_flatview_destroy_rcu(view, view->root); + assert(view->root); + call_rcu(view, flatview_destroy, rcu); } } +FlatView *address_space_to_flatview(AddressSpace *as) +{ + return atomic_rcu_read(&as->current_map); +} + +AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv) +{ + return fv->dispatch; +} + +AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as) +{ + return flatview_to_dispatch(address_space_to_flatview(as)); +} + static bool can_merge(FlatRange *r1, FlatRange *r2) { return int128_eq(addrrange_end(r1->addr), r2->addr.start) @@ -560,13 +591,14 @@ static MemTxResult access_with_adjusted_size(hwaddr addr, unsigned size, unsigned access_size_min, unsigned access_size_max, - MemTxResult (*access)(MemoryRegion *mr, - hwaddr addr, - uint64_t *value, - unsigned size, - unsigned shift, - uint64_t mask, - MemTxAttrs attrs), + MemTxResult (*access_fn) + (MemoryRegion *mr, + hwaddr addr, + uint64_t *value, + unsigned size, + unsigned shift, + uint64_t mask, + MemTxAttrs attrs), MemoryRegion *mr, MemTxAttrs attrs) { @@ -587,12 +619,12 @@ static MemTxResult access_with_adjusted_size(hwaddr addr, access_mask = -1ULL >> (64 - access_size * 8); if (memory_region_big_endian(mr)) { for (i = 0; i < size; i += access_size) { - r |= access(mr, addr + i, value, access_size, + r |= access_fn(mr, addr + i, value, access_size, (size - access_size - i) * 8, access_mask, attrs); } } else { for (i = 0; i < size; i += access_size) { - r |= access(mr, addr + i, value, access_size, i * 8, + r |= access_fn(mr, addr + i, value, access_size, i * 8, access_mask, attrs); } } @@ -701,13 +733,57 @@ static void render_memory_region(FlatView *view, } } +static MemoryRegion *memory_region_get_flatview_root(MemoryRegion *mr) +{ + while (mr->enabled) { + if (mr->alias) { + if (!mr->alias_offset && int128_ge(mr->size, mr->alias->size)) { + /* The alias is included in its entirety. Use it as + * the "real" root, so that we can share more FlatViews. + */ + mr = mr->alias; + continue; + } + } else if (!mr->terminates) { + unsigned int found = 0; + MemoryRegion *child, *next = NULL; + QTAILQ_FOREACH(child, &mr->subregions, subregions_link) { + if (child->enabled) { + if (++found > 1) { + next = NULL; + break; + } + if (!child->addr && int128_ge(mr->size, child->size)) { + /* A child is included in its entirety. If it's the only + * enabled one, use it in the hope of finding an alias down the + * way. This will also let us share FlatViews. + */ + next = child; + } + } + } + if (found == 0) { + return NULL; + } + if (next) { + mr = next; + continue; + } + } + + return mr; + } + + return NULL; +} + /* Render a memory topology into a list of disjoint absolute ranges. */ static FlatView *generate_memory_topology(MemoryRegion *mr) { + int i; FlatView *view; - view = g_new(FlatView, 1); - flatview_init(view); + view = flatview_new(mr); if (mr) { render_memory_region(view, mr, int128_zero(), @@ -715,6 +791,15 @@ static FlatView *generate_memory_topology(MemoryRegion *mr) } flatview_simplify(view); + view->dispatch = address_space_dispatch_new(view); + for (i = 0; i < view->nr; i++) { + MemoryRegionSection mrs = + section_from_flat_range(&view->ranges[i], view); + flatview_add_to_dispatch(view, &mrs); + } + address_space_dispatch_compact(view->dispatch); + g_hash_table_replace(flat_views, mr, view); + return view; } @@ -740,7 +825,7 @@ static void address_space_add_del_ioeventfds(AddressSpace *as, fds_new[inew]))) { fd = &fds_old[iold]; section = (MemoryRegionSection) { - .address_space = as, + .fv = address_space_to_flatview(as), .offset_within_address_space = int128_get64(fd->addr.start), .size = fd->addr.size, }; @@ -753,7 +838,7 @@ static void address_space_add_del_ioeventfds(AddressSpace *as, fds_old[iold]))) { fd = &fds_new[inew]; section = (MemoryRegionSection) { - .address_space = as, + .fv = address_space_to_flatview(as), .offset_within_address_space = int128_get64(fd->addr.start), .size = fd->addr.size, }; @@ -772,8 +857,12 @@ static FlatView *address_space_get_flatview(AddressSpace *as) FlatView *view; rcu_read_lock(); - view = atomic_rcu_read(&as->current_map); - flatview_ref(view); + do { + view = address_space_to_flatview(as); + /* If somebody has replaced as->current_map concurrently, + * flatview_ref returns false. + */ + } while (!flatview_ref(view)); rcu_read_unlock(); return view; } @@ -879,18 +968,81 @@ static void address_space_update_topology_pass(AddressSpace *as, } } +static void flatviews_init(void) +{ + static FlatView *empty_view; + + if (flat_views) { + return; + } -static void address_space_update_topology(AddressSpace *as) + flat_views = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, + (GDestroyNotify) flatview_unref); + if (!empty_view) { + empty_view = generate_memory_topology(NULL); + /* We keep it alive forever in the global variable. */ + flatview_ref(empty_view); + } else { + g_hash_table_replace(flat_views, NULL, empty_view); + flatview_ref(empty_view); + } +} + +static void flatviews_reset(void) +{ + AddressSpace *as; + + if (flat_views) { + g_hash_table_unref(flat_views); + flat_views = NULL; + } + flatviews_init(); + + /* Render unique FVs */ + QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + if (g_hash_table_lookup(flat_views, physmr)) { + continue; + } + + generate_memory_topology(physmr); + } +} + +static void address_space_set_flatview(AddressSpace *as) { - FlatView *old_view = address_space_get_flatview(as); - FlatView *new_view = generate_memory_topology(as->root); + FlatView *old_view = address_space_to_flatview(as); + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + FlatView *new_view = g_hash_table_lookup(flat_views, physmr); - address_space_update_topology_pass(as, old_view, new_view, false); - address_space_update_topology_pass(as, old_view, new_view, true); + assert(new_view); + + if (old_view == new_view) { + return; + } + + if (old_view) { + flatview_ref(old_view); + } + + flatview_ref(new_view); + + if (!QTAILQ_EMPTY(&as->listeners)) { + FlatView tmpview = { .nr = 0 }, *old_view2 = old_view; + + if (!old_view2) { + old_view2 = &tmpview; + } + address_space_update_topology_pass(as, old_view2, new_view, false); + address_space_update_topology_pass(as, old_view2, new_view, true); + } /* Writes are protected by the BQL. */ atomic_rcu_set(&as->current_map, new_view); - call_rcu(old_view, flatview_unref, rcu); + if (old_view) { + flatview_unref(old_view); + } /* Note that all the old MemoryRegions are still alive up to this * point. This relieves most MemoryListeners from the need to @@ -898,9 +1050,20 @@ static void address_space_update_topology(AddressSpace *as) * outside the iothread mutex, in which case precise reference * counting is necessary. */ - flatview_unref(old_view); + if (old_view) { + flatview_unref(old_view); + } +} - address_space_update_ioeventfds(as); +static void address_space_update_topology(AddressSpace *as) +{ + MemoryRegion *physmr = memory_region_get_flatview_root(as->root); + + flatviews_init(); + if (!g_hash_table_lookup(flat_views, physmr)) { + generate_memory_topology(physmr); + } + address_space_set_flatview(as); } void memory_region_transaction_begin(void) @@ -919,10 +1082,13 @@ void memory_region_transaction_commit(void) --memory_region_transaction_depth; if (!memory_region_transaction_depth) { if (memory_region_update_pending) { + flatviews_reset(); + MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - address_space_update_topology(as); + address_space_set_flatview(as); + address_space_update_ioeventfds(as); } memory_region_update_pending = false; MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); @@ -1835,7 +2001,7 @@ void memory_region_sync_dirty_bitmap(MemoryRegion *mr) view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { if (fr->mr == mr) { - MemoryRegionSection mrs = section_from_flat_range(fr, as); + MemoryRegionSection mrs = section_from_flat_range(fr, view); listener->log_sync(listener, &mrs); } } @@ -1938,7 +2104,7 @@ static void memory_region_update_coalesced_range_as(MemoryRegion *mr, AddressSpa FOR_EACH_FLAT_RANGE(fr, view) { if (fr->mr == mr) { section = (MemoryRegionSection) { - .address_space = as, + .fv = view, .offset_within_address_space = int128_get64(fr->addr.start), .size = fr->addr.size, }; @@ -2289,7 +2455,7 @@ static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr, } range = addrrange_make(int128_make64(addr), int128_make64(size)); - view = atomic_rcu_read(&as->current_map); + view = address_space_to_flatview(as); fr = flatview_lookup(view, range); if (!fr) { return ret; @@ -2300,7 +2466,7 @@ static MemoryRegionSection memory_region_find_rcu(MemoryRegion *mr, } ret.mr = fr->mr; - ret.address_space = as; + ret.fv = view; range = addrrange_intersection(range, fr->addr); ret.offset_within_region = fr->offset_in_region; ret.offset_within_region += int128_get64(int128_sub(range.start, @@ -2349,7 +2515,8 @@ void memory_global_dirty_log_sync(void) view = address_space_get_flatview(as); FOR_EACH_FLAT_RANGE(fr, view) { if (fr->dirty_log_mask) { - MemoryRegionSection mrs = section_from_flat_range(fr, as); + MemoryRegionSection mrs = section_from_flat_range(fr, view); + listener->log_sync(listener, &mrs); } } @@ -2434,7 +2601,7 @@ static void listener_add_address_space(MemoryListener *listener, FOR_EACH_FLAT_RANGE(fr, view) { MemoryRegionSection section = { .mr = fr->mr, - .address_space = as, + .fv = view, .offset_within_region = fr->offset_in_region, .size = fr->addr.size, .offset_within_address_space = int128_get64(fr->addr.start), @@ -2610,69 +2777,36 @@ void memory_region_invalidate_mmio_ptr(MemoryRegion *mr, hwaddr offset, void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) { memory_region_ref(root); - memory_region_transaction_begin(); - as->ref_count = 1; as->root = root; - as->malloced = false; - as->current_map = g_new(FlatView, 1); - flatview_init(as->current_map); + as->current_map = NULL; as->ioeventfd_nb = 0; as->ioeventfds = NULL; QTAILQ_INIT(&as->listeners); QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); as->name = g_strdup(name ? name : "anonymous"); - address_space_init_dispatch(as); - memory_region_update_pending |= root->enabled; - memory_region_transaction_commit(); + address_space_update_topology(as); + address_space_update_ioeventfds(as); } static void do_address_space_destroy(AddressSpace *as) { - bool do_free = as->malloced; - - address_space_destroy_dispatch(as); assert(QTAILQ_EMPTY(&as->listeners)); flatview_unref(as->current_map); g_free(as->name); g_free(as->ioeventfds); memory_region_unref(as->root); - if (do_free) { - g_free(as); - } -} - -AddressSpace *address_space_init_shareable(MemoryRegion *root, const char *name) -{ - AddressSpace *as; - - QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - if (root == as->root && as->malloced) { - as->ref_count++; - return as; - } - } - - as = g_malloc0(sizeof *as); - address_space_init(as, root, name); - as->malloced = true; - return as; } void address_space_destroy(AddressSpace *as) { MemoryRegion *root = as->root; - as->ref_count--; - if (as->ref_count) { - return; - } /* Flush out anything from MemoryListeners listening in on this */ memory_region_transaction_begin(); as->root = NULL; memory_region_transaction_commit(); QTAILQ_REMOVE(&address_spaces, as, address_spaces_link); - address_space_unregister(as); /* At this point, as->dispatch and as->current_map are dummy * entries that the guest should never use. Wait for the old @@ -2807,18 +2941,44 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f, } } -static void mtree_print_flatview(fprintf_function p, void *f, - AddressSpace *as) +struct FlatViewInfo { + fprintf_function mon_printf; + void *f; + int counter; + bool dispatch_tree; +}; + +static void mtree_print_flatview(gpointer key, gpointer value, + gpointer user_data) { - FlatView *view = address_space_get_flatview(as); + FlatView *view = key; + GArray *fv_address_spaces = value; + struct FlatViewInfo *fvi = user_data; + fprintf_function p = fvi->mon_printf; + void *f = fvi->f; FlatRange *range = &view->ranges[0]; MemoryRegion *mr; int n = view->nr; + int i; + AddressSpace *as; + + p(f, "FlatView #%d\n", fvi->counter); + ++fvi->counter; + + for (i = 0; i < fv_address_spaces->len; ++i) { + as = g_array_index(fv_address_spaces, AddressSpace*, i); + p(f, " AS \"%s\", root: %s", as->name, memory_region_name(as->root)); + if (as->root->alias) { + p(f, ", alias %s", memory_region_name(as->root->alias)); + } + p(f, "\n"); + } + + p(f, " Root memory region: %s\n", + view->root ? memory_region_name(view->root) : "(none)"); if (n <= 0) { - p(f, MTREE_INDENT "No rendered FlatView for " - "address space '%s'\n", as->name); - flatview_unref(view); + p(f, MTREE_INDENT "No rendered FlatView\n\n"); return; } @@ -2845,21 +3005,65 @@ static void mtree_print_flatview(fprintf_function p, void *f, range++; } +#if !defined(CONFIG_USER_ONLY) + if (fvi->dispatch_tree && view->root) { + mtree_print_dispatch(p, f, view->dispatch, view->root); + } +#endif + + p(f, "\n"); +} + +static gboolean mtree_info_flatview_free(gpointer key, gpointer value, + gpointer user_data) +{ + FlatView *view = key; + GArray *fv_address_spaces = value; + + g_array_unref(fv_address_spaces); flatview_unref(view); + + return true; } -void mtree_info(fprintf_function mon_printf, void *f, bool flatview) +void mtree_info(fprintf_function mon_printf, void *f, bool flatview, + bool dispatch_tree) { MemoryRegionListHead ml_head; MemoryRegionList *ml, *ml2; AddressSpace *as; if (flatview) { + FlatView *view; + struct FlatViewInfo fvi = { + .mon_printf = mon_printf, + .f = f, + .counter = 0, + .dispatch_tree = dispatch_tree + }; + GArray *fv_address_spaces; + GHashTable *views = g_hash_table_new(g_direct_hash, g_direct_equal); + + /* Gather all FVs in one table */ QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { - mon_printf(f, "address-space (flat view): %s\n", as->name); - mtree_print_flatview(mon_printf, f, as); - mon_printf(f, "\n"); + view = address_space_get_flatview(as); + + fv_address_spaces = g_hash_table_lookup(views, view); + if (!fv_address_spaces) { + fv_address_spaces = g_array_new(false, false, sizeof(as)); + g_hash_table_insert(views, view, fv_address_spaces); + } + + g_array_append_val(fv_address_spaces, as); } + + /* Print */ + g_hash_table_foreach(views, mtree_print_flatview, &fvi); + + /* Free */ + g_hash_table_foreach_remove(views, mtree_info_flatview_free, 0); + g_hash_table_unref(views); + return; } @@ -1703,8 +1703,9 @@ static void hmp_boot_set(Monitor *mon, const QDict *qdict) static void hmp_info_mtree(Monitor *mon, const QDict *qdict) { bool flatview = qdict_get_try_bool(qdict, "flatview", false); + bool dispatch_tree = qdict_get_try_bool(qdict, "dispatch_tree", false); - mtree_info((fprintf_function)monitor_printf, mon, flatview); + mtree_info((fprintf_function)monitor_printf, mon, flatview, dispatch_tree); } static void hmp_info_numa(Monitor *mon, const QDict *qdict) diff --git a/qapi/block-core.json b/qapi/block-core.json index bb11815608..c69a395804 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2241,6 +2241,9 @@ # Driver specific block device options for the file backend. # # @filename: path to the image file +# @pr-manager: the id for the object that will handle persistent reservations +# for this device (default: none, forward the commands via SG_IO; +# since 2.11) # @aio: AIO backend (default: threads) (since: 2.8) # @locking: whether to enable file locking. If set to 'auto', only enable # when Open File Descriptor (OFD) locking API is available @@ -2250,6 +2253,7 @@ ## { 'struct': 'BlockdevOptionsFile', 'data': { 'filename': 'str', + '*pr-manager': 'str', '*locking': 'OnOffAuto', '*aio': 'BlockdevAioOptions' } } diff --git a/scsi/Makefile.objs b/scsi/Makefile.objs index 31b82a5a36..4d25e476cf 100644 --- a/scsi/Makefile.objs +++ b/scsi/Makefile.objs @@ -1 +1,3 @@ block-obj-y += utils.o + +block-obj-$(CONFIG_LINUX) += pr-manager.o pr-manager-helper.o diff --git a/scsi/pr-helper.h b/scsi/pr-helper.h new file mode 100644 index 0000000000..96c50a9e5f --- /dev/null +++ b/scsi/pr-helper.h @@ -0,0 +1,41 @@ +/* Definitions for QEMU's persistent reservation helper daemon + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: + * Paolo Bonzini <pbonzini@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef QEMU_PR_HELPER_H +#define QEMU_PR_HELPER_H 1 + +#include <stdint.h> + +#define PR_HELPER_CDB_SIZE 16 +#define PR_HELPER_SENSE_SIZE 96 +#define PR_HELPER_DATA_SIZE 8192 + +typedef struct PRHelperResponse { + int32_t result; + int32_t sz; + uint8_t sense[PR_HELPER_SENSE_SIZE]; +} PRHelperResponse; + +#endif diff --git a/scsi/pr-manager-helper.c b/scsi/pr-manager-helper.c new file mode 100644 index 0000000000..82ff6b6123 --- /dev/null +++ b/scsi/pr-manager-helper.c @@ -0,0 +1,302 @@ +/* + * Persistent reservation manager that talks to qemu-pr-helper + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This code is licensed under the LGPL v2.1 or later. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "scsi/constants.h" +#include "scsi/pr-manager.h" +#include "scsi/utils.h" +#include "io/channel.h" +#include "io/channel-socket.h" +#include "pr-helper.h" + +#include <scsi/sg.h> + +#define PR_MAX_RECONNECT_ATTEMPTS 5 + +#define TYPE_PR_MANAGER_HELPER "pr-manager-helper" + +#define PR_MANAGER_HELPER(obj) \ + OBJECT_CHECK(PRManagerHelper, (obj), \ + TYPE_PR_MANAGER_HELPER) + +typedef struct PRManagerHelper { + /* <private> */ + PRManager parent; + + char *path; + + QemuMutex lock; + QIOChannel *ioc; +} PRManagerHelper; + +/* Called with lock held. */ +static int pr_manager_helper_read(PRManagerHelper *pr_mgr, + void *buf, int sz, Error **errp) +{ + ssize_t r = qio_channel_read_all(pr_mgr->ioc, buf, sz, errp); + + if (r < 0) { + object_unref(OBJECT(pr_mgr->ioc)); + pr_mgr->ioc = NULL; + return -EINVAL; + } + + return 0; +} + +/* Called with lock held. */ +static int pr_manager_helper_write(PRManagerHelper *pr_mgr, + int fd, + const void *buf, int sz, Error **errp) +{ + size_t nfds = (fd != -1); + while (sz > 0) { + struct iovec iov; + ssize_t n_written; + + iov.iov_base = (void *)buf; + iov.iov_len = sz; + n_written = qio_channel_writev_full(QIO_CHANNEL(pr_mgr->ioc), &iov, 1, + nfds ? &fd : NULL, nfds, errp); + + if (n_written <= 0) { + assert(n_written != QIO_CHANNEL_ERR_BLOCK); + object_unref(OBJECT(pr_mgr->ioc)); + return n_written < 0 ? -EINVAL : 0; + } + + nfds = 0; + buf += n_written; + sz -= n_written; + } + + return 0; +} + +/* Called with lock held. */ +static int pr_manager_helper_initialize(PRManagerHelper *pr_mgr, + Error **errp) +{ + char *path = g_strdup(pr_mgr->path); + SocketAddress saddr = { + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = path + }; + QIOChannelSocket *sioc = qio_channel_socket_new(); + Error *local_err = NULL; + + uint32_t flags; + int r; + + assert(!pr_mgr->ioc); + qio_channel_set_name(QIO_CHANNEL(sioc), "pr-manager-helper"); + qio_channel_socket_connect_sync(sioc, + &saddr, + &local_err); + g_free(path); + if (local_err) { + object_unref(OBJECT(sioc)); + error_propagate(errp, local_err); + return -ENOTCONN; + } + + qio_channel_set_delay(QIO_CHANNEL(sioc), false); + pr_mgr->ioc = QIO_CHANNEL(sioc); + + /* A simple feature negotation protocol, even though there is + * no optional feature right now. + */ + r = pr_manager_helper_read(pr_mgr, &flags, sizeof(flags), errp); + if (r < 0) { + goto out_close; + } + + flags = 0; + r = pr_manager_helper_write(pr_mgr, -1, &flags, sizeof(flags), errp); + if (r < 0) { + goto out_close; + } + + return 0; + +out_close: + object_unref(OBJECT(pr_mgr->ioc)); + pr_mgr->ioc = NULL; + return r; +} + +static int pr_manager_helper_run(PRManager *p, + int fd, struct sg_io_hdr *io_hdr) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(p); + + uint32_t len; + PRHelperResponse resp; + int ret; + int expected_dir; + int attempts; + uint8_t cdb[PR_HELPER_CDB_SIZE] = { 0 }; + + if (!io_hdr->cmd_len || io_hdr->cmd_len > PR_HELPER_CDB_SIZE) { + return -EINVAL; + } + + memcpy(cdb, io_hdr->cmdp, io_hdr->cmd_len); + assert(cdb[0] == PERSISTENT_RESERVE_OUT || cdb[0] == PERSISTENT_RESERVE_IN); + expected_dir = + (cdb[0] == PERSISTENT_RESERVE_OUT ? SG_DXFER_TO_DEV : SG_DXFER_FROM_DEV); + if (io_hdr->dxfer_direction != expected_dir) { + return -EINVAL; + } + + len = scsi_cdb_xfer(cdb); + if (io_hdr->dxfer_len < len || len > PR_HELPER_DATA_SIZE) { + return -EINVAL; + } + + qemu_mutex_lock(&pr_mgr->lock); + + /* Try to reconnect while sending the CDB. */ + for (attempts = 0; attempts < PR_MAX_RECONNECT_ATTEMPTS; attempts++) { + if (!pr_mgr->ioc) { + ret = pr_manager_helper_initialize(pr_mgr, NULL); + if (ret < 0) { + qemu_mutex_unlock(&pr_mgr->lock); + g_usleep(G_USEC_PER_SEC); + qemu_mutex_lock(&pr_mgr->lock); + continue; + } + } + + ret = pr_manager_helper_write(pr_mgr, fd, cdb, ARRAY_SIZE(cdb), NULL); + if (ret >= 0) { + break; + } + } + if (ret < 0) { + goto out; + } + + /* After sending the CDB, any communications failure causes the + * command to fail. The failure is transient, retrying the command + * will invoke pr_manager_helper_initialize again. + */ + if (expected_dir == SG_DXFER_TO_DEV) { + io_hdr->resid = io_hdr->dxfer_len - len; + ret = pr_manager_helper_write(pr_mgr, -1, io_hdr->dxferp, len, NULL); + if (ret < 0) { + goto out; + } + } + ret = pr_manager_helper_read(pr_mgr, &resp, sizeof(resp), NULL); + if (ret < 0) { + goto out; + } + + resp.result = be32_to_cpu(resp.result); + resp.sz = be32_to_cpu(resp.sz); + if (io_hdr->dxfer_direction == SG_DXFER_FROM_DEV) { + assert(resp.sz <= io_hdr->dxfer_len); + ret = pr_manager_helper_read(pr_mgr, io_hdr->dxferp, resp.sz, NULL); + if (ret < 0) { + goto out; + } + io_hdr->resid = io_hdr->dxfer_len - resp.sz; + } else { + assert(resp.sz == 0); + } + + io_hdr->status = resp.result; + if (resp.result == CHECK_CONDITION) { + io_hdr->driver_status = SG_ERR_DRIVER_SENSE; + io_hdr->sb_len_wr = MIN(io_hdr->mx_sb_len, PR_HELPER_SENSE_SIZE); + memcpy(io_hdr->sbp, resp.sense, io_hdr->sb_len_wr); + } + +out: + if (ret < 0) { + int sense_len = scsi_build_sense(io_hdr->sbp, + SENSE_CODE(LUN_COMM_FAILURE)); + io_hdr->driver_status = SG_ERR_DRIVER_SENSE; + io_hdr->sb_len_wr = MIN(io_hdr->mx_sb_len, sense_len); + io_hdr->status = CHECK_CONDITION; + } + qemu_mutex_unlock(&pr_mgr->lock); + return ret; +} + +static void pr_manager_helper_complete(UserCreatable *uc, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(uc); + + qemu_mutex_lock(&pr_mgr->lock); + pr_manager_helper_initialize(pr_mgr, errp); + qemu_mutex_unlock(&pr_mgr->lock); +} + +static char *get_path(Object *obj, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + return g_strdup(pr_mgr->path); +} + +static void set_path(Object *obj, const char *str, Error **errp) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + g_free(pr_mgr->path); + pr_mgr->path = g_strdup(str); +} + +static void pr_manager_helper_instance_finalize(Object *obj) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + object_unref(OBJECT(pr_mgr->ioc)); + qemu_mutex_destroy(&pr_mgr->lock); +} + +static void pr_manager_helper_instance_init(Object *obj) +{ + PRManagerHelper *pr_mgr = PR_MANAGER_HELPER(obj); + + qemu_mutex_init(&pr_mgr->lock); +} + +static void pr_manager_helper_class_init(ObjectClass *klass, + void *class_data G_GNUC_UNUSED) +{ + PRManagerClass *prmgr_klass = PR_MANAGER_CLASS(klass); + UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass); + + object_class_property_add_str(klass, "path", get_path, set_path, + &error_abort); + uc_klass->complete = pr_manager_helper_complete; + prmgr_klass->run = pr_manager_helper_run; +} + +static const TypeInfo pr_manager_helper_info = { + .parent = TYPE_PR_MANAGER, + .name = TYPE_PR_MANAGER_HELPER, + .instance_size = sizeof(PRManagerHelper), + .instance_init = pr_manager_helper_instance_init, + .instance_finalize = pr_manager_helper_instance_finalize, + .class_init = pr_manager_helper_class_init, +}; + +static void pr_manager_helper_register_types(void) +{ + type_register_static(&pr_manager_helper_info); +} + +type_init(pr_manager_helper_register_types); diff --git a/scsi/pr-manager.c b/scsi/pr-manager.c new file mode 100644 index 0000000000..87c45db5d4 --- /dev/null +++ b/scsi/pr-manager.c @@ -0,0 +1,109 @@ +/* + * Persistent reservation manager abstract class + * + * Copyright (c) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This code is licensed under the LGPL. + * + */ + +#include "qemu/osdep.h" +#include <scsi/sg.h> + +#include "qapi/error.h" +#include "block/aio.h" +#include "block/thread-pool.h" +#include "scsi/pr-manager.h" +#include "trace.h" + +typedef struct PRManagerData { + PRManager *pr_mgr; + struct sg_io_hdr *hdr; + int fd; +} PRManagerData; + +static int pr_manager_worker(void *opaque) +{ + PRManagerData *data = opaque; + PRManager *pr_mgr = data->pr_mgr; + PRManagerClass *pr_mgr_class = + PR_MANAGER_GET_CLASS(pr_mgr); + struct sg_io_hdr *hdr = data->hdr; + int fd = data->fd; + int r; + + g_free(data); + trace_pr_manager_run(fd, hdr->cmdp[0], hdr->cmdp[1]); + + /* The reference was taken in pr_manager_execute. */ + r = pr_mgr_class->run(pr_mgr, fd, hdr); + object_unref(OBJECT(pr_mgr)); + return r; +} + + +BlockAIOCB *pr_manager_execute(PRManager *pr_mgr, + AioContext *ctx, int fd, + struct sg_io_hdr *hdr, + BlockCompletionFunc *complete, + void *opaque) +{ + PRManagerData *data = g_new(PRManagerData, 1); + ThreadPool *pool = aio_get_thread_pool(ctx); + + trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1], opaque); + data->pr_mgr = pr_mgr; + data->fd = fd; + data->hdr = hdr; + + /* The matching object_unref is in pr_manager_worker. */ + object_ref(OBJECT(pr_mgr)); + return thread_pool_submit_aio(pool, pr_manager_worker, + data, complete, opaque); +} + +static const TypeInfo pr_manager_info = { + .parent = TYPE_OBJECT, + .name = TYPE_PR_MANAGER, + .class_size = sizeof(PRManagerClass), + .abstract = true, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +PRManager *pr_manager_lookup(const char *id, Error **errp) +{ + Object *obj; + PRManager *pr_mgr; + + obj = object_resolve_path_component(object_get_objects_root(), id); + if (!obj) { + error_setg(errp, "No persistent reservation manager with id '%s'", id); + return NULL; + } + + pr_mgr = (PRManager *) + object_dynamic_cast(obj, + TYPE_PR_MANAGER); + if (!pr_mgr) { + error_setg(errp, + "Object with id '%s' is not a persistent reservation manager", + id); + return NULL; + } + + return pr_mgr; +} + +static void +pr_manager_register_types(void) +{ + type_register_static(&pr_manager_info); +} + + +type_init(pr_manager_register_types); diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c new file mode 100644 index 0000000000..d58184833f --- /dev/null +++ b/scsi/qemu-pr-helper.c @@ -0,0 +1,1075 @@ +/* + * Privileged helper to handle persistent reservation commands for QEMU + * + * Copyright (C) 2017 Red Hat, Inc. <pbonzini@redhat.com> + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include <getopt.h> +#include <sys/ioctl.h> +#include <linux/dm-ioctl.h> +#include <scsi/sg.h> + +#ifdef CONFIG_LIBCAP +#include <cap-ng.h> +#endif +#include <pwd.h> +#include <grp.h> + +#ifdef CONFIG_MPATH +#include <libudev.h> +#include <mpath_cmd.h> +#include <mpath_persist.h> +#endif + +#include "qapi/error.h" +#include "qemu-common.h" +#include "qemu/cutils.h" +#include "qemu/main-loop.h" +#include "qemu/error-report.h" +#include "qemu/config-file.h" +#include "qemu/bswap.h" +#include "qemu/log.h" +#include "qemu/systemd.h" +#include "qapi/util.h" +#include "qapi/qmp/qstring.h" +#include "io/channel-socket.h" +#include "trace/control.h" +#include "qemu-version.h" + +#include "block/aio.h" +#include "block/thread-pool.h" + +#include "scsi/constants.h" +#include "scsi/utils.h" +#include "pr-helper.h" + +#define PR_OUT_FIXED_PARAM_SIZE 24 + +static char *socket_path; +static char *pidfile; +static enum { RUNNING, TERMINATE, TERMINATING } state; +static QIOChannelSocket *server_ioc; +static int server_watch; +static int num_active_sockets = 1; +static int noisy; +static int verbose; + +#ifdef CONFIG_LIBCAP +static int uid = -1; +static int gid = -1; +#endif + +static void usage(const char *name) +{ + (printf) ( +"Usage: %s [OPTIONS] FILE\n" +"Persistent Reservation helper program for QEMU\n" +"\n" +" -h, --help display this help and exit\n" +" -V, --version output version information and exit\n" +"\n" +" -d, --daemon run in the background\n" +" -f, --pidfile=PATH PID file when running as a daemon\n" +" (default '%s')\n" +" -k, --socket=PATH path to the unix socket\n" +" (default '%s')\n" +" -T, --trace [[enable=]<pattern>][,events=<file>][,file=<file>]\n" +" specify tracing options\n" +#ifdef CONFIG_LIBCAP +" -u, --user=USER user to drop privileges to\n" +" -g, --group=GROUP group to drop privileges to\n" +#endif +"\n" +QEMU_HELP_BOTTOM "\n" + , name, pidfile, socket_path); +} + +static void version(const char *name) +{ + printf( +"%s " QEMU_VERSION QEMU_PKGVERSION "\n" +"Written by Paolo Bonzini.\n" +"\n" +QEMU_COPYRIGHT "\n" +"This is free software; see the source for copying conditions. There is NO\n" +"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" + , name); +} + +static void write_pidfile(void) +{ + int pidfd; + char pidstr[32]; + + pidfd = qemu_open(pidfile, O_CREAT|O_WRONLY, S_IRUSR|S_IWUSR); + if (pidfd == -1) { + error_report("Cannot open pid file, %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + if (lockf(pidfd, F_TLOCK, 0)) { + error_report("Cannot lock pid file, %s", strerror(errno)); + goto fail; + } + if (ftruncate(pidfd, 0)) { + error_report("Failed to truncate pid file"); + goto fail; + } + + snprintf(pidstr, sizeof(pidstr), "%d\n", getpid()); + if (write(pidfd, pidstr, strlen(pidstr)) != strlen(pidstr)) { + error_report("Failed to write pid file"); + goto fail; + } + return; + +fail: + unlink(pidfile); + close(pidfd); + exit(EXIT_FAILURE); +} + +/* SG_IO support */ + +typedef struct PRHelperSGIOData { + int fd; + const uint8_t *cdb; + uint8_t *sense; + uint8_t *buf; + int sz; /* input/output */ + int dir; +} PRHelperSGIOData; + +static int do_sgio_worker(void *opaque) +{ + PRHelperSGIOData *data = opaque; + struct sg_io_hdr io_hdr; + int ret; + int status; + SCSISense sense_code; + + memset(data->sense, 0, PR_HELPER_SENSE_SIZE); + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = PR_HELPER_CDB_SIZE; + io_hdr.cmdp = (uint8_t *)data->cdb; + io_hdr.sbp = data->sense; + io_hdr.mx_sb_len = PR_HELPER_SENSE_SIZE; + io_hdr.timeout = 1; + io_hdr.dxfer_direction = data->dir; + io_hdr.dxferp = (char *)data->buf; + io_hdr.dxfer_len = data->sz; + ret = ioctl(data->fd, SG_IO, &io_hdr); + status = sg_io_sense_from_errno(ret < 0 ? errno : 0, &io_hdr, + &sense_code); + if (status == GOOD) { + data->sz -= io_hdr.resid; + } else { + data->sz = 0; + } + + if (status == CHECK_CONDITION && + !(io_hdr.driver_status & SG_ERR_DRIVER_SENSE)) { + scsi_build_sense(data->sense, sense_code); + } + + return status; +} + +static int do_sgio(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *buf, int *sz, int dir) +{ + ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context()); + int r; + + PRHelperSGIOData data = { + .fd = fd, + .cdb = cdb, + .sense = sense, + .buf = buf, + .sz = *sz, + .dir = dir, + }; + + r = thread_pool_submit_co(pool, do_sgio_worker, &data); + *sz = data.sz; + return r; +} + +/* Device mapper interface */ + +#ifdef CONFIG_MPATH +#define CONTROL_PATH "/dev/mapper/control" + +typedef struct DMData { + struct dm_ioctl dm; + uint8_t data[1024]; +} DMData; + +static int control_fd; + +static void *dm_ioctl(int ioc, struct dm_ioctl *dm) +{ + static DMData d; + memcpy(&d.dm, dm, sizeof(d.dm)); + QEMU_BUILD_BUG_ON(sizeof(d.data) < sizeof(struct dm_target_spec)); + + d.dm.version[0] = DM_VERSION_MAJOR; + d.dm.version[1] = 0; + d.dm.version[2] = 0; + d.dm.data_size = 1024; + d.dm.data_start = offsetof(DMData, data); + if (ioctl(control_fd, ioc, &d) < 0) { + return NULL; + } + memcpy(dm, &d.dm, sizeof(d.dm)); + return &d.data; +} + +static void *dm_dev_ioctl(int fd, int ioc, struct dm_ioctl *dm) +{ + struct stat st; + int r; + + r = fstat(fd, &st); + if (r < 0) { + perror("fstat"); + exit(1); + } + + dm->dev = st.st_rdev; + return dm_ioctl(ioc, dm); +} + +static void dm_init(void) +{ + control_fd = open(CONTROL_PATH, O_RDWR); + if (control_fd < 0) { + perror("Cannot open " CONTROL_PATH); + exit(1); + } + struct dm_ioctl dm = { 0 }; + if (!dm_ioctl(DM_VERSION, &dm)) { + perror("ioctl"); + exit(1); + } + if (dm.version[0] != DM_VERSION_MAJOR) { + fprintf(stderr, "Unsupported device mapper interface"); + exit(1); + } +} + +/* Variables required by libmultipath and libmpathpersist. */ +QEMU_BUILD_BUG_ON(PR_HELPER_DATA_SIZE > MPATH_MAX_PARAM_LEN); +unsigned mpath_mx_alloc_len = PR_HELPER_DATA_SIZE; +int logsink; + +static void multipath_pr_init(void) +{ + static struct udev *udev; + + udev = udev_new(); + mpath_lib_init(udev); +} + +static int is_mpath(int fd) +{ + struct dm_ioctl dm = { .flags = DM_NOFLUSH_FLAG }; + struct dm_target_spec *tgt; + + tgt = dm_dev_ioctl(fd, DM_TABLE_STATUS, &dm); + if (!tgt) { + if (errno == ENXIO) { + return 0; + } + perror("ioctl"); + exit(EXIT_FAILURE); + } + return !strncmp(tgt->target_type, "multipath", DM_MAX_TYPE_NAME); +} + +static int mpath_reconstruct_sense(int fd, int r, uint8_t *sense) +{ + switch (r) { + case MPATH_PR_SUCCESS: + return GOOD; + case MPATH_PR_SENSE_NOT_READY: + case MPATH_PR_SENSE_MEDIUM_ERROR: + case MPATH_PR_SENSE_HARDWARE_ERROR: + case MPATH_PR_SENSE_ABORTED_COMMAND: + { + /* libmpathpersist ate the exact sense. Try to find it by + * issuing TEST UNIT READY. + */ + uint8_t cdb[6] = { TEST_UNIT_READY }; + int sz = 0; + return do_sgio(fd, cdb, sense, NULL, &sz, SG_DXFER_NONE); + } + + case MPATH_PR_SENSE_UNIT_ATTENTION: + /* Congratulations libmpathpersist, you ruined the Unit Attention... + * Return a heavyweight one. + */ + scsi_build_sense(sense, SENSE_CODE(SCSI_BUS_RESET)); + return CHECK_CONDITION; + case MPATH_PR_SENSE_INVALID_OP: + /* Only one valid sense. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_OPCODE)); + return CHECK_CONDITION; + case MPATH_PR_ILLEGAL_REQ: + /* Guess. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + case MPATH_PR_NO_SENSE: + scsi_build_sense(sense, SENSE_CODE(NO_SENSE)); + return CHECK_CONDITION; + + case MPATH_PR_RESERV_CONFLICT: + return RESERVATION_CONFLICT; + + case MPATH_PR_OTHER: + default: + scsi_build_sense(sense, SENSE_CODE(LUN_COMM_FAILURE)); + return CHECK_CONDITION; + } +} + +static int multipath_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int sz) +{ + int rq_servact = cdb[1]; + struct prin_resp resp; + size_t written; + int r; + + switch (rq_servact) { + case MPATH_PRIN_RKEY_SA: + case MPATH_PRIN_RRES_SA: + case MPATH_PRIN_RCAP_SA: + break; + case MPATH_PRIN_RFSTAT_SA: + /* Nobody implements it anyway, so bail out. */ + default: + /* Cannot parse any other output. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_FIELD)); + return CHECK_CONDITION; + } + + r = mpath_persistent_reserve_in(fd, rq_servact, &resp, noisy, verbose); + if (r == MPATH_PR_SUCCESS) { + switch (rq_servact) { + case MPATH_PRIN_RKEY_SA: + case MPATH_PRIN_RRES_SA: { + struct prin_readdescr *out = &resp.prin_descriptor.prin_readkeys; + assert(sz >= 8); + written = MIN(out->additional_length + 8, sz); + stl_be_p(&data[0], out->prgeneration); + stl_be_p(&data[4], out->additional_length); + memcpy(&data[8], out->key_list, written - 8); + break; + } + case MPATH_PRIN_RCAP_SA: { + struct prin_capdescr *out = &resp.prin_descriptor.prin_readcap; + assert(sz >= 6); + written = 6; + stw_be_p(&data[0], out->length); + data[2] = out->flags[0]; + data[3] = out->flags[1]; + stw_be_p(&data[4], out->pr_type_mask); + break; + } + default: + scsi_build_sense(sense, SENSE_CODE(INVALID_OPCODE)); + return CHECK_CONDITION; + } + assert(written <= sz); + memset(data + written, 0, sz - written); + } + + return mpath_reconstruct_sense(fd, r, sense); +} + +static int multipath_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) +{ + int rq_servact = cdb[1]; + int rq_scope = cdb[2] >> 4; + int rq_type = cdb[2] & 0xf; + struct prout_param_descriptor paramp; + char transportids[PR_HELPER_DATA_SIZE]; + int r; + + switch (rq_servact) { + case MPATH_PROUT_REG_SA: + case MPATH_PROUT_RES_SA: + case MPATH_PROUT_REL_SA: + case MPATH_PROUT_CLEAR_SA: + case MPATH_PROUT_PREE_SA: + case MPATH_PROUT_PREE_AB_SA: + case MPATH_PROUT_REG_IGN_SA: + break; + case MPATH_PROUT_REG_MOV_SA: + /* Not supported by struct prout_param_descriptor. */ + default: + /* Cannot parse any other input. */ + scsi_build_sense(sense, SENSE_CODE(INVALID_FIELD)); + return CHECK_CONDITION; + } + + /* Convert input data, especially transport IDs, to the structs + * used by libmpathpersist (which, of course, will immediately + * do the opposite). + */ + memset(¶mp, 0, sizeof(paramp)); + memcpy(¶mp.key, ¶m[0], 8); + memcpy(¶mp.sa_key, ¶m[8], 8); + paramp.sa_flags = param[10]; + if (sz > PR_OUT_FIXED_PARAM_SIZE) { + size_t transportid_len; + int i, j; + if (sz < PR_OUT_FIXED_PARAM_SIZE + 4) { + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM_LEN)); + return CHECK_CONDITION; + } + transportid_len = ldl_be_p(¶m[24]) + PR_OUT_FIXED_PARAM_SIZE + 4; + if (transportid_len > sz) { + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + } + for (i = PR_OUT_FIXED_PARAM_SIZE + 4, j = 0; i < transportid_len; ) { + struct transportid *id = (struct transportid *) &transportids[j]; + int len; + + id->format_code = param[i] & 0xc0; + id->protocol_id = param[i] & 0x0f; + switch (param[i] & 0xcf) { + case 0: + /* FC transport. */ + if (i + 24 > transportid_len) { + goto illegal_req; + } + memcpy(id->n_port_name, ¶m[i + 8], 8); + j += offsetof(struct transportid, n_port_name[8]); + i += 24; + break; + case 3: + case 0x43: + /* iSCSI transport. */ + len = lduw_be_p(¶m[i + 2]); + if (len > 252 || (len & 3) || i + len + 4 > transportid_len) { + /* For format code 00, the standard says the maximum is 223 + * plus the NUL terminator. For format code 01 there is no + * maximum length, but libmpathpersist ignores the first + * byte of id->iscsi_name so our maximum is 252. + */ + goto illegal_req; + } + if (memchr(¶m[i + 4], 0, len) == NULL) { + goto illegal_req; + } + memcpy(id->iscsi_name, ¶m[i + 2], len + 2); + j += offsetof(struct transportid, iscsi_name[len + 2]); + i += len + 4; + break; + case 6: + /* SAS transport. */ + if (i + 24 > transportid_len) { + goto illegal_req; + } + memcpy(id->sas_address, ¶m[i + 4], 8); + j += offsetof(struct transportid, sas_address[8]); + i += 24; + break; + default: + illegal_req: + scsi_build_sense(sense, SENSE_CODE(INVALID_PARAM)); + return CHECK_CONDITION; + } + + paramp.trnptid_list[paramp.num_transportid++] = id; + } + } + + r = mpath_persistent_reserve_out(fd, rq_servact, rq_scope, rq_type, + ¶mp, noisy, verbose); + return mpath_reconstruct_sense(fd, r, sense); +} +#endif + +static int do_pr_in(int fd, const uint8_t *cdb, uint8_t *sense, + uint8_t *data, int *resp_sz) +{ +#ifdef CONFIG_MPATH + if (is_mpath(fd)) { + /* multipath_pr_in fills the whole input buffer. */ + return multipath_pr_in(fd, cdb, sense, data, *resp_sz); + } +#endif + + return do_sgio(fd, cdb, sense, data, resp_sz, + SG_DXFER_FROM_DEV); +} + +static int do_pr_out(int fd, const uint8_t *cdb, uint8_t *sense, + const uint8_t *param, int sz) +{ + int resp_sz; +#ifdef CONFIG_MPATH + if (is_mpath(fd)) { + return multipath_pr_out(fd, cdb, sense, param, sz); + } +#endif + + resp_sz = sz; + return do_sgio(fd, cdb, sense, (uint8_t *)param, &resp_sz, + SG_DXFER_TO_DEV); +} + +/* Client */ + +typedef struct PRHelperClient { + QIOChannelSocket *ioc; + Coroutine *co; + int fd; + uint8_t data[PR_HELPER_DATA_SIZE]; +} PRHelperClient; + +typedef struct PRHelperRequest { + int fd; + size_t sz; + uint8_t cdb[PR_HELPER_CDB_SIZE]; +} PRHelperRequest; + +static int coroutine_fn prh_read(PRHelperClient *client, void *buf, int sz, + Error **errp) +{ + int ret = 0; + + while (sz > 0) { + int *fds = NULL; + size_t nfds = 0; + int i; + struct iovec iov; + ssize_t n_read; + + iov.iov_base = buf; + iov.iov_len = sz; + n_read = qio_channel_readv_full(QIO_CHANNEL(client->ioc), &iov, 1, + &fds, &nfds, errp); + + if (n_read == QIO_CHANNEL_ERR_BLOCK) { + qio_channel_yield(QIO_CHANNEL(client->ioc), G_IO_IN); + continue; + } + if (n_read <= 0) { + ret = n_read ? n_read : -1; + goto err; + } + + /* Stash one file descriptor per request. */ + if (nfds) { + bool too_many = false; + for (i = 0; i < nfds; i++) { + if (client->fd == -1) { + client->fd = fds[i]; + } else { + close(fds[i]); + too_many = true; + } + } + g_free(fds); + if (too_many) { + ret = -1; + goto err; + } + } + + buf += n_read; + sz -= n_read; + } + + return 0; + +err: + if (client->fd != -1) { + close(client->fd); + client->fd = -1; + } + return ret; +} + +static int coroutine_fn prh_read_request(PRHelperClient *client, + PRHelperRequest *req, + PRHelperResponse *resp, Error **errp) +{ + uint32_t sz; + + if (prh_read(client, req->cdb, sizeof(req->cdb), NULL) < 0) { + return -1; + } + + if (client->fd == -1) { + error_setg(errp, "No file descriptor in request."); + return -1; + } + + if (req->cdb[0] != PERSISTENT_RESERVE_OUT && + req->cdb[0] != PERSISTENT_RESERVE_IN) { + error_setg(errp, "Invalid CDB, closing socket."); + goto out_close; + } + + sz = scsi_cdb_xfer(req->cdb); + if (sz > sizeof(client->data)) { + goto out_close; + } + + if (req->cdb[0] == PERSISTENT_RESERVE_OUT) { + if (qio_channel_read_all(QIO_CHANNEL(client->ioc), + (char *)client->data, sz, + errp) < 0) { + goto out_close; + } + if ((fcntl(client->fd, F_GETFL) & O_ACCMODE) == O_RDONLY) { + scsi_build_sense(resp->sense, SENSE_CODE(INVALID_OPCODE)); + sz = 0; + } else if (sz < PR_OUT_FIXED_PARAM_SIZE) { + /* Illegal request, Parameter list length error. This isn't fatal; + * we have read the data, send an error without closing the socket. + */ + scsi_build_sense(resp->sense, SENSE_CODE(INVALID_PARAM_LEN)); + sz = 0; + } + if (sz == 0) { + resp->result = CHECK_CONDITION; + close(client->fd); + client->fd = -1; + } + } + + req->fd = client->fd; + req->sz = sz; + client->fd = -1; + return sz; + +out_close: + close(client->fd); + client->fd = -1; + return -1; +} + +static int coroutine_fn prh_write_response(PRHelperClient *client, + PRHelperRequest *req, + PRHelperResponse *resp, Error **errp) +{ + ssize_t r; + size_t sz; + + if (req->cdb[0] == PERSISTENT_RESERVE_IN && resp->result == GOOD) { + assert(resp->sz <= req->sz && resp->sz <= sizeof(client->data)); + } else { + assert(resp->sz == 0); + } + + sz = resp->sz; + + resp->result = cpu_to_be32(resp->result); + resp->sz = cpu_to_be32(resp->sz); + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) resp, sizeof(*resp), errp); + if (r < 0) { + return r; + } + + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) client->data, + sz, errp); + return r < 0 ? r : 0; +} + +static void coroutine_fn prh_co_entry(void *opaque) +{ + PRHelperClient *client = opaque; + Error *local_err = NULL; + uint32_t flags; + int r; + + qio_channel_set_blocking(QIO_CHANNEL(client->ioc), + false, NULL); + qio_channel_attach_aio_context(QIO_CHANNEL(client->ioc), + qemu_get_aio_context()); + + /* A very simple negotiation for future extensibility. No features + * are defined so write 0. + */ + flags = cpu_to_be32(0); + r = qio_channel_write_all(QIO_CHANNEL(client->ioc), + (char *) &flags, sizeof(flags), NULL); + if (r < 0) { + goto out; + } + + r = qio_channel_read_all(QIO_CHANNEL(client->ioc), + (char *) &flags, sizeof(flags), NULL); + if (be32_to_cpu(flags) != 0 || r < 0) { + goto out; + } + + while (atomic_read(&state) == RUNNING) { + PRHelperRequest req; + PRHelperResponse resp; + int sz; + + sz = prh_read_request(client, &req, &resp, &local_err); + if (sz < 0) { + break; + } + + if (sz > 0) { + num_active_sockets++; + if (req.cdb[0] == PERSISTENT_RESERVE_OUT) { + r = do_pr_out(req.fd, req.cdb, resp.sense, + client->data, sz); + resp.sz = 0; + } else { + resp.sz = sizeof(client->data); + r = do_pr_in(req.fd, req.cdb, resp.sense, + client->data, &resp.sz); + resp.sz = MIN(resp.sz, sz); + } + num_active_sockets--; + close(req.fd); + if (r == -1) { + break; + } + resp.result = r; + } + + if (prh_write_response(client, &req, &resp, &local_err) < 0) { + break; + } + } + + if (local_err) { + if (verbose == 0) { + error_free(local_err); + } else { + error_report_err(local_err); + } + } + +out: + qio_channel_detach_aio_context(QIO_CHANNEL(client->ioc)); + object_unref(OBJECT(client->ioc)); + g_free(client); +} + +static gboolean accept_client(QIOChannel *ioc, GIOCondition cond, gpointer opaque) +{ + QIOChannelSocket *cioc; + PRHelperClient *prh; + + cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc), + NULL); + if (!cioc) { + return TRUE; + } + + prh = g_new(PRHelperClient, 1); + prh->ioc = cioc; + prh->fd = -1; + prh->co = qemu_coroutine_create(prh_co_entry, prh); + qemu_coroutine_enter(prh->co); + + return TRUE; +} + + +/* + * Check socket parameters compatibility when socket activation is used. + */ +static const char *socket_activation_validate_opts(void) +{ + if (socket_path != NULL) { + return "Unix socket can't be set when using socket activation"; + } + + return NULL; +} + +static void compute_default_paths(void) +{ + if (!socket_path) { + socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock"); + } +} + +static void termsig_handler(int signum) +{ + atomic_cmpxchg(&state, RUNNING, TERMINATE); + qemu_notify_event(); +} + +static void close_server_socket(void) +{ + assert(server_ioc); + + g_source_remove(server_watch); + server_watch = -1; + object_unref(OBJECT(server_ioc)); + num_active_sockets--; +} + +#ifdef CONFIG_LIBCAP +static int drop_privileges(void) +{ + /* clear all capabilities */ + capng_clear(CAPNG_SELECT_BOTH); + + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED, + CAP_SYS_RAWIO) < 0) { + return -1; + } + +#ifdef CONFIG_MPATH + /* For /dev/mapper/control ioctls */ + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED, + CAP_SYS_ADMIN) < 0) { + return -1; + } +#endif + + /* Change user/group id, retaining the capabilities. Because file descriptors + * are passed via SCM_RIGHTS, we don't need supplementary groups (and in + * fact the helper can run as "nobody"). + */ + if (capng_change_id(uid != -1 ? uid : getuid(), + gid != -1 ? gid : getgid(), + CAPNG_DROP_SUPP_GRP | CAPNG_CLEAR_BOUNDING)) { + return -1; + } + + return 0; +} +#endif + +int main(int argc, char **argv) +{ + const char *sopt = "hVk:fdT:u:g:vq"; + struct option lopt[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "socket", required_argument, NULL, 'k' }, + { "pidfile", no_argument, NULL, 'f' }, + { "daemon", no_argument, NULL, 'd' }, + { "trace", required_argument, NULL, 'T' }, + { "user", required_argument, NULL, 'u' }, + { "group", required_argument, NULL, 'g' }, + { "verbose", no_argument, NULL, 'v' }, + { "quiet", no_argument, NULL, 'q' }, + { NULL, 0, NULL, 0 } + }; + int opt_ind = 0; + int loglevel = 1; + int quiet = 0; + int ch; + Error *local_err = NULL; + char *trace_file = NULL; + bool daemonize = false; + unsigned socket_activation; + + struct sigaction sa_sigterm; + memset(&sa_sigterm, 0, sizeof(sa_sigterm)); + sa_sigterm.sa_handler = termsig_handler; + sigaction(SIGTERM, &sa_sigterm, NULL); + sigaction(SIGINT, &sa_sigterm, NULL); + sigaction(SIGHUP, &sa_sigterm, NULL); + + signal(SIGPIPE, SIG_IGN); + + module_call_init(MODULE_INIT_TRACE); + module_call_init(MODULE_INIT_QOM); + qemu_add_opts(&qemu_trace_opts); + qemu_init_exec_dir(argv[0]); + + pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid"); + + while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { + switch (ch) { + case 'k': + socket_path = optarg; + if (socket_path[0] != '/') { + error_report("socket path must be absolute"); + exit(EXIT_FAILURE); + } + break; + case 'f': + pidfile = optarg; + break; +#ifdef CONFIG_LIBCAP + case 'u': { + unsigned long res; + struct passwd *userinfo = getpwnam(optarg); + if (userinfo) { + uid = userinfo->pw_uid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (uid_t)res == res) { + uid = res; + } else { + error_report("invalid user '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } + case 'g': { + unsigned long res; + struct group *groupinfo = getgrnam(optarg); + if (groupinfo) { + gid = groupinfo->gr_gid; + } else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 && + (gid_t)res == res) { + gid = res; + } else { + error_report("invalid group '%s'", optarg); + exit(EXIT_FAILURE); + } + break; + } +#else + case 'u': + case 'g': + error_report("-%c not supported by this %s", ch, argv[0]); + exit(1); +#endif + case 'd': + daemonize = true; + break; + case 'q': + quiet = 1; + break; + case 'v': + ++loglevel; + break; + case 'T': + g_free(trace_file); + trace_file = trace_opt_parse(optarg); + break; + case 'V': + version(argv[0]); + exit(EXIT_SUCCESS); + break; + case 'h': + usage(argv[0]); + exit(EXIT_SUCCESS); + break; + case '?': + error_report("Try `%s --help' for more information.", argv[0]); + exit(EXIT_FAILURE); + } + } + + /* set verbosity */ + noisy = !quiet && (loglevel >= 3); + verbose = quiet ? 0 : MIN(loglevel, 3); + + if (!trace_init_backends()) { + exit(EXIT_FAILURE); + } + trace_init_file(trace_file); + qemu_set_log(LOG_TRACE); + +#ifdef CONFIG_MPATH + dm_init(); + multipath_pr_init(); +#endif + + socket_activation = check_socket_activation(); + if (socket_activation == 0) { + SocketAddress saddr; + compute_default_paths(); + saddr = (SocketAddress){ + .type = SOCKET_ADDRESS_TYPE_UNIX, + .u.q_unix.path = g_strdup(socket_path) + }; + server_ioc = qio_channel_socket_new(); + if (qio_channel_socket_listen_sync(server_ioc, &saddr, &local_err) < 0) { + object_unref(OBJECT(server_ioc)); + error_report_err(local_err); + return 1; + } + g_free(saddr.u.q_unix.path); + } else { + /* Using socket activation - check user didn't use -p etc. */ + const char *err_msg = socket_activation_validate_opts(); + if (err_msg != NULL) { + error_report("%s", err_msg); + exit(EXIT_FAILURE); + } + + /* Can only listen on a single socket. */ + if (socket_activation > 1) { + error_report("%s does not support socket activation with LISTEN_FDS > 1", + argv[0]); + exit(EXIT_FAILURE); + } + server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD, + &local_err); + if (server_ioc == NULL) { + error_report("Failed to use socket activation: %s", + error_get_pretty(local_err)); + exit(EXIT_FAILURE); + } + socket_path = NULL; + } + + if (qemu_init_main_loop(&local_err)) { + error_report_err(local_err); + exit(EXIT_FAILURE); + } + + server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc), + G_IO_IN, + accept_client, + NULL, NULL); + +#ifdef CONFIG_LIBCAP + if (drop_privileges() < 0) { + error_report("Failed to drop privileges: %s", strerror(errno)); + exit(EXIT_FAILURE); + } +#endif + + if (daemonize) { + if (daemon(0, 0) < 0) { + error_report("Failed to daemonize: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + write_pidfile(); + } + + state = RUNNING; + do { + main_loop_wait(false); + if (state == TERMINATE) { + state = TERMINATING; + close_server_socket(); + } + } while (num_active_sockets > 0); + + exit(EXIT_SUCCESS); +} diff --git a/scsi/trace-events b/scsi/trace-events new file mode 100644 index 0000000000..45f5b6e49b --- /dev/null +++ b/scsi/trace-events @@ -0,0 +1,3 @@ +# scsi/pr-manager.c +pr_manager_execute(int fd, int cmd, int sa, void *opaque) "fd=%d cmd=0x%02x service action=0x%02x opaque=%p" +pr_manager_run(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x" diff --git a/scsi/utils.c b/scsi/utils.c index fab60bdf20..5684951b12 100644 --- a/scsi/utils.c +++ b/scsi/utils.c @@ -206,6 +206,11 @@ const struct SCSISense sense_code_OVERLAPPED_COMMANDS = { .key = ABORTED_COMMAND, .asc = 0x4e, .ascq = 0x00 }; +/* Command aborted, LUN Communication Failure */ +const struct SCSISense sense_code_LUN_COMM_FAILURE = { + .key = ABORTED_COMMAND, .asc = 0x08, .ascq = 0x00 +}; + /* Unit attention, Capacity data has changed */ const struct SCSISense sense_code_CAPACITY_CHANGED = { .key = UNIT_ATTENTION, .asc = 0x2a, .ascq = 0x09 @@ -216,6 +221,11 @@ const struct SCSISense sense_code_RESET = { .key = UNIT_ATTENTION, .asc = 0x29, .ascq = 0x00 }; +/* Unit attention, SCSI bus reset */ +const struct SCSISense sense_code_SCSI_BUS_RESET = { + .key = UNIT_ATTENTION, .asc = 0x29, .ascq = 0x02 +}; + /* Unit attention, No medium */ const struct SCSISense sense_code_UNIT_ATTENTION_NO_MEDIUM = { .key = UNIT_ATTENTION, .asc = 0x3a, .ascq = 0x00 diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 33449790e5..4300de66e2 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -691,6 +691,9 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) CPUARMState *env = &cpu->env; int pagebits; Error *local_err = NULL; +#ifndef CONFIG_USER_ONLY + AddressSpace *as; +#endif cpu_exec_realizefn(cs, &local_err); if (local_err != NULL) { @@ -881,24 +884,21 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY)) { - AddressSpace *as; + as = g_new0(AddressSpace, 1); cs->num_ases = 2; if (!cpu->secure_memory) { cpu->secure_memory = cs->memory; } - as = address_space_init_shareable(cpu->secure_memory, - "cpu-secure-memory"); + address_space_init(as, cpu->secure_memory, "cpu-secure-memory"); cpu_address_space_init(cs, as, ARMASIdx_S); } else { cs->num_ases = 1; } - - cpu_address_space_init(cs, - address_space_init_shareable(cs->memory, - "cpu-memory"), - ARMASIdx_NS); + as = g_new0(AddressSpace, 1); + address_space_init(as, cs->memory, "cpu-memory"); + cpu_address_space_init(cs, as, ARMASIdx_NS); #endif qemu_init_vcpu(cs); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 0aa28fc775..98732cd65f 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3738,10 +3738,11 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY if (tcg_enabled()) { - AddressSpace *as_normal = address_space_init_shareable(cs->memory, - "cpu-memory"); + AddressSpace *as_normal = g_new0(AddressSpace, 1); AddressSpace *as_smm = g_new(AddressSpace, 1); + address_space_init(as_normal, cs->memory, "cpu-memory"); + cpu->cpu_as_mem = g_new(MemoryRegion, 1); cpu->cpu_as_root = g_new(MemoryRegion, 1); diff --git a/trace-events b/trace-events index 1f50f56d9d..1d2eb5d3e4 100644 --- a/trace-events +++ b/trace-events @@ -64,6 +64,9 @@ memory_region_tb_read(int cpu_index, uint64_t addr, uint64_t value, unsigned siz memory_region_tb_write(int cpu_index, uint64_t addr, uint64_t value, unsigned size) "cpu %d addr 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" memory_region_ram_device_write(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" +flatview_new(FlatView *view, MemoryRegion *root) "%p (root %p)" +flatview_destroy(FlatView *view, MemoryRegion *root) "%p (root %p)" +flatview_destroy_rcu(FlatView *view, MemoryRegion *root) "%p (root %p)" ### Guest events, keep at bottom @@ -2889,7 +2889,8 @@ static int machine_set_property(void *opaque, */ static bool object_create_initial(const char *type) { - if (g_str_equal(type, "rng-egd")) { + if (g_str_equal(type, "rng-egd") || + g_str_has_prefix(type, "pr-manager-")) { return false; } |