diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2016-05-31 09:29:23 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2016-05-31 09:29:23 +0100 |
commit | 07e070aac4eeb186905148461f331e43f2b828aa (patch) | |
tree | 725db4314abc3e4975ec17d427d560cc0c1bbc13 | |
parent | d6550e9ed2e1a60d889dfb721de00d9a4e3bafbe (diff) | |
parent | 0878d0e11ba8013dd759c6921cbf05ba6a41bd71 (diff) |
Merge remote-tracking branch 'remotes/bonzini/tags/for-upstream' into staging
* docs/atomics fixes and atomic_rcu_* optimization (Emilio)
* NBD bugfix (Eric)
* Memory fixes and cleanups (Paolo, Paul)
* scsi-block support for SCSI status, including persistent
reservations (Paolo)
* kvm_stat moves to the Linux repository
* SCSI bug fixes (Peter, Prasad)
* Killing qemu_char_get_next_serial, non-ARM parts (Xiaoqiang)
# gpg: Signature made Sun 29 May 2016 08:11:20 BST using RSA key ID 78C7AE83
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>"
# gpg: aka "Paolo Bonzini <pbonzini@redhat.com>"
* remotes/bonzini/tags/for-upstream: (30 commits)
exec: hide mr->ram_addr from qemu_get_ram_ptr users
memory: split memory_region_from_host from qemu_ram_addr_from_host
exec: remove ram_addr argument from qemu_ram_block_from_host
memory: remove qemu_get_ram_fd, qemu_set_ram_fd, qemu_ram_block_host_ptr
scsi-generic: Merge block max xfer len in INQUIRY response
scsi-block: always use SG_IO
scsi-disk: introduce scsi_disk_req_check_error
scsi-disk: add need_fua_emulation to SCSIDiskClass
scsi-disk: introduce dma_readv and dma_writev
scsi-disk: introduce a common base class
xen-hvm: ignore background I/O sections
docs/atomics: update comparison with Linux
atomics: do not emit consume barrier for atomic_rcu_read
atomics: emit an smp_read_barrier_depends() barrier only for Alpha and Thread Sanitizer
docs/atomics: update atomic_read/set comparison with Linux
bt: rewrite csrhci_write to avoid out-of-bounds writes
block/iscsi: avoid potential overflow of acb->task->cdb
scsi: megasas: check 'read_queue_head' index value
scsi: megasas: initialise local configuration data buffer
scsi: megasas: use appropriate property buffer size
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
36 files changed, 709 insertions, 1242 deletions
@@ -92,9 +92,6 @@ HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF) ifdef BUILD_DOCS DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qemu-ga.8 DOCS+=qmp-commands.txt -ifdef CONFIG_LINUX -DOCS+=kvm_stat.1 -endif ifdef CONFIG_VIRTFS DOCS+=fsdev/virtfs-proxy-helper.1 endif @@ -571,12 +568,6 @@ qemu-ga.8: qemu-ga.texi $(POD2MAN) --section=8 --center=" " --release=" " qemu-ga.pod > $@, \ " GEN $@") -kvm_stat.1: scripts/kvm/kvm_stat.texi - $(call quiet-command, \ - perl -Ww -- $(SRC_PATH)/scripts/texi2pod.pl $< kvm_stat.pod && \ - $(POD2MAN) --section=1 --center=" " --release=" " kvm_stat.pod > $@, \ - " GEN $@") - dvi: qemu-doc.dvi qemu-tech.dvi html: qemu-doc.html qemu-tech.html info: qemu-doc.info qemu-tech.info diff --git a/block/iscsi.c b/block/iscsi.c index 2ca8e72967..e7d5f7b0c3 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -833,6 +833,13 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, return &acb->common; } + if (acb->ioh->cmd_len > SCSI_CDB_MAX_SIZE) { + error_report("iSCSI: ioctl error CDB exceeds max size (%d > %d)", + acb->ioh->cmd_len, SCSI_CDB_MAX_SIZE); + qemu_aio_unref(acb); + return NULL; + } + acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi command. %s", @@ -246,7 +246,8 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) { ram_addr_t ram_addr; - if (qemu_ram_addr_from_host(ptr, &ram_addr) == NULL) { + ram_addr = qemu_ram_addr_from_host(ptr); + if (ram_addr == RAM_ADDR_INVALID) { fprintf(stderr, "Bad ram pointer %p\n", ptr); abort(); } diff --git a/docs/atomics.txt b/docs/atomics.txt index bba771ecd6..c95950b6c5 100644 --- a/docs/atomics.txt +++ b/docs/atomics.txt @@ -326,21 +326,41 @@ and memory barriers, and the equivalents in QEMU: use a boxed atomic_t type; atomic operations in QEMU are polymorphic and use normal C types. -- atomic_read and atomic_set in Linux give no guarantee at all; - atomic_read and atomic_set in QEMU include a compiler barrier - (similar to the READ_ONCE/WRITE_ONCE macros in Linux). - -- most atomic read-modify-write operations in Linux return void; - in QEMU, all of them return the old value of the variable. +- Originally, atomic_read and atomic_set in Linux gave no guarantee + at all. Linux 4.1 updated them to implement volatile + semantics via ACCESS_ONCE (or the more recent READ/WRITE_ONCE). + + QEMU's atomic_read/set implement, if the compiler supports it, C11 + atomic relaxed semantics, and volatile semantics otherwise. + Both semantics prevent the compiler from doing certain transformations; + the difference is that atomic accesses are guaranteed to be atomic, + while volatile accesses aren't. Thus, in the volatile case we just cross + our fingers hoping that the compiler will generate atomic accesses, + since we assume the variables passed are machine-word sized and + properly aligned. + No barriers are implied by atomic_read/set in either Linux or QEMU. + +- atomic read-modify-write operations in Linux are of three kinds: + + atomic_OP returns void + atomic_OP_return returns new value of the variable + atomic_fetch_OP returns the old value of the variable + atomic_cmpxchg returns the old value of the variable + + In QEMU, the second kind does not exist. Currently Linux has + atomic_fetch_or only. QEMU provides and, or, inc, dec, add, sub. - different atomic read-modify-write operations in Linux imply a different set of memory barriers; in QEMU, all of them enforce sequential consistency, which means they imply full memory barriers before and after the operation. -- Linux does not have an equivalent of atomic_mb_read() and - atomic_mb_set(). In particular, note that set_mb() is a little - weaker than atomic_mb_set(). +- Linux does not have an equivalent of atomic_mb_set(). In particular, + note that smp_store_mb() is a little weaker than atomic_mb_set(). + atomic_mb_read() compiles to the same instructions as Linux's + smp_load_acquire(), but this should be treated as an implementation + detail. If required, QEMU might later add atomic_load_acquire() and + atomic_store_release() macros. SOURCES @@ -1815,40 +1815,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) } #endif /* !_WIN32 */ -int qemu_get_ram_fd(ram_addr_t addr) -{ - RAMBlock *block; - int fd; - - rcu_read_lock(); - block = qemu_get_ram_block(addr); - fd = block->fd; - rcu_read_unlock(); - return fd; -} - -void qemu_set_ram_fd(ram_addr_t addr, int fd) -{ - RAMBlock *block; - - rcu_read_lock(); - block = qemu_get_ram_block(addr); - block->fd = fd; - rcu_read_unlock(); -} - -void *qemu_get_ram_block_host_ptr(ram_addr_t addr) -{ - RAMBlock *block; - void *ptr; - - rcu_read_lock(); - block = qemu_get_ram_block(addr); - ptr = ramblock_ptr(block, 0); - rcu_read_unlock(); - return ptr; -} - /* Return a host pointer to ram allocated with qemu_ram_alloc. * This should not be used for general purpose DMA. Use address_space_map * or address_space_rw instead. For local memory (e.g. video ram) that the @@ -1856,12 +1822,13 @@ void *qemu_get_ram_block_host_ptr(ram_addr_t addr) * * Called within RCU critical section. */ -void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) +void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) { RAMBlock *block = ram_block; if (block == NULL) { block = qemu_get_ram_block(addr); + addr -= block->offset; } if (xen_enabled() && block->host == NULL) { @@ -1875,10 +1842,10 @@ void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) block->host = xen_map_cache(block->offset, block->max_length, 1); } - return ramblock_ptr(block, addr - block->offset); + return ramblock_ptr(block, addr); } -/* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr +/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr * but takes a size argument. * * Called within RCU critical section. @@ -1887,16 +1854,15 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, hwaddr *size) { RAMBlock *block = ram_block; - ram_addr_t offset_inside_block; if (*size == 0) { return NULL; } if (block == NULL) { block = qemu_get_ram_block(addr); + addr -= block->offset; } - offset_inside_block = addr - block->offset; - *size = MIN(*size, block->max_length - offset_inside_block); + *size = MIN(*size, block->max_length - addr); if (xen_enabled() && block->host == NULL) { /* We need to check if the requested address is in the RAM @@ -1910,7 +1876,7 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, block->host = xen_map_cache(block->offset, block->max_length, 1); } - return ramblock_ptr(block, offset_inside_block); + return ramblock_ptr(block, addr); } /* @@ -1931,16 +1897,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, * ram_addr_t. */ RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, - ram_addr_t *ram_addr, ram_addr_t *offset) { RAMBlock *block; uint8_t *host = ptr; if (xen_enabled()) { + ram_addr_t ram_addr; rcu_read_lock(); - *ram_addr = xen_ram_addr_from_mapcache(ptr); - block = qemu_get_ram_block(*ram_addr); + ram_addr = xen_ram_addr_from_mapcache(ptr); + block = qemu_get_ram_block(ram_addr); if (block) { *offset = (host - block->host); } @@ -1972,7 +1938,6 @@ found: if (round_offset) { *offset &= TARGET_PAGE_MASK; } - *ram_addr = block->offset + *offset; rcu_read_unlock(); return block; } @@ -1999,18 +1964,17 @@ RAMBlock *qemu_ram_block_by_name(const char *name) /* Some of the softmmu routines need to translate from a host pointer (typically a TLB entry) back to a ram offset. */ -MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) +ram_addr_t qemu_ram_addr_from_host(void *ptr) { RAMBlock *block; - ram_addr_t offset; /* Not used */ - - block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset); + ram_addr_t offset; + block = qemu_ram_block_from_host(ptr, false, &offset); if (!block) { - return NULL; + return RAM_ADDR_INVALID; } - return block->mr; + return block->offset + offset; } /* Called within RCU critical section. */ @@ -2022,13 +1986,13 @@ static void notdirty_mem_write(void *opaque, hwaddr ram_addr, } switch (size) { case 1: - stb_p(qemu_get_ram_ptr(NULL, ram_addr), val); + stb_p(qemu_map_ram_ptr(NULL, ram_addr), val); break; case 2: - stw_p(qemu_get_ram_ptr(NULL, ram_addr), val); + stw_p(qemu_map_ram_ptr(NULL, ram_addr), val); break; case 4: - stl_p(qemu_get_ram_ptr(NULL, ram_addr), val); + stl_p(qemu_map_ram_ptr(NULL, ram_addr), val); break; default: abort(); @@ -2490,6 +2454,8 @@ static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, hwaddr length) { uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr); + addr += memory_region_get_ram_addr(mr); + /* No early return if dirty_log_mask is or becomes 0, because * cpu_physical_memory_set_dirty_range will still call * xen_modified_memory. @@ -2602,9 +2568,8 @@ static MemTxResult address_space_write_continue(AddressSpace *as, hwaddr addr, abort(); } } else { - addr1 += memory_region_get_ram_addr(mr); /* RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(ptr, buf, l); invalidate_and_set_dirty(mr, addr1, l); } @@ -2695,8 +2660,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, } } else { /* RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, - memory_region_get_ram_addr(mr) + addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(buf, ptr, l); } @@ -2779,9 +2743,8 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as, memory_region_is_romd(mr))) { l = memory_access_size(mr, l, addr1); } else { - addr1 += memory_region_get_ram_addr(mr); /* ROM/RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (type) { case WRITE_DATA: memcpy(ptr, buf, l); @@ -2939,7 +2902,6 @@ void *address_space_map(AddressSpace *as, hwaddr done = 0; hwaddr l, xlat, base; MemoryRegion *mr, *this_mr; - ram_addr_t raddr; void *ptr; if (len == 0) { @@ -2974,7 +2936,6 @@ void *address_space_map(AddressSpace *as, } base = xlat; - raddr = memory_region_get_ram_addr(mr); for (;;) { len -= l; @@ -2993,7 +2954,7 @@ void *address_space_map(AddressSpace *as, memory_region_ref(mr); *plen = done; - ptr = qemu_ram_ptr_length(mr->ram_block, raddr + base, plen); + ptr = qemu_ram_ptr_length(mr->ram_block, base, plen); rcu_read_unlock(); return ptr; @@ -3010,7 +2971,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, MemoryRegion *mr; ram_addr_t addr1; - mr = qemu_ram_addr_from_host(buffer, &addr1); + mr = memory_region_from_host(buffer, &addr1); assert(mr != NULL); if (is_write) { invalidate_and_set_dirty(mr, addr1, access_len); @@ -3077,8 +3038,7 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr, #endif } else { /* RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, - memory_region_get_ram_addr(mr) + addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: val = ldl_le_p(ptr); @@ -3171,8 +3131,7 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr, #endif } else { /* RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, - memory_region_get_ram_addr(mr) + addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: val = ldq_le_p(ptr); @@ -3285,8 +3244,7 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as, #endif } else { /* RAM case */ - ptr = qemu_get_ram_ptr(mr->ram_block, - memory_region_get_ram_addr(mr) + addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: val = lduw_le_p(ptr); @@ -3368,13 +3326,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val, r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); } else { - addr1 += memory_region_get_ram_addr(mr); - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); stl_p(ptr, val); dirty_log_mask = memory_region_get_dirty_log_mask(mr); dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); - cpu_physical_memory_set_dirty_range(addr1, 4, dirty_log_mask); + cpu_physical_memory_set_dirty_range(memory_region_get_ram_addr(mr) + addr, + 4, dirty_log_mask); r = MEMTX_OK; } if (result) { @@ -3423,8 +3381,7 @@ static inline void address_space_stl_internal(AddressSpace *as, r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); } else { /* RAM case */ - addr1 += memory_region_get_ram_addr(mr); - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: stl_le_p(ptr, val); @@ -3533,8 +3490,7 @@ static inline void address_space_stw_internal(AddressSpace *as, r = memory_region_dispatch_write(mr, addr1, val, 2, attrs); } else { /* RAM case */ - addr1 += memory_region_get_ram_addr(mr); - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); switch (endian) { case DEVICE_LITTLE_ENDIAN: stw_le_p(ptr, val); diff --git a/hw/bt/hci-csr.c b/hw/bt/hci-csr.c index e6b8998253..d688372ca3 100644 --- a/hw/bt/hci-csr.c +++ b/hw/bt/hci-csr.c @@ -39,9 +39,14 @@ struct csrhci_s { int out_size; uint8_t outfifo[FIFO_LEN * 2]; uint8_t inpkt[FIFO_LEN]; + enum { + CSR_HDR_LEN, + CSR_DATA_LEN, + CSR_DATA + } in_state; int in_len; int in_hdr; - int in_data; + int in_needed; QEMUTimer *out_tm; int64_t baud_delay; @@ -296,38 +301,60 @@ static int csrhci_data_len(const uint8_t *pkt) exit(-1); } +static void csrhci_ready_for_next_inpkt(struct csrhci_s *s) +{ + s->in_state = CSR_HDR_LEN; + s->in_len = 0; + s->in_needed = 2; + s->in_hdr = INT_MAX; +} + static int csrhci_write(struct CharDriverState *chr, const uint8_t *buf, int len) { struct csrhci_s *s = (struct csrhci_s *) chr->opaque; - int plen = s->in_len; + int total = 0; if (!s->enable) return 0; - s->in_len += len; - memcpy(s->inpkt + plen, buf, len); + for (;;) { + int cnt = MIN(len, s->in_needed - s->in_len); + if (cnt) { + memcpy(s->inpkt + s->in_len, buf, cnt); + s->in_len += cnt; + buf += cnt; + len -= cnt; + total += cnt; + } + + if (s->in_len < s->in_needed) { + break; + } - while (1) { - if (s->in_len >= 2 && plen < 2) + if (s->in_state == CSR_HDR_LEN) { s->in_hdr = csrhci_header_len(s->inpkt) + 1; + assert(s->in_hdr >= s->in_needed); + s->in_needed = s->in_hdr; + s->in_state = CSR_DATA_LEN; + continue; + } - if (s->in_len >= s->in_hdr && plen < s->in_hdr) - s->in_data = csrhci_data_len(s->inpkt) + s->in_hdr; + if (s->in_state == CSR_DATA_LEN) { + s->in_needed += csrhci_data_len(s->inpkt); + /* hci_acl_hdr could specify more than 4096 bytes, so assert. */ + assert(s->in_needed <= sizeof(s->inpkt)); + s->in_state = CSR_DATA; + continue; + } - if (s->in_len >= s->in_data) { + if (s->in_state == CSR_DATA) { csrhci_in_packet(s, s->inpkt); - - memmove(s->inpkt, s->inpkt + s->in_len, s->in_len - s->in_data); - s->in_len -= s->in_data; - s->in_hdr = INT_MAX; - s->in_data = INT_MAX; - plen = 0; - } else - break; + csrhci_ready_for_next_inpkt(s); + } } - return len; + return total; } static void csrhci_out_hci_packet_event(void *opaque, @@ -389,11 +416,9 @@ static void csrhci_reset(struct csrhci_s *s) { s->out_len = 0; s->out_size = FIFO_LEN; - s->in_len = 0; + csrhci_ready_for_next_inpkt(s); s->baud_delay = NANOSECONDS_PER_SECOND; s->enable = 0; - s->in_hdr = INT_MAX; - s->in_data = INT_MAX; s->modem_state = 0; /* After a while... (but sooner than 10ms) */ diff --git a/hw/char/escc.c b/hw/char/escc.c index 7bf09a0077..8e6a7df465 100644 --- a/hw/char/escc.c +++ b/hw/char/escc.c @@ -983,9 +983,10 @@ void slavio_serial_ms_kbd_init(hwaddr base, qemu_irq irq, sysbus_mmio_map(s, 0, base); } -static int escc_init1(SysBusDevice *dev) +static void escc_init1(Object *obj) { - ESCCState *s = ESCC(dev); + ESCCState *s = ESCC(obj); + SysBusDevice *dev = SYS_BUS_DEVICE(obj); unsigned int i; s->chn[0].disabled = s->disabled; @@ -994,17 +995,26 @@ static int escc_init1(SysBusDevice *dev) sysbus_init_irq(dev, &s->chn[i].irq); s->chn[i].chn = 1 - i; s->chn[i].clock = s->frequency / 2; - if (s->chn[i].chr) { - qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive, - serial_receive1, serial_event, &s->chn[i]); - } } s->chn[0].otherchn = &s->chn[1]; s->chn[1].otherchn = &s->chn[0]; - memory_region_init_io(&s->mmio, OBJECT(s), &escc_mem_ops, s, "escc", + memory_region_init_io(&s->mmio, obj, &escc_mem_ops, s, "escc", ESCC_SIZE << s->it_shift); sysbus_init_mmio(dev, &s->mmio); +} + +static void escc_realize(DeviceState *dev, Error **errp) +{ + ESCCState *s = ESCC(dev); + unsigned int i; + + for (i = 0; i < 2; i++) { + if (s->chn[i].chr) { + qemu_chr_add_handlers(s->chn[i].chr, serial_can_receive, + serial_receive1, serial_event, &s->chn[i]); + } + } if (s->chn[0].type == mouse) { qemu_add_mouse_event_handler(sunmouse_event, &s->chn[0], 0, @@ -1014,8 +1024,6 @@ static int escc_init1(SysBusDevice *dev) s->chn[1].hs = qemu_input_handler_register((DeviceState *)(&s->chn[1]), &sunkbd_handler); } - - return 0; } static Property escc_properties[] = { @@ -1032,10 +1040,9 @@ static Property escc_properties[] = { static void escc_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - k->init = escc_init1; dc->reset = escc_reset; + dc->realize = escc_realize; dc->vmsd = &vmstate_escc; dc->props = escc_properties; set_bit(DEVICE_CATEGORY_INPUT, dc->categories); @@ -1045,6 +1052,7 @@ static const TypeInfo escc_info = { .name = TYPE_ESCC, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(ESCCState), + .instance_init = escc_init1, .class_init = escc_class_init, }; diff --git a/hw/char/etraxfs_ser.c b/hw/char/etraxfs_ser.c index 146b387e7e..04ca04fe2c 100644 --- a/hw/char/etraxfs_ser.c +++ b/hw/char/etraxfs_ser.c @@ -159,6 +159,11 @@ static const MemoryRegionOps ser_ops = { } }; +static Property etraxfs_ser_properties[] = { + DEFINE_PROP_CHR("chardev", ETRAXSerial, chr), + DEFINE_PROP_END_OF_LIST(), +}; + static void serial_receive(void *opaque, const uint8_t *buf, int size) { ETRAXSerial *s = opaque; @@ -209,40 +214,42 @@ static void etraxfs_ser_reset(DeviceState *d) } -static int etraxfs_ser_init(SysBusDevice *dev) +static void etraxfs_ser_init(Object *obj) { - ETRAXSerial *s = ETRAX_SERIAL(dev); + ETRAXSerial *s = ETRAX_SERIAL(obj); + SysBusDevice *dev = SYS_BUS_DEVICE(obj); sysbus_init_irq(dev, &s->irq); - memory_region_init_io(&s->mmio, OBJECT(s), &ser_ops, s, + memory_region_init_io(&s->mmio, obj, &ser_ops, s, "etraxfs-serial", R_MAX * 4); sysbus_init_mmio(dev, &s->mmio); +} + +static void etraxfs_ser_realize(DeviceState *dev, Error **errp) +{ + ETRAXSerial *s = ETRAX_SERIAL(dev); - /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */ - s->chr = qemu_char_get_next_serial(); if (s->chr) { qemu_chr_add_handlers(s->chr, serial_can_receive, serial_receive, serial_event, s); } - return 0; } static void etraxfs_ser_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - k->init = etraxfs_ser_init; dc->reset = etraxfs_ser_reset; - /* Reason: init() method uses qemu_char_get_next_serial() */ - dc->cannot_instantiate_with_device_add_yet = true; + dc->props = etraxfs_ser_properties; + dc->realize = etraxfs_ser_realize; } static const TypeInfo etraxfs_ser_info = { .name = TYPE_ETRAX_FS_SERIAL, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(ETRAXSerial), + .instance_init = etraxfs_ser_init, .class_init = etraxfs_ser_class_init, }; diff --git a/hw/char/lm32_juart.c b/hw/char/lm32_juart.c index 5bf8acfe8f..28c2cf702d 100644 --- a/hw/char/lm32_juart.c +++ b/hw/char/lm32_juart.c @@ -114,17 +114,13 @@ static void juart_reset(DeviceState *d) s->jrx = 0; } -static int lm32_juart_init(SysBusDevice *dev) +static void lm32_juart_realize(DeviceState *dev, Error **errp) { LM32JuartState *s = LM32_JUART(dev); - /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */ - s->chr = qemu_char_get_next_serial(); if (s->chr) { qemu_chr_add_handlers(s->chr, juart_can_rx, juart_rx, juart_event, s); } - - return 0; } static const VMStateDescription vmstate_lm32_juart = { @@ -138,16 +134,19 @@ static const VMStateDescription vmstate_lm32_juart = { } }; +static Property lm32_juart_properties[] = { + DEFINE_PROP_CHR("chardev", LM32JuartState, chr), + DEFINE_PROP_END_OF_LIST(), +}; + static void lm32_juart_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - k->init = lm32_juart_init; dc->reset = juart_reset; dc->vmsd = &vmstate_lm32_juart; - /* Reason: init() method uses qemu_char_get_next_serial() */ - dc->cannot_instantiate_with_device_add_yet = true; + dc->props = lm32_juart_properties; + dc->realize = lm32_juart_realize; } static const TypeInfo lm32_juart_info = { diff --git a/hw/char/lm32_uart.c b/hw/char/lm32_uart.c index 036813d0f3..b5c760dda3 100644 --- a/hw/char/lm32_uart.c +++ b/hw/char/lm32_uart.c @@ -249,23 +249,25 @@ static void uart_reset(DeviceState *d) s->regs[R_LSR] = LSR_THRE | LSR_TEMT; } -static int lm32_uart_init(SysBusDevice *dev) +static void lm32_uart_init(Object *obj) { - LM32UartState *s = LM32_UART(dev); + LM32UartState *s = LM32_UART(obj); + SysBusDevice *dev = SYS_BUS_DEVICE(obj); sysbus_init_irq(dev, &s->irq); - memory_region_init_io(&s->iomem, OBJECT(s), &uart_ops, s, + memory_region_init_io(&s->iomem, obj, &uart_ops, s, "uart", R_MAX * 4); sysbus_init_mmio(dev, &s->iomem); +} + +static void lm32_uart_realize(DeviceState *dev, Error **errp) +{ + LM32UartState *s = LM32_UART(dev); - /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */ - s->chr = qemu_char_get_next_serial(); if (s->chr) { qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s); } - - return 0; } static const VMStateDescription vmstate_lm32_uart = { @@ -278,22 +280,26 @@ static const VMStateDescription vmstate_lm32_uart = { } }; +static Property lm32_uart_properties[] = { + DEFINE_PROP_CHR("chardev", LM32UartState, chr), + DEFINE_PROP_END_OF_LIST(), +}; + static void lm32_uart_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - k->init = lm32_uart_init; dc->reset = uart_reset; dc->vmsd = &vmstate_lm32_uart; - /* Reason: init() method uses qemu_char_get_next_serial() */ - dc->cannot_instantiate_with_device_add_yet = true; + dc->props = lm32_uart_properties; + dc->realize = lm32_uart_realize; } static const TypeInfo lm32_uart_info = { .name = TYPE_LM32_UART, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(LM32UartState), + .instance_init = lm32_uart_init, .class_init = lm32_uart_class_init, }; diff --git a/hw/char/milkymist-uart.c b/hw/char/milkymist-uart.c index 03b36b2236..72f8484668 100644 --- a/hw/char/milkymist-uart.c +++ b/hw/char/milkymist-uart.c @@ -200,8 +200,6 @@ static void milkymist_uart_realize(DeviceState *dev, Error **errp) { MilkymistUartState *s = MILKYMIST_UART(dev); - /* FIXME use a qdev chardev prop instead of qemu_char_get_next_serial() */ - s->chr = qemu_char_get_next_serial(); if (s->chr) { qemu_chr_add_handlers(s->chr, uart_can_rx, uart_rx, uart_event, s); } @@ -229,6 +227,11 @@ static const VMStateDescription vmstate_milkymist_uart = { } }; +static Property milkymist_uart_properties[] = { + DEFINE_PROP_CHR("chardev", MilkymistUartState, chr), + DEFINE_PROP_END_OF_LIST(), +}; + static void milkymist_uart_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -236,8 +239,7 @@ static void milkymist_uart_class_init(ObjectClass *klass, void *data) dc->realize = milkymist_uart_realize; dc->reset = milkymist_uart_reset; dc->vmsd = &vmstate_milkymist_uart; - /* Reason: realize() method uses qemu_char_get_next_serial() */ - dc->cannot_instantiate_with_device_add_yet = true; + dc->props = milkymist_uart_properties; } static const TypeInfo milkymist_uart_info = { diff --git a/hw/cris/axis_dev88.c b/hw/cris/axis_dev88.c index 9f58658741..60df8877c1 100644 --- a/hw/cris/axis_dev88.c +++ b/hw/cris/axis_dev88.c @@ -37,6 +37,7 @@ #include "sysemu/block-backend.h" #include "exec/address-spaces.h" #include "sysemu/qtest.h" +#include "sysemu/sysemu.h" #define D(x) #define DNAND(x) @@ -341,8 +342,7 @@ void axisdev88_init(MachineState *machine) sysbus_create_varargs("etraxfs,timer", 0x3005e000, irq[0x1b], nmi[1], NULL); for (i = 0; i < 4; i++) { - sysbus_create_simple("etraxfs,serial", 0x30026000 + i * 0x2000, - irq[0x14 + i]); + etraxfs_ser_create(0x30026000 + i * 0x2000, irq[0x14 + i], serial_hds[i]); } if (kernel_filename) { diff --git a/hw/lm32/lm32.h b/hw/lm32/lm32.h index 18aa6fdc15..e338bfeae5 100644 --- a/hw/lm32/lm32.h +++ b/hw/lm32/lm32.h @@ -16,14 +16,31 @@ static inline DeviceState *lm32_pic_init(qemu_irq cpu_irq) return dev; } -static inline DeviceState *lm32_juart_init(void) +static inline DeviceState *lm32_juart_init(CharDriverState *chr) { DeviceState *dev; dev = qdev_create(NULL, TYPE_LM32_JUART); + qdev_prop_set_chr(dev, "chardev", chr); qdev_init_nofail(dev); return dev; } +static inline DeviceState *lm32_uart_create(hwaddr addr, + qemu_irq irq, + CharDriverState *chr) +{ + DeviceState *dev; + SysBusDevice *s; + + dev = qdev_create(NULL, "lm32-uart"); + s = SYS_BUS_DEVICE(dev); + qdev_prop_set_chr(dev, "chardev", chr); + qdev_init_nofail(dev); + sysbus_mmio_map(s, 0, addr); + sysbus_connect_irq(s, 0, irq); + return dev; +} + #endif diff --git a/hw/lm32/lm32_boards.c b/hw/lm32/lm32_boards.c index c0290560fc..8f0c3079d6 100644 --- a/hw/lm32/lm32_boards.c +++ b/hw/lm32/lm32_boards.c @@ -31,6 +31,7 @@ #include "lm32_hwsetup.h" #include "lm32.h" #include "exec/address-spaces.h" +#include "sysemu/sysemu.h" typedef struct { LM32CPU *cpu; @@ -131,12 +132,12 @@ static void lm32_evr_init(MachineState *machine) irq[i] = qdev_get_gpio_in(env->pic_state, i); } - sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]); + lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]); sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]); sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]); /* make sure juart isn't the first chardev */ - env->juart_state = lm32_juart_init(); + env->juart_state = lm32_juart_init(serial_hds[1]); reset_info->bootstrap_pc = flash_base; @@ -232,13 +233,13 @@ static void lm32_uclinux_init(MachineState *machine) irq[i] = qdev_get_gpio_in(env->pic_state, i); } - sysbus_create_simple("lm32-uart", uart0_base, irq[uart0_irq]); + lm32_uart_create(uart0_base, irq[uart0_irq], serial_hds[0]); sysbus_create_simple("lm32-timer", timer0_base, irq[timer0_irq]); sysbus_create_simple("lm32-timer", timer1_base, irq[timer1_irq]); sysbus_create_simple("lm32-timer", timer2_base, irq[timer2_irq]); /* make sure juart isn't the first chardev */ - env->juart_state = lm32_juart_init(); + env->juart_state = lm32_juart_init(serial_hds[1]); reset_info->bootstrap_pc = flash_base; diff --git a/hw/lm32/milkymist-hw.h b/hw/lm32/milkymist-hw.h index f857d2846f..eb6a3a2559 100644 --- a/hw/lm32/milkymist-hw.h +++ b/hw/lm32/milkymist-hw.h @@ -5,11 +5,13 @@ #include "net/net.h" static inline DeviceState *milkymist_uart_create(hwaddr base, - qemu_irq irq) + qemu_irq irq, + CharDriverState *chr) { DeviceState *dev; dev = qdev_create(NULL, "milkymist-uart"); + qdev_prop_set_chr(dev, "chardev", chr); qdev_init_nofail(dev); sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base); sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, irq); diff --git a/hw/lm32/milkymist.c b/hw/lm32/milkymist.c index 1abdf6e2e6..5cae0f19dd 100644 --- a/hw/lm32/milkymist.c +++ b/hw/lm32/milkymist.c @@ -159,7 +159,7 @@ milkymist_init(MachineState *machine) } g_free(bios_filename); - milkymist_uart_create(0x60000000, irq[0]); + milkymist_uart_create(0x60000000, irq[0], serial_hds[0]); milkymist_sysctl_create(0x60001000, irq[1], irq[2], irq[3], 80000000, 0x10014d31, 0x0000041f, 0x00000001); milkymist_hpdmc_create(0x60002000); @@ -175,7 +175,7 @@ milkymist_init(MachineState *machine) 0x20000000, 0x1000, 0x20020000, 0x2000); /* make sure juart isn't the first chardev */ - env->juart_state = lm32_juart_init(); + env->juart_state = lm32_juart_init(serial_hds[1]); if (kernel_filename) { uint64_t entry; diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index e40f23bfc2..90be9f7617 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -33,7 +33,6 @@ #include "sysemu/hostmem.h" #include "sysemu/qtest.h" #include "qapi/visitor.h" -#include "exec/ram_addr.h" #include "hw/misc/ivshmem.h" @@ -533,7 +532,7 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp) } memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s), "ivshmem.bar2", size, ptr); - qemu_set_ram_fd(memory_region_get_ram_addr(&s->server_bar2), fd); + memory_region_set_fd(&s->server_bar2, fd); s->ivshmem_bar2 = &s->server_bar2; } @@ -940,7 +939,7 @@ static void ivshmem_exit(PCIDevice *dev) strerror(errno)); } - fd = qemu_get_ram_fd(memory_region_get_ram_addr(s->ivshmem_bar2)); + fd = memory_region_get_fd(s->ivshmem_bar2); close(fd); } diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index a63a581550..cc66d36186 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -650,7 +650,9 @@ static int megasas_init_firmware(MegasasState *s, MegasasCmd *cmd) pa_hi = le32_to_cpu(initq->pi_addr_hi); s->producer_pa = ((uint64_t) pa_hi << 32) | pa_lo; s->reply_queue_head = ldl_le_pci_dma(pcid, s->producer_pa); + s->reply_queue_head %= MEGASAS_MAX_FRAMES; s->reply_queue_tail = ldl_le_pci_dma(pcid, s->consumer_pa); + s->reply_queue_tail %= MEGASAS_MAX_FRAMES; flags = le32_to_cpu(initq->flags); if (flags & MFI_QUEUE_FLAG_CONTEXT64) { s->flags |= MEGASAS_MASK_USE_QUEUE64; @@ -1293,7 +1295,7 @@ static int megasas_dcmd_ld_get_info(MegasasState *s, MegasasCmd *cmd) static int megasas_dcmd_cfg_read(MegasasState *s, MegasasCmd *cmd) { - uint8_t data[4096]; + uint8_t data[4096] = { 0 }; struct mfi_config_data *info; int num_pd_disks = 0, array_offset, ld_offset; BusChild *kid; @@ -1446,7 +1448,7 @@ static int megasas_dcmd_set_properties(MegasasState *s, MegasasCmd *cmd) dcmd_size); return MFI_STAT_INVALID_PARAMETER; } - dma_buf_write((uint8_t *)&info, cmd->iov_size, &cmd->qsg); + dma_buf_write((uint8_t *)&info, dcmd_size, &cmd->qsg); trace_megasas_dcmd_unsupported(cmd->index, cmd->iov_size); return MFI_STAT_OK; } diff --git a/hw/scsi/mptsas.c b/hw/scsi/mptsas.c index 499c1465ae..be88e161a9 100644 --- a/hw/scsi/mptsas.c +++ b/hw/scsi/mptsas.c @@ -754,11 +754,6 @@ static void mptsas_fetch_request(MPTSASState *s) hwaddr addr; int size; - if (s->state != MPI_IOC_STATE_OPERATIONAL) { - mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE); - return; - } - /* Read the message header from the guest first. */ addr = s->host_mfa_high_addr | MPTSAS_FIFO_GET(s, request_post); pci_dma_read(pci, addr, req, sizeof(hdr)); @@ -789,6 +784,10 @@ static void mptsas_fetch_requests(void *opaque) { MPTSASState *s = opaque; + if (s->state != MPI_IOC_STATE_OPERATIONAL) { + mptsas_set_fault(s, MPI_IOCSTATUS_INVALID_STATE); + return; + } while (!MPTSAS_FIFO_EMPTY(s, request_post)) { mptsas_fetch_request(s); } diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 8865da53e8..ace65e0720 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -53,7 +53,21 @@ do { printf("scsi-disk: " fmt , ## __VA_ARGS__); } while (0) #define DEFAULT_MAX_UNMAP_SIZE (1 << 30) /* 1 GB */ #define DEFAULT_MAX_IO_SIZE INT_MAX /* 2 GB - 1 block */ -typedef struct SCSIDiskState SCSIDiskState; +#define TYPE_SCSI_DISK_BASE "scsi-disk-base" + +#define SCSI_DISK_BASE(obj) \ + OBJECT_CHECK(SCSIDiskState, (obj), TYPE_SCSI_DISK_BASE) +#define SCSI_DISK_BASE_CLASS(klass) \ + OBJECT_CLASS_CHECK(SCSIDiskClass, (klass), TYPE_SCSI_DISK_BASE) +#define SCSI_DISK_BASE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(SCSIDiskClass, (obj), TYPE_SCSI_DISK_BASE) + +typedef struct SCSIDiskClass { + SCSIDeviceClass parent_class; + DMAIOFunc *dma_readv; + DMAIOFunc *dma_writev; + bool (*need_fua_emulation)(SCSICommand *cmd); +} SCSIDiskClass; typedef struct SCSIDiskReq { SCSIRequest req; @@ -62,16 +76,18 @@ typedef struct SCSIDiskReq { uint32_t sector_count; uint32_t buflen; bool started; + bool need_fua_emulation; struct iovec iov; QEMUIOVector qiov; BlockAcctCookie acct; + unsigned char *status; } SCSIDiskReq; #define SCSI_DISK_F_REMOVABLE 0 #define SCSI_DISK_F_DPOFUA 1 #define SCSI_DISK_F_NO_REMOVABLE_DEVOPS 2 -struct SCSIDiskState +typedef struct SCSIDiskState { SCSIDevice qdev; uint32_t features; @@ -88,7 +104,7 @@ struct SCSIDiskState char *product; bool tray_open; bool tray_locked; -}; +} SCSIDiskState; static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed); @@ -161,6 +177,29 @@ static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req) qemu_iovec_init_external(&r->qiov, &r->iov, 1); } +static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed) +{ + if (r->req.io_canceled) { + scsi_req_cancel_complete(&r->req); + return true; + } + + if (ret < 0) { + return scsi_handle_rw_error(r, -ret, acct_failed); + } + + if (r->status && *r->status) { + if (acct_failed) { + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct); + } + scsi_req_complete(&r->req, *r->status); + return true; + } + + return false; +} + static void scsi_aio_complete(void *opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; @@ -168,17 +207,10 @@ static void scsi_aio_complete(void *opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, true)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, true)) { - goto done; - } - } - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); scsi_req_complete(&r->req, GOOD); @@ -217,13 +249,9 @@ static void scsi_write_do_fua(SCSIDiskReq *r) SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); assert(r->req.aiocb == NULL); + assert(!r->req.io_canceled); - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); - goto done; - } - - if (scsi_is_cmd_fua(&r->req.cmd)) { + if (r->need_fua_emulation) { block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0, BLOCK_ACCT_FLUSH); r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_aio_complete, r); @@ -231,26 +259,16 @@ static void scsi_write_do_fua(SCSIDiskReq *r) } scsi_req_complete(&r->req, GOOD); - -done: scsi_req_unref(&r->req); } static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret) { assert(r->req.aiocb == NULL); - - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, false)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, false)) { - goto done; - } - } - r->sector += r->sector_count; r->sector_count = 0; if (r->req.cmd.mode == SCSI_XFER_TO_DEV) { @@ -288,17 +306,10 @@ static void scsi_read_complete(void * opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, true)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, true)) { - goto done; - } - } - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size); @@ -315,36 +326,29 @@ done: static void scsi_do_read(SCSIDiskReq *r, int ret) { SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); assert (r->req.aiocb == NULL); - - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, false)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, false)) { - goto done; - } - } - /* The request is used as the AIO opaque value, so add a ref. */ scsi_req_ref(&r->req); if (r->req.sg) { dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_READ); r->req.resid -= r->req.sg->size; - r->req.aiocb = dma_blk_read(s->qdev.conf.blk, r->req.sg, - r->sector << BDRV_SECTOR_BITS, - scsi_dma_complete, r); + r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), + r->req.sg, r->sector << BDRV_SECTOR_BITS, + sdc->dma_readv, r, scsi_dma_complete, r, + DMA_DIRECTION_FROM_DEVICE); } else { scsi_init_iovec(r, SCSI_DMA_BUF_SIZE); block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, r->qiov.size, BLOCK_ACCT_READ); - r->req.aiocb = blk_aio_preadv(s->qdev.conf.blk, - r->sector << BDRV_SECTOR_BITS, &r->qiov, - 0, scsi_read_complete, r); + r->req.aiocb = sdc->dma_readv(r->sector, &r->qiov, + scsi_read_complete, r, r); } done: @@ -399,7 +403,7 @@ static void scsi_read_data(SCSIRequest *req) first = !r->started; r->started = true; - if (first && scsi_is_cmd_fua(&r->req.cmd)) { + if (first && r->need_fua_emulation) { block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, 0, BLOCK_ACCT_FLUSH); r->req.aiocb = blk_aio_flush(s->qdev.conf.blk, scsi_do_read_cb, r); @@ -456,18 +460,10 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret) uint32_t n; assert (r->req.aiocb == NULL); - - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, false)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, false)) { - goto done; - } - } - n = r->qiov.size / 512; r->sector += n; r->sector_count -= n; @@ -504,6 +500,7 @@ static void scsi_write_data(SCSIRequest *req) { SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req); SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); /* No data transfer may already be in progress */ assert(r->req.aiocb == NULL); @@ -540,15 +537,15 @@ static void scsi_write_data(SCSIRequest *req) if (r->req.sg) { dma_acct_start(s->qdev.conf.blk, &r->acct, r->req.sg, BLOCK_ACCT_WRITE); r->req.resid -= r->req.sg->size; - r->req.aiocb = dma_blk_write(s->qdev.conf.blk, r->req.sg, - r->sector << BDRV_SECTOR_BITS, - scsi_dma_complete, r); + r->req.aiocb = dma_blk_io(blk_get_aio_context(s->qdev.conf.blk), + r->req.sg, r->sector << BDRV_SECTOR_BITS, + sdc->dma_writev, r, scsi_dma_complete, r, + DMA_DIRECTION_TO_DEVICE); } else { block_acct_start(blk_get_stats(s->qdev.conf.blk), &r->acct, r->qiov.size, BLOCK_ACCT_WRITE); - r->req.aiocb = blk_aio_pwritev(s->qdev.conf.blk, - r->sector << BDRV_SECTOR_BITS, &r->qiov, - 0, scsi_write_complete, r); + r->req.aiocb = sdc->dma_writev(r->sector << BDRV_SECTOR_BITS, &r->qiov, + scsi_write_complete, r, r); } } @@ -1600,18 +1597,10 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret) uint32_t nb_sectors; assert(r->req.aiocb == NULL); - - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, false)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, false)) { - goto done; - } - } - if (data->count > 0) { sector_num = ldq_be_p(&data->inbuf[0]); nb_sectors = ldl_be_p(&data->inbuf[8]) & 0xffffffffULL; @@ -1711,17 +1700,10 @@ static void scsi_write_same_complete(void *opaque, int ret) assert(r->req.aiocb != NULL); r->req.aiocb = NULL; - if (r->req.io_canceled) { - scsi_req_cancel_complete(&r->req); + if (scsi_disk_req_check_error(r, ret, true)) { goto done; } - if (ret < 0) { - if (scsi_handle_rw_error(r, -ret, true)) { - goto done; - } - } - block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct); data->nb_sectors -= data->iov.iov_len / 512; @@ -2138,6 +2120,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf) { SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req); SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev); + SCSIDiskClass *sdc = (SCSIDiskClass *) object_get_class(OBJECT(s)); uint32_t len; uint8_t command; @@ -2196,6 +2179,7 @@ static int32_t scsi_disk_dma_command(SCSIRequest *req, uint8_t *buf) scsi_check_condition(r, SENSE_CODE(LBA_OUT_OF_RANGE)); return 0; } + r->need_fua_emulation = sdc->need_fua_emulation(&r->req.cmd); if (r->sector_count == 0) { scsi_req_complete(&r->req, GOOD); } @@ -2578,16 +2562,145 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp) scsi_generic_read_device_identification(&s->qdev); } +typedef struct SCSIBlockReq { + SCSIDiskReq req; + sg_io_hdr_t io_header; + + /* Selected bytes of the original CDB, copied into our own CDB. */ + uint8_t cmd, cdb1, group_number; + + /* CDB passed to SG_IO. */ + uint8_t cdb[16]; +} SCSIBlockReq; + +static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req, + int64_t offset, QEMUIOVector *iov, + int direction, + BlockCompletionFunc *cb, void *opaque) +{ + sg_io_hdr_t *io_header = &req->io_header; + SCSIDiskReq *r = &req->req; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + int nb_logical_blocks; + uint64_t lba; + BlockAIOCB *aiocb; + + /* This is not supported yet. It can only happen if the guest does + * reads and writes that are not aligned to one logical sectors + * _and_ cover multiple MemoryRegions. + */ + assert(offset % s->qdev.blocksize == 0); + assert(iov->size % s->qdev.blocksize == 0); + + io_header->interface_id = 'S'; + + /* The data transfer comes from the QEMUIOVector. */ + io_header->dxfer_direction = direction; + io_header->dxfer_len = iov->size; + io_header->dxferp = (void *)iov->iov; + io_header->iovec_count = iov->niov; + assert(io_header->iovec_count == iov->niov); /* no overflow! */ + + /* Build a new CDB with the LBA and length patched in, in case + * DMA helpers split the transfer in multiple segments. Do not + * build a CDB smaller than what the guest wanted, and only build + * a larger one if strictly necessary. + */ + io_header->cmdp = req->cdb; + lba = offset / s->qdev.blocksize; + nb_logical_blocks = io_header->dxfer_len / s->qdev.blocksize; + + if ((req->cmd >> 5) == 0 && lba <= 0x1ffff) { + /* 6-byte CDB */ + stl_be_p(&req->cdb[0], lba | (req->cmd << 24)); + req->cdb[4] = nb_logical_blocks; + req->cdb[5] = 0; + io_header->cmd_len = 6; + } else if ((req->cmd >> 5) <= 1 && lba <= 0xffffffffULL) { + /* 10-byte CDB */ + req->cdb[0] = (req->cmd & 0x1f) | 0x20; + req->cdb[1] = req->cdb1; + stl_be_p(&req->cdb[2], lba); + req->cdb[6] = req->group_number; + stw_be_p(&req->cdb[7], nb_logical_blocks); + req->cdb[9] = 0; + io_header->cmd_len = 10; + } else if ((req->cmd >> 5) != 4 && lba <= 0xffffffffULL) { + /* 12-byte CDB */ + req->cdb[0] = (req->cmd & 0x1f) | 0xA0; + req->cdb[1] = req->cdb1; + stl_be_p(&req->cdb[2], lba); + stl_be_p(&req->cdb[6], nb_logical_blocks); + req->cdb[10] = req->group_number; + req->cdb[11] = 0; + io_header->cmd_len = 12; + } else { + /* 16-byte CDB */ + req->cdb[0] = (req->cmd & 0x1f) | 0x80; + req->cdb[1] = req->cdb1; + stq_be_p(&req->cdb[2], lba); + stl_be_p(&req->cdb[10], nb_logical_blocks); + req->cdb[14] = req->group_number; + req->cdb[15] = 0; + io_header->cmd_len = 16; + } + + /* The rest is as in scsi-generic.c. */ + io_header->mx_sb_len = sizeof(r->req.sense); + io_header->sbp = r->req.sense; + io_header->timeout = UINT_MAX; + io_header->usr_ptr = r; + io_header->flags |= SG_FLAG_DIRECT_IO; + + aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque); + assert(aiocb != NULL); + return aiocb; +} + +static bool scsi_block_no_fua(SCSICommand *cmd) +{ + return false; +} + +static BlockAIOCB *scsi_block_dma_readv(int64_t offset, + QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + SCSIBlockReq *r = opaque; + return scsi_block_do_sgio(r, offset, iov, + SG_DXFER_FROM_DEV, cb, cb_opaque); +} + +static BlockAIOCB *scsi_block_dma_writev(int64_t offset, + QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + SCSIBlockReq *r = opaque; + return scsi_block_do_sgio(r, offset, iov, + SG_DXFER_TO_DEV, cb, cb_opaque); +} + static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf) { switch (buf[0]) { + case VERIFY_10: + case VERIFY_12: + case VERIFY_16: + /* Check if BYTCHK == 0x01 (data-out buffer contains data + * for the number of logical blocks specified in the length + * field). For other modes, do not use scatter/gather operation. + */ + if ((buf[1] & 6) != 2) { + return false; + } + break; + case READ_6: case READ_10: case READ_12: case READ_16: - case VERIFY_10: - case VERIFY_12: - case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: @@ -2595,21 +2708,8 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf) case WRITE_VERIFY_10: case WRITE_VERIFY_12: case WRITE_VERIFY_16: - /* If we are not using O_DIRECT, we might read stale data from the - * host cache if writes were made using other commands than these - * ones (such as WRITE SAME or EXTENDED COPY, etc.). So, without - * O_DIRECT everything must go through SG_IO. - */ - if (!(blk_get_flags(s->qdev.conf.blk) & BDRV_O_NOCACHE)) { - break; - } - - /* MMC writing cannot be done via pread/pwrite, because it sometimes + /* MMC writing cannot be done via DMA helpers, because it sometimes * involves writing beyond the maximum LBA or to negative LBA (lead-in). - * And once you do these writes, reading from the block device is - * unreliable, too. It is even possible that reads deliver random data - * from the host page cache (this is probably a Linux bug). - * * We might use scsi_disk_dma_reqops as long as no writing commands are * seen, but performance usually isn't paramount on optical media. So, * just make scsi-block operate the same as scsi-generic for them. @@ -2627,6 +2727,54 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf) } +static int32_t scsi_block_dma_command(SCSIRequest *req, uint8_t *buf) +{ + SCSIBlockReq *r = (SCSIBlockReq *)req; + r->cmd = req->cmd.buf[0]; + switch (r->cmd >> 5) { + case 0: + /* 6-byte CDB. */ + r->cdb1 = r->group_number = 0; + break; + case 1: + /* 10-byte CDB. */ + r->cdb1 = req->cmd.buf[1]; + r->group_number = req->cmd.buf[6]; + case 4: + /* 12-byte CDB. */ + r->cdb1 = req->cmd.buf[1]; + r->group_number = req->cmd.buf[10]; + break; + case 5: + /* 16-byte CDB. */ + r->cdb1 = req->cmd.buf[1]; + r->group_number = req->cmd.buf[14]; + break; + default: + abort(); + } + + if (r->cdb1 & 0xe0) { + /* Protection information is not supported. */ + scsi_check_condition(&r->req, SENSE_CODE(INVALID_FIELD)); + return 0; + } + + r->req.status = &r->io_header.status; + return scsi_disk_dma_command(req, buf); +} + +static const SCSIReqOps scsi_block_dma_reqops = { + .size = sizeof(SCSIBlockReq), + .free_req = scsi_free_request, + .send_command = scsi_block_dma_command, + .read_data = scsi_read_data, + .write_data = scsi_write_data, + .get_buf = scsi_get_buf, + .load_request = scsi_disk_load_request, + .save_request = scsi_disk_save_request, +}; + static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag, uint32_t lun, uint8_t *buf, void *hba_private) @@ -2637,7 +2785,7 @@ static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag, return scsi_req_alloc(&scsi_generic_req_ops, &s->qdev, tag, lun, hba_private); } else { - return scsi_req_alloc(&scsi_disk_dma_reqops, &s->qdev, tag, lun, + return scsi_req_alloc(&scsi_block_dma_reqops, &s->qdev, tag, lun, hba_private); } } @@ -2656,6 +2804,46 @@ static int scsi_block_parse_cdb(SCSIDevice *d, SCSICommand *cmd, #endif +static +BlockAIOCB *scsi_dma_readv(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + SCSIDiskReq *r = opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + return blk_aio_preadv(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque); +} + +static +BlockAIOCB *scsi_dma_writev(int64_t offset, QEMUIOVector *iov, + BlockCompletionFunc *cb, void *cb_opaque, + void *opaque) +{ + SCSIDiskReq *r = opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); + return blk_aio_pwritev(s->qdev.conf.blk, offset, iov, 0, cb, cb_opaque); +} + +static void scsi_disk_base_class_initfn(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass); + + dc->fw_name = "disk"; + dc->reset = scsi_disk_reset; + sdc->dma_readv = scsi_dma_readv; + sdc->dma_writev = scsi_dma_writev; + sdc->need_fua_emulation = scsi_is_cmd_fua; +} + +static const TypeInfo scsi_disk_base_info = { + .name = TYPE_SCSI_DISK_BASE, + .parent = TYPE_SCSI_DEVICE, + .class_init = scsi_disk_base_class_initfn, + .instance_size = sizeof(SCSIDiskState), + .class_size = sizeof(SCSIDiskClass), +}; + #define DEFINE_SCSI_DISK_PROPERTIES() \ DEFINE_BLOCK_PROPERTIES(SCSIDiskState, qdev.conf), \ DEFINE_PROP_STRING("ver", SCSIDiskState, version), \ @@ -2703,17 +2891,14 @@ static void scsi_hd_class_initfn(ObjectClass *klass, void *data) sc->realize = scsi_hd_realize; sc->alloc_req = scsi_new_request; sc->unit_attention_reported = scsi_disk_unit_attention_reported; - dc->fw_name = "disk"; dc->desc = "virtual SCSI disk"; - dc->reset = scsi_disk_reset; dc->props = scsi_hd_properties; dc->vmsd = &vmstate_scsi_disk_state; } static const TypeInfo scsi_hd_info = { .name = "scsi-hd", - .parent = TYPE_SCSI_DEVICE, - .instance_size = sizeof(SCSIDiskState), + .parent = TYPE_SCSI_DISK_BASE, .class_init = scsi_hd_class_initfn, }; @@ -2735,17 +2920,14 @@ static void scsi_cd_class_initfn(ObjectClass *klass, void *data) sc->realize = scsi_cd_realize; sc->alloc_req = scsi_new_request; sc->unit_attention_reported = scsi_disk_unit_attention_reported; - dc->fw_name = "disk"; dc->desc = "virtual SCSI CD-ROM"; - dc->reset = scsi_disk_reset; dc->props = scsi_cd_properties; dc->vmsd = &vmstate_scsi_disk_state; } static const TypeInfo scsi_cd_info = { .name = "scsi-cd", - .parent = TYPE_SCSI_DEVICE, - .instance_size = sizeof(SCSIDiskState), + .parent = TYPE_SCSI_DISK_BASE, .class_init = scsi_cd_class_initfn, }; @@ -2759,21 +2941,22 @@ static void scsi_block_class_initfn(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); SCSIDeviceClass *sc = SCSI_DEVICE_CLASS(klass); + SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass); sc->realize = scsi_block_realize; sc->alloc_req = scsi_block_new_request; sc->parse_cdb = scsi_block_parse_cdb; - dc->fw_name = "disk"; + sdc->dma_readv = scsi_block_dma_readv; + sdc->dma_writev = scsi_block_dma_writev; + sdc->need_fua_emulation = scsi_block_no_fua; dc->desc = "SCSI block device passthrough"; - dc->reset = scsi_disk_reset; dc->props = scsi_block_properties; dc->vmsd = &vmstate_scsi_disk_state; } static const TypeInfo scsi_block_info = { .name = "scsi-block", - .parent = TYPE_SCSI_DEVICE, - .instance_size = sizeof(SCSIDiskState), + .parent = TYPE_SCSI_DISK_BASE, .class_init = scsi_block_class_initfn, }; #endif @@ -2811,13 +2994,13 @@ static void scsi_disk_class_initfn(ObjectClass *klass, void *data) static const TypeInfo scsi_disk_info = { .name = "scsi-disk", - .parent = TYPE_SCSI_DEVICE, - .instance_size = sizeof(SCSIDiskState), + .parent = TYPE_SCSI_DISK_BASE, .class_init = scsi_disk_class_initfn, }; static void scsi_disk_register_types(void) { + type_register_static(&scsi_disk_base_info); type_register_static(&scsi_hd_info); type_register_static(&scsi_cd_info); #ifdef __linux__ diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 7459465f60..71372a8383 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -222,6 +222,18 @@ static void scsi_read_complete(void * opaque, int ret) r->buf[3] |= 0x80; } } + if (s->type == TYPE_DISK && + r->req.cmd.buf[0] == INQUIRY && + r->req.cmd.buf[2] == 0xb0) { + uint32_t max_xfer_len = blk_get_max_transfer_length(s->conf.blk); + if (max_xfer_len) { + stl_be_p(&r->buf[8], max_xfer_len); + /* Also take care of the opt xfer len. */ + if (ldl_be_p(&r->buf[12]) > max_xfer_len) { + stl_be_p(&r->buf[12], max_xfer_len); + } + } + } scsi_req_data(&r->req, len); scsi_req_unref(&r->req); } diff --git a/hw/scsi/vmw_pvscsi.c b/hw/scsi/vmw_pvscsi.c index f67b5bf7d3..2d7528d1dd 100644 --- a/hw/scsi/vmw_pvscsi.c +++ b/hw/scsi/vmw_pvscsi.c @@ -153,7 +153,7 @@ pvscsi_log2(uint32_t input) return log; } -static void +static int pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri) { int i; @@ -161,6 +161,10 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri) uint32_t req_ring_size, cmp_ring_size; m->rs_pa = ri->ringsStatePPN << VMW_PAGE_SHIFT; + if ((ri->reqRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES) + || (ri->cmpRingNumPages > PVSCSI_SETUP_RINGS_MAX_NUM_PAGES)) { + return -1; + } req_ring_size = ri->reqRingNumPages * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE; cmp_ring_size = ri->cmpRingNumPages * PVSCSI_MAX_NUM_CMP_ENTRIES_PER_PAGE; txr_len_log2 = pvscsi_log2(req_ring_size - 1); @@ -192,15 +196,20 @@ pvscsi_ring_init_data(PVSCSIRingInfo *m, PVSCSICmdDescSetupRings *ri) /* Flush ring state page changes */ smp_wmb(); + + return 0; } -static void +static int pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri) { int i; uint32_t len_log2; uint32_t ring_size; + if (ri->numPages > PVSCSI_SETUP_MSG_RING_MAX_NUM_PAGES) { + return -1; + } ring_size = ri->numPages * PVSCSI_MAX_NUM_MSG_ENTRIES_PER_PAGE; len_log2 = pvscsi_log2(ring_size - 1); @@ -220,6 +229,8 @@ pvscsi_ring_init_msg(PVSCSIRingInfo *m, PVSCSICmdDescSetupMsgRing *ri) /* Flush ring state page changes */ smp_wmb(); + + return 0; } static void @@ -770,7 +781,10 @@ pvscsi_on_cmd_setup_rings(PVSCSIState *s) trace_pvscsi_on_cmd_arrived("PVSCSI_CMD_SETUP_RINGS"); pvscsi_dbg_dump_tx_rings_config(rc); - pvscsi_ring_init_data(&s->rings, rc); + if (pvscsi_ring_init_data(&s->rings, rc) < 0) { + return PVSCSI_COMMAND_PROCESSING_FAILED; + } + s->rings_info_valid = TRUE; return PVSCSI_COMMAND_PROCESSING_SUCCEEDED; } @@ -850,7 +864,9 @@ pvscsi_on_cmd_setup_msg_ring(PVSCSIState *s) } if (s->rings_info_valid) { - pvscsi_ring_init_msg(&s->rings, rc); + if (pvscsi_ring_init_msg(&s->rings, rc) < 0) { + return PVSCSI_COMMAND_PROCESSING_FAILED; + } s->msg_ring_info_valid = TRUE; } return sizeof(PVSCSICmdDescSetupMsgRing) / sizeof(uint32_t); diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 5914e85107..495e09fd4e 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -17,7 +17,6 @@ #include "sysemu/kvm.h" #include "qemu/error-report.h" #include "qemu/sockets.h" -#include "exec/ram_addr.h" #include "migration/migration.h" #include <sys/ioctl.h> @@ -247,18 +246,18 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, for (i = 0; i < dev->mem->nregions; ++i) { struct vhost_memory_region *reg = dev->mem->regions + i; - ram_addr_t ram_addr; + ram_addr_t offset; + MemoryRegion *mr; assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); - qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, - &ram_addr); - fd = qemu_get_ram_fd(ram_addr); + mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr, + &offset); + fd = memory_region_get_fd(mr); if (fd > 0) { msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr; msg.payload.memory.regions[fd_num].memory_size = reg->memory_size; msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; - msg.payload.memory.regions[fd_num].mmap_offset = reg->userspace_addr - - (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr); + msg.payload.memory.regions[fd_num].mmap_offset = offset; assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); fds[fd_num++] = fd; } @@ -616,17 +615,15 @@ static bool vhost_user_can_merge(struct vhost_dev *dev, uint64_t start1, uint64_t size1, uint64_t start2, uint64_t size2) { - ram_addr_t ram_addr; + ram_addr_t offset; int mfd, rfd; MemoryRegion *mr; - mr = qemu_ram_addr_from_host((void *)(uintptr_t)start1, &ram_addr); - assert(mr); - mfd = qemu_get_ram_fd(ram_addr); + mr = memory_region_from_host((void *)(uintptr_t)start1, &offset); + mfd = memory_region_get_fd(mr); - mr = qemu_ram_addr_from_host((void *)(uintptr_t)start2, &ram_addr); - assert(mr); - rfd = qemu_get_ram_fd(ram_addr); + mr = memory_region_from_host((void *)(uintptr_t)start2, &offset); + rfd = memory_region_get_fd(mr); return mfd == rfd; } diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index a2c3b92742..aaee995634 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -57,10 +57,10 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr); void qemu_ram_remap(ram_addr_t addr, ram_addr_t length); /* This should not be used by devices. */ -MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); +ram_addr_t qemu_ram_addr_from_host(void *ptr); RAMBlock *qemu_ram_block_by_name(const char *name); RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, - ram_addr_t *ram_addr, ram_addr_t *offset); + ram_addr_t *offset); void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev); void qemu_ram_unset_idstr(RAMBlock *block); const char *qemu_ram_get_idstr(RAMBlock *rb); diff --git a/include/exec/memory.h b/include/exec/memory.h index f649697ee9..4ab680052f 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -32,6 +32,8 @@ #include "qom/object.h" #include "qemu/rcu.h" +#define RAM_ADDR_INVALID (~(ram_addr_t)0) + #define MAX_PHYS_ADDR_SPACE_BITS 62 #define MAX_PHYS_ADDR (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1) @@ -667,6 +669,35 @@ static inline bool memory_region_is_rom(MemoryRegion *mr) int memory_region_get_fd(MemoryRegion *mr); /** + * memory_region_set_fd: Mark a RAM memory region as backed by a + * file descriptor. + * + * This function is typically used after memory_region_init_ram_ptr(). + * + * @mr: the memory region being queried. + * @fd: the file descriptor that backs @mr. + */ +void memory_region_set_fd(MemoryRegion *mr, int fd); + +/** + * memory_region_from_host: Convert a pointer into a RAM memory region + * and an offset within it. + * + * Given a host pointer inside a RAM memory region (created with + * memory_region_init_ram() or memory_region_init_ram_ptr()), return + * the MemoryRegion and the offset within it. + * + * Use with care; by the time this function returns, the returned pointer is + * not protected by RCU anymore. If the caller is not within an RCU critical + * section and does not hold the iothread lock, it must have other means of + * protecting the pointer, such as a reference to the region that includes + * the incoming ram_addr_t. + * + * @mr: the memory region being queried. + */ +MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset); + +/** * memory_region_get_ram_ptr: Get a pointer into a RAM memory region. * * Returns a host pointer to a RAM memory region (created with @@ -1362,7 +1393,7 @@ MemTxResult address_space_read_continue(AddressSpace *as, hwaddr addr, MemoryRegion *mr); MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, uint8_t *buf, int len); -void *qemu_get_ram_ptr(RAMBlock *ram_block, ram_addr_t addr); +void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr); static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write) { @@ -1400,8 +1431,7 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, l = len; mr = address_space_translate(as, addr, &addr1, &l, false); if (len == l && memory_access_is_direct(mr, false)) { - addr1 += memory_region_get_ram_addr(mr); - ptr = qemu_get_ram_ptr(mr->ram_block, addr1); + ptr = qemu_map_ram_ptr(mr->ram_block, addr1); memcpy(buf, ptr, len); } else { result = address_space_read_continue(as, addr, attrs, buf, len, diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 5b6e1b8b86..2a9465da11 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -105,9 +105,6 @@ RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t max_size, uint64_t length, void *host), MemoryRegion *mr, Error **errp); -int qemu_get_ram_fd(ram_addr_t addr); -void qemu_set_ram_fd(ram_addr_t addr, int fd); -void *qemu_get_ram_block_host_ptr(ram_addr_t addr); void qemu_ram_free(RAMBlock *block); int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp); diff --git a/include/hw/cris/etraxfs.h b/include/hw/cris/etraxfs.h index 73a6134c1e..eb664181e7 100644 --- a/include/hw/cris/etraxfs.h +++ b/include/hw/cris/etraxfs.h @@ -46,4 +46,20 @@ etraxfs_eth_init(NICInfo *nd, hwaddr base, int phyaddr, return dev; } +static inline DeviceState *etraxfs_ser_create(hwaddr addr, + qemu_irq irq, + CharDriverState *chr) +{ + DeviceState *dev; + SysBusDevice *s; + + dev = qdev_create(NULL, "etraxfs,serial"); + s = SYS_BUS_DEVICE(dev); + qdev_prop_set_chr(dev, "chardev", chr); + qdev_init_nofail(dev); + sysbus_mmio_map(s, 0, addr); + sysbus_connect_irq(s, 0, irq); + return dev; +} + #endif diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index 5bc4d6cc47..7a590969b5 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -36,7 +36,18 @@ #define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); barrier(); }) #define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); barrier(); }) +/* Most compilers currently treat consume and acquire the same, but really + * no processors except Alpha need a barrier here. Leave it in if + * using Thread Sanitizer to avoid warnings, otherwise optimize it away. + */ +#if defined(__SANITIZE_THREAD__) #define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); barrier(); }) +#elsif defined(__alpha__) +#define smp_read_barrier_depends() asm volatile("mb":::"memory") +#else +#define smp_read_barrier_depends() barrier() +#endif + /* Weak atomic operations prevent the compiler moving other * loads/stores past the atomic operation load/store. However there is @@ -56,13 +67,23 @@ __atomic_store(ptr, &_val, __ATOMIC_RELAXED); \ } while(0) -/* Atomic RCU operations imply weak memory barriers */ +/* See above: most compilers currently treat consume and acquire the + * same, but this slows down atomic_rcu_read unnecessarily. + */ +#ifdef __SANITIZE_THREAD__ +#define atomic_rcu_read__nocheck(ptr, valptr) \ + __atomic_load(ptr, valptr, __ATOMIC_CONSUME); +#else +#define atomic_rcu_read__nocheck(ptr, valptr) \ + __atomic_load(ptr, valptr, __ATOMIC_RELAXED); \ + smp_read_barrier_depends(); +#endif #define atomic_rcu_read(ptr) \ ({ \ QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \ typeof(*ptr) _val; \ - __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \ + atomic_rcu_read__nocheck(ptr, &_val); \ _val; \ }) @@ -33,8 +33,6 @@ //#define DEBUG_UNASSIGNED -#define RAM_ADDR_INVALID (~(ram_addr_t)0) - static unsigned memory_region_transaction_depth; static bool memory_region_update_pending; static bool ioeventfd_update_pending; @@ -227,6 +225,7 @@ struct FlatRange { hwaddr offset_in_region; AddrRange addr; uint8_t dirty_log_mask; + bool romd_mode; bool readonly; }; @@ -251,6 +250,7 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b) return a->mr == b->mr && addrrange_equal(a->addr, b->addr) && a->offset_in_region == b->offset_in_region + && a->romd_mode == b->romd_mode && a->readonly == b->readonly; } @@ -310,6 +310,7 @@ static bool can_merge(FlatRange *r1, FlatRange *r2) r1->addr.size), int128_make64(r2->offset_in_region)) && r1->dirty_log_mask == r2->dirty_log_mask + && r1->romd_mode == r2->romd_mode && r1->readonly == r2->readonly; } @@ -663,6 +664,7 @@ static void render_memory_region(FlatView *view, fr.mr = mr; fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr); + fr.romd_mode = mr->romd_mode; fr.readonly = readonly; /* Render the region itself into any gaps left by the current view. */ @@ -1622,13 +1624,26 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr, int memory_region_get_fd(MemoryRegion *mr) { - if (mr->alias) { - return memory_region_get_fd(mr->alias); + int fd; + + rcu_read_lock(); + while (mr->alias) { + mr = mr->alias; } + fd = mr->ram_block->fd; + rcu_read_unlock(); - assert(mr->ram_block); + return fd; +} - return qemu_get_ram_fd(memory_region_get_ram_addr(mr)); +void memory_region_set_fd(MemoryRegion *mr, int fd) +{ + rcu_read_lock(); + while (mr->alias) { + mr = mr->alias; + } + mr->ram_block->fd = fd; + rcu_read_unlock(); } void *memory_region_get_ram_ptr(MemoryRegion *mr) @@ -1642,10 +1657,22 @@ void *memory_region_get_ram_ptr(MemoryRegion *mr) mr = mr->alias; } assert(mr->ram_block); - ptr = qemu_get_ram_ptr(mr->ram_block, memory_region_get_ram_addr(mr)); + ptr = qemu_map_ram_ptr(mr->ram_block, offset); rcu_read_unlock(); - return ptr + offset; + return ptr; +} + +MemoryRegion *memory_region_from_host(void *ptr, ram_addr_t *offset) +{ + RAMBlock *block; + + block = qemu_ram_block_from_host(ptr, false, offset); + if (!block) { + return NULL; + } + + return block->mr; } ram_addr_t memory_region_get_ram_addr(MemoryRegion *mr) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index fbd0064fce..cf7dcd25d4 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -407,7 +407,6 @@ static void *postcopy_ram_fault_thread(void *opaque) while (true) { ram_addr_t rb_offset; - ram_addr_t in_raspace; struct pollfd pfd[2]; /* @@ -459,7 +458,7 @@ static void *postcopy_ram_fault_thread(void *opaque) rb = qemu_ram_block_from_host( (void *)(uintptr_t)msg.arg.pagefault.address, - true, &in_raspace, &rb_offset); + true, &rb_offset); if (!rb) { error_report("postcopy_ram_fault_thread: Fault outside guest: %" PRIx64, (uint64_t)msg.arg.pagefault.address); diff --git a/nbd/server.c b/nbd/server.c index fa862cd622..b2cfeb9843 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1153,12 +1153,20 @@ static void nbd_trip(void *opaque) break; case NBD_CMD_TRIM: TRACE("Request type is TRIM"); - ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset) - / BDRV_SECTOR_SIZE, - request.len / BDRV_SECTOR_SIZE); - if (ret < 0) { - LOG("discard failed"); - reply.error = -ret; + /* Ignore unaligned head or tail, until block layer adds byte + * interface */ + if (request.len >= BDRV_SECTOR_SIZE) { + request.len -= (request.from + request.len) % BDRV_SECTOR_SIZE; + ret = blk_co_discard(exp->blk, + DIV_ROUND_UP(request.from + exp->dev_offset, + BDRV_SECTOR_SIZE), + request.len / BDRV_SECTOR_SIZE); + if (ret < 0) { + LOG("discard failed"); + reply.error = -ret; + } + } else { + TRACE("trim request too small, ignoring"); } if (nbd_co_send_reply(req, &reply, 0) < 0) { goto out; diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py index eb24f7874b..9956fc036c 100644 --- a/scripts/dump-guest-memory.py +++ b/scripts/dump-guest-memory.py @@ -328,23 +328,10 @@ def qlist_foreach(head, field_str): yield var -def qemu_get_ram_block(ram_addr): - """Returns the RAMBlock struct to which the given address belongs.""" - - ram_blocks = gdb.parse_and_eval("ram_list.blocks") - - for block in qlist_foreach(ram_blocks, "next"): - if (ram_addr - block["offset"]) < block["used_length"]: - return block - - raise gdb.GdbError("Bad ram offset %x" % ram_addr) - - -def qemu_get_ram_ptr(ram_addr): +def qemu_map_ram_ptr(block, offset): """Returns qemu vaddr for given guest physical address.""" - block = qemu_get_ram_block(ram_addr) - return block["host"] + (ram_addr - block["offset"]) + return block["host"] + offset def memory_region_get_ram_ptr(memory_region): @@ -352,7 +339,7 @@ def memory_region_get_ram_ptr(memory_region): return (memory_region_get_ram_ptr(memory_region["alias"].dereference()) + memory_region["alias_offset"]) - return qemu_get_ram_ptr(memory_region["ram_block"]["offset"]) + return qemu_map_ram_ptr(memory_region["ram_block"], 0) def get_guest_phys_blocks(): diff --git a/scripts/kvm/kvm_stat b/scripts/kvm/kvm_stat deleted file mode 100755 index 769d884b6d..0000000000 --- a/scripts/kvm/kvm_stat +++ /dev/null @@ -1,825 +0,0 @@ -#!/usr/bin/python -# -# top-like utility for displaying kvm statistics -# -# Copyright 2006-2008 Qumranet Technologies -# Copyright 2008-2011 Red Hat, Inc. -# -# Authors: -# Avi Kivity <avi@redhat.com> -# -# This work is licensed under the terms of the GNU GPL, version 2. See -# the COPYING file in the top-level directory. - -import curses -import sys -import os -import time -import optparse -import ctypes -import fcntl -import resource -import struct -import re -from collections import defaultdict -from time import sleep - -VMX_EXIT_REASONS = { - 'EXCEPTION_NMI': 0, - 'EXTERNAL_INTERRUPT': 1, - 'TRIPLE_FAULT': 2, - 'PENDING_INTERRUPT': 7, - 'NMI_WINDOW': 8, - 'TASK_SWITCH': 9, - 'CPUID': 10, - 'HLT': 12, - 'INVLPG': 14, - 'RDPMC': 15, - 'RDTSC': 16, - 'VMCALL': 18, - 'VMCLEAR': 19, - 'VMLAUNCH': 20, - 'VMPTRLD': 21, - 'VMPTRST': 22, - 'VMREAD': 23, - 'VMRESUME': 24, - 'VMWRITE': 25, - 'VMOFF': 26, - 'VMON': 27, - 'CR_ACCESS': 28, - 'DR_ACCESS': 29, - 'IO_INSTRUCTION': 30, - 'MSR_READ': 31, - 'MSR_WRITE': 32, - 'INVALID_STATE': 33, - 'MWAIT_INSTRUCTION': 36, - 'MONITOR_INSTRUCTION': 39, - 'PAUSE_INSTRUCTION': 40, - 'MCE_DURING_VMENTRY': 41, - 'TPR_BELOW_THRESHOLD': 43, - 'APIC_ACCESS': 44, - 'EPT_VIOLATION': 48, - 'EPT_MISCONFIG': 49, - 'WBINVD': 54, - 'XSETBV': 55, - 'APIC_WRITE': 56, - 'INVPCID': 58, -} - -SVM_EXIT_REASONS = { - 'READ_CR0': 0x000, - 'READ_CR3': 0x003, - 'READ_CR4': 0x004, - 'READ_CR8': 0x008, - 'WRITE_CR0': 0x010, - 'WRITE_CR3': 0x013, - 'WRITE_CR4': 0x014, - 'WRITE_CR8': 0x018, - 'READ_DR0': 0x020, - 'READ_DR1': 0x021, - 'READ_DR2': 0x022, - 'READ_DR3': 0x023, - 'READ_DR4': 0x024, - 'READ_DR5': 0x025, - 'READ_DR6': 0x026, - 'READ_DR7': 0x027, - 'WRITE_DR0': 0x030, - 'WRITE_DR1': 0x031, - 'WRITE_DR2': 0x032, - 'WRITE_DR3': 0x033, - 'WRITE_DR4': 0x034, - 'WRITE_DR5': 0x035, - 'WRITE_DR6': 0x036, - 'WRITE_DR7': 0x037, - 'EXCP_BASE': 0x040, - 'INTR': 0x060, - 'NMI': 0x061, - 'SMI': 0x062, - 'INIT': 0x063, - 'VINTR': 0x064, - 'CR0_SEL_WRITE': 0x065, - 'IDTR_READ': 0x066, - 'GDTR_READ': 0x067, - 'LDTR_READ': 0x068, - 'TR_READ': 0x069, - 'IDTR_WRITE': 0x06a, - 'GDTR_WRITE': 0x06b, - 'LDTR_WRITE': 0x06c, - 'TR_WRITE': 0x06d, - 'RDTSC': 0x06e, - 'RDPMC': 0x06f, - 'PUSHF': 0x070, - 'POPF': 0x071, - 'CPUID': 0x072, - 'RSM': 0x073, - 'IRET': 0x074, - 'SWINT': 0x075, - 'INVD': 0x076, - 'PAUSE': 0x077, - 'HLT': 0x078, - 'INVLPG': 0x079, - 'INVLPGA': 0x07a, - 'IOIO': 0x07b, - 'MSR': 0x07c, - 'TASK_SWITCH': 0x07d, - 'FERR_FREEZE': 0x07e, - 'SHUTDOWN': 0x07f, - 'VMRUN': 0x080, - 'VMMCALL': 0x081, - 'VMLOAD': 0x082, - 'VMSAVE': 0x083, - 'STGI': 0x084, - 'CLGI': 0x085, - 'SKINIT': 0x086, - 'RDTSCP': 0x087, - 'ICEBP': 0x088, - 'WBINVD': 0x089, - 'MONITOR': 0x08a, - 'MWAIT': 0x08b, - 'MWAIT_COND': 0x08c, - 'XSETBV': 0x08d, - 'NPF': 0x400, -} - -# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) -AARCH64_EXIT_REASONS = { - 'UNKNOWN': 0x00, - 'WFI': 0x01, - 'CP15_32': 0x03, - 'CP15_64': 0x04, - 'CP14_MR': 0x05, - 'CP14_LS': 0x06, - 'FP_ASIMD': 0x07, - 'CP10_ID': 0x08, - 'CP14_64': 0x0C, - 'ILL_ISS': 0x0E, - 'SVC32': 0x11, - 'HVC32': 0x12, - 'SMC32': 0x13, - 'SVC64': 0x15, - 'HVC64': 0x16, - 'SMC64': 0x17, - 'SYS64': 0x18, - 'IABT': 0x20, - 'IABT_HYP': 0x21, - 'PC_ALIGN': 0x22, - 'DABT': 0x24, - 'DABT_HYP': 0x25, - 'SP_ALIGN': 0x26, - 'FP_EXC32': 0x28, - 'FP_EXC64': 0x2C, - 'SERROR': 0x2F, - 'BREAKPT': 0x30, - 'BREAKPT_HYP': 0x31, - 'SOFTSTP': 0x32, - 'SOFTSTP_HYP': 0x33, - 'WATCHPT': 0x34, - 'WATCHPT_HYP': 0x35, - 'BKPT32': 0x38, - 'VECTOR32': 0x3A, - 'BRK64': 0x3C, -} - -# From include/uapi/linux/kvm.h, KVM_EXIT_xxx -USERSPACE_EXIT_REASONS = { - 'UNKNOWN': 0, - 'EXCEPTION': 1, - 'IO': 2, - 'HYPERCALL': 3, - 'DEBUG': 4, - 'HLT': 5, - 'MMIO': 6, - 'IRQ_WINDOW_OPEN': 7, - 'SHUTDOWN': 8, - 'FAIL_ENTRY': 9, - 'INTR': 10, - 'SET_TPR': 11, - 'TPR_ACCESS': 12, - 'S390_SIEIC': 13, - 'S390_RESET': 14, - 'DCR': 15, - 'NMI': 16, - 'INTERNAL_ERROR': 17, - 'OSI': 18, - 'PAPR_HCALL': 19, - 'S390_UCONTROL': 20, - 'WATCHDOG': 21, - 'S390_TSCH': 22, - 'EPR': 23, - 'SYSTEM_EVENT': 24, -} - -IOCTL_NUMBERS = { - 'SET_FILTER': 0x40082406, - 'ENABLE': 0x00002400, - 'DISABLE': 0x00002401, - 'RESET': 0x00002403, -} - -class Arch(object): - """Class that encapsulates global architecture specific data like - syscall and ioctl numbers. - - """ - @staticmethod - def get_arch(): - machine = os.uname()[4] - - if machine.startswith('ppc'): - return ArchPPC() - elif machine.startswith('aarch64'): - return ArchA64() - elif machine.startswith('s390'): - return ArchS390() - else: - # X86_64 - for line in open('/proc/cpuinfo'): - if not line.startswith('flags'): - continue - - flags = line.split() - if 'vmx' in flags: - return ArchX86(VMX_EXIT_REASONS) - if 'svm' in flags: - return ArchX86(SVM_EXIT_REASONS) - return - -class ArchX86(Arch): - def __init__(self, exit_reasons): - self.sc_perf_evt_open = 298 - self.ioctl_numbers = IOCTL_NUMBERS - self.exit_reasons = exit_reasons - -class ArchPPC(Arch): - def __init__(self): - self.sc_perf_evt_open = 319 - self.ioctl_numbers = IOCTL_NUMBERS - self.ioctl_numbers['ENABLE'] = 0x20002400 - self.ioctl_numbers['DISABLE'] = 0x20002401 - - # PPC comes in 32 and 64 bit and some generated ioctl - # numbers depend on the wordsize. - char_ptr_size = ctypes.sizeof(ctypes.c_char_p) - self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 - -class ArchA64(Arch): - def __init__(self): - self.sc_perf_evt_open = 241 - self.ioctl_numbers = IOCTL_NUMBERS - self.exit_reasons = AARCH64_EXIT_REASONS - -class ArchS390(Arch): - def __init__(self): - self.sc_perf_evt_open = 331 - self.ioctl_numbers = IOCTL_NUMBERS - self.exit_reasons = None - -ARCH = Arch.get_arch() - - -def walkdir(path): - """Returns os.walk() data for specified directory. - - As it is only a wrapper it returns the same 3-tuple of (dirpath, - dirnames, filenames). - """ - return next(os.walk(path)) - - -def parse_int_list(list_string): - """Returns an int list from a string of comma separated integers and - integer ranges.""" - integers = [] - members = list_string.split(',') - - for member in members: - if '-' not in member: - integers.append(int(member)) - else: - int_range = member.split('-') - integers.extend(range(int(int_range[0]), - int(int_range[1]) + 1)) - - return integers - - -def get_online_cpus(): - with open('/sys/devices/system/cpu/online') as cpu_list: - cpu_string = cpu_list.readline() - return parse_int_list(cpu_string) - - -def get_filters(): - filters = {} - filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) - if ARCH.exit_reasons: - filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) - return filters - -libc = ctypes.CDLL('libc.so.6', use_errno=True) -syscall = libc.syscall - -class perf_event_attr(ctypes.Structure): - _fields_ = [('type', ctypes.c_uint32), - ('size', ctypes.c_uint32), - ('config', ctypes.c_uint64), - ('sample_freq', ctypes.c_uint64), - ('sample_type', ctypes.c_uint64), - ('read_format', ctypes.c_uint64), - ('flags', ctypes.c_uint64), - ('wakeup_events', ctypes.c_uint32), - ('bp_type', ctypes.c_uint32), - ('bp_addr', ctypes.c_uint64), - ('bp_len', ctypes.c_uint64), - ] - - def __init__(self): - super(self.__class__, self).__init__() - self.type = PERF_TYPE_TRACEPOINT - self.size = ctypes.sizeof(self) - self.read_format = PERF_FORMAT_GROUP - -def perf_event_open(attr, pid, cpu, group_fd, flags): - return syscall(ARCH.sc_perf_evt_open, ctypes.pointer(attr), - ctypes.c_int(pid), ctypes.c_int(cpu), - ctypes.c_int(group_fd), ctypes.c_long(flags)) - -PERF_TYPE_TRACEPOINT = 2 -PERF_FORMAT_GROUP = 1 << 3 - -PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' -PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' - -class Group(object): - def __init__(self): - self.events = [] - - def add_event(self, event): - self.events.append(event) - - def read(self): - length = 8 * (1 + len(self.events)) - read_format = 'xxxxxxxx' + 'Q' * len(self.events) - return dict(zip([event.name for event in self.events], - struct.unpack(read_format, - os.read(self.events[0].fd, length)))) - -class Event(object): - def __init__(self, name, group, trace_cpu, trace_point, trace_filter, - trace_set='kvm'): - self.name = name - self.fd = None - self.setup_event(group, trace_cpu, trace_point, trace_filter, - trace_set) - - def setup_event_attribute(self, trace_set, trace_point): - id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, - trace_point, 'id') - - event_attr = perf_event_attr() - event_attr.config = int(open(id_path).read()) - return event_attr - - def setup_event(self, group, trace_cpu, trace_point, trace_filter, - trace_set): - event_attr = self.setup_event_attribute(trace_set, trace_point) - - group_leader = -1 - if group.events: - group_leader = group.events[0].fd - - fd = perf_event_open(event_attr, -1, trace_cpu, - group_leader, 0) - if fd == -1: - err = ctypes.get_errno() - raise OSError(err, os.strerror(err), - 'while calling sys_perf_event_open().') - - if trace_filter: - fcntl.ioctl(fd, ARCH.ioctl_numbers['SET_FILTER'], - trace_filter) - - self.fd = fd - - def enable(self): - fcntl.ioctl(self.fd, ARCH.ioctl_numbers['ENABLE'], 0) - - def disable(self): - fcntl.ioctl(self.fd, ARCH.ioctl_numbers['DISABLE'], 0) - - def reset(self): - fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) - -class TracepointProvider(object): - def __init__(self): - self.group_leaders = [] - self.filters = get_filters() - self._fields = self.get_available_fields() - self.setup_traces() - self.fields = self._fields - - def get_available_fields(self): - path = os.path.join(PATH_DEBUGFS_TRACING, 'events', 'kvm') - fields = walkdir(path)[1] - extra = [] - for field in fields: - if field in self.filters: - filter_name_, filter_dicts = self.filters[field] - for name in filter_dicts: - extra.append(field + '(' + name + ')') - fields += extra - return fields - - def setup_traces(self): - cpus = get_online_cpus() - - # The constant is needed as a buffer for python libs, std - # streams and other files that the script opens. - newlim = len(cpus) * len(self._fields) + 50 - try: - softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) - - if hardlim < newlim: - # Now we need CAP_SYS_RESOURCE, to increase the hard limit. - resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, newlim)) - else: - # Raising the soft limit is sufficient. - resource.setrlimit(resource.RLIMIT_NOFILE, (newlim, hardlim)) - - except ValueError: - sys.exit("NOFILE rlimit could not be raised to {0}".format(newlim)) - - for cpu in cpus: - group = Group() - for name in self._fields: - tracepoint = name - tracefilter = None - match = re.match(r'(.*)\((.*)\)', name) - if match: - tracepoint, sub = match.groups() - tracefilter = ('%s==%d\0' % - (self.filters[tracepoint][0], - self.filters[tracepoint][1][sub])) - - group.add_event(Event(name=name, - group=group, - trace_cpu=cpu, - trace_point=tracepoint, - trace_filter=tracefilter)) - self.group_leaders.append(group) - - def available_fields(self): - return self.get_available_fields() - - @property - def fields(self): - return self._fields - - @fields.setter - def fields(self, fields): - self._fields = fields - for group in self.group_leaders: - for index, event in enumerate(group.events): - if event.name in fields: - event.reset() - event.enable() - else: - # Do not disable the group leader. - # It would disable all of its events. - if index != 0: - event.disable() - - def read(self): - ret = defaultdict(int) - for group in self.group_leaders: - for name, val in group.read().iteritems(): - if name in self._fields: - ret[name] += val - return ret - -class DebugfsProvider(object): - def __init__(self): - self._fields = self.get_available_fields() - - def get_available_fields(self): - return walkdir(PATH_DEBUGFS_KVM)[2] - - @property - def fields(self): - return self._fields - - @fields.setter - def fields(self, fields): - self._fields = fields - - def read(self): - def val(key): - return int(file(PATH_DEBUGFS_KVM + '/' + key).read()) - return dict([(key, val(key)) for key in self._fields]) - -class Stats(object): - def __init__(self, providers, fields=None): - self.providers = providers - self._fields_filter = fields - self.values = {} - self.update_provider_filters() - - def update_provider_filters(self): - def wanted(key): - if not self._fields_filter: - return True - return re.match(self._fields_filter, key) is not None - - # As we reset the counters when updating the fields we can - # also clear the cache of old values. - self.values = {} - for provider in self.providers: - provider_fields = [key for key in provider.get_available_fields() - if wanted(key)] - provider.fields = provider_fields - - @property - def fields_filter(self): - return self._fields_filter - - @fields_filter.setter - def fields_filter(self, fields_filter): - self._fields_filter = fields_filter - self.update_provider_filters() - - def get(self): - for provider in self.providers: - new = provider.read() - for key in provider.fields: - oldval = self.values.get(key, (0, 0)) - newval = new.get(key, 0) - newdelta = None - if oldval is not None: - newdelta = newval - oldval[0] - self.values[key] = (newval, newdelta) - return self.values - -LABEL_WIDTH = 40 -NUMBER_WIDTH = 10 - -class Tui(object): - def __init__(self, stats): - self.stats = stats - self.screen = None - self.drilldown = False - self.update_drilldown() - - def __enter__(self): - """Initialises curses for later use. Based on curses.wrapper - implementation from the Python standard library.""" - self.screen = curses.initscr() - curses.noecho() - curses.cbreak() - - # The try/catch works around a minor bit of - # over-conscientiousness in the curses module, the error - # return from C start_color() is ignorable. - try: - curses.start_color() - except: - pass - - curses.use_default_colors() - return self - - def __exit__(self, *exception): - """Resets the terminal to its normal state. Based on curses.wrappre - implementation from the Python standard library.""" - if self.screen: - self.screen.keypad(0) - curses.echo() - curses.nocbreak() - curses.endwin() - - def update_drilldown(self): - if not self.stats.fields_filter: - self.stats.fields_filter = r'^[^\(]*$' - - elif self.stats.fields_filter == r'^[^\(]*$': - self.stats.fields_filter = None - - def refresh(self, sleeptime): - self.screen.erase() - self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) - self.screen.addstr(2, 1, 'Event') - self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - - len('Total'), 'Total') - self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - - len('Current'), 'Current') - row = 3 - stats = self.stats.get() - def sortkey(x): - if stats[x][1]: - return (-stats[x][1], -stats[x][0]) - else: - return (0, -stats[x][0]) - for key in sorted(stats.keys(), key=sortkey): - - if row >= self.screen.getmaxyx()[0]: - break - values = stats[key] - if not values[0] and not values[1]: - break - col = 1 - self.screen.addstr(row, col, key) - col += LABEL_WIDTH - self.screen.addstr(row, col, '%10d' % (values[0],)) - col += NUMBER_WIDTH - if values[1] is not None: - self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) - row += 1 - self.screen.refresh() - - def show_filter_selection(self): - while True: - self.screen.erase() - self.screen.addstr(0, 0, - "Show statistics for events matching a regex.", - curses.A_BOLD) - self.screen.addstr(2, 0, - "Current regex: {0}" - .format(self.stats.fields_filter)) - self.screen.addstr(3, 0, "New regex: ") - curses.echo() - regex = self.screen.getstr() - curses.noecho() - if len(regex) == 0: - return - try: - re.compile(regex) - self.stats.fields_filter = regex - return - except re.error: - continue - - def show_stats(self): - sleeptime = 0.25 - while True: - self.refresh(sleeptime) - curses.halfdelay(int(sleeptime * 10)) - sleeptime = 3 - try: - char = self.screen.getkey() - if char == 'x': - self.drilldown = not self.drilldown - self.update_drilldown() - if char == 'q': - break - if char == 'f': - self.show_filter_selection() - except KeyboardInterrupt: - break - except curses.error: - continue - -def batch(stats): - s = stats.get() - time.sleep(1) - s = stats.get() - for key in sorted(s.keys()): - values = s[key] - print '%-42s%10d%10d' % (key, values[0], values[1]) - -def log(stats): - keys = sorted(stats.get().iterkeys()) - def banner(): - for k in keys: - print '%s' % k, - print - def statline(): - s = stats.get() - for k in keys: - print ' %9d' % s[k][1], - print - line = 0 - banner_repeat = 20 - while True: - time.sleep(1) - if line % banner_repeat == 0: - banner() - statline() - line += 1 - -def get_options(): - description_text = """ -This script displays various statistics about VMs running under KVM. -The statistics are gathered from the KVM debugfs entries and / or the -currently available perf traces. - -The monitoring takes additional cpu cycles and might affect the VM's -performance. - -Requirements: -- Access to: - /sys/kernel/debug/kvm - /sys/kernel/debug/trace/events/* - /proc/pid/task -- /proc/sys/kernel/perf_event_paranoid < 1 if user has no - CAP_SYS_ADMIN and perf events are used. -- CAP_SYS_RESOURCE if the hard limit is not high enough to allow - the large number of files that are possibly opened. -""" - - class PlainHelpFormatter(optparse.IndentedHelpFormatter): - def format_description(self, description): - if description: - return description + "\n" - else: - return "" - - optparser = optparse.OptionParser(description=description_text, - formatter=PlainHelpFormatter()) - optparser.add_option('-1', '--once', '--batch', - action='store_true', - default=False, - dest='once', - help='run in batch mode for one second', - ) - optparser.add_option('-l', '--log', - action='store_true', - default=False, - dest='log', - help='run in logging mode (like vmstat)', - ) - optparser.add_option('-t', '--tracepoints', - action='store_true', - default=False, - dest='tracepoints', - help='retrieve statistics from tracepoints', - ) - optparser.add_option('-d', '--debugfs', - action='store_true', - default=False, - dest='debugfs', - help='retrieve statistics from debugfs', - ) - optparser.add_option('-f', '--fields', - action='store', - default=None, - dest='fields', - help='fields to display (regex)', - ) - (options, _) = optparser.parse_args(sys.argv) - return options - -def get_providers(options): - providers = [] - - if options.tracepoints: - providers.append(TracepointProvider()) - if options.debugfs: - providers.append(DebugfsProvider()) - if len(providers) == 0: - providers.append(TracepointProvider()) - - return providers - -def check_access(options): - if not os.path.exists('/sys/kernel/debug'): - sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') - sys.exit(1) - - if not os.path.exists(PATH_DEBUGFS_KVM): - sys.stderr.write("Please make sure, that debugfs is mounted and " - "readable by the current user:\n" - "('mount -t debugfs debugfs /sys/kernel/debug')\n" - "Also ensure, that the kvm modules are loaded.\n") - sys.exit(1) - - if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints - or not options.debugfs): - sys.stderr.write("Please enable CONFIG_TRACING in your kernel " - "when using the option -t (default).\n" - "If it is enabled, make {0} readable by the " - "current user.\n" - .format(PATH_DEBUGFS_TRACING)) - if options.tracepoints: - sys.exit(1) - - sys.stderr.write("Falling back to debugfs statistics!\n") - options.debugfs = True - sleep(5) - - return options - -def main(): - options = get_options() - options = check_access(options) - providers = get_providers(options) - stats = Stats(providers, fields=options.fields) - - if options.log: - log(stats) - elif not options.once: - with Tui(stats) as tui: - tui.show_stats() - else: - batch(stats) - -if __name__ == "__main__": - main() diff --git a/scripts/kvm/kvm_stat.texi b/scripts/kvm/kvm_stat.texi deleted file mode 100644 index 6ce00d80e7..0000000000 --- a/scripts/kvm/kvm_stat.texi +++ /dev/null @@ -1,55 +0,0 @@ -@example -@c man begin SYNOPSIS -usage: kvm_stat [OPTION]... -@c man end -@end example - -@c man begin DESCRIPTION - -kvm_stat prints counts of KVM kernel module trace events. These events signify -state transitions such as guest mode entry and exit. - -This tool is useful for observing guest behavior from the host perspective. -Often conclusions about performance or buggy behavior can be drawn from the -output. - -The set of KVM kernel module trace events may be specific to the kernel version -or architecture. It is best to check the KVM kernel module source code for the -meaning of events. - -Note that trace events are counted globally across all running guests. - -@c man end - -@c man begin OPTIONS -@table @option -@item -1, --once, --batch - run in batch mode for one second -@item -l, --log - run in logging mode (like vmstat) -@item -t, --tracepoints - retrieve statistics from tracepoints -@item -d, --debugfs - retrieve statistics from debugfs -@item -f, --fields=@var{fields} - fields to display (regex) -@item -h, --help - show help message -@end table - -@c man end - -@ignore - -@setfilename kvm_stat -@settitle Report KVM kernel module event counters. - -@c man begin AUTHOR -Stefan Hajnoczi <stefanha@redhat.com> -@c man end - -@c man begin SEEALSO -perf(1), trace-cmd(1) -@c man end - -@end ignore diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 7b3667a089..abf50e6632 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -411,7 +411,8 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) if ((env->mcg_cap & MCG_SER_P) && addr && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) { - if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL || + ram_addr = qemu_ram_addr_from_host(addr); + if (ram_addr == RAM_ADDR_INVALID || !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " "QEMU itself instead of guest system!\n"); @@ -445,7 +446,8 @@ int kvm_arch_on_sigbus(int code, void *addr) hwaddr paddr; /* Hope we are lucky for AO MCE */ - if (qemu_ram_addr_from_host(addr, &ram_addr) == NULL || + ram_addr = qemu_ram_addr_from_host(addr); + if (ram_addr == RAM_ADDR_INVALID || !kvm_physical_memory_addr_from_host(first_cpu->kvm_state, addr, &paddr)) { fprintf(stderr, "Hardware memory error for memory used by " @@ -511,8 +511,13 @@ static void xen_io_add(MemoryListener *listener, MemoryRegionSection *section) { XenIOState *state = container_of(listener, XenIOState, io_listener); + MemoryRegion *mr = section->mr; - memory_region_ref(section->mr); + if (mr->ops == &unassigned_io_ops) { + return; + } + + memory_region_ref(mr); xen_map_io_section(xen_xc, xen_domid, state->ioservid, section); } @@ -521,10 +526,15 @@ static void xen_io_del(MemoryListener *listener, MemoryRegionSection *section) { XenIOState *state = container_of(listener, XenIOState, io_listener); + MemoryRegion *mr = section->mr; + + if (mr->ops == &unassigned_io_ops) { + return; + } xen_unmap_io_section(xen_xc, xen_domid, state->ioservid, section); - memory_region_unref(section->mr); + memory_region_unref(mr); } static void xen_device_realize(DeviceListener *listener, |