diff options
58 files changed, 3837 insertions, 574 deletions
@@ -777,7 +777,6 @@ bepo cz ifdef INSTALL_BLOBS BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \ vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin vgabios-virtio.bin \ -acpi-dsdt.aml \ ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \ pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \ pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \ @@ -1048,6 +1047,9 @@ endif include $(SRC_PATH)/tests/docker/Makefile.include include $(SRC_PATH)/tests/vm/Makefile.include +printgen: + @echo $(GENERATED_FILES) + .PHONY: help help: @echo 'Generic targets:' @@ -1 +1 @@ -2.11.50 +2.11.90 diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 2e358b5bce..beeed0c43f 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -26,9 +26,20 @@ #include <sys/socket.h> #include <sys/eventfd.h> #include <sys/mman.h> +#include "qemu/compiler.h" + +#if defined(__linux__) +#include <sys/syscall.h> +#include <fcntl.h> +#include <sys/ioctl.h> #include <linux/vhost.h> -#include "qemu/compiler.h" +#ifdef __NR_userfaultfd +#include <linux/userfaultfd.h> +#endif + +#endif + #include "qemu/atomic.h" #include "libvhost-user.h" @@ -86,6 +97,9 @@ vu_request_to_string(unsigned int req) REQ(VHOST_USER_SET_VRING_ENDIAN), REQ(VHOST_USER_GET_CONFIG), REQ(VHOST_USER_SET_CONFIG), + REQ(VHOST_USER_POSTCOPY_ADVISE), + REQ(VHOST_USER_POSTCOPY_LISTEN), + REQ(VHOST_USER_POSTCOPY_END), REQ(VHOST_USER_MAX), }; #undef REQ @@ -171,6 +185,35 @@ vmsg_close_fds(VhostUserMsg *vmsg) } } +/* A test to see if we have userfault available */ +static bool +have_userfault(void) +{ +#if defined(__linux__) && defined(__NR_userfaultfd) &&\ + defined(UFFD_FEATURE_MISSING_SHMEM) &&\ + defined(UFFD_FEATURE_MISSING_HUGETLBFS) + /* Now test the kernel we're running on really has the features */ + int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + struct uffdio_api api_struct; + if (ufd < 0) { + return false; + } + + api_struct.api = UFFD_API; + api_struct.features = UFFD_FEATURE_MISSING_SHMEM | + UFFD_FEATURE_MISSING_HUGETLBFS; + if (ioctl(ufd, UFFDIO_API, &api_struct)) { + close(ufd); + return false; + } + close(ufd); + return true; + +#else + return false; +#endif +} + static bool vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { @@ -245,6 +288,31 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) { int rc; uint8_t *p = (uint8_t *)vmsg; + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + }; + struct cmsghdr *cmsg; + + memset(control, 0, sizeof(control)); + assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS); + if (vmsg->fd_num > 0) { + size_t fdsize = vmsg->fd_num * sizeof(int); + msg.msg_controllen = CMSG_SPACE(fdsize); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); + } else { + msg.msg_controllen = 0; + } /* Set the version in the flags when sending the reply */ vmsg->flags &= ~VHOST_USER_VERSION_MASK; @@ -252,7 +320,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) vmsg->flags |= VHOST_USER_REPLY_MASK; do { - rc = write(conn_fd, p, VHOST_USER_HDR_SIZE); + rc = sendmsg(conn_fd, &msg, 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); do { @@ -345,6 +413,7 @@ vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) } vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); @@ -410,6 +479,148 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) } static bool +vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. + * In postcopy we're using PROT_NONE here to catch anyone + * accessing it before we userfault + */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_NONE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + dev_region->mmap_addr); + } + + /* Return the address to QEMU so that it can translate the ufd + * fault addresses back. + */ + msg_region->userspace_addr = (uintptr_t)(mmap_addr + + dev_region->mmap_offset); + close(vmsg->fds[i]); + } + + /* Send the message back to qemu with the addresses filled in */ + vmsg->fd_num = 0; + if (!vu_message_write(dev, dev->sock, vmsg)) { + vu_panic(dev, "failed to respond to set-mem-table for postcopy"); + return false; + } + + /* Wait for QEMU to confirm that it's registered the handler for the + * faults. + */ + if (!vu_message_read(dev, dev->sock, vmsg) || + vmsg->size != sizeof(vmsg->payload.u64) || + vmsg->payload.u64 != 0) { + vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table"); + return false; + } + + /* OK, now we can go and register the memory and generate faults */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *dev_region = &dev->regions[i]; + int ret; +#ifdef UFFDIO_REGISTER + /* We should already have an open ufd. Mark each memory + * range as ufd. + * Discard any mapping we have here; note I can't use MADV_REMOVE + * or fallocate to make the hole since I don't want to lose + * data that's already arrived in the shared process. + * TODO: How to do hugepage + */ + ret = madvise((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_DONTNEED); + if (ret) { + fprintf(stderr, + "%s: Failed to madvise(DONTNEED) region %d: %s\n", + __func__, i, strerror(errno)); + } + /* Turn off transparent hugepages so we dont get lose wakeups + * in neighbouring pages. + * TODO: Turn this backon later. + */ + ret = madvise((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + MADV_NOHUGEPAGE); + if (ret) { + /* Note: This can happen legally on kernels that are configured + * without madvise'able hugepages + */ + fprintf(stderr, + "%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n", + __func__, i, strerror(errno)); + } + struct uffdio_register reg_struct; + reg_struct.range.start = (uintptr_t)dev_region->mmap_addr; + reg_struct.range.len = dev_region->size + dev_region->mmap_offset; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) { + vu_panic(dev, "%s: Failed to userfault region %d " + "@%p + size:%zx offset: %zx: (ufd=%d)%s\n", + __func__, i, + dev_region->mmap_addr, + dev_region->size, dev_region->mmap_offset, + dev->postcopy_ufd, strerror(errno)); + return false; + } + if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) { + vu_panic(dev, "%s Region (%d) doesn't support COPY", + __func__, i); + return false; + } + DPRINT("%s: region %d: Registered userfault for %llx + %llx\n", + __func__, i, reg_struct.range.start, reg_struct.range.len); + /* Now it's registered we can let the client at it */ + if (mprotect((void *)dev_region->mmap_addr, + dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE)) { + vu_panic(dev, "failed to mprotect region %d for postcopy (%s)", + i, strerror(errno)); + return false; + } + /* TODO: Stash 'zero' support flags somewhere */ +#endif + } + + return false; +} + +static bool vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) { int i; @@ -425,6 +636,10 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) } dev->nregions = memory->nregions; + if (dev->postcopy_listening) { + return vu_set_mem_table_exec_postcopy(dev, vmsg); + } + DPRINT("Nregions: %d\n", memory->nregions); for (i = 0; i < dev->nregions; i++) { void *mmap_addr; @@ -500,6 +715,7 @@ vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) dev->log_size = log_mmap_size; vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -752,12 +968,17 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ; + if (have_userfault()) { + features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; + } + if (dev->iface->get_protocol_features) { features |= dev->iface->get_protocol_features(dev); } vmsg->payload.u64 = features; vmsg->size = sizeof(vmsg->payload.u64); + vmsg->fd_num = 0; return true; } @@ -857,6 +1078,77 @@ vu_set_config(VuDev *dev, VhostUserMsg *vmsg) } static bool +vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg) +{ + dev->postcopy_ufd = -1; +#ifdef UFFDIO_API + struct uffdio_api api_struct; + + dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + vmsg->size = 0; +#endif + + if (dev->postcopy_ufd == -1) { + vu_panic(dev, "Userfaultfd not available: %s", strerror(errno)); + goto out; + } + +#ifdef UFFDIO_API + api_struct.api = UFFD_API; + api_struct.features = 0; + if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) { + vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno)); + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + goto out; + } + /* TODO: Stash feature flags somewhere */ +#endif + +out: + /* Return a ufd to the QEMU */ + vmsg->fd_num = 1; + vmsg->fds[0] = dev->postcopy_ufd; + return true; /* = send a reply */ +} + +static bool +vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = -1; + vmsg->size = sizeof(vmsg->payload.u64); + + if (dev->nregions) { + vu_panic(dev, "Regions already registered at postcopy-listen"); + return true; + } + dev->postcopy_listening = true; + + vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; + vmsg->payload.u64 = 0; /* Success */ + return true; +} + +static bool +vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("%s: Entry\n", __func__); + dev->postcopy_listening = false; + if (dev->postcopy_ufd > 0) { + close(dev->postcopy_ufd); + dev->postcopy_ufd = -1; + DPRINT("%s: Done close\n", __func__); + } + + vmsg->fd_num = 0; + vmsg->payload.u64 = 0; + vmsg->size = sizeof(vmsg->payload.u64); + vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK; + DPRINT("%s: exit\n", __func__); + return true; +} + +static bool vu_process_message(VuDev *dev, VhostUserMsg *vmsg) { int do_reply = 0; @@ -927,6 +1219,12 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) return vu_set_config(dev, vmsg); case VHOST_USER_NONE: break; + case VHOST_USER_POSTCOPY_ADVISE: + return vu_set_postcopy_advise(dev, vmsg); + case VHOST_USER_POSTCOPY_LISTEN: + return vu_set_postcopy_listen(dev, vmsg); + case VHOST_USER_POSTCOPY_END: + return vu_set_postcopy_end(dev, vmsg); default: vmsg_close_fds(vmsg); vu_panic(dev, "Unhandled request: %d", vmsg->request); diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h index 18f95f65d7..79f7a53ee8 100644 --- a/contrib/libvhost-user/libvhost-user.h +++ b/contrib/libvhost-user/libvhost-user.h @@ -48,6 +48,8 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_NET_MTU = 4, VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5, VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, + VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, VHOST_USER_PROTOCOL_F_MAX }; @@ -81,6 +83,11 @@ typedef enum VhostUserRequest { VHOST_USER_SET_VRING_ENDIAN = 23, VHOST_USER_GET_CONFIG = 24, VHOST_USER_SET_CONFIG = 25, + VHOST_USER_CREATE_CRYPTO_SESSION = 26, + VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, VHOST_USER_MAX } VhostUserRequest; @@ -277,6 +284,10 @@ struct VuDev { * re-initialize */ vu_panic_cb panic; const VuDevIface *iface; + + /* Postcopy data */ + int postcopy_ufd; + bool postcopy_listening; }; typedef struct VuVirtqElement { diff --git a/docs/devel/migration.rst b/docs/devel/migration.rst index 9d1b7657f0..e32b087f6e 100644 --- a/docs/devel/migration.rst +++ b/docs/devel/migration.rst @@ -577,3 +577,44 @@ Postcopy now works with hugetlbfs backed memory: hugepages works well, however 1GB hugepages are likely to be problematic since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, and until the full page is transferred the destination thread is blocked. + +Postcopy with shared memory +--------------------------- + +Postcopy migration with shared memory needs explicit support from the other +processes that share memory and from QEMU. There are restrictions on the type of +memory that userfault can support shared. + +The Linux kernel userfault support works on `/dev/shm` memory and on `hugetlbfs` +(although the kernel doesn't provide an equivalent to `madvise(MADV_DONTNEED)` +for hugetlbfs which may be a problem in some configurations). + +The vhost-user code in QEMU supports clients that have Postcopy support, +and the `vhost-user-bridge` (in `tests/`) and the DPDK package have changes +to support postcopy. + +The client needs to open a userfaultfd and register the areas +of memory that it maps with userfault. The client must then pass the +userfaultfd back to QEMU together with a mapping table that allows +fault addresses in the clients address space to be converted back to +RAMBlock/offsets. The client's userfaultfd is added to the postcopy +fault-thread and page requests are made on behalf of the client by QEMU. +QEMU performs 'wake' operations on the client's userfaultfd to allow it +to continue after a page has arrived. + +.. note:: + There are two future improvements that would be nice: + a) Some way to make QEMU ignorant of the addresses in the clients + address space + b) Avoiding the need for QEMU to perform ufd-wake calls after the + pages have arrived + +Retro-fitting postcopy to existing clients is possible: + a) A mechanism is needed for the registration with userfault as above, + and the registration needs to be coordinated with the phases of + postcopy. In vhost-user extra messages are added to the existing + control channel. + b) Any thread that can block due to guest memory accesses must be + identified and the implication understood; for example if the + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt index cb3a7595aa..c058c407df 100644 --- a/docs/interop/vhost-user.txt +++ b/docs/interop/vhost-user.txt @@ -290,6 +290,15 @@ Once the source has finished migration, rings will be stopped by the source. No further update must be done before rings are restarted. +In postcopy migration the slave is started before all the memory has been +received from the source host, and care must be taken to avoid accessing pages +that have yet to be received. The slave opens a 'userfault'-fd and registers +the memory with it; this fd is then passed back over to the master. +The master services requests on the userfaultfd for pages that are accessed +and when the page is available it performs WAKE ioctl's on the userfaultfd +to wake the stalled slave. The client indicates support for this via the +VHOST_USER_PROTOCOL_F_PAGEFAULT feature. + Memory access ------------- @@ -369,6 +378,7 @@ Protocol features #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 +#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 Master message types -------------------- @@ -445,12 +455,21 @@ Master message types Id: 5 Equivalent ioctl: VHOST_SET_MEM_TABLE Master payload: memory regions description + Slave payload: (postcopy only) memory regions description Sets the memory map regions on the slave so it can translate the vring addresses. In the ancillary data there is an array of file descriptors for each memory mapped region. The size and ordering of the fds matches the number and ordering of memory regions. + When VHOST_USER_POSTCOPY_LISTEN has been received, SET_MEM_TABLE replies with + the bases of the memory mapped regions to the master. The slave must + have mmap'd the regions but not yet accessed them and should not yet generate + a userfault event. Note NEED_REPLY_MASK is not set in this case. + QEMU will then reply back to the list of mappings with an empty + VHOST_USER_SET_MEM_TABLE as an acknowledgment; only upon reception of this + message may the guest start accessing the memory and generating faults. + * VHOST_USER_SET_LOG_BASE Id: 6 @@ -689,6 +708,39 @@ Master message types feature has been successfully negotiated. It's a required feature for crypto devices. + * VHOST_USER_POSTCOPY_ADVISE + Id: 28 + Master payload: N/A + Slave payload: userfault fd + + When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, the + master advises slave that a migration with postcopy enabled is underway, + the slave must open a userfaultfd for later use. + Note that at this stage the migration is still in precopy mode. + + * VHOST_USER_POSTCOPY_LISTEN + Id: 29 + Master payload: N/A + + Master advises slave that a transition to postcopy mode has happened. + The slave must ensure that shared memory is registered with userfaultfd + to cause faulting of non-present pages. + + This is always sent sometime after a VHOST_USER_POSTCOPY_ADVISE, and + thus only when VHOST_USER_PROTOCOL_F_PAGEFAULT is supported. + + * VHOST_USER_POSTCOPY_END + Id: 30 + Slave payload: u64 + + Master advises that postcopy migration has now completed. The + slave must disable the userfaultfd. The response is an acknowledgement + only. + When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, this message + is sent at the end of the migration, after VHOST_USER_POSTCOPY_LISTEN + was previously sent. + The value returned is an error indication; 0 is success. + Slave message types ------------------- @@ -99,6 +99,11 @@ static MemoryRegion io_mem_unassigned; */ #define RAM_RESIZEABLE (1 << 2) +/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically + * zero the page and wake waiting processes. + * (Set during postcopy) + */ +#define RAM_UF_ZEROPAGE (1 << 3) #endif #ifdef TARGET_PAGE_BITS_VARY @@ -1790,6 +1795,17 @@ bool qemu_ram_is_shared(RAMBlock *rb) return rb->flags & RAM_SHARED; } +/* Note: Only set at the start of postcopy */ +bool qemu_ram_is_uf_zeroable(RAMBlock *rb) +{ + return rb->flags & RAM_UF_ZEROPAGE; +} + +void qemu_ram_set_uf_zeroable(RAMBlock *rb) +{ + rb->flags |= RAM_UF_ZEROPAGE; +} + /* Called with iothread lock held. */ void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) { @@ -2320,6 +2336,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, return ramblock_ptr(block, addr); } +/* Return the offset of a hostpointer within a ramblock */ +ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host) +{ + ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host; + assert((uintptr_t)host >= (uintptr_t)rb->host); + assert(res < rb->max_length); + + return res; +} + /* * Translates a host ptr back to a RAMBlock, a ram_addr and an offset * in that RAMBlock. @@ -3744,6 +3770,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) } if ((start + length) <= rb->used_length) { + bool need_madvise, need_fallocate; uint8_t *host_endaddr = host_startaddr + length; if ((uintptr_t)host_endaddr & (rb->page_size - 1)) { error_report("ram_block_discard_range: Unaligned end address: %p", @@ -3753,29 +3780,60 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) errno = ENOTSUP; /* If we are missing MADVISE etc */ - if (rb->page_size == qemu_host_page_size) { -#if defined(CONFIG_MADVISE) - /* Note: We need the madvise MADV_DONTNEED behaviour of definitely - * freeing the page. - */ - ret = madvise(host_startaddr, length, MADV_DONTNEED); -#endif - } else { - /* Huge page case - unfortunately it can't do DONTNEED, but - * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the - * huge page file. + /* The logic here is messy; + * madvise DONTNEED fails for hugepages + * fallocate works on hugepages and shmem + */ + need_madvise = (rb->page_size == qemu_host_page_size); + need_fallocate = rb->fd != -1; + if (need_fallocate) { + /* For a file, this causes the area of the file to be zero'd + * if read, and for hugetlbfs also causes it to be unmapped + * so a userfault will trigger. */ #ifdef CONFIG_FALLOCATE_PUNCH_HOLE ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, length); + if (ret) { + ret = -errno; + error_report("ram_block_discard_range: Failed to fallocate " + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; + } +#else + ret = -ENOSYS; + error_report("ram_block_discard_range: fallocate not available/file" + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; #endif } - if (ret) { - ret = -errno; - error_report("ram_block_discard_range: Failed to discard range " + if (need_madvise) { + /* For normal RAM this causes it to be unmapped, + * for shared memory it causes the local mapping to disappear + * and to fall back on the file contents (which we just + * fallocate'd away). + */ +#if defined(CONFIG_MADVISE) + ret = madvise(host_startaddr, length, MADV_DONTNEED); + if (ret) { + ret = -errno; + error_report("ram_block_discard_range: Failed to discard range " + "%s:%" PRIx64 " +%zx (%d)", + rb->idstr, start, length, ret); + goto err; + } +#else + ret = -ENOSYS; + error_report("ram_block_discard_range: MADVISE not available" "%s:%" PRIx64 " +%zx (%d)", rb->idstr, start, length, ret); + goto err; +#endif } + trace_ram_block_discard_range(rb->idstr, host_startaddr, length, + need_madvise, need_fallocate, ret); } else { error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64 "/%zx/" RAM_ADDR_FMT")", @@ -1321,7 +1321,7 @@ void hmp_savevm(Monitor *mon, const QDict *qdict) void hmp_delvm(Monitor *mon, const QDict *qdict) { BlockDriverState *bs; - Error *err; + Error *err = NULL; const char *name = qdict_get_str(qdict, "name"); if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { @@ -2423,7 +2423,18 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) switch (value->type) { case MEMORY_DEVICE_INFO_KIND_DIMM: di = value->u.dimm.data; + break; + + case MEMORY_DEVICE_INFO_KIND_NVDIMM: + di = value->u.nvdimm.data; + break; + default: + di = NULL; + break; + } + + if (di) { monitor_printf(mon, "Memory device [%s]: \"%s\"\n", MemoryDeviceInfoKind_str(value->type), di->id ? di->id : ""); @@ -2436,9 +2447,6 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict) di->hotplugged ? "true" : "false"); monitor_printf(mon, " hotpluggable: %s\n", di->hotpluggable ? "true" : "false"); - break; - default: - break; } } } diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 36a6cc450e..3fa557cea1 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -258,6 +258,22 @@ static void build_append_int(GArray *table, uint64_t value) } } +/* Generic Address Structure (GAS) + * ACPI 2.0/3.0: 5.2.3.1 Generic Address Structure + * 2.0 compat note: + * @access_width must be 0, see ACPI 2.0:Table 5-1 + */ +void build_append_gas(GArray *table, AmlAddressSpace as, + uint8_t bit_width, uint8_t bit_offset, + uint8_t access_width, uint64_t address) +{ + build_append_int_noprefix(table, as, 1); + build_append_int_noprefix(table, bit_width, 1); + build_append_int_noprefix(table, bit_offset, 1); + build_append_int_noprefix(table, access_width, 1); + build_append_int_noprefix(table, address, 8); +} + /* * Build NAME(XXXX, 0x00000000) where 0x00000000 is encoded as a dword, * and return the offset to 0x00000000 for runtime patching. @@ -1662,3 +1678,127 @@ void build_slit(GArray *table_data, BIOSLinker *linker) "SLIT", table_data->len - slit_start, 1, NULL, NULL); } + +/* build rev1/rev3/rev5.1 FADT */ +void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, + const char *oem_id, const char *oem_table_id) +{ + int off; + int fadt_start = tbl->len; + + acpi_data_push(tbl, sizeof(AcpiTableHeader)); + + /* FACS address to be filled by Guest linker at runtime */ + off = tbl->len; + build_append_int_noprefix(tbl, 0, 4); /* FIRMWARE_CTRL */ + if (f->facs_tbl_offset) { /* don't patch if not supported by platform */ + bios_linker_loader_add_pointer(linker, + ACPI_BUILD_TABLE_FILE, off, 4, + ACPI_BUILD_TABLE_FILE, *f->facs_tbl_offset); + } + + /* DSDT address to be filled by Guest linker at runtime */ + off = tbl->len; + build_append_int_noprefix(tbl, 0, 4); /* DSDT */ + if (f->dsdt_tbl_offset) { /* don't patch if not supported by platform */ + bios_linker_loader_add_pointer(linker, + ACPI_BUILD_TABLE_FILE, off, 4, + ACPI_BUILD_TABLE_FILE, *f->dsdt_tbl_offset); + } + + /* ACPI1.0: INT_MODEL, ACPI2.0+: Reserved */ + build_append_int_noprefix(tbl, f->int_model /* Multiple APIC */, 1); + /* Preferred_PM_Profile */ + build_append_int_noprefix(tbl, 0 /* Unspecified */, 1); + build_append_int_noprefix(tbl, f->sci_int, 2); /* SCI_INT */ + build_append_int_noprefix(tbl, f->smi_cmd, 4); /* SMI_CMD */ + build_append_int_noprefix(tbl, f->acpi_enable_cmd, 1); /* ACPI_ENABLE */ + build_append_int_noprefix(tbl, f->acpi_disable_cmd, 1); /* ACPI_DISABLE */ + build_append_int_noprefix(tbl, 0 /* not supported */, 1); /* S4BIOS_REQ */ + /* ACPI1.0: Reserved, ACPI2.0+: PSTATE_CNT */ + build_append_int_noprefix(tbl, 0, 1); + build_append_int_noprefix(tbl, f->pm1a_evt.address, 4); /* PM1a_EVT_BLK */ + build_append_int_noprefix(tbl, 0, 4); /* PM1b_EVT_BLK */ + build_append_int_noprefix(tbl, f->pm1a_cnt.address, 4); /* PM1a_CNT_BLK */ + build_append_int_noprefix(tbl, 0, 4); /* PM1b_CNT_BLK */ + build_append_int_noprefix(tbl, 0, 4); /* PM2_CNT_BLK */ + build_append_int_noprefix(tbl, f->pm_tmr.address, 4); /* PM_TMR_BLK */ + build_append_int_noprefix(tbl, f->gpe0_blk.address, 4); /* GPE0_BLK */ + build_append_int_noprefix(tbl, 0, 4); /* GPE1_BLK */ + /* PM1_EVT_LEN */ + build_append_int_noprefix(tbl, f->pm1a_evt.bit_width / 8, 1); + /* PM1_CNT_LEN */ + build_append_int_noprefix(tbl, f->pm1a_cnt.bit_width / 8, 1); + build_append_int_noprefix(tbl, 0, 1); /* PM2_CNT_LEN */ + build_append_int_noprefix(tbl, f->pm_tmr.bit_width / 8, 1); /* PM_TMR_LEN */ + /* GPE0_BLK_LEN */ + build_append_int_noprefix(tbl, f->gpe0_blk.bit_width / 8, 1); + build_append_int_noprefix(tbl, 0, 1); /* GPE1_BLK_LEN */ + build_append_int_noprefix(tbl, 0, 1); /* GPE1_BASE */ + build_append_int_noprefix(tbl, 0, 1); /* CST_CNT */ + build_append_int_noprefix(tbl, f->plvl2_lat, 2); /* P_LVL2_LAT */ + build_append_int_noprefix(tbl, f->plvl3_lat, 2); /* P_LVL3_LAT */ + build_append_int_noprefix(tbl, 0, 2); /* FLUSH_SIZE */ + build_append_int_noprefix(tbl, 0, 2); /* FLUSH_STRIDE */ + build_append_int_noprefix(tbl, 0, 1); /* DUTY_OFFSET */ + build_append_int_noprefix(tbl, 0, 1); /* DUTY_WIDTH */ + build_append_int_noprefix(tbl, 0, 1); /* DAY_ALRM */ + build_append_int_noprefix(tbl, 0, 1); /* MON_ALRM */ + build_append_int_noprefix(tbl, f->rtc_century, 1); /* CENTURY */ + build_append_int_noprefix(tbl, 0, 2); /* IAPC_BOOT_ARCH */ + build_append_int_noprefix(tbl, 0, 1); /* Reserved */ + build_append_int_noprefix(tbl, f->flags, 4); /* Flags */ + + if (f->rev == 1) { + goto build_hdr; + } + + build_append_gas_from_struct(tbl, &f->reset_reg); /* RESET_REG */ + build_append_int_noprefix(tbl, f->reset_val, 1); /* RESET_VALUE */ + /* Since ACPI 5.1 */ + if ((f->rev >= 6) || ((f->rev == 5) && f->minor_ver > 0)) { + build_append_int_noprefix(tbl, f->arm_boot_arch, 2); /* ARM_BOOT_ARCH */ + /* FADT Minor Version */ + build_append_int_noprefix(tbl, f->minor_ver, 1); + } else { + build_append_int_noprefix(tbl, 0, 3); /* Reserved upto ACPI 5.0 */ + } + build_append_int_noprefix(tbl, 0, 8); /* X_FIRMWARE_CTRL */ + + /* XDSDT address to be filled by Guest linker at runtime */ + off = tbl->len; + build_append_int_noprefix(tbl, 0, 8); /* X_DSDT */ + if (f->xdsdt_tbl_offset) { + bios_linker_loader_add_pointer(linker, + ACPI_BUILD_TABLE_FILE, off, 8, + ACPI_BUILD_TABLE_FILE, *f->xdsdt_tbl_offset); + } + + build_append_gas_from_struct(tbl, &f->pm1a_evt); /* X_PM1a_EVT_BLK */ + /* X_PM1b_EVT_BLK */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + build_append_gas_from_struct(tbl, &f->pm1a_cnt); /* X_PM1a_CNT_BLK */ + /* X_PM1b_CNT_BLK */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + /* X_PM2_CNT_BLK */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + build_append_gas_from_struct(tbl, &f->pm_tmr); /* X_PM_TMR_BLK */ + build_append_gas_from_struct(tbl, &f->gpe0_blk); /* X_GPE0_BLK */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); /* X_GPE1_BLK */ + + if (f->rev <= 4) { + goto build_hdr; + } + + /* SLEEP_CONTROL_REG */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + /* SLEEP_STATUS_REG */ + build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); + + /* TODO: extra fields need to be added to support revisions above rev5 */ + assert(f->rev == 5); + +build_hdr: + build_header(linker, tbl, (void *)(tbl->data + fadt_start), + "FACP", tbl->len - fadt_start, f->rev, oem_id, oem_table_id); +} diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index f7fa795278..c7c6a57ec5 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -651,42 +651,33 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } /* FADT */ -static void build_fadt(GArray *table_data, BIOSLinker *linker, - VirtMachineState *vms, unsigned dsdt_tbl_offset) +static void build_fadt_rev5(GArray *table_data, BIOSLinker *linker, + VirtMachineState *vms, unsigned dsdt_tbl_offset) { - int fadt_start = table_data->len; - AcpiFadtDescriptorRev5_1 *fadt = acpi_data_push(table_data, sizeof(*fadt)); - unsigned xdsdt_entry_offset = (char *)&fadt->x_dsdt - table_data->data; - uint16_t bootflags; + /* ACPI v5.1 */ + AcpiFadtData fadt = { + .rev = 5, + .minor_ver = 1, + .flags = 1 << ACPI_FADT_F_HW_REDUCED_ACPI, + .xdsdt_tbl_offset = &dsdt_tbl_offset, + }; switch (vms->psci_conduit) { case QEMU_PSCI_CONDUIT_DISABLED: - bootflags = 0; + fadt.arm_boot_arch = 0; break; case QEMU_PSCI_CONDUIT_HVC: - bootflags = ACPI_FADT_ARM_PSCI_COMPLIANT | ACPI_FADT_ARM_PSCI_USE_HVC; + fadt.arm_boot_arch = ACPI_FADT_ARM_PSCI_COMPLIANT | + ACPI_FADT_ARM_PSCI_USE_HVC; break; case QEMU_PSCI_CONDUIT_SMC: - bootflags = ACPI_FADT_ARM_PSCI_COMPLIANT; + fadt.arm_boot_arch = ACPI_FADT_ARM_PSCI_COMPLIANT; break; default: g_assert_not_reached(); } - /* Hardware Reduced = 1 and use PSCI 0.2+ */ - fadt->flags = cpu_to_le32(1 << ACPI_FADT_F_HW_REDUCED_ACPI); - fadt->arm_boot_flags = cpu_to_le16(bootflags); - - /* ACPI v5.1 (fadt->revision.fadt->minor_revision) */ - fadt->minor_revision = 0x1; - - /* DSDT address to be filled by Guest linker */ - bios_linker_loader_add_pointer(linker, - ACPI_BUILD_TABLE_FILE, xdsdt_entry_offset, sizeof(fadt->x_dsdt), - ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset); - - build_header(linker, table_data, (void *)(table_data->data + fadt_start), - "FACP", table_data->len - fadt_start, 5, NULL, NULL); + build_fadt(table_data, linker, &fadt, NULL, NULL); } /* DSDT */ @@ -761,7 +752,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) /* FADT MADT GTDT MCFG SPCR pointed to by RSDT */ acpi_add_table(table_offsets, tables_blob); - build_fadt(tables_blob, tables->linker, vms, dsdt); + build_fadt_rev5(tables_blob, tables->linker, vms, dsdt); acpi_add_table(table_offsets, tables_blob); build_madt(tables_blob, tables->linker, vms); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index a66fb2dcd2..3cf2a1679c 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -91,17 +91,11 @@ typedef struct AcpiMcfgInfo { } AcpiMcfgInfo; typedef struct AcpiPmInfo { - bool force_rev1_fadt; bool s3_disabled; bool s4_disabled; bool pcihp_bridge_en; uint8_t s4_val; - uint16_t sci_int; - uint8_t acpi_enable_cmd; - uint8_t acpi_disable_cmd; - uint32_t gpe0_blk; - uint32_t gpe0_blk_len; - uint32_t io_base; + AcpiFadtData fadt; uint16_t cpu_hp_io_base; uint16_t pcihp_io_base; uint16_t pcihp_io_len; @@ -124,21 +118,59 @@ typedef struct AcpiBuildPciBusHotplugState { bool pcihp_bridge_en; } AcpiBuildPciBusHotplugState; +static void init_common_fadt_data(Object *o, AcpiFadtData *data) +{ + uint32_t io = object_property_get_uint(o, ACPI_PM_PROP_PM_IO_BASE, NULL); + AmlAddressSpace as = AML_AS_SYSTEM_IO; + AcpiFadtData fadt = { + .rev = 3, + .flags = + (1 << ACPI_FADT_F_WBINVD) | + (1 << ACPI_FADT_F_PROC_C1) | + (1 << ACPI_FADT_F_SLP_BUTTON) | + (1 << ACPI_FADT_F_RTC_S4) | + (1 << ACPI_FADT_F_USE_PLATFORM_CLOCK) | + /* APIC destination mode ("Flat Logical") has an upper limit of 8 + * CPUs for more than 8 CPUs, "Clustered Logical" mode has to be + * used + */ + ((max_cpus > 8) ? (1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL) : 0), + .int_model = 1 /* Multiple APIC */, + .rtc_century = RTC_CENTURY, + .plvl2_lat = 0xfff /* C2 state not supported */, + .plvl3_lat = 0xfff /* C3 state not supported */, + .smi_cmd = ACPI_PORT_SMI_CMD, + .sci_int = object_property_get_uint(o, ACPI_PM_PROP_SCI_INT, NULL), + .acpi_enable_cmd = + object_property_get_uint(o, ACPI_PM_PROP_ACPI_ENABLE_CMD, NULL), + .acpi_disable_cmd = + object_property_get_uint(o, ACPI_PM_PROP_ACPI_DISABLE_CMD, NULL), + .pm1a_evt = { .space_id = as, .bit_width = 4 * 8, .address = io }, + .pm1a_cnt = { .space_id = as, .bit_width = 2 * 8, + .address = io + 0x04 }, + .pm_tmr = { .space_id = as, .bit_width = 4 * 8, .address = io + 0x08 }, + .gpe0_blk = { .space_id = as, .bit_width = + object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK_LEN, NULL) * 8, + .address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL) + }, + }; + *data = fadt; +} + static void acpi_get_pm_info(AcpiPmInfo *pm) { Object *piix = piix4_pm_find(); Object *lpc = ich9_lpc_find(); - Object *obj = NULL; + Object *obj = piix ? piix : lpc; QObject *o; - - pm->force_rev1_fadt = false; pm->cpu_hp_io_base = 0; pm->pcihp_io_base = 0; pm->pcihp_io_len = 0; + + init_common_fadt_data(obj, &pm->fadt); if (piix) { /* w2k requires FADT(rev1) or it won't boot, keep PC compatible */ - pm->force_rev1_fadt = true; - obj = piix; + pm->fadt.rev = 1; pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE; pm->pcihp_io_base = object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL); @@ -146,11 +178,19 @@ static void acpi_get_pm_info(AcpiPmInfo *pm) object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL); } if (lpc) { - obj = lpc; + struct AcpiGenericAddress r = { .space_id = AML_AS_SYSTEM_IO, + .bit_width = 8, .address = ICH9_RST_CNT_IOPORT }; + pm->fadt.reset_reg = r; + pm->fadt.reset_val = 0xf; + pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP; pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE; } assert(obj); + /* The above need not be conditional on machine type because the reset port + * happens to be the same on PIIX (pc) and ICH9 (q35). */ + QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != RCR_IOPORT); + /* Fill in optional s3/s4 related properties */ o = object_property_get_qobject(obj, ACPI_PM_PROP_S3_DISABLED, NULL); if (o) { @@ -174,22 +214,6 @@ static void acpi_get_pm_info(AcpiPmInfo *pm) } qobject_decref(o); - /* Fill in mandatory properties */ - pm->sci_int = object_property_get_uint(obj, ACPI_PM_PROP_SCI_INT, NULL); - - pm->acpi_enable_cmd = object_property_get_uint(obj, - ACPI_PM_PROP_ACPI_ENABLE_CMD, - NULL); - pm->acpi_disable_cmd = - object_property_get_uint(obj, - ACPI_PM_PROP_ACPI_DISABLE_CMD, - NULL); - pm->io_base = object_property_get_uint(obj, ACPI_PM_PROP_PM_IO_BASE, - NULL); - pm->gpe0_blk = object_property_get_uint(obj, ACPI_PM_PROP_GPE0_BLK, - NULL); - pm->gpe0_blk_len = object_property_get_uint(obj, ACPI_PM_PROP_GPE0_BLK_LEN, - NULL); pm->pcihp_bridge_en = object_property_get_bool(obj, "acpi-pci-hotplug-with-bridge-support", NULL); @@ -257,8 +281,6 @@ static void acpi_get_pci_holes(Range *hole, Range *hole64) NULL)); } -#define ACPI_PORT_SMI_CMD 0x00b2 /* TODO: this is APM_CNT_IOPORT */ - static void acpi_align_size(GArray *blob, unsigned align) { /* Align size to multiple of given size. This reduces the chance @@ -276,106 +298,6 @@ build_facs(GArray *table_data, BIOSLinker *linker) facs->length = cpu_to_le32(sizeof(*facs)); } -/* Load chipset information in FADT */ -static void fadt_setup(AcpiFadtDescriptorRev3 *fadt, AcpiPmInfo *pm) -{ - fadt->model = 1; - fadt->reserved1 = 0; - fadt->sci_int = cpu_to_le16(pm->sci_int); - fadt->smi_cmd = cpu_to_le32(ACPI_PORT_SMI_CMD); - fadt->acpi_enable = pm->acpi_enable_cmd; - fadt->acpi_disable = pm->acpi_disable_cmd; - /* EVT, CNT, TMR offset matches hw/acpi/core.c */ - fadt->pm1a_evt_blk = cpu_to_le32(pm->io_base); - fadt->pm1a_cnt_blk = cpu_to_le32(pm->io_base + 0x04); - fadt->pm_tmr_blk = cpu_to_le32(pm->io_base + 0x08); - fadt->gpe0_blk = cpu_to_le32(pm->gpe0_blk); - /* EVT, CNT, TMR length matches hw/acpi/core.c */ - fadt->pm1_evt_len = 4; - fadt->pm1_cnt_len = 2; - fadt->pm_tmr_len = 4; - fadt->gpe0_blk_len = pm->gpe0_blk_len; - fadt->plvl2_lat = cpu_to_le16(0xfff); /* C2 state not supported */ - fadt->plvl3_lat = cpu_to_le16(0xfff); /* C3 state not supported */ - fadt->flags = cpu_to_le32((1 << ACPI_FADT_F_WBINVD) | - (1 << ACPI_FADT_F_PROC_C1) | - (1 << ACPI_FADT_F_SLP_BUTTON) | - (1 << ACPI_FADT_F_RTC_S4)); - fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_USE_PLATFORM_CLOCK); - /* APIC destination mode ("Flat Logical") has an upper limit of 8 CPUs - * For more than 8 CPUs, "Clustered Logical" mode has to be used - */ - if (max_cpus > 8) { - fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL); - } - fadt->century = RTC_CENTURY; - if (pm->force_rev1_fadt) { - return; - } - - fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_RESET_REG_SUP); - fadt->reset_value = 0xf; - fadt->reset_register.space_id = AML_SYSTEM_IO; - fadt->reset_register.bit_width = 8; - fadt->reset_register.address = cpu_to_le64(ICH9_RST_CNT_IOPORT); - /* The above need not be conditional on machine type because the reset port - * happens to be the same on PIIX (pc) and ICH9 (q35). */ - QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != RCR_IOPORT); - - fadt->xpm1a_event_block.space_id = AML_SYSTEM_IO; - fadt->xpm1a_event_block.bit_width = fadt->pm1_evt_len * 8; - fadt->xpm1a_event_block.address = cpu_to_le64(pm->io_base); - - fadt->xpm1a_control_block.space_id = AML_SYSTEM_IO; - fadt->xpm1a_control_block.bit_width = fadt->pm1_cnt_len * 8; - fadt->xpm1a_control_block.address = cpu_to_le64(pm->io_base + 0x4); - - fadt->xpm_timer_block.space_id = AML_SYSTEM_IO; - fadt->xpm_timer_block.bit_width = fadt->pm_tmr_len * 8; - fadt->xpm_timer_block.address = cpu_to_le64(pm->io_base + 0x8); - - fadt->xgpe0_block.space_id = AML_SYSTEM_IO; - fadt->xgpe0_block.bit_width = pm->gpe0_blk_len * 8; - fadt->xgpe0_block.address = cpu_to_le64(pm->gpe0_blk); -} - - -/* FADT */ -static void -build_fadt(GArray *table_data, BIOSLinker *linker, AcpiPmInfo *pm, - unsigned facs_tbl_offset, unsigned dsdt_tbl_offset, - const char *oem_id, const char *oem_table_id) -{ - AcpiFadtDescriptorRev3 *fadt = acpi_data_push(table_data, sizeof(*fadt)); - unsigned fw_ctrl_offset = (char *)&fadt->firmware_ctrl - table_data->data; - unsigned dsdt_entry_offset = (char *)&fadt->dsdt - table_data->data; - unsigned xdsdt_entry_offset = (char *)&fadt->x_dsdt - table_data->data; - int fadt_size = sizeof(*fadt); - int rev = 3; - - /* FACS address to be filled by Guest linker */ - bios_linker_loader_add_pointer(linker, - ACPI_BUILD_TABLE_FILE, fw_ctrl_offset, sizeof(fadt->firmware_ctrl), - ACPI_BUILD_TABLE_FILE, facs_tbl_offset); - - /* DSDT address to be filled by Guest linker */ - fadt_setup(fadt, pm); - bios_linker_loader_add_pointer(linker, - ACPI_BUILD_TABLE_FILE, dsdt_entry_offset, sizeof(fadt->dsdt), - ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset); - if (pm->force_rev1_fadt) { - rev = 1; - fadt_size = offsetof(typeof(*fadt), reset_register); - } else { - bios_linker_loader_add_pointer(linker, - ACPI_BUILD_TABLE_FILE, xdsdt_entry_offset, sizeof(fadt->x_dsdt), - ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset); - } - - build_header(linker, table_data, - (void *)fadt, "FACP", fadt_size, rev, oem_id, oem_table_id); -} - void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid, const CPUArchIdList *apic_ids, GArray *entry) { @@ -2053,7 +1975,12 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, aml_append(dev, aml_name_decl("_STA", aml_int(0xB))); crs = aml_resource_template(); aml_append(crs, - aml_io(AML_DECODE16, pm->gpe0_blk, pm->gpe0_blk, 1, pm->gpe0_blk_len) + aml_io( + AML_DECODE16, + pm->fadt.gpe0_blk.address, + pm->fadt.gpe0_blk.address, + 1, + pm->fadt.gpe0_blk.bit_width / 8) ); aml_append(dev, aml_name_decl("_CRS", crs)); aml_append(scope, dev); @@ -2323,6 +2250,55 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog) #define HOLE_640K_START (640 * 1024) #define HOLE_640K_END (1024 * 1024) +static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base, + uint64_t len, int default_node) +{ + MemoryDeviceInfoList *info_list = qmp_pc_dimm_device_list(); + MemoryDeviceInfoList *info; + MemoryDeviceInfo *mi; + PCDIMMDeviceInfo *di; + uint64_t end = base + len, cur, size; + bool is_nvdimm; + AcpiSratMemoryAffinity *numamem; + MemoryAffinityFlags flags; + + for (cur = base, info = info_list; + cur < end; + cur += size, info = info->next) { + numamem = acpi_data_push(table_data, sizeof *numamem); + + if (!info) { + build_srat_memory(numamem, cur, end - cur, default_node, + MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); + break; + } + + mi = info->value; + is_nvdimm = (mi->type == MEMORY_DEVICE_INFO_KIND_NVDIMM); + di = !is_nvdimm ? mi->u.dimm.data : mi->u.nvdimm.data; + + if (cur < di->addr) { + build_srat_memory(numamem, cur, di->addr - cur, default_node, + MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); + numamem = acpi_data_push(table_data, sizeof *numamem); + } + + size = di->size; + + flags = MEM_AFFINITY_ENABLED; + if (di->hotpluggable) { + flags |= MEM_AFFINITY_HOTPLUGGABLE; + } + if (is_nvdimm) { + flags |= MEM_AFFINITY_NON_VOLATILE; + } + + build_srat_memory(numamem, di->addr, size, di->node, flags); + } + + qapi_free_MemoryDeviceInfoList(info_list); +} + static void build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) { @@ -2434,10 +2410,9 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) * providing _PXM method if necessary. */ if (hotplugabble_address_space_size) { - numamem = acpi_data_push(table_data, sizeof *numamem); - build_srat_memory(numamem, pcms->hotplug_memory.base, - hotplugabble_address_space_size, pcms->numa_nodes - 1, - MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED); + build_srat_hotpluggable_memory(table_data, pcms->hotplug_memory.base, + hotplugabble_address_space_size, + pcms->numa_nodes - 1); } build_header(linker, table_data, @@ -2700,7 +2675,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine) /* ACPI tables pointed to by RSDT */ fadt = tables_blob->len; acpi_add_table(table_offsets, tables_blob); - build_fadt(tables_blob, tables->linker, &pm, facs, dsdt, + pm.fadt.facs_tbl_offset = &facs; + pm.fadt.dsdt_tbl_offset = &dsdt; + pm.fadt.xdsdt_tbl_offset = &dsdt; + build_fadt(tables_blob, tables->linker, &pm.fadt, slic_oem.id, slic_oem.table_id); aml_len += tables_blob->len - fadt; diff --git a/hw/isa/apm.c b/hw/isa/apm.c index e232b0da03..c3101ef52f 100644 --- a/hw/isa/apm.c +++ b/hw/isa/apm.c @@ -34,7 +34,6 @@ #endif /* fixed I/O location */ -#define APM_CNT_IOPORT 0xb2 #define APM_STS_IOPORT 0xb3 static void apm_ioport_writeb(void *opaque, hwaddr addr, uint64_t val, diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 6e74b61cb6..51350d9c2d 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -20,6 +20,7 @@ #include "qemu/osdep.h" #include "hw/mem/pc-dimm.h" +#include "hw/mem/nvdimm.h" #include "qapi/error.h" #include "qemu/config-file.h" #include "qapi/visitor.h" @@ -162,45 +163,6 @@ uint64_t get_plugged_memory_size(void) return pc_existing_dimms_capacity(&error_abort); } -int qmp_pc_dimm_device_list(Object *obj, void *opaque) -{ - MemoryDeviceInfoList ***prev = opaque; - - if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { - DeviceState *dev = DEVICE(obj); - - if (dev->realized) { - MemoryDeviceInfoList *elem = g_new0(MemoryDeviceInfoList, 1); - MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); - PCDIMMDeviceInfo *di = g_new0(PCDIMMDeviceInfo, 1); - DeviceClass *dc = DEVICE_GET_CLASS(obj); - PCDIMMDevice *dimm = PC_DIMM(obj); - - if (dev->id) { - di->has_id = true; - di->id = g_strdup(dev->id); - } - di->hotplugged = dev->hotplugged; - di->hotpluggable = dc->hotpluggable; - di->addr = dimm->addr; - di->slot = dimm->slot; - di->node = dimm->node; - di->size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP, - NULL); - di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem)); - - info->u.dimm.data = di; - elem->value = info; - elem->next = NULL; - **prev = elem; - *prev = &elem->next; - } - } - - object_child_foreach(obj, qmp_pc_dimm_device_list, opaque); - return 0; -} - static int pc_dimm_slot2bitmap(Object *obj, void *opaque) { unsigned long *bitmap = opaque; @@ -276,6 +238,57 @@ static int pc_dimm_built_list(Object *obj, void *opaque) return 0; } +MemoryDeviceInfoList *qmp_pc_dimm_device_list(void) +{ + GSList *dimms = NULL, *item; + MemoryDeviceInfoList *list = NULL, *prev = NULL; + + object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &dimms); + + for (item = dimms; item; item = g_slist_next(item)) { + PCDIMMDevice *dimm = PC_DIMM(item->data); + Object *obj = OBJECT(dimm); + MemoryDeviceInfoList *elem = g_new0(MemoryDeviceInfoList, 1); + MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); + PCDIMMDeviceInfo *di = g_new0(PCDIMMDeviceInfo, 1); + bool is_nvdimm = object_dynamic_cast(obj, TYPE_NVDIMM); + DeviceClass *dc = DEVICE_GET_CLASS(obj); + DeviceState *dev = DEVICE(obj); + + if (dev->id) { + di->has_id = true; + di->id = g_strdup(dev->id); + } + di->hotplugged = dev->hotplugged; + di->hotpluggable = dc->hotpluggable; + di->addr = dimm->addr; + di->slot = dimm->slot; + di->node = dimm->node; + di->size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL); + di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem)); + + if (!is_nvdimm) { + info->u.dimm.data = di; + info->type = MEMORY_DEVICE_INFO_KIND_DIMM; + } else { + info->u.nvdimm.data = di; + info->type = MEMORY_DEVICE_INFO_KIND_NVDIMM; + } + elem->value = info; + elem->next = NULL; + if (prev) { + prev->next = elem; + } else { + list = elem; + } + prev = elem; + } + + g_slist_free(dimms); + + return list; +} + uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, uint64_t *hint, uint64_t align, uint64_t size, diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 188744e17d..67ad38cfe4 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -26,6 +26,7 @@ #include "qapi/qapi-events-net.h" #include "hw/virtio/virtio-access.h" #include "migration/misc.h" +#include "standard-headers/linux/ethtool.h" #define VIRTIO_NET_VM_VERSION 11 @@ -48,19 +49,21 @@ (offsetof(container, field) + sizeof(((container *)0)->field)) typedef struct VirtIOFeature { - uint32_t flags; + uint64_t flags; size_t end; } VirtIOFeature; static VirtIOFeature feature_sizes[] = { - {.flags = 1 << VIRTIO_NET_F_MAC, + {.flags = 1ULL << VIRTIO_NET_F_MAC, .end = endof(struct virtio_net_config, mac)}, - {.flags = 1 << VIRTIO_NET_F_STATUS, + {.flags = 1ULL << VIRTIO_NET_F_STATUS, .end = endof(struct virtio_net_config, status)}, - {.flags = 1 << VIRTIO_NET_F_MQ, + {.flags = 1ULL << VIRTIO_NET_F_MQ, .end = endof(struct virtio_net_config, max_virtqueue_pairs)}, - {.flags = 1 << VIRTIO_NET_F_MTU, + {.flags = 1ULL << VIRTIO_NET_F_MTU, .end = endof(struct virtio_net_config, mtu)}, + {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX, + .end = endof(struct virtio_net_config, duplex)}, {} }; @@ -89,6 +92,8 @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queues); virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu); memcpy(netcfg.mac, n->mac, ETH_ALEN); + virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed); + netcfg.duplex = n->net_conf.duplex; memcpy(config, &netcfg, n->config_size); } @@ -1938,7 +1943,26 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) int i; if (n->net_conf.mtu) { - n->host_features |= (0x1 << VIRTIO_NET_F_MTU); + n->host_features |= (1ULL << VIRTIO_NET_F_MTU); + } + + if (n->net_conf.duplex_str) { + if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) { + n->net_conf.duplex = DUPLEX_HALF; + } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) { + n->net_conf.duplex = DUPLEX_FULL; + } else { + error_setg(errp, "'duplex' must be 'half' or 'full'"); + } + n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX); + } else { + n->net_conf.duplex = DUPLEX_UNKNOWN; + } + + if (n->net_conf.speed < SPEED_UNKNOWN) { + error_setg(errp, "'speed' must be between 0 and INT_MAX"); + } else if (n->net_conf.speed >= 0) { + n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX); } virtio_net_set_config_size(n, n->host_features); @@ -2109,45 +2133,46 @@ static const VMStateDescription vmstate_virtio_net = { }; static Property virtio_net_properties[] = { - DEFINE_PROP_BIT("csum", VirtIONet, host_features, VIRTIO_NET_F_CSUM, true), - DEFINE_PROP_BIT("guest_csum", VirtIONet, host_features, + DEFINE_PROP_BIT64("csum", VirtIONet, host_features, + VIRTIO_NET_F_CSUM, true), + DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features, VIRTIO_NET_F_GUEST_CSUM, true), - DEFINE_PROP_BIT("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true), - DEFINE_PROP_BIT("guest_tso4", VirtIONet, host_features, + DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true), + DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features, VIRTIO_NET_F_GUEST_TSO4, true), - DEFINE_PROP_BIT("guest_tso6", VirtIONet, host_features, + DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features, VIRTIO_NET_F_GUEST_TSO6, true), - DEFINE_PROP_BIT("guest_ecn", VirtIONet, host_features, + DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features, VIRTIO_NET_F_GUEST_ECN, true), - DEFINE_PROP_BIT("guest_ufo", VirtIONet, host_features, + DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features, VIRTIO_NET_F_GUEST_UFO, true), - DEFINE_PROP_BIT("guest_announce", VirtIONet, host_features, + DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features, VIRTIO_NET_F_GUEST_ANNOUNCE, true), - DEFINE_PROP_BIT("host_tso4", VirtIONet, host_features, + DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features, VIRTIO_NET_F_HOST_TSO4, true), - DEFINE_PROP_BIT("host_tso6", VirtIONet, host_features, + DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features, VIRTIO_NET_F_HOST_TSO6, true), - DEFINE_PROP_BIT("host_ecn", VirtIONet, host_features, + DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features, VIRTIO_NET_F_HOST_ECN, true), - DEFINE_PROP_BIT("host_ufo", VirtIONet, host_features, + DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features, VIRTIO_NET_F_HOST_UFO, true), - DEFINE_PROP_BIT("mrg_rxbuf", VirtIONet, host_features, + DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features, VIRTIO_NET_F_MRG_RXBUF, true), - DEFINE_PROP_BIT("status", VirtIONet, host_features, + DEFINE_PROP_BIT64("status", VirtIONet, host_features, VIRTIO_NET_F_STATUS, true), - DEFINE_PROP_BIT("ctrl_vq", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features, VIRTIO_NET_F_CTRL_VQ, true), - DEFINE_PROP_BIT("ctrl_rx", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features, VIRTIO_NET_F_CTRL_RX, true), - DEFINE_PROP_BIT("ctrl_vlan", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features, VIRTIO_NET_F_CTRL_VLAN, true), - DEFINE_PROP_BIT("ctrl_rx_extra", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features, VIRTIO_NET_F_CTRL_RX_EXTRA, true), - DEFINE_PROP_BIT("ctrl_mac_addr", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features, VIRTIO_NET_F_CTRL_MAC_ADDR, true), - DEFINE_PROP_BIT("ctrl_guest_offloads", VirtIONet, host_features, + DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true), - DEFINE_PROP_BIT("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false), + DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false), DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf), DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer, TX_TIMER_INTERVAL), @@ -2160,6 +2185,8 @@ static Property virtio_net_properties[] = { DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0), DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend, true), + DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN), + DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 67a3f72bd6..80bc45930d 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2048,18 +2048,6 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) } } -static void pci_default_realize(PCIDevice *dev, Error **errp) -{ - PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev); - - if (pc->init) { - if (pc->init(dev) < 0) { - error_setg(errp, "Device initialization failed"); - return; - } - } -} - PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction, const char *name) { @@ -2532,13 +2520,11 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev) static void pci_device_class_init(ObjectClass *klass, void *data) { DeviceClass *k = DEVICE_CLASS(klass); - PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass); k->realize = pci_qdev_realize; k->unrealize = pci_qdev_unrealize; k->bus_type = TYPE_PCI_BUS; k->props = pci_props; - pc->realize = pci_default_realize; } static void pci_device_class_base_init(ObjectClass *klass, void *data) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index fba76abee2..2c0be8c898 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -722,8 +722,7 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt) } if (hotplug_lmb_start) { - MemoryDeviceInfoList **prev = &dimms; - qmp_pc_dimm_device_list(qdev_get_machine(), &prev); + dimms = qmp_pc_dimm_device_list(); } /* ibm,dynamic-memory */ diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index 742ff0f90b..1422ff03ab 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -3,9 +3,23 @@ # hw/virtio/vhost.c vhost_commit(bool started, bool changed) "Started: %d Changed: %d" vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 -vhost_region_add_section_abut(const char *name, uint64_t new_size) "%s: 0x%"PRIx64 +vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64 +vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64 vhost_section(const char *name, int r) "%s:%d" +# hw/virtio/vhost-user.c +vhost_user_postcopy_end_entry(void) "" +vhost_user_postcopy_end_exit(void) "" +vhost_user_postcopy_fault_handler(const char *name, uint64_t fault_address, int nregions) "%s: @0x%"PRIx64" nregions:%d" +vhost_user_postcopy_fault_handler_loop(int i, uint64_t client_base, uint64_t size) "%d: client 0x%"PRIx64" +0x%"PRIx64 +vhost_user_postcopy_fault_handler_found(int i, uint64_t region_offset, uint64_t rb_offset) "%d: region_offset: 0x%"PRIx64" rb_offset:0x%"PRIx64 +vhost_user_postcopy_listen(void) "" +vhost_user_set_mem_table_postcopy(uint64_t client_addr, uint64_t qhva, int reply_i, int region_i) "client:0x%"PRIx64" for hva: 0x%"PRIx64" reply %d region %d" +vhost_user_set_mem_table_withfd(int index, const char *name, uint64_t memory_size, uint64_t guest_phys_addr, uint64_t userspace_addr, uint64_t offset) "%d:%s: size:0x%"PRIx64" GPA:0x%"PRIx64" QVA/userspace:0x%"PRIx64" RB offset:0x%"PRIx64 +vhost_user_postcopy_waker(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 +vhost_user_postcopy_waker_found(uint64_t client_addr) "0x%"PRIx64 +vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64 + # hw/virtio/virtio.c virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u" virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) "vq %p elem %p len %u idx %u" diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 41ff5cff41..44aea5c0a8 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -18,11 +18,15 @@ #include "qemu/error-report.h" #include "qemu/sockets.h" #include "sysemu/cryptodev.h" +#include "migration/migration.h" +#include "migration/postcopy-ram.h" +#include "trace.h" #include <sys/ioctl.h> #include <sys/socket.h> #include <sys/un.h> #include <linux/vhost.h> +#include <linux/userfaultfd.h> #define VHOST_MEMORY_MAX_NREGIONS 8 #define VHOST_USER_F_PROTOCOL_FEATURES 30 @@ -41,7 +45,7 @@ enum VhostUserProtocolFeature { VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5, VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6, VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7, - + VHOST_USER_PROTOCOL_F_PAGEFAULT = 8, VHOST_USER_PROTOCOL_F_MAX }; @@ -76,6 +80,9 @@ typedef enum VhostUserRequest { VHOST_USER_SET_CONFIG = 25, VHOST_USER_CREATE_CRYPTO_SESSION = 26, VHOST_USER_CLOSE_CRYPTO_SESSION = 27, + VHOST_USER_POSTCOPY_ADVISE = 28, + VHOST_USER_POSTCOPY_LISTEN = 29, + VHOST_USER_POSTCOPY_END = 30, VHOST_USER_MAX } VhostUserRequest; @@ -164,8 +171,23 @@ static VhostUserMsg m __attribute__ ((unused)); #define VHOST_USER_VERSION (0x1) struct vhost_user { + struct vhost_dev *dev; CharBackend *chr; int slave_fd; + NotifierWithReturn postcopy_notifier; + struct PostCopyFD postcopy_fd; + uint64_t postcopy_client_bases[VHOST_MEMORY_MAX_NREGIONS]; + /* Length of the region_rb and region_rb_offset arrays */ + size_t region_rb_len; + /* RAMBlock associated with a given region */ + RAMBlock **region_rb; + /* The offset from the start of the RAMBlock to the start of the + * vhost region. + */ + ram_addr_t *region_rb_offset; + + /* True once we've entered postcopy_listen */ + bool postcopy_listen; }; static bool ioeventfd_enabled(void) @@ -330,14 +352,167 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, return 0; } +static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + struct vhost_user *u = dev->opaque; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int i, fd; + size_t fd_num = 0; + bool reply_supported = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_REPLY_ACK); + VhostUserMsg msg_reply; + int region_i, msg_i; + + VhostUserMsg msg = { + .hdr.request = VHOST_USER_SET_MEM_TABLE, + .hdr.flags = VHOST_USER_VERSION, + }; + + if (reply_supported) { + msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK; + } + + if (u->region_rb_len < dev->mem->nregions) { + u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions); + u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset, + dev->mem->nregions); + memset(&(u->region_rb[u->region_rb_len]), '\0', + sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len)); + memset(&(u->region_rb_offset[u->region_rb_len]), '\0', + sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len)); + u->region_rb_len = dev->mem->nregions; + } + + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + ram_addr_t offset; + MemoryRegion *mr; + + assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); + mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr, + &offset); + fd = memory_region_get_fd(mr); + if (fd > 0) { + trace_vhost_user_set_mem_table_withfd(fd_num, mr->name, + reg->memory_size, + reg->guest_phys_addr, + reg->userspace_addr, offset); + u->region_rb_offset[i] = offset; + u->region_rb[i] = mr->ram_block; + msg.payload.memory.regions[fd_num].userspace_addr = + reg->userspace_addr; + msg.payload.memory.regions[fd_num].memory_size = reg->memory_size; + msg.payload.memory.regions[fd_num].guest_phys_addr = + reg->guest_phys_addr; + msg.payload.memory.regions[fd_num].mmap_offset = offset; + assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); + fds[fd_num++] = fd; + } else { + u->region_rb_offset[i] = 0; + u->region_rb[i] = NULL; + } + } + + msg.payload.memory.nregions = fd_num; + + if (!fd_num) { + error_report("Failed initializing vhost-user memory map, " + "consider using -object memory-backend-file share=on"); + return -1; + } + + msg.hdr.size = sizeof(msg.payload.memory.nregions); + msg.hdr.size += sizeof(msg.payload.memory.padding); + msg.hdr.size += fd_num * sizeof(VhostUserMemoryRegion); + + if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + return -1; + } + + if (vhost_user_read(dev, &msg_reply) < 0) { + return -1; + } + + if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) { + error_report("%s: Received unexpected msg type." + "Expected %d received %d", __func__, + VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request); + return -1; + } + /* We're using the same structure, just reusing one of the + * fields, so it should be the same size. + */ + if (msg_reply.hdr.size != msg.hdr.size) { + error_report("%s: Unexpected size for postcopy reply " + "%d vs %d", __func__, msg_reply.hdr.size, msg.hdr.size); + return -1; + } + + memset(u->postcopy_client_bases, 0, + sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS); + + /* They're in the same order as the regions that were sent + * but some of the regions were skipped (above) if they + * didn't have fd's + */ + for (msg_i = 0, region_i = 0; + region_i < dev->mem->nregions; + region_i++) { + if (msg_i < fd_num && + msg_reply.payload.memory.regions[msg_i].guest_phys_addr == + dev->mem->regions[region_i].guest_phys_addr) { + u->postcopy_client_bases[region_i] = + msg_reply.payload.memory.regions[msg_i].userspace_addr; + trace_vhost_user_set_mem_table_postcopy( + msg_reply.payload.memory.regions[msg_i].userspace_addr, + msg.payload.memory.regions[msg_i].userspace_addr, + msg_i, region_i); + msg_i++; + } + } + if (msg_i != fd_num) { + error_report("%s: postcopy reply not fully consumed " + "%d vs %zd", + __func__, msg_i, fd_num); + return -1; + } + /* Now we've registered this with the postcopy code, we ack to the client, + * because now we're in the position to be able to deal with any faults + * it generates. + */ + /* TODO: Use this for failure cases as well with a bad value */ + msg.hdr.size = sizeof(msg.payload.u64); + msg.payload.u64 = 0; /* OK */ + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { + return -1; + } + + if (reply_supported) { + return process_message_reply(dev, &msg); + } + + return 0; +} + static int vhost_user_set_mem_table(struct vhost_dev *dev, struct vhost_memory *mem) { + struct vhost_user *u = dev->opaque; int fds[VHOST_MEMORY_MAX_NREGIONS]; int i, fd; size_t fd_num = 0; + bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler; bool reply_supported = virtio_has_feature(dev->protocol_features, - VHOST_USER_PROTOCOL_F_REPLY_ACK); + VHOST_USER_PROTOCOL_F_REPLY_ACK) && + !do_postcopy; + + if (do_postcopy) { + /* Postcopy has enough differences that it's best done in it's own + * version + */ + return vhost_user_set_mem_table_postcopy(dev, mem); + } VhostUserMsg msg = { .hdr.request = VHOST_USER_SET_MEM_TABLE, @@ -362,9 +537,11 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, error_report("Failed preparing vhost-user memory table msg"); return -1; } - msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr; + msg.payload.memory.regions[fd_num].userspace_addr = + reg->userspace_addr; msg.payload.memory.regions[fd_num].memory_size = reg->memory_size; - msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; + msg.payload.memory.regions[fd_num].guest_phys_addr = + reg->guest_phys_addr; msg.payload.memory.regions[fd_num].mmap_offset = offset; fds[fd_num++] = fd; } @@ -791,6 +968,219 @@ out: return ret; } +/* + * Called back from the postcopy fault thread when a fault is received on our + * ufd. + * TODO: This is Linux specific + */ +static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd, + void *ufd) +{ + struct vhost_dev *dev = pcfd->data; + struct vhost_user *u = dev->opaque; + struct uffd_msg *msg = ufd; + uint64_t faultaddr = msg->arg.pagefault.address; + RAMBlock *rb = NULL; + uint64_t rb_offset; + int i; + + trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr, + dev->mem->nregions); + for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) { + trace_vhost_user_postcopy_fault_handler_loop(i, + u->postcopy_client_bases[i], dev->mem->regions[i].memory_size); + if (faultaddr >= u->postcopy_client_bases[i]) { + /* Ofset of the fault address in the vhost region */ + uint64_t region_offset = faultaddr - u->postcopy_client_bases[i]; + if (region_offset < dev->mem->regions[i].memory_size) { + rb_offset = region_offset + u->region_rb_offset[i]; + trace_vhost_user_postcopy_fault_handler_found(i, + region_offset, rb_offset); + rb = u->region_rb[i]; + return postcopy_request_shared_page(pcfd, rb, faultaddr, + rb_offset); + } + } + } + error_report("%s: Failed to find region for fault %" PRIx64, + __func__, faultaddr); + return -1; +} + +static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t offset) +{ + struct vhost_dev *dev = pcfd->data; + struct vhost_user *u = dev->opaque; + int i; + + trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset); + + if (!u) { + return 0; + } + /* Translate the offset into an address in the clients address space */ + for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) { + if (u->region_rb[i] == rb && + offset >= u->region_rb_offset[i] && + offset < (u->region_rb_offset[i] + + dev->mem->regions[i].memory_size)) { + uint64_t client_addr = (offset - u->region_rb_offset[i]) + + u->postcopy_client_bases[i]; + trace_vhost_user_postcopy_waker_found(client_addr); + return postcopy_wake_shared(pcfd, client_addr, rb); + } + } + + trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset); + return 0; +} + +/* + * Called at the start of an inbound postcopy on reception of the + * 'advise' command. + */ +static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp) +{ + struct vhost_user *u = dev->opaque; + CharBackend *chr = u->chr; + int ufd; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_ADVISE, + .hdr.flags = VHOST_USER_VERSION, + }; + + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { + error_setg(errp, "Failed to send postcopy_advise to vhost"); + return -1; + } + + if (vhost_user_read(dev, &msg) < 0) { + error_setg(errp, "Failed to get postcopy_advise reply from vhost"); + return -1; + } + + if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) { + error_setg(errp, "Unexpected msg type. Expected %d received %d", + VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request); + return -1; + } + + if (msg.hdr.size) { + error_setg(errp, "Received bad msg size."); + return -1; + } + ufd = qemu_chr_fe_get_msgfd(chr); + if (ufd < 0) { + error_setg(errp, "%s: Failed to get ufd", __func__); + return -1; + } + fcntl(ufd, F_SETFL, O_NONBLOCK); + + /* register ufd with userfault thread */ + u->postcopy_fd.fd = ufd; + u->postcopy_fd.data = dev; + u->postcopy_fd.handler = vhost_user_postcopy_fault_handler; + u->postcopy_fd.waker = vhost_user_postcopy_waker; + u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */ + postcopy_register_shared_ufd(&u->postcopy_fd); + return 0; +} + +/* + * Called at the switch to postcopy on reception of the 'listen' command. + */ +static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp) +{ + struct vhost_user *u = dev->opaque; + int ret; + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_LISTEN, + .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, + }; + u->postcopy_listen = true; + trace_vhost_user_postcopy_listen(); + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { + error_setg(errp, "Failed to send postcopy_listen to vhost"); + return -1; + } + + ret = process_message_reply(dev, &msg); + if (ret) { + error_setg(errp, "Failed to receive reply to postcopy_listen"); + return ret; + } + + return 0; +} + +/* + * Called at the end of postcopy + */ +static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp) +{ + VhostUserMsg msg = { + .hdr.request = VHOST_USER_POSTCOPY_END, + .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK, + }; + int ret; + struct vhost_user *u = dev->opaque; + + trace_vhost_user_postcopy_end_entry(); + if (vhost_user_write(dev, &msg, NULL, 0) < 0) { + error_setg(errp, "Failed to send postcopy_end to vhost"); + return -1; + } + + ret = process_message_reply(dev, &msg); + if (ret) { + error_setg(errp, "Failed to receive reply to postcopy_end"); + return ret; + } + postcopy_unregister_shared_ufd(&u->postcopy_fd); + u->postcopy_fd.handler = NULL; + + trace_vhost_user_postcopy_end_exit(); + + return 0; +} + +static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier, + void *opaque) +{ + struct PostcopyNotifyData *pnd = opaque; + struct vhost_user *u = container_of(notifier, struct vhost_user, + postcopy_notifier); + struct vhost_dev *dev = u->dev; + + switch (pnd->reason) { + case POSTCOPY_NOTIFY_PROBE: + if (!virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_PAGEFAULT)) { + /* TODO: Get the device name into this error somehow */ + error_setg(pnd->errp, + "vhost-user backend not capable of postcopy"); + return -ENOENT; + } + break; + + case POSTCOPY_NOTIFY_INBOUND_ADVISE: + return vhost_user_postcopy_advise(dev, pnd->errp); + + case POSTCOPY_NOTIFY_INBOUND_LISTEN: + return vhost_user_postcopy_listen(dev, pnd->errp); + + case POSTCOPY_NOTIFY_INBOUND_END: + return vhost_user_postcopy_end(dev, pnd->errp); + + default: + /* We ignore notifications we don't know */ + break; + } + + return 0; +} + static int vhost_user_init(struct vhost_dev *dev, void *opaque) { uint64_t features, protocol_features; @@ -802,6 +1192,7 @@ static int vhost_user_init(struct vhost_dev *dev, void *opaque) u = g_new0(struct vhost_user, 1); u->chr = opaque; u->slave_fd = -1; + u->dev = dev; dev->opaque = u; err = vhost_user_get_features(dev, &features); @@ -858,6 +1249,9 @@ static int vhost_user_init(struct vhost_dev *dev, void *opaque) return err; } + u->postcopy_notifier.notify = vhost_user_postcopy_notifier; + postcopy_add_notifier(&u->postcopy_notifier); + return 0; } @@ -868,11 +1262,20 @@ static int vhost_user_cleanup(struct vhost_dev *dev) assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); u = dev->opaque; + if (u->postcopy_notifier.notify) { + postcopy_remove_notifier(&u->postcopy_notifier); + u->postcopy_notifier.notify = NULL; + } if (u->slave_fd >= 0) { qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL); close(u->slave_fd); u->slave_fd = -1; } + g_free(u->region_rb); + u->region_rb = NULL; + g_free(u->region_rb_offset); + u->region_rb_offset = NULL; + u->region_rb_len = 0; g_free(u); dev->opaque = 0; diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index d8d0ef92e1..250f886acb 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -522,10 +522,28 @@ static void vhost_region_add_section(struct vhost_dev *dev, uint64_t mrs_gpa = section->offset_within_address_space; uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) + section->offset_within_region; + RAMBlock *mrs_rb = section->mr->ram_block; + size_t mrs_page = qemu_ram_pagesize(mrs_rb); trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size, mrs_host); + /* Round the section to it's page size */ + /* First align the start down to a page boundary */ + uint64_t alignage = mrs_host & (mrs_page - 1); + if (alignage) { + mrs_host -= alignage; + mrs_size += alignage; + mrs_gpa -= alignage; + } + /* Now align the size up to a page boundary */ + alignage = mrs_size & (mrs_page - 1); + if (alignage) { + mrs_size += mrs_page - alignage; + } + trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size, + mrs_host); + if (dev->n_tmp_sections) { /* Since we already have at least one section, lets see if * this extends it; since we're scanning in order, we only @@ -542,18 +560,46 @@ static void vhost_region_add_section(struct vhost_dev *dev, prev_sec->offset_within_region; uint64_t prev_host_end = range_get_last(prev_host_start, prev_size); - if (prev_gpa_end + 1 == mrs_gpa && - prev_host_end + 1 == mrs_host && - section->mr == prev_sec->mr && - (!dev->vhost_ops->vhost_backend_can_merge || - dev->vhost_ops->vhost_backend_can_merge(dev, + if (mrs_gpa <= (prev_gpa_end + 1)) { + /* OK, looks like overlapping/intersecting - it's possible that + * the rounding to page sizes has made them overlap, but they should + * match up in the same RAMBlock if they do. + */ + if (mrs_gpa < prev_gpa_start) { + error_report("%s:Section rounded to %"PRIx64 + " prior to previous %"PRIx64, + __func__, mrs_gpa, prev_gpa_start); + /* A way to cleanly fail here would be better */ + return; + } + /* Offset from the start of the previous GPA to this GPA */ + size_t offset = mrs_gpa - prev_gpa_start; + + if (prev_host_start + offset == mrs_host && + section->mr == prev_sec->mr && + (!dev->vhost_ops->vhost_backend_can_merge || + dev->vhost_ops->vhost_backend_can_merge(dev, mrs_host, mrs_size, prev_host_start, prev_size))) { - /* The two sections abut */ - need_add = false; - prev_sec->size = int128_add(prev_sec->size, section->size); - trace_vhost_region_add_section_abut(section->mr->name, - mrs_size + prev_size); + uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size); + need_add = false; + prev_sec->offset_within_address_space = + MIN(prev_gpa_start, mrs_gpa); + prev_sec->offset_within_region = + MIN(prev_host_start, mrs_host) - + (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr); + prev_sec->size = int128_make64(max_end - MIN(prev_host_start, + mrs_host)); + trace_vhost_region_add_section_merge(section->mr->name, + int128_get64(prev_sec->size), + prev_sec->offset_within_address_space, + prev_sec->offset_within_region); + } else { + error_report("%s: Overlapping but not coherent sections " + "at %"PRIx64, + __func__, mrs_gpa); + return; + } } } diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 74341b19d2..24d335f95d 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -68,10 +68,14 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr); RAMBlock *qemu_ram_block_by_name(const char *name); RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, ram_addr_t *offset); +ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host); void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev); void qemu_ram_unset_idstr(RAMBlock *block); const char *qemu_ram_get_idstr(RAMBlock *rb); bool qemu_ram_is_shared(RAMBlock *rb); +bool qemu_ram_is_uf_zeroable(RAMBlock *rb); +void qemu_ram_set_uf_zeroable(RAMBlock *rb); + size_t qemu_ram_pagesize(RAMBlock *block); size_t qemu_ram_pagesize_largest(void); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 80c8099a23..5955eb4fc0 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -40,18 +40,6 @@ enum { ACPI_FADT_F_LOW_POWER_S0_IDLE_CAPABLE, }; -/* - * ACPI 2.0 Generic Address Space definition. - */ -struct Acpi20GenericAddress { - uint8_t address_space_id; - uint8_t register_bit_width; - uint8_t register_bit_offset; - uint8_t reserved; - uint64_t address; -} QEMU_PACKED; -typedef struct Acpi20GenericAddress Acpi20GenericAddress; - struct AcpiRsdpDescriptor { /* Root System Descriptor Pointer */ uint64_t signature; /* ACPI signature, contains "RSD PTR " */ uint8_t checksum; /* To make sum of struct == 0 */ @@ -87,104 +75,44 @@ struct AcpiTableHeader { } QEMU_PACKED; typedef struct AcpiTableHeader AcpiTableHeader; -/* - * ACPI Fixed ACPI Description Table (FADT) - */ -#define ACPI_FADT_COMMON_DEF /* FADT common definition */ \ - ACPI_TABLE_HEADER_DEF /* ACPI common table header */ \ - uint32_t firmware_ctrl; /* Physical address of FACS */ \ - uint32_t dsdt; /* Physical address of DSDT */ \ - uint8_t model; /* System Interrupt Model */ \ - uint8_t reserved1; /* Reserved */ \ - uint16_t sci_int; /* System vector of SCI interrupt */ \ - uint32_t smi_cmd; /* Port address of SMI command port */ \ - uint8_t acpi_enable; /* Value to write to smi_cmd to enable ACPI */ \ - uint8_t acpi_disable; /* Value to write to smi_cmd to disable ACPI */ \ - /* Value to write to SMI CMD to enter S4BIOS state */ \ - uint8_t S4bios_req; \ - uint8_t reserved2; /* Reserved - must be zero */ \ - /* Port address of Power Mgt 1a acpi_event Reg Blk */ \ - uint32_t pm1a_evt_blk; \ - /* Port address of Power Mgt 1b acpi_event Reg Blk */ \ - uint32_t pm1b_evt_blk; \ - uint32_t pm1a_cnt_blk; /* Port address of Power Mgt 1a Control Reg Blk */ \ - uint32_t pm1b_cnt_blk; /* Port address of Power Mgt 1b Control Reg Blk */ \ - uint32_t pm2_cnt_blk; /* Port address of Power Mgt 2 Control Reg Blk */ \ - uint32_t pm_tmr_blk; /* Port address of Power Mgt Timer Ctrl Reg Blk */ \ - /* Port addr of General Purpose acpi_event 0 Reg Blk */ \ - uint32_t gpe0_blk; \ - /* Port addr of General Purpose acpi_event 1 Reg Blk */ \ - uint32_t gpe1_blk; \ - uint8_t pm1_evt_len; /* Byte length of ports at pm1_x_evt_blk */ \ - uint8_t pm1_cnt_len; /* Byte length of ports at pm1_x_cnt_blk */ \ - uint8_t pm2_cnt_len; /* Byte Length of ports at pm2_cnt_blk */ \ - uint8_t pm_tmr_len; /* Byte Length of ports at pm_tm_blk */ \ - uint8_t gpe0_blk_len; /* Byte Length of ports at gpe0_blk */ \ - uint8_t gpe1_blk_len; /* Byte Length of ports at gpe1_blk */ \ - uint8_t gpe1_base; /* Offset in gpe model where gpe1 events start */ \ - uint8_t reserved3; /* Reserved */ \ - uint16_t plvl2_lat; /* Worst case HW latency to enter/exit C2 state */ \ - uint16_t plvl3_lat; /* Worst case HW latency to enter/exit C3 state */ \ - uint16_t flush_size; /* Size of area read to flush caches */ \ - uint16_t flush_stride; /* Stride used in flushing caches */ \ - uint8_t duty_offset; /* Bit location of duty cycle field in p_cnt reg */ \ - uint8_t duty_width; /* Bit width of duty cycle field in p_cnt reg */ \ - uint8_t day_alrm; /* Index to day-of-month alarm in RTC CMOS RAM */ \ - uint8_t mon_alrm; /* Index to month-of-year alarm in RTC CMOS RAM */ \ - uint8_t century; /* Index to century in RTC CMOS RAM */ \ - /* IA-PC Boot Architecture Flags (see below for individual flags) */ \ - uint16_t boot_flags; \ - uint8_t reserved; /* Reserved, must be zero */ \ - /* Miscellaneous flag bits (see below for individual flags) */ \ - uint32_t flags; \ - /* 64-bit address of the Reset register */ \ - struct AcpiGenericAddress reset_register; \ - /* Value to write to the reset_register port to reset the system */ \ - uint8_t reset_value; \ - /* ARM-Specific Boot Flags (see below for individual flags) (ACPI 5.1) */ \ - uint16_t arm_boot_flags; \ - uint8_t minor_revision; /* FADT Minor Revision (ACPI 5.1) */ \ - uint64_t x_facs; /* 64-bit physical address of FACS */ \ - uint64_t x_dsdt; /* 64-bit physical address of DSDT */ \ - /* 64-bit Extended Power Mgt 1a Event Reg Blk address */ \ - struct AcpiGenericAddress xpm1a_event_block; \ - /* 64-bit Extended Power Mgt 1b Event Reg Blk address */ \ - struct AcpiGenericAddress xpm1b_event_block; \ - /* 64-bit Extended Power Mgt 1a Control Reg Blk address */ \ - struct AcpiGenericAddress xpm1a_control_block; \ - /* 64-bit Extended Power Mgt 1b Control Reg Blk address */ \ - struct AcpiGenericAddress xpm1b_control_block; \ - /* 64-bit Extended Power Mgt 2 Control Reg Blk address */ \ - struct AcpiGenericAddress xpm2_control_block; \ - /* 64-bit Extended Power Mgt Timer Ctrl Reg Blk address */ \ - struct AcpiGenericAddress xpm_timer_block; \ - /* 64-bit Extended General Purpose Event 0 Reg Blk address */ \ - struct AcpiGenericAddress xgpe0_block; \ - /* 64-bit Extended General Purpose Event 1 Reg Blk address */ \ - struct AcpiGenericAddress xgpe1_block; \ - struct AcpiGenericAddress { uint8_t space_id; /* Address space where struct or register exists */ uint8_t bit_width; /* Size in bits of given register */ uint8_t bit_offset; /* Bit offset within the register */ - uint8_t access_width; /* Minimum Access size (ACPI 3.0) */ + uint8_t access_width; /* ACPI 3.0: Minimum Access size (ACPI 3.0), + ACPI 2.0: Reserved, Table 5-1 */ uint64_t address; /* 64-bit address of struct or register */ } QEMU_PACKED; -struct AcpiFadtDescriptorRev3 { - ACPI_FADT_COMMON_DEF -} QEMU_PACKED; -typedef struct AcpiFadtDescriptorRev3 AcpiFadtDescriptorRev3; - -struct AcpiFadtDescriptorRev5_1 { - ACPI_FADT_COMMON_DEF - /* 64-bit Sleep Control register (ACPI 5.0) */ - struct AcpiGenericAddress sleep_control; - /* 64-bit Sleep Status register (ACPI 5.0) */ - struct AcpiGenericAddress sleep_status; -} QEMU_PACKED; - -typedef struct AcpiFadtDescriptorRev5_1 AcpiFadtDescriptorRev5_1; +typedef struct AcpiFadtData { + struct AcpiGenericAddress pm1a_cnt; /* PM1a_CNT_BLK */ + struct AcpiGenericAddress pm1a_evt; /* PM1a_EVT_BLK */ + struct AcpiGenericAddress pm_tmr; /* PM_TMR_BLK */ + struct AcpiGenericAddress gpe0_blk; /* GPE0_BLK */ + struct AcpiGenericAddress reset_reg; /* RESET_REG */ + uint8_t reset_val; /* RESET_VALUE */ + uint8_t rev; /* Revision */ + uint32_t flags; /* Flags */ + uint32_t smi_cmd; /* SMI_CMD */ + uint16_t sci_int; /* SCI_INT */ + uint8_t int_model; /* INT_MODEL */ + uint8_t acpi_enable_cmd; /* ACPI_ENABLE */ + uint8_t acpi_disable_cmd; /* ACPI_DISABLE */ + uint8_t rtc_century; /* CENTURY */ + uint16_t plvl2_lat; /* P_LVL2_LAT */ + uint16_t plvl3_lat; /* P_LVL3_LAT */ + uint16_t arm_boot_arch; /* ARM_BOOT_ARCH */ + uint8_t minor_ver; /* FADT Minor Version */ + + /* + * respective tables offsets within ACPI_BUILD_TABLE_FILE, + * NULL if table doesn't exist (in that case field's value + * won't be patched by linker and will be kept set to 0) + */ + unsigned *facs_tbl_offset; /* FACS offset in */ + unsigned *dsdt_tbl_offset; + unsigned *xdsdt_tbl_offset; +} AcpiFadtData; #define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0) #define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1) @@ -456,7 +384,7 @@ typedef struct AcpiGenericTimerTable AcpiGenericTimerTable; struct Acpi20Hpet { ACPI_TABLE_HEADER_DEF /* ACPI common table header */ uint32_t timer_block_id; - Acpi20GenericAddress addr; + struct AcpiGenericAddress addr; uint8_t hpet_number; uint16_t min_tick; uint8_t page_protect; diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h index 88d0738d76..6c36903c0a 100644 --- a/include/hw/acpi/aml-build.h +++ b/include/hw/acpi/aml-build.h @@ -78,6 +78,15 @@ typedef enum { } AmlUpdateRule; typedef enum { + AML_AS_SYSTEM_MEMORY = 0X00, + AML_AS_SYSTEM_IO = 0X01, + AML_AS_PCI_CONFIG = 0X02, + AML_AS_EMBEDDED_CTRL = 0X03, + AML_AS_SMBUS = 0X04, + AML_AS_FFH = 0X7F, +} AmlAddressSpace; + +typedef enum { AML_SYSTEM_MEMORY = 0X00, AML_SYSTEM_IO = 0X01, AML_PCI_CONFIG = 0X02, @@ -389,8 +398,22 @@ int build_append_named_dword(GArray *array, const char *name_format, ...) GCC_FMT_ATTR(2, 3); +void build_append_gas(GArray *table, AmlAddressSpace as, + uint8_t bit_width, uint8_t bit_offset, + uint8_t access_width, uint64_t address); + +static inline void +build_append_gas_from_struct(GArray *table, const struct AcpiGenericAddress *s) +{ + build_append_gas(table, s->space_id, s->bit_width, s->bit_offset, + s->access_width, s->address); +} + void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base, uint64_t len, int node, MemoryAffinityFlags flags); void build_slit(GArray *table_data, BIOSLinker *linker); + +void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, + const char *oem_id, const char *oem_table_id); #endif diff --git a/include/hw/isa/apm.h b/include/hw/isa/apm.h index 4839ff1df2..b7098bf7ca 100644 --- a/include/hw/isa/apm.h +++ b/include/hw/isa/apm.h @@ -5,6 +5,9 @@ #include "hw/hw.h" #include "exec/memory.h" +#define APM_CNT_IOPORT 0xb2 +#define ACPI_PORT_SMI_CMD APM_CNT_IOPORT + typedef void (*apm_ctrl_changed_t)(uint32_t val, void *arg); typedef struct APMState { diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index d83b957829..1fc479281c 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -93,7 +93,7 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp); -int qmp_pc_dimm_device_list(Object *obj, void *opaque); +MemoryDeviceInfoList *qmp_pc_dimm_device_list(void); uint64_t pc_existing_dimms_capacity(Error **errp); uint64_t get_plugged_memory_size(void); void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index e255941b5a..a9c3ee5aa2 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -217,7 +217,6 @@ typedef struct PCIDeviceClass { DeviceClass parent_class; void (*realize)(PCIDevice *dev, Error **errp); - int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */ PCIUnregisterFunc *exit; PCIConfigReadFunc *config_read; PCIConfigWriteFunc *config_write; diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index b81b6a4624..02484dc94c 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -38,6 +38,9 @@ typedef struct virtio_net_conf uint16_t rx_queue_size; uint16_t tx_queue_size; uint16_t mtu; + int32_t speed; + char *duplex_str; + uint8_t duplex; } virtio_net_conf; /* Maximum packet size we can receive from tap device: header + 64k */ @@ -67,7 +70,7 @@ typedef struct VirtIONet { uint32_t has_vnet_hdr; size_t host_hdr_len; size_t guest_hdr_len; - uint32_t host_features; + uint64_t host_features; uint8_t has_ufo; uint32_t mergeable_rx_bufs; uint8_t promisc; diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h new file mode 100644 index 0000000000..94aacb7adf --- /dev/null +++ b/include/standard-headers/linux/ethtool.h @@ -0,0 +1,1821 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ethtool.h: Defines for Linux ethtool. + * + * Copyright (C) 1998 David S. Miller (davem@redhat.com) + * Copyright 2001 Jeff Garzik <jgarzik@pobox.com> + * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) + * Portions Copyright 2002 Intel (eli.kupermann@intel.com, + * christopher.leech@intel.com, + * scott.feldman@intel.com) + * Portions Copyright (C) Sun Microsystems 2008 + */ + +#ifndef _LINUX_ETHTOOL_H +#define _LINUX_ETHTOOL_H + +#include "net/eth.h" + +#include "standard-headers/linux/kernel.h" +#include "standard-headers/linux/types.h" +#include "standard-headers/linux/if_ether.h" + +#include <limits.h> /* for INT_MAX */ + +/* All structures exposed to userland should be defined such that they + * have the same layout for 32-bit and 64-bit userland. + */ + +/** + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. + * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET + * @supported: Bitmask of %SUPPORTED_* flags for the link modes, + * physical connectors and other link features for which the + * interface supports autonegotiation or auto-detection. + * Read-only. + * @advertising: Bitmask of %ADVERTISED_* flags for the link modes, + * physical connectors and other link features that are + * advertised through autonegotiation or enabled for + * auto-detection. + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @transceiver: Historically used to distinguish different possible + * PHY types, but not in a consistent way. Deprecated. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @maxtxpkt: Historically used to report TX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @maxrxpkt: Historically used to report RX IRQ coalescing; now + * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes + * and other link features that the link partner advertised + * through autonegotiation; 0 if unknown or not applicable. + * Read-only. + * + * The link speed in Mbps is split between @speed and @speed_hi. Use + * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to + * access it. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GSET to get the current values before making specific + * changes and then applying them with %ETHTOOL_SSET. + * + * Drivers that implement set_settings() should validate all fields + * other than @cmd that are not described as read-only or deprecated, + * and must ignore all fields described as read-only. + * + * Deprecated fields should be ignored by both users and drivers. + */ +struct ethtool_cmd { + uint32_t cmd; + uint32_t supported; + uint32_t advertising; + uint16_t speed; + uint8_t duplex; + uint8_t port; + uint8_t phy_address; + uint8_t transceiver; + uint8_t autoneg; + uint8_t mdio_support; + uint32_t maxtxpkt; + uint32_t maxrxpkt; + uint16_t speed_hi; + uint8_t eth_tp_mdix; + uint8_t eth_tp_mdix_ctrl; + uint32_t lp_advertising; + uint32_t reserved[2]; +}; + +static inline void ethtool_cmd_speed_set(struct ethtool_cmd *ep, + uint32_t speed) +{ + ep->speed = (uint16_t)(speed & 0xFFFF); + ep->speed_hi = (uint16_t)(speed >> 16); +} + +static inline uint32_t ethtool_cmd_speed(const struct ethtool_cmd *ep) +{ + return (ep->speed_hi << 16) | ep->speed; +} + +/* Device supports clause 22 register access to PHY or peripherals + * using the interface defined in "standard-headers/linux/mii.h". This should not be + * set if there are known to be no such peripherals present or if + * the driver only emulates clause 22 registers for compatibility. + */ +#define ETH_MDIO_SUPPORTS_C22 1 + +/* Device supports clause 45 register access to PHY or peripherals + * using the interface defined in "standard-headers/linux/mii.h" and <linux/mdio.h>. + * This should not be set if there are known to be no such peripherals + * present. + */ +#define ETH_MDIO_SUPPORTS_C45 2 + +#define ETHTOOL_FWVERS_LEN 32 +#define ETHTOOL_BUSINFO_LEN 32 +#define ETHTOOL_EROMVERS_LEN 32 + +/** + * struct ethtool_drvinfo - general driver and device information + * @cmd: Command number = %ETHTOOL_GDRVINFO + * @driver: Driver short name. This should normally match the name + * in its bus driver structure (e.g. pci_driver::name). Must + * not be an empty string. + * @version: Driver version string; may be an empty string + * @fw_version: Firmware version string; may be an empty string + * @erom_version: Expansion ROM version string; may be an empty string + * @bus_info: Device bus address. This should match the dev_name() + * string for the underlying bus device, if there is one. May be + * an empty string. + * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and + * %ETHTOOL_SPFLAGS commands; also the number of strings in the + * %ETH_SS_PRIV_FLAGS set + * @n_stats: Number of uint64_t statistics returned by the %ETHTOOL_GSTATS + * command; also the number of strings in the %ETH_SS_STATS set + * @testinfo_len: Number of results returned by the %ETHTOOL_TEST + * command; also the number of strings in the %ETH_SS_TEST set + * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM + * and %ETHTOOL_SEEPROM commands, in bytes + * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS + * command, in bytes + * + * Users can use the %ETHTOOL_GSSET_INFO command to get the number of + * strings in any string set (from Linux 2.6.34). + * + * Drivers should set at most @driver, @version, @fw_version and + * @bus_info in their get_drvinfo() implementation. The ethtool + * core fills in the other fields using other driver operations. + */ +struct ethtool_drvinfo { + uint32_t cmd; + char driver[32]; + char version[32]; + char fw_version[ETHTOOL_FWVERS_LEN]; + char bus_info[ETHTOOL_BUSINFO_LEN]; + char erom_version[ETHTOOL_EROMVERS_LEN]; + char reserved2[12]; + uint32_t n_priv_flags; + uint32_t n_stats; + uint32_t testinfo_len; + uint32_t eedump_len; + uint32_t regdump_len; +}; + +#define SOPASS_MAX 6 + +/** + * struct ethtool_wolinfo - Wake-On-Lan configuration + * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL + * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes. + * Read-only. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE + * is set in @wolopts. + */ +struct ethtool_wolinfo { + uint32_t cmd; + uint32_t supported; + uint32_t wolopts; + uint8_t sopass[SOPASS_MAX]; +}; + +/* for passing single values */ +struct ethtool_value { + uint32_t cmd; + uint32_t data; +}; + +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, + /* + * Add your fresh new tubale attribute above and remember to update + * tunable_strings[] in net/core/ethtool.c + */ + __ETHTOOL_TUNABLE_COUNT, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + uint32_t cmd; + uint32_t id; + uint32_t type_id; + uint32_t len; + void *data[0]; +}; + +#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff +#define DOWNSHIFT_DEV_DISABLE 0 + +enum phy_tunable_id { + ETHTOOL_PHY_ID_UNSPEC, + ETHTOOL_PHY_DOWNSHIFT, + /* + * Add your fresh new phy tunable attribute above and remember to update + * phy_tunable_strings[] in net/core/ethtool.c + */ + __ETHTOOL_PHY_TUNABLE_COUNT, +}; + +/** + * struct ethtool_regs - hardware register dump + * @cmd: Command number = %ETHTOOL_GREGS + * @version: Dump format version. This is driver-specific and may + * distinguish different chips/revisions. Drivers must use new + * version numbers whenever the dump format changes in an + * incompatible way. + * @len: On entry, the real length of @data. On return, the number of + * bytes used. + * @data: Buffer for the register dump + * + * Users should use %ETHTOOL_GDRVINFO to find the maximum length of + * a register dump for the interface. They must allocate the buffer + * immediately following this structure. + */ +struct ethtool_regs { + uint32_t cmd; + uint32_t version; + uint32_t len; + uint8_t data[0]; +}; + +/** + * struct ethtool_eeprom - EEPROM dump + * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or + * %ETHTOOL_SEEPROM + * @magic: A 'magic cookie' value to guard against accidental changes. + * The value passed in to %ETHTOOL_SEEPROM must match the value + * returned by %ETHTOOL_GEEPROM for the same device. This is + * unused when @cmd is %ETHTOOL_GMODULEEEPROM. + * @offset: Offset within the EEPROM to begin reading/writing, in bytes + * @len: On entry, number of bytes to read/write. On successful + * return, number of bytes actually read/written. In case of + * error, this may indicate at what point the error occurred. + * @data: Buffer to read/write from + * + * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find + * the length of an on-board or module EEPROM, respectively. They + * must allocate the buffer immediately following this structure. + */ +struct ethtool_eeprom { + uint32_t cmd; + uint32_t magic; + uint32_t offset; + uint32_t len; + uint8_t data[0]; +}; + +/** + * struct ethtool_eee - Energy Efficient Ethernet information + * @cmd: ETHTOOL_{G,S}EEE + * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations + * for which there is EEE support. + * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations + * advertised as eee capable. + * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex + * combinations advertised by the link partner as eee capable. + * @eee_active: Result of the eee auto negotiation. + * @eee_enabled: EEE configured mode (enabled/disabled). + * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given + * that eee was negotiated. + * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting + * its tx lpi (after reaching 'idle' state). Effective only when eee + * was negotiated and tx_lpi_enabled was set. + */ +struct ethtool_eee { + uint32_t cmd; + uint32_t supported; + uint32_t advertised; + uint32_t lp_advertised; + uint32_t eee_active; + uint32_t eee_enabled; + uint32_t tx_lpi_enabled; + uint32_t tx_lpi_timer; + uint32_t reserved[2]; +}; + +/** + * struct ethtool_modinfo - plugin module eeprom information + * @cmd: %ETHTOOL_GMODULEINFO + * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx + * @eeprom_len: Length of the eeprom + * + * This structure is used to return the information to + * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM. + * The type code indicates the eeprom data format + */ +struct ethtool_modinfo { + uint32_t cmd; + uint32_t type; + uint32_t eeprom_len; + uint32_t reserved[8]; +}; + +/** + * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates + * @cmd: ETHTOOL_{G,S}COALESCE + * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after + * a packet arrives. + * @rx_max_coalesced_frames: Maximum number of packets to receive + * before an RX interrupt. + * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after + * a packet is sent. + * @tx_max_coalesced_frames: Maximum number of packets to be sent + * before a TX interrupt. + * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that + * this value applies while an IRQ is being serviced by the host. + * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames, + * except that this value applies while an IRQ is being serviced + * by the host. + * @stats_block_coalesce_usecs: How many usecs to delay in-memory + * statistics block updates. Some drivers do not have an + * in-memory statistic block, and in such cases this value is + * ignored. This value must not be zero. + * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing. + * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing. + * @pkt_rate_low: Threshold for low packet rate (packets per second). + * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is below @pkt_rate_low. + * @rx_max_coalesced_frames_low: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is below @pkt_rate_low. + * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is below @pkt_rate_low. + * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before + * a TX interrupt, when the packet rate is below @pkt_rate_low. + * @pkt_rate_high: Threshold for high packet rate (packets per second). + * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after + * a packet arrives, when the packet rate is above @pkt_rate_high. + * @rx_max_coalesced_frames_high: Maximum number of packets to be received + * before an RX interrupt, when the packet rate is above @pkt_rate_high. + * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after + * a packet is sent, when the packet rate is above @pkt_rate_high. + * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before + * a TX interrupt, when the packet rate is above @pkt_rate_high. + * @rate_sample_interval: How often to do adaptive coalescing packet rate + * sampling, measured in seconds. Must not be zero. + * + * Each pair of (usecs, max_frames) fields specifies that interrupts + * should be coalesced until + * (usecs > 0 && time_since_first_completion >= usecs) || + * (max_frames > 0 && completed_frames >= max_frames) + * + * It is illegal to set both usecs and max_frames to zero as this + * would cause interrupts to never be generated. To disable + * coalescing, set usecs = 0 and max_frames = 1. + * + * Some implementations ignore the value of max_frames and use the + * condition time_since_first_completion >= usecs + * + * This is deprecated. Drivers for hardware that does not support + * counting completions should validate that max_frames == !rx_usecs. + * + * Adaptive RX/TX coalescing is an algorithm implemented by some + * drivers to improve latency under low packet rates and improve + * throughput under high packet rates. Some drivers only implement + * one of RX or TX adaptive coalescing. Anything not implemented by + * the driver causes these values to be silently ignored. + * + * When the packet rate is below @pkt_rate_high but above + * @pkt_rate_low (both measured in packets per second) the + * normal {rx,tx}_* coalescing parameters are used. + */ +struct ethtool_coalesce { + uint32_t cmd; + uint32_t rx_coalesce_usecs; + uint32_t rx_max_coalesced_frames; + uint32_t rx_coalesce_usecs_irq; + uint32_t rx_max_coalesced_frames_irq; + uint32_t tx_coalesce_usecs; + uint32_t tx_max_coalesced_frames; + uint32_t tx_coalesce_usecs_irq; + uint32_t tx_max_coalesced_frames_irq; + uint32_t stats_block_coalesce_usecs; + uint32_t use_adaptive_rx_coalesce; + uint32_t use_adaptive_tx_coalesce; + uint32_t pkt_rate_low; + uint32_t rx_coalesce_usecs_low; + uint32_t rx_max_coalesced_frames_low; + uint32_t tx_coalesce_usecs_low; + uint32_t tx_max_coalesced_frames_low; + uint32_t pkt_rate_high; + uint32_t rx_coalesce_usecs_high; + uint32_t rx_max_coalesced_frames_high; + uint32_t tx_coalesce_usecs_high; + uint32_t tx_max_coalesced_frames_high; + uint32_t rate_sample_interval; +}; + +/** + * struct ethtool_ringparam - RX/TX ring parameters + * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM + * @rx_max_pending: Maximum supported number of pending entries per + * RX ring. Read-only. + * @rx_mini_max_pending: Maximum supported number of pending entries + * per RX mini ring. Read-only. + * @rx_jumbo_max_pending: Maximum supported number of pending entries + * per RX jumbo ring. Read-only. + * @tx_max_pending: Maximum supported number of pending entries per + * TX ring. Read-only. + * @rx_pending: Current maximum number of pending entries per RX ring + * @rx_mini_pending: Current maximum number of pending entries per RX + * mini ring + * @rx_jumbo_pending: Current maximum number of pending entries per RX + * jumbo ring + * @tx_pending: Current maximum supported number of pending entries + * per TX ring + * + * If the interface does not have separate RX mini and/or jumbo rings, + * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0. + * + * There may also be driver-dependent minimum values for the number + * of entries per ring. + */ +struct ethtool_ringparam { + uint32_t cmd; + uint32_t rx_max_pending; + uint32_t rx_mini_max_pending; + uint32_t rx_jumbo_max_pending; + uint32_t tx_max_pending; + uint32_t rx_pending; + uint32_t rx_mini_pending; + uint32_t rx_jumbo_pending; + uint32_t tx_pending; +}; + +/** + * struct ethtool_channels - configuring number of network channel + * @cmd: ETHTOOL_{G,S}CHANNELS + * @max_rx: Read only. Maximum number of receive channel the driver support. + * @max_tx: Read only. Maximum number of transmit channel the driver support. + * @max_other: Read only. Maximum number of other channel the driver support. + * @max_combined: Read only. Maximum number of combined channel the driver + * support. Set of queues RX, TX or other. + * @rx_count: Valid values are in the range 1 to the max_rx. + * @tx_count: Valid values are in the range 1 to the max_tx. + * @other_count: Valid values are in the range 1 to the max_other. + * @combined_count: Valid values are in the range 1 to the max_combined. + * + * This can be used to configure RX, TX and other channels. + */ + +struct ethtool_channels { + uint32_t cmd; + uint32_t max_rx; + uint32_t max_tx; + uint32_t max_other; + uint32_t max_combined; + uint32_t rx_count; + uint32_t tx_count; + uint32_t other_count; + uint32_t combined_count; +}; + +/** + * struct ethtool_pauseparam - Ethernet pause (flow control) parameters + * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM + * @autoneg: Flag to enable autonegotiation of pause frame use + * @rx_pause: Flag to enable reception of pause frames + * @tx_pause: Flag to enable transmission of pause frames + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + * If the link is autonegotiated, drivers should use + * mii_advertise_flowctrl() or similar code to set the advertised + * pause frame capabilities based on the @rx_pause and @tx_pause flags, + * even if @autoneg is zero. They should also allow the advertised + * pause frame capabilities to be controlled directly through the + * advertising field of &struct ethtool_cmd. + * + * If @autoneg is non-zero, the MAC is configured to send and/or + * receive pause frames according to the result of autonegotiation. + * Otherwise, it is configured directly based on the @rx_pause and + * @tx_pause flags. + */ +struct ethtool_pauseparam { + uint32_t cmd; + uint32_t autoneg; + uint32_t rx_pause; + uint32_t tx_pause; +}; + +#define ETH_GSTRING_LEN 32 + +/** + * enum ethtool_stringset - string set ID + * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST + * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS + * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with + * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS + * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; + * now deprecated + * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names + * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS + * @ETH_SS_PHY_TUNABLES: PHY tunable names + */ +enum ethtool_stringset { + ETH_SS_TEST = 0, + ETH_SS_STATS, + ETH_SS_PRIV_FLAGS, + ETH_SS_NTUPLE_FILTERS, + ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, + ETH_SS_TUNABLES, + ETH_SS_PHY_STATS, + ETH_SS_PHY_TUNABLES, +}; + +/** + * struct ethtool_gstrings - string set for data tagging + * @cmd: Command number = %ETHTOOL_GSTRINGS + * @string_set: String set ID; one of &enum ethtool_stringset + * @len: On return, the number of strings in the string set + * @data: Buffer for strings. Each string is null-padded to a size of + * %ETH_GSTRING_LEN. + * + * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in + * the string set. They must allocate a buffer of the appropriate + * size immediately following this structure. + */ +struct ethtool_gstrings { + uint32_t cmd; + uint32_t string_set; + uint32_t len; + uint8_t data[0]; +}; + +/** + * struct ethtool_sset_info - string set information + * @cmd: Command number = %ETHTOOL_GSSET_INFO + * @sset_mask: On entry, a bitmask of string sets to query, with bits + * numbered according to &enum ethtool_stringset. On return, a + * bitmask of those string sets queried that are supported. + * @data: Buffer for string set sizes. On return, this contains the + * size of each string set that was queried and supported, in + * order of ID. + * + * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on + * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the + * size of set 1 and @data[1] contains the size of set 2. + * + * Users must allocate a buffer of the appropriate size (4 * number of + * sets queried) immediately following this structure. + */ +struct ethtool_sset_info { + uint32_t cmd; + uint32_t reserved; + uint64_t sset_mask; + uint32_t data[0]; +}; + +/** + * enum ethtool_test_flags - flags definition of ethtool_test + * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise + * only online tests. + * @ETH_TEST_FL_FAILED: Driver set this flag if test fails. + * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback + * test. + * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test + */ + +enum ethtool_test_flags { + ETH_TEST_FL_OFFLINE = (1 << 0), + ETH_TEST_FL_FAILED = (1 << 1), + ETH_TEST_FL_EXTERNAL_LB = (1 << 2), + ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3), +}; + +/** + * struct ethtool_test - device self-test invocation + * @cmd: Command number = %ETHTOOL_TEST + * @flags: A bitmask of flags from &enum ethtool_test_flags. Some + * flags may be set by the user on entry; others may be set by + * the driver on return. + * @len: On return, the number of test results + * @data: Array of test results + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of test results that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of results) immediately + * following this structure. + */ +struct ethtool_test { + uint32_t cmd; + uint32_t flags; + uint32_t reserved; + uint32_t len; + uint64_t data[0]; +}; + +/** + * struct ethtool_stats - device-specific statistics + * @cmd: Command number = %ETHTOOL_GSTATS + * @n_stats: On return, the number of statistics + * @data: Array of statistics + * + * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the + * number of statistics that will be returned. They must allocate a + * buffer of the appropriate size (8 * number of statistics) + * immediately following this structure. + */ +struct ethtool_stats { + uint32_t cmd; + uint32_t n_stats; + uint64_t data[0]; +}; + +/** + * struct ethtool_perm_addr - permanent hardware address + * @cmd: Command number = %ETHTOOL_GPERMADDR + * @size: On entry, the size of the buffer. On return, the size of the + * address. The command fails if the buffer is too small. + * @data: Buffer for the address + * + * Users must allocate the buffer immediately following this structure. + * A buffer size of %MAX_ADDR_LEN should be sufficient for any address + * type. + */ +struct ethtool_perm_addr { + uint32_t cmd; + uint32_t size; + uint8_t data[0]; +}; + +/* boolean flags controlling per-interface behavior characteristics. + * When reading, the flag indicates whether or not a certain behavior + * is enabled/present. When writing, the flag indicates whether + * or not the driver should turn on (set) or off (clear) a behavior. + * + * Some behaviors may read-only (unconditionally absent or present). + * If such is the case, return EINVAL in the set-flags operation if the + * flag differs from the read-only value. + */ +enum ethtool_flags { + ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */ + ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */ + ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */ + ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */ + ETH_FLAG_RXHASH = (1 << 28), +}; + +/* The following structures are for supporting RX network flow + * classification and RX n-tuple configuration. Note, all multibyte + * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to + * be in network byte order. + */ + +/** + * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc. + * @ip4src: Source host + * @ip4dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tos: Type-of-service + * + * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow. + */ +struct ethtool_tcpip4_spec { + uint32_t ip4src; + uint32_t ip4dst; + uint16_t psrc; + uint16_t pdst; + uint8_t tos; +}; + +/** + * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @spi: Security parameters index + * @tos: Type-of-service + * + * This can be used to specify an IPsec transport or tunnel over IPv4. + */ +struct ethtool_ah_espip4_spec { + uint32_t ip4src; + uint32_t ip4dst; + uint32_t spi; + uint8_t tos; +}; + +#define ETH_RX_NFC_IP4 1 + +/** + * struct ethtool_usrip4_spec - general flow specification for IPv4 + * @ip4src: Source host + * @ip4dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tos: Type-of-service + * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0 + * @proto: Transport protocol number; mask must be 0 + */ +struct ethtool_usrip4_spec { + uint32_t ip4src; + uint32_t ip4dst; + uint32_t l4_4_bytes; + uint8_t tos; + uint8_t ip_ver; + uint8_t proto; +}; + +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + uint32_t ip6src[4]; + uint32_t ip6dst[4]; + uint16_t psrc; + uint16_t pdst; + uint8_t tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + uint32_t ip6src[4]; + uint32_t ip6dst[4]; + uint32_t spi; + uint8_t tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + uint32_t ip6src[4]; + uint32_t ip6dst[4]; + uint32_t l4_4_bytes; + uint8_t tclass; + uint8_t l4_proto; +}; + +union ethtool_flow_union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; + struct eth_header ether_spec; + uint8_t hdata[52]; +}; + +/** + * struct ethtool_flow_ext - additional RX flow fields + * @h_dest: destination MAC address + * @vlan_etype: VLAN EtherType + * @vlan_tci: VLAN tag control information + * @data: user defined data + * + * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT + * is set in &struct ethtool_rx_flow_spec @flow_type. + * @h_dest is valid if %FLOW_MAC_EXT is set. + */ +struct ethtool_flow_ext { + uint8_t padding[2]; + unsigned char h_dest[ETH_ALEN]; + uint16_t vlan_etype; + uint16_t vlan_tci; + uint32_t data[2]; +}; + +/** + * struct ethtool_rx_flow_spec - classification rule for RX flows + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow fields to match (dependent on @flow_type) + * @h_ext: Additional fields to match + * @m_u: Masks for flow field bits to be matched + * @m_ext: Masks for additional field bits to be matched + * Note, all additional fields must be ignored unless @flow_type + * includes the %FLOW_EXT or %FLOW_MAC_EXT flag + * (see &struct ethtool_flow_ext description). + * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC + * if packets should be discarded + * @location: Location of rule in the table. Locations must be + * numbered such that a flow matching multiple rules will be + * classified according to the first (lowest numbered) rule. + */ +struct ethtool_rx_flow_spec { + uint32_t flow_type; + union ethtool_flow_union h_u; + struct ethtool_flow_ext h_ext; + union ethtool_flow_union m_u; + struct ethtool_flow_ext m_ext; + uint64_t ring_cookie; + uint32_t location; +}; + +/* How rings are layed out when accessing virtual functions or + * offloaded queues is device specific. To allow users to do flow + * steering and specify these queues the ring cookie is partitioned + * into a 32bit queue index with an 8 bit virtual function id. + * This also leaves the 3bytes for further specifiers. It is possible + * future devices may support more than 256 virtual functions if + * devices start supporting PCIe w/ARI. However at the moment I + * do not know of any devices that support this so I do not reserve + * space for this at this time. If a future patch consumes the next + * byte it should be aware of this possiblity. + */ +#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL +#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32 +static inline uint64_t ethtool_get_flow_spec_ring(uint64_t ring_cookie) +{ + return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie; +}; + +static inline uint64_t ethtool_get_flow_spec_ring_vf(uint64_t ring_cookie) +{ + return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >> + ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF; +}; + +/** + * struct ethtool_rxnfc - command to get or set RX flow classification rules + * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH, + * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE, + * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS + * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW + * @data: Command-dependent value + * @fs: Flow classification rule + * @rule_cnt: Number of rules to be affected + * @rule_locs: Array of used rule locations + * + * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating + * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following + * structure fields must not be used. + * + * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues + * on return. + * + * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined + * rules on return. If @data is non-zero on return then it is the + * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the + * driver supports any special location values. If that flag is not + * set in @data then special location values should not be used. + * + * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an + * existing rule on entry and @fs contains the rule on return. + * + * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the + * user buffer for @rule_locs on entry. On return, @data is the size + * of the rule table, @rule_cnt is the number of defined rules, and + * @rule_locs contains the locations of the defined rules. Drivers + * must use the second parameter to get_rxnfc() instead of @rule_locs. + * + * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. + * @fs.@location either specifies the location to use or is a special + * location value with %RX_CLS_LOC_SPECIAL flag set. On return, + * @fs.@location is the actual rule location. + * + * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an + * existing rule on entry. + * + * A driver supporting the special location values for + * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused + * location, and may remove a rule at a later location (lower + * priority) that matches exactly the same set of flows. The special + * values are %RX_CLS_LOC_ANY, selecting any location; + * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum + * priority); and %RX_CLS_LOC_LAST, selecting the last suitable + * location (minimum priority). Additional special values may be + * defined in future and drivers must return -%EINVAL for any + * unrecognised value. + */ +struct ethtool_rxnfc { + uint32_t cmd; + uint32_t flow_type; + uint64_t data; + struct ethtool_rx_flow_spec fs; + uint32_t rule_cnt; + uint32_t rule_locs[0]; +}; + + +/** + * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection + * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR + * @size: On entry, the array size of the user buffer, which may be zero. + * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware + * indirection table. + * @ring_index: RX ring/queue index for each hash value + * + * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size + * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means + * the table should be reset to default values. This last feature + * is not supported by the original implementations. + */ +struct ethtool_rxfh_indir { + uint32_t cmd; + uint32_t size; + uint32_t ring_index[0]; +}; + +/** + * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. + * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH + * @rss_context: RSS context identifier. + * @indir_size: On entry, the array size of the user buffer for the + * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), + * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, + * the array size of the hardware indirection table. + * @key_size: On entry, the array size of the user buffer for the hash key, + * which may be zero. On return from %ETHTOOL_GRSSH, the size of the + * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @rsvd: Reserved for future extensions. + * @rss_config: RX ring/queue index for each hash value i.e., indirection table + * of @indir_size uint32_t elements, followed by hash key of @key_size + * bytes. + * + * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the + * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of + * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested + * and a @indir_size of zero means the indir table should be reset to default + * values. An hfunc of zero means that hash function setting is not requested. + */ +struct ethtool_rxfh { + uint32_t cmd; + uint32_t rss_context; + uint32_t indir_size; + uint32_t key_size; + uint8_t hfunc; + uint8_t rsvd8[3]; + uint32_t rsvd32; + uint32_t rss_config[0]; +}; +#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff + +/** + * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter + * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW + * @h_u: Flow field values to match (dependent on @flow_type) + * @m_u: Masks for flow field value bits to be ignored + * @vlan_tag: VLAN tag to match + * @vlan_tag_mask: Mask for VLAN tag bits to be ignored + * @data: Driver-dependent data to match + * @data_mask: Mask for driver-dependent data bits to be ignored + * @action: RX ring/queue index to deliver to (non-negative) or other action + * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP) + * + * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where + * a field value and mask are both zero this is treated as if all mask + * bits are set i.e. the field is ignored. + */ +struct ethtool_rx_ntuple_flow_spec { + uint32_t flow_type; + union { + struct ethtool_tcpip4_spec tcp_ip4_spec; + struct ethtool_tcpip4_spec udp_ip4_spec; + struct ethtool_tcpip4_spec sctp_ip4_spec; + struct ethtool_ah_espip4_spec ah_ip4_spec; + struct ethtool_ah_espip4_spec esp_ip4_spec; + struct ethtool_usrip4_spec usr_ip4_spec; + struct eth_header ether_spec; + uint8_t hdata[72]; + } h_u, m_u; + + uint16_t vlan_tag; + uint16_t vlan_tag_mask; + uint64_t data; + uint64_t data_mask; + + int32_t action; +#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */ +#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */ +}; + +/** + * struct ethtool_rx_ntuple - command to set or clear RX flow filter + * @cmd: Command number - %ETHTOOL_SRXNTUPLE + * @fs: Flow filter specification + */ +struct ethtool_rx_ntuple { + uint32_t cmd; + struct ethtool_rx_ntuple_flow_spec fs; +}; + +#define ETHTOOL_FLASH_MAX_FILENAME 128 +enum ethtool_flash_op_type { + ETHTOOL_FLASH_ALL_REGIONS = 0, +}; + +/* for passing firmware flashing related parameters */ +struct ethtool_flash { + uint32_t cmd; + uint32_t region; + char data[ETHTOOL_FLASH_MAX_FILENAME]; +}; + +/** + * struct ethtool_dump - used for retrieving, setting device dump + * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or + * %ETHTOOL_SET_DUMP + * @version: FW version of the dump, filled in by driver + * @flag: driver dependent flag for dump setting, filled in by driver during + * get and filled in by ethtool for set operation. + * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when + * firmware dump is disabled. + * @len: length of dump data, used as the length of the user buffer on entry to + * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver + * for %ETHTOOL_GET_DUMP_FLAG command + * @data: data collected for get dump data operation + */ +struct ethtool_dump { + uint32_t cmd; + uint32_t version; + uint32_t flag; + uint32_t len; + uint8_t data[0]; +}; + +#define ETH_FW_DUMP_DISABLE 0 + +/* for returning and changing feature sets */ + +/** + * struct ethtool_get_features_block - block with state of 32 features + * @available: mask of changeable features + * @requested: mask of features requested to be enabled if possible + * @active: mask of currently enabled features + * @never_changed: mask of features not changeable for any device + */ +struct ethtool_get_features_block { + uint32_t available; + uint32_t requested; + uint32_t active; + uint32_t never_changed; +}; + +/** + * struct ethtool_gfeatures - command to get state of device's features + * @cmd: command number = %ETHTOOL_GFEATURES + * @size: On entry, the number of elements in the features[] array; + * on return, the number of elements in features[] needed to hold + * all features + * @features: state of features + */ +struct ethtool_gfeatures { + uint32_t cmd; + uint32_t size; + struct ethtool_get_features_block features[0]; +}; + +/** + * struct ethtool_set_features_block - block with request for 32 features + * @valid: mask of features to be changed + * @requested: values of features to be changed + */ +struct ethtool_set_features_block { + uint32_t valid; + uint32_t requested; +}; + +/** + * struct ethtool_sfeatures - command to request change in device's features + * @cmd: command number = %ETHTOOL_SFEATURES + * @size: array size of the features[] array + * @features: feature change masks + */ +struct ethtool_sfeatures { + uint32_t cmd; + uint32_t size; + struct ethtool_set_features_block features[0]; +}; + +/** + * struct ethtool_ts_info - holds a device's timestamping and PHC association + * @cmd: command number = %ETHTOOL_GET_TS_INFO + * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags + * @phc_index: device index of the associated PHC, or -1 if there is none + * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values + * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values + * + * The bits in the 'tx_types' and 'rx_filters' fields correspond to + * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values, + * respectively. For example, if the device supports HWTSTAMP_TX_ON, + * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set. + * + * Drivers should only report the filters they actually support without + * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for + * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the + * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op. + */ +struct ethtool_ts_info { + uint32_t cmd; + uint32_t so_timestamping; + int32_t phc_index; + uint32_t tx_types; + uint32_t tx_reserved[3]; + uint32_t rx_filters; + uint32_t rx_reserved[3]; +}; + +/* + * %ETHTOOL_SFEATURES changes features present in features[].valid to the + * values of corresponding bits in features[].requested. Bits in .requested + * not set in .valid or not changeable are ignored. + * + * Returns %EINVAL when .valid contains undefined or never-changeable bits + * or size is not equal to required number of features words (32-bit blocks). + * Returns >= 0 if request was completed; bits set in the value mean: + * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not + * changeable (not present in %ETHTOOL_GFEATURES' features[].available) + * those bits were ignored. + * %ETHTOOL_F_WISH - some or all changes requested were recorded but the + * resulting state of bits masked by .valid is not equal to .requested. + * Probably there are other device-specific constraints on some features + * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered + * here as though ignored bits were cleared. + * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling + * compatibility functions. Requested offload state cannot be properly + * managed by kernel. + * + * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of + * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands + * for ETH_SS_FEATURES string set. First entry in the table corresponds to least + * significant bit in features[0] fields. Empty strings mark undefined features. + */ +enum ethtool_sfeatures_retval_bits { + ETHTOOL_F_UNSUPPORTED__BIT, + ETHTOOL_F_WISH__BIT, + ETHTOOL_F_COMPAT__BIT, +}; + +#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT) +#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) +#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) + +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + uint32_t cmd; + uint32_t sub_command; + uint32_t queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; + +/** + * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on porte + * @fec: Bitmask of supported/configured FEC modes + * @rsvd: Reserved for future extensions. i.e FEC bypass feature. + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + */ +struct ethtool_fecparam { + uint32_t cmd; + /* bitmask of FEC modes */ + uint32_t active_fec; + uint32_t fec; + uint32_t reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported + * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver + * @ETHTOOL_FEC_OFF: No FEC Mode + * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode + * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) + +/* CMDs currently supported */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ +#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ +#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ +#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ +#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */ +#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ +#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */ +#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */ +/* Get link status for host, i.e. whether the interface *and* the + * physical port (if there is one) are up (ethtool_value). */ +#define ETHTOOL_GLINK 0x0000000a +#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ +#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */ +#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ +#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ +#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ +#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */ +#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ +#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ +#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ +#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ +#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ +#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ +#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable + * (ethtool_value) */ +#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable + * (ethtool_value). */ +#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */ +#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ +#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ +#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ +#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ +#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ +#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */ +#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */ +#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */ +#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */ +#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */ +#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */ +#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */ +#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */ +#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */ + +#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */ +#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */ +#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */ +#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */ +#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */ +#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */ +#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */ +#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */ +#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */ +#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */ +#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */ +#define ETHTOOL_RESET 0x00000034 /* Reset hardware */ +#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */ +#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */ +#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */ +#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */ +#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */ + +#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */ +#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */ +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ +#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */ +#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */ +#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */ +#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */ +#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */ +#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */ +#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */ +#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */ +#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */ + +#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ +#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ +#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ + +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ +#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ +#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ + +/* compatibility with older code */ +#define SPARC_ETH_GSET ETHTOOL_GSET +#define SPARC_ETH_SSET ETHTOOL_SSET + +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, + ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, + ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, + ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35, + ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36, + ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37, + ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38, + ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39, + ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41, + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42, + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43, + ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44, + ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45, + ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46, + ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, + ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + __ETHTOOL_LINK_MODE_LAST + = ETHTOOL_LINK_MODE_FEC_BASER_BIT, +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ + +/* The following are all involved in forcing a particular link + * mode for the device for setting things. When getting the + * devices settings, these indicate the current mode and whether + * it was forced up into this mode or autonegotiated. + */ + +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. + * Update drivers/net/phy/phy.c:phy_speed_to_str() and + * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values. + */ +#define SPEED_10 10 +#define SPEED_100 100 +#define SPEED_1000 1000 +#define SPEED_2500 2500 +#define SPEED_5000 5000 +#define SPEED_10000 10000 +#define SPEED_14000 14000 +#define SPEED_20000 20000 +#define SPEED_25000 25000 +#define SPEED_40000 40000 +#define SPEED_50000 50000 +#define SPEED_56000 56000 +#define SPEED_100000 100000 + +#define SPEED_UNKNOWN -1 + +static inline int ethtool_validate_speed(uint32_t speed) +{ + return speed <= INT_MAX || speed == SPEED_UNKNOWN; +} + +/* Duplex, half or full. */ +#define DUPLEX_HALF 0x00 +#define DUPLEX_FULL 0x01 +#define DUPLEX_UNKNOWN 0xff + +static inline int ethtool_validate_duplex(uint8_t duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + +/* Which connector port. */ +#define PORT_TP 0x00 +#define PORT_AUI 0x01 +#define PORT_MII 0x02 +#define PORT_FIBRE 0x03 +#define PORT_BNC 0x04 +#define PORT_DA 0x05 +#define PORT_NONE 0xef +#define PORT_OTHER 0xff + +/* Which transceiver to use. */ +#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */ +#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */ +#define XCVR_DUMMY1 0x02 +#define XCVR_DUMMY2 0x03 +#define XCVR_DUMMY3 0x04 + +/* Enable or disable autonegotiation. */ +#define AUTONEG_DISABLE 0x00 +#define AUTONEG_ENABLE 0x01 + +/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then + * the driver is required to renegotiate link + */ +#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */ +#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */ +#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */ +#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */ + +/* Wake-On-Lan options. */ +#define WAKE_PHY (1 << 0) +#define WAKE_UCAST (1 << 1) +#define WAKE_MCAST (1 << 2) +#define WAKE_BCAST (1 << 3) +#define WAKE_ARP (1 << 4) +#define WAKE_MAGIC (1 << 5) +#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */ + +/* L2-L4 network traffic flow types */ +#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ +#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ +#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ +#define AH_ESP_V4_FLOW 0x04 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ +#define AH_ESP_V6_FLOW 0x08 /* hash only */ +#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ +#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ +#define IPV4_FLOW 0x10 /* hash only */ +#define IPV6_FLOW 0x11 /* hash only */ +#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ +/* Flag to enable additional fields in struct ethtool_rx_flow_spec */ +#define FLOW_EXT 0x80000000 +#define FLOW_MAC_EXT 0x40000000 + +/* L3-L4 network traffic flow hash options */ +#define RXH_L2DA (1 << 1) +#define RXH_VLAN (1 << 2) +#define RXH_L3_PROTO (1 << 3) +#define RXH_IP_SRC (1 << 4) +#define RXH_IP_DST (1 << 5) +#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ +#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ +#define RXH_DISCARD (1 << 31) + +#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL + +/* Special RX classification rule insert location values */ +#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */ +#define RX_CLS_LOC_ANY 0xffffffff +#define RX_CLS_LOC_FIRST 0xfffffffe +#define RX_CLS_LOC_LAST 0xfffffffd + +/* EEPROM Standards for plug in modules */ +#define ETH_MODULE_SFF_8079 0x1 +#define ETH_MODULE_SFF_8079_LEN 256 +#define ETH_MODULE_SFF_8472 0x2 +#define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 + +/* Reset flags */ +/* The reset() operation must clear the flags for the components which + * were actually reset. On successful return, the flags indicate the + * components which were not reset, either because they do not exist + * in the hardware or because they cannot be reset independently. The + * driver must never reset any components that were not requested. + */ +enum ethtool_reset_flags { + /* These flags represent components dedicated to the interface + * the command is addressed to. Shift any flag left by + * ETH_RESET_SHARED_SHIFT to reset a shared component of the + * same type. + */ + ETH_RESET_MGMT = 1 << 0, /* Management processor */ + ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */ + ETH_RESET_DMA = 1 << 2, /* DMA engine */ + ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */ + ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */ + ETH_RESET_MAC = 1 << 5, /* Media access controller */ + ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */ + ETH_RESET_RAM = 1 << 7, /* RAM shared between + * multiple components */ + ETH_RESET_AP = 1 << 8, /* Application processor */ + + ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to + * this interface */ + ETH_RESET_ALL = 0xffffffff, /* All components used by this + * interface, even if shared */ +}; +#define ETH_RESET_SHARED_SHIFT 16 + + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel expected size and all the other fields but cmd + * are 0; otherwise (handshake completed), strictly positive + * to indicate size used by kernel and cmd field stays + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * @advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * @lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + * @transceiver: Used to distinguish different possible PHY types, + * reported consistently by PHYLIB. Read-only. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. Until all drivers are + * converted to ignore them or to the new %ethtool_link_settings API, + * for both queries and changes, users should always try + * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick + * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it + * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and + * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing + * either %ethtool_cmd or %ethtool_link_settings). + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + */ +struct ethtool_link_settings { + uint32_t cmd; + uint32_t speed; + uint8_t duplex; + uint8_t port; + uint8_t phy_address; + uint8_t autoneg; + uint8_t mdio_support; + uint8_t eth_tp_mdix; + uint8_t eth_tp_mdix_ctrl; + int8_t link_mode_masks_nwords; + uint8_t transceiver; + uint8_t reserved1[3]; + uint32_t reserved[7]; + uint32_t link_mode_masks[0]; + /* layout of link_mode_masks fields: + * uint32_t map_supported[link_mode_masks_nwords]; + * uint32_t map_advertising[link_mode_masks_nwords]; + * uint32_t map_lp_advertising[link_mode_masks_nwords]; + */ +}; +#endif /* _LINUX_ETHTOOL_H */ diff --git a/include/standard-headers/linux/kernel.h b/include/standard-headers/linux/kernel.h new file mode 100644 index 0000000000..1eeba2ef92 --- /dev/null +++ b/include/standard-headers/linux/kernel.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_KERNEL_H +#define _LINUX_KERNEL_H + +#include "standard-headers/linux/sysinfo.h" + +/* + * 'kernel.h' contains some often-used function prototypes etc + */ +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) + +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#endif /* _LINUX_KERNEL_H */ diff --git a/include/standard-headers/linux/sysinfo.h b/include/standard-headers/linux/sysinfo.h new file mode 100644 index 0000000000..e3c06aca81 --- /dev/null +++ b/include/standard-headers/linux/sysinfo.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SYSINFO_H +#define _LINUX_SYSINFO_H + +#include "standard-headers/linux/types.h" + +#define SI_LOAD_SHIFT 16 +struct sysinfo { + long uptime; /* Seconds since boot */ + unsigned long loads[3]; /* 1, 5, and 15 minute load averages */ + unsigned long totalram; /* Total usable main memory size */ + unsigned long freeram; /* Available memory size */ + unsigned long sharedram; /* Amount of shared memory */ + unsigned long bufferram; /* Memory used by buffers */ + unsigned long totalswap; /* Total swap space size */ + unsigned long freeswap; /* swap space still available */ + uint16_t procs; /* Number of current processes */ + uint16_t pad; /* Explicit padding for m68k */ + unsigned long totalhigh; /* Total high memory size */ + unsigned long freehigh; /* Available high memory size */ + uint32_t mem_unit; /* Memory unit size in bytes */ + char _f[20-2*sizeof(unsigned long)-sizeof(uint32_t)]; /* Padding: libc5 uses this.. */ +}; + +#endif /* _LINUX_SYSINFO_H */ diff --git a/migration/migration.c b/migration/migration.c index 623f373326..fc629e5965 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -155,6 +155,8 @@ MigrationIncomingState *migration_incoming_get_current(void) if (!once) { mis_current.state = MIGRATION_STATUS_NONE; memset(&mis_current, 0, sizeof(MigrationIncomingState)); + mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE, + sizeof(struct PostCopyFD)); qemu_mutex_init(&mis_current.rp_mutex); qemu_event_init(&mis_current.main_thread_load_event, false); @@ -180,6 +182,10 @@ void migration_incoming_state_destroy(void) qemu_fclose(mis->from_src_file); mis->from_src_file = NULL; } + if (mis->postcopy_remote_fds) { + g_array_free(mis->postcopy_remote_fds, TRUE); + mis->postcopy_remote_fds = NULL; + } qemu_event_reset(&mis->main_thread_load_event); } diff --git a/migration/migration.h b/migration/migration.h index a79540b99c..8d2f320c48 100644 --- a/migration/migration.h +++ b/migration/migration.h @@ -49,8 +49,12 @@ struct MigrationIncomingState { int userfault_event_fd; QEMUFile *to_src_file; QemuMutex rp_mutex; /* We send replies from multiple threads */ + /* RAMBlock of last request sent to source */ + RAMBlock *last_rb; void *postcopy_tmp_page; void *postcopy_tmp_zero_page; + /* PostCopyFD's for external userfaultfds & handlers of shared memory */ + GArray *postcopy_remote_fds; QEMUBH *bh; diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 032abfbf1a..efd77939af 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -23,6 +23,8 @@ #include "savevm.h" #include "postcopy-ram.h" #include "ram.h" +#include "qapi/error.h" +#include "qemu/notify.h" #include "sysemu/sysemu.h" #include "sysemu/balloon.h" #include "qemu/error-report.h" @@ -45,6 +47,33 @@ struct PostcopyDiscardState { unsigned int nsentcmds; }; +static NotifierWithReturnList postcopy_notifier_list; + +void postcopy_infrastructure_init(void) +{ + notifier_with_return_list_init(&postcopy_notifier_list); +} + +void postcopy_add_notifier(NotifierWithReturn *nn) +{ + notifier_with_return_list_add(&postcopy_notifier_list, nn); +} + +void postcopy_remove_notifier(NotifierWithReturn *n) +{ + notifier_with_return_remove(n); +} + +int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp) +{ + struct PostcopyNotifyData pnd; + pnd.reason = reason; + pnd.errp = errp; + + return notifier_with_return_list_notify(&postcopy_notifier_list, + &pnd); +} + /* Postcopy needs to detect accesses to pages that haven't yet been copied * across, and efficiently map new pages in, the techniques for doing this * are target OS specific. @@ -186,12 +215,6 @@ static int test_ramblock_postcopiable(const char *block_name, void *host_addr, RAMBlock *rb = qemu_ram_block_by_name(block_name); size_t pagesize = qemu_ram_pagesize(rb); - if (qemu_ram_is_shared(rb)) { - error_report("Postcopy on shared RAM (%s) is not yet supported", - block_name); - return 1; - } - if (length % pagesize) { error_report("Postcopy requires RAM blocks to be a page size multiple," " block %s is 0x" RAM_ADDR_FMT " bytes with a " @@ -215,6 +238,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) struct uffdio_register reg_struct; struct uffdio_range range_struct; uint64_t feature_mask; + Error *local_err = NULL; if (qemu_target_page_size() > pagesize) { error_report("Target page size bigger than host page size"); @@ -228,6 +252,12 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis) goto out; } + /* Give devices a chance to object */ + if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) { + error_report_err(local_err); + goto out; + } + /* Version and features check */ if (!ufd_check_and_apply(ufd, mis)) { goto out; @@ -377,6 +407,13 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) trace_postcopy_ram_incoming_cleanup_entry(); if (mis->have_fault_thread) { + Error *local_err = NULL; + + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) { + error_report_err(local_err); + return -1; + } + if (qemu_ram_foreach_block(cleanup_range, mis)) { return -1; } @@ -481,10 +518,63 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, error_report("%s userfault: Region doesn't support COPY", __func__); return -1; } + if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) { + RAMBlock *rb = qemu_ram_block_by_name(block_name); + qemu_ram_set_uf_zeroable(rb); + } return 0; } +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) +{ + size_t pagesize = qemu_ram_pagesize(rb); + struct uffdio_range range; + int ret; + trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb)); + range.start = client_addr & ~(pagesize - 1); + range.len = pagesize; + ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range); + if (ret) { + error_report("%s: Failed to wake: %zx in %s (%s)", + __func__, (size_t)client_addr, qemu_ram_get_idstr(rb), + strerror(errno)); + } + return ret; +} + +/* + * Callback from shared fault handlers to ask for a page, + * the page must be specified by a RAMBlock and an offset in that rb + * Note: Only for use by shared fault handlers (in fault thread) + */ +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + size_t pagesize = qemu_ram_pagesize(rb); + uint64_t aligned_rbo = rb_offset & ~(pagesize - 1); + MigrationIncomingState *mis = migration_incoming_get_current(); + + trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb), + rb_offset); + if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) { + trace_postcopy_request_shared_page_present(pcfd->idstr, + qemu_ram_get_idstr(rb), rb_offset); + return postcopy_wake_shared(pcfd, client_addr, rb); + } + if (rb != mis->last_rb) { + mis->last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + aligned_rbo, pagesize); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize); + } + return 0; +} + /* * Handle faults detected by the USERFAULT markings */ @@ -493,29 +583,44 @@ static void *postcopy_ram_fault_thread(void *opaque) MigrationIncomingState *mis = opaque; struct uffd_msg msg; int ret; + size_t index; RAMBlock *rb = NULL; - RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ trace_postcopy_ram_fault_thread_entry(); + mis->last_rb = NULL; /* last RAMBlock we sent part of */ qemu_sem_post(&mis->fault_thread_sem); + struct pollfd *pfd; + size_t pfd_len = 2 + mis->postcopy_remote_fds->len; + + pfd = g_new0(struct pollfd, pfd_len); + + pfd[0].fd = mis->userfault_fd; + pfd[0].events = POLLIN; + pfd[1].fd = mis->userfault_event_fd; + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ + trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd); + for (index = 0; index < mis->postcopy_remote_fds->len; index++) { + struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index); + pfd[2 + index].fd = pcfd->fd; + pfd[2 + index].events = POLLIN; + trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr, + pcfd->fd); + } + while (true) { ram_addr_t rb_offset; - struct pollfd pfd[2]; + int poll_result; /* * We're mainly waiting for the kernel to give us a faulting HVA, * however we can be told to quit via userfault_quit_fd which is * an eventfd */ - pfd[0].fd = mis->userfault_fd; - pfd[0].events = POLLIN; - pfd[0].revents = 0; - pfd[1].fd = mis->userfault_event_fd; - pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ - pfd[1].revents = 0; - - if (poll(pfd, 2, -1 /* Wait forever */) == -1) { + + poll_result = poll(pfd, pfd_len, -1 /* Wait forever */); + if (poll_result == -1) { error_report("%s: userfault poll: %s", __func__, strerror(errno)); break; } @@ -535,57 +640,117 @@ static void *postcopy_ram_fault_thread(void *opaque) } } - ret = read(mis->userfault_fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (errno == EAGAIN) { - /* - * if a wake up happens on the other thread just after - * the poll, there is nothing to read. - */ - continue; + if (pfd[0].revents) { + poll_result--; + ret = read(mis->userfault_fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s", + __func__, strerror(errno)); + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd", + __func__, ret, sizeof(msg)); + break; /* Lost alignment, don't know what we'd read next */ + } } - if (ret < 0) { - error_report("%s: Failed to read full userfault message: %s", - __func__, strerror(errno)); - break; - } else { - error_report("%s: Read %d bytes from userfaultfd expected %zd", - __func__, ret, sizeof(msg)); - break; /* Lost alignment, don't know what we'd read next */ + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud from userfaultfd", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ } - } - if (msg.event != UFFD_EVENT_PAGEFAULT) { - error_report("%s: Read unexpected event %ud from userfaultfd", - __func__, msg.event); - continue; /* It's not a page fault, shouldn't happen */ - } - rb = qemu_ram_block_from_host( - (void *)(uintptr_t)msg.arg.pagefault.address, - true, &rb_offset); - if (!rb) { - error_report("postcopy_ram_fault_thread: Fault outside guest: %" - PRIx64, (uint64_t)msg.arg.pagefault.address); - break; - } + rb = qemu_ram_block_from_host( + (void *)(uintptr_t)msg.arg.pagefault.address, + true, &rb_offset); + if (!rb) { + error_report("postcopy_ram_fault_thread: Fault outside guest: %" + PRIx64, (uint64_t)msg.arg.pagefault.address); + break; + } - rb_offset &= ~(qemu_ram_pagesize(rb) - 1); - trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, + rb_offset &= ~(qemu_ram_pagesize(rb) - 1); + trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, qemu_ram_get_idstr(rb), rb_offset); + /* + * Send the request to the source - we want to request one + * of our host page sizes (which is >= TPS) + */ + if (rb != mis->last_rb) { + mis->last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + rb_offset, qemu_ram_pagesize(rb)); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, + rb_offset, qemu_ram_pagesize(rb)); + } + } - /* - * Send the request to the source - we want to request one - * of our host page sizes (which is >= TPS) - */ - if (rb != last_rb) { - last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - rb_offset, qemu_ram_pagesize(rb)); - } else { - /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, - rb_offset, qemu_ram_pagesize(rb)); + /* Now handle any requests from external processes on shared memory */ + /* TODO: May need to handle devices deregistering during postcopy */ + for (index = 2; index < pfd_len && poll_result; index++) { + if (pfd[index].revents) { + struct PostCopyFD *pcfd = + &g_array_index(mis->postcopy_remote_fds, + struct PostCopyFD, index - 2); + + poll_result--; + if (pfd[index].revents & POLLERR) { + error_report("%s: POLLERR on poll %zd fd=%d", + __func__, index, pcfd->fd); + pfd[index].events = 0; + continue; + } + + ret = read(pcfd->fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault " + "message: %s (shared) revents=%d", + __func__, strerror(errno), + pfd[index].revents); + /*TODO: Could just disable this sharer */ + break; + } else { + error_report("%s: Read %d bytes from userfaultfd " + "expected %zd (shared)", + __func__, ret, sizeof(msg)); + /*TODO: Could just disable this sharer */ + break; /*Lost alignment,don't know what we'd read next*/ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud " + "from userfaultfd (shared)", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } + /* Call the device handler registered with us */ + ret = pcfd->handler(pcfd, &msg); + if (ret) { + error_report("%s: Failed to resolve shared fault on %zd/%s", + __func__, index, pcfd->idstr); + /* TODO: Fail? Disable this sharer? */ + } + } } } trace_postcopy_ram_fault_thread_exit(); @@ -667,6 +832,22 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr, return ret; } +int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset) +{ + int i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + int ret = cur->waker(cur, rb, offset); + if (ret) { + return ret; + } + } + return 0; +} + /* * Place a host page (from) at (host) atomically * returns 0 on success @@ -690,7 +871,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, } trace_postcopy_place_page(host); - return 0; + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, host)); } /* @@ -700,17 +882,23 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, RAMBlock *rb) { + size_t pagesize = qemu_ram_pagesize(rb); trace_postcopy_place_page_zero(host); - if (qemu_ram_pagesize(rb) == getpagesize()) { - if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(), - rb)) { + /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE + * but it's not available for everything (e.g. hugetlbpages) + */ + if (qemu_ram_is_uf_zeroable(rb)) { + if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) { int e = errno; error_report("%s: %s zero host: %p", __func__, strerror(e), host); return -e; } + return postcopy_notify_shared_wake(rb, + qemu_ram_block_host_offset(rb, + host)); } else { /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */ if (!mis->postcopy_tmp_zero_page) { @@ -730,8 +918,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host, return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb); } - - return 0; } /* @@ -784,6 +970,13 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis) return -1; } +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t rb_offset) +{ + assert(0); + return -1; +} + int postcopy_ram_enable_notify(MigrationIncomingState *mis) { assert(0); @@ -810,6 +1003,13 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis) return NULL; } +int postcopy_wake_shared(struct PostCopyFD *pcfd, + uint64_t client_addr, + RAMBlock *rb) +{ + assert(0); + return -1; +} #endif /* ------------------------------------------------------------------------- */ @@ -927,3 +1127,31 @@ PostcopyState postcopy_state_set(PostcopyState new_state) { return atomic_xchg(&incoming_postcopy_state, new_state); } + +/* Register a handler for external shared memory postcopy + * called on the destination. + */ +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + + mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds, + *pcfd); +} + +/* Unregister a handler for external shared memory postcopy + */ +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd) +{ + guint i; + MigrationIncomingState *mis = migration_incoming_get_current(); + GArray *pcrfds = mis->postcopy_remote_fds; + + for (i = 0; i < pcrfds->len; i++) { + struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i); + if (cur->fd == pcfd->fd) { + mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i); + return; + } + } +} diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h index 14f6cadcbd..d900d9c34f 100644 --- a/migration/postcopy-ram.h +++ b/migration/postcopy-ram.h @@ -116,4 +116,77 @@ PostcopyState postcopy_state_set(PostcopyState new_state); void postcopy_fault_thread_notify(MigrationIncomingState *mis); +/* + * To be called once at the start before any device initialisation + */ +void postcopy_infrastructure_init(void); + +/* Add a notifier to a list to be called when checking whether the devices + * can support postcopy. + * It's data is a *PostcopyNotifyData + * It should return 0 if OK, or a negative value on failure. + * On failure it must set the data->errp to an error. + * + */ +enum PostcopyNotifyReason { + POSTCOPY_NOTIFY_PROBE = 0, + POSTCOPY_NOTIFY_INBOUND_ADVISE, + POSTCOPY_NOTIFY_INBOUND_LISTEN, + POSTCOPY_NOTIFY_INBOUND_END, +}; + +struct PostcopyNotifyData { + enum PostcopyNotifyReason reason; + Error **errp; +}; + +void postcopy_add_notifier(NotifierWithReturn *nn); +void postcopy_remove_notifier(NotifierWithReturn *n); +/* Call the notifier list set by postcopy_add_start_notifier */ +int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp); + +struct PostCopyFD; + +/* ufd is a pointer to the struct uffd_msg *TODO: more Portable! */ +typedef int (*pcfdhandler)(struct PostCopyFD *pcfd, void *ufd); +/* Notification to wake, either on place or on reception of + * a fault on something that's already arrived (race) + */ +typedef int (*pcfdwake)(struct PostCopyFD *pcfd, RAMBlock *rb, uint64_t offset); + +struct PostCopyFD { + int fd; + /* Data to pass to handler */ + void *data; + /* Handler to be called whenever we get a poll event */ + pcfdhandler handler; + /* Notification to wake shared client */ + pcfdwake waker; + /* A string to use in error messages */ + const char *idstr; +}; + +/* Register a userfaultfd owned by an external process for + * shared memory. + */ +void postcopy_register_shared_ufd(struct PostCopyFD *pcfd); +void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd); +/* Call each of the shared 'waker's registerd telling them of + * availability of a block. + */ +int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset); +/* postcopy_wake_shared: Notify a client ufd that a page is available + * + * Returns 0 on success + * + * @pcfd: Structure with fd, handler and name as above + * @client_addr: Address in the client program, not QEMU + * @rb: The RAMBlock the page is in + */ +int postcopy_wake_shared(struct PostCopyFD *pcfd, uint64_t client_addr, + RAMBlock *rb); +/* Callback from shared fault handlers to ask for a page */ +int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb, + uint64_t client_addr, uint64_t offset); + #endif diff --git a/migration/ram.c b/migration/ram.c index 590fceb7e9..0e90efa092 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -169,6 +169,11 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) rb->receivedmap); } +bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) +{ + return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); +} + void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) { set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); diff --git a/migration/ram.h b/migration/ram.h index 53f0021c51..5030be110a 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -60,6 +60,7 @@ int ram_postcopy_incoming_init(MigrationIncomingState *mis); void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); +bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); diff --git a/migration/savevm.c b/migration/savevm.c index f417cef7d5..e2be02afe4 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -1395,6 +1395,7 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, { PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps; + Error *local_err = NULL; trace_loadvm_postcopy_handle_advise(); if (ps != POSTCOPY_INCOMING_NONE) { @@ -1460,6 +1461,11 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis, return -1; } + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) { + error_report_err(local_err); + return -1; + } + if (ram_postcopy_incoming_init(mis)) { return -1; } @@ -1621,6 +1627,8 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) { PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); trace_loadvm_postcopy_handle_listen(); + Error *local_err = NULL; + if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); return -1; @@ -1646,6 +1654,11 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) } } + if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) { + error_report_err(local_err); + return -1; + } + if (mis->have_listen_thread) { error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread"); return -1; diff --git a/migration/trace-events b/migration/trace-events index 314e1be6bc..a180d7b008 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -190,12 +190,18 @@ postcopy_place_page_zero(void *host_addr) "host=%p" postcopy_ram_enable_notify(void) "" postcopy_ram_fault_thread_entry(void) "" postcopy_ram_fault_thread_exit(void) "" +postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d" +postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d" postcopy_ram_fault_thread_quit(void) "" postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx" postcopy_ram_incoming_cleanup_closeuf(void) "" postcopy_ram_incoming_cleanup_entry(void) "" postcopy_ram_incoming_cleanup_exit(void) "" postcopy_ram_incoming_cleanup_join(void) "" +postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64 +postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64 +postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s" + save_xbzrle_page_skipping(void) "" save_xbzrle_page_overflow(void) "" ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations" @@ -520,29 +520,34 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, static void numa_stat_memory_devices(NumaNodeMem node_mem[]) { - MemoryDeviceInfoList *info_list = NULL; - MemoryDeviceInfoList **prev = &info_list; + MemoryDeviceInfoList *info_list = qmp_pc_dimm_device_list(); MemoryDeviceInfoList *info; PCDIMMDeviceInfo *pcdimm_info; - qmp_pc_dimm_device_list(qdev_get_machine(), &prev); for (info = info_list; info; info = info->next) { MemoryDeviceInfo *value = info->value; if (value) { switch (value->type) { - case MEMORY_DEVICE_INFO_KIND_DIMM: { + case MEMORY_DEVICE_INFO_KIND_DIMM: pcdimm_info = value->u.dimm.data; + break; + + case MEMORY_DEVICE_INFO_KIND_NVDIMM: + pcdimm_info = value->u.nvdimm.data; + break; + + default: + pcdimm_info = NULL; + break; + } + + if (pcdimm_info) { node_mem[pcdimm_info->node].node_mem += pcdimm_info->size; if (pcdimm_info->hotpluggable && pcdimm_info->hotplugged) { node_mem[pcdimm_info->node].node_plugged_mem += pcdimm_info->size; } - break; - } - - default: - break; } } } diff --git a/pc-bios/acpi-dsdt.aml b/pc-bios/acpi-dsdt.aml Binary files differdeleted file mode 100644 index 558c10f51c..0000000000 --- a/pc-bios/acpi-dsdt.aml +++ /dev/null diff --git a/qapi/misc.json b/qapi/misc.json index c31fc983f3..5636f4a149 100644 --- a/qapi/misc.json +++ b/qapi/misc.json @@ -2878,7 +2878,11 @@ # # Since: 2.1 ## -{ 'union': 'MemoryDeviceInfo', 'data': {'dimm': 'PCDIMMDeviceInfo'} } +{ 'union': 'MemoryDeviceInfo', + 'data': { 'dimm': 'PCDIMMDeviceInfo', + 'nvdimm': 'PCDIMMDeviceInfo' + } +} ## # @query-memory-devices: @@ -731,12 +731,7 @@ void qmp_object_del(const char *id, Error **errp) MemoryDeviceInfoList *qmp_query_memory_devices(Error **errp) { - MemoryDeviceInfoList *head = NULL; - MemoryDeviceInfoList **prev = &head; - - qmp_pc_dimm_device_list(qdev_get_machine(), &prev); - - return head; + return qmp_pc_dimm_device_list(); } ACPIOSTInfoList *qmp_query_acpi_ospm_status(Error **errp) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index da9030a01c..5b1d8dcdf4 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -40,6 +40,9 @@ cp_portable() { -e 'sys/' \ -e 'pvrdma_verbs' \ -e 'drm.h' \ + -e 'limits' \ + -e 'linux/kernel' \ + -e 'linux/sysinfo' \ > /dev/null then echo "Unexpected #include in input file $f". @@ -62,6 +65,10 @@ cp_portable() { -e '/sys\/ioctl.h/d' \ -e 's/SW_MAX/SW_MAX_/' \ -e 's/atomic_t/int/' \ + -e 's/__kernel_long_t/long/' \ + -e 's/__kernel_ulong_t/unsigned long/' \ + -e 's/struct ethhdr/struct eth_header/' \ + -e '/\#define _LINUX_ETHTOOL_H/a \\n\#include "net/eth.h"' \ "$f" > "$to/$header"; } @@ -151,7 +158,9 @@ rm -rf "$output/include/standard-headers/linux" mkdir -p "$output/include/standard-headers/linux" for i in "$tmpdir"/include/linux/*virtio*.h "$tmpdir/include/linux/input.h" \ "$tmpdir/include/linux/input-event-codes.h" \ - "$tmpdir/include/linux/pci_regs.h"; do + "$tmpdir/include/linux/pci_regs.h" \ + "$tmpdir/include/linux/ethtool.h" "$tmpdir/include/linux/kernel.h" \ + "$tmpdir/include/linux/sysinfo.h"; do cp_portable "$i" "$output/include/standard-headers/linux" done mkdir -p "$output/include/standard-headers/drm" diff --git a/stubs/qmp_pc_dimm.c b/stubs/qmp_pc_dimm.c index 9ddc4f619a..b6b2cca89e 100644 --- a/stubs/qmp_pc_dimm.c +++ b/stubs/qmp_pc_dimm.c @@ -2,9 +2,9 @@ #include "qom/object.h" #include "hw/mem/pc-dimm.h" -int qmp_pc_dimm_device_list(Object *obj, void *opaque) +MemoryDeviceInfoList *qmp_pc_dimm_device_list(void) { - return 0; + return NULL; } uint64_t get_plugged_memory_size(void) diff --git a/target/i386/monitor.c b/target/i386/monitor.c index 011419eba2..a890b3c2ab 100644 --- a/target/i386/monitor.c +++ b/target/i386/monitor.c @@ -696,6 +696,8 @@ void hmp_info_sev(Monitor *mon, const QDict *qdict) } else { monitor_printf(mon, "SEV is not enabled\n"); } + + qapi_free_SevInfo(info); } SevLaunchMeasureInfo *qmp_query_sev_launch_measure(Error **errp) diff --git a/target/m68k/translate.c b/target/m68k/translate.c index cef6f663ad..6beaf9ed66 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -123,8 +123,34 @@ typedef struct DisasContext { int done_mac; int writeback_mask; TCGv writeback[8]; +#define MAX_TO_RELEASE 8 + int release_count; + TCGv release[MAX_TO_RELEASE]; } DisasContext; +static void init_release_array(DisasContext *s) +{ +#ifdef CONFIG_DEBUG_TCG + memset(s->release, 0, sizeof(s->release)); +#endif + s->release_count = 0; +} + +static void do_release(DisasContext *s) +{ + int i; + for (i = 0; i < s->release_count; i++) { + tcg_temp_free(s->release[i]); + } + init_release_array(s); +} + +static TCGv mark_to_release(DisasContext *s, TCGv tmp) +{ + g_assert(s->release_count < MAX_TO_RELEASE); + return s->release[s->release_count++] = tmp; +} + static TCGv get_areg(DisasContext *s, unsigned regno) { if (s->writeback_mask & (1 << regno)) { @@ -347,7 +373,8 @@ static TCGv gen_ldst(DisasContext *s, int opsize, TCGv addr, TCGv val, gen_store(s, opsize, addr, val, index); return store_dummy; } else { - return gen_load(s, opsize, addr, what == EA_LOADS, index); + return mark_to_release(s, gen_load(s, opsize, addr, + what == EA_LOADS, index)); } } @@ -439,7 +466,7 @@ static TCGv gen_lea_indexed(CPUM68KState *env, DisasContext *s, TCGv base) } else { bd = 0; } - tmp = tcg_temp_new(); + tmp = mark_to_release(s, tcg_temp_new()); if ((ext & 0x44) == 0) { /* pre-index */ add = gen_addr_index(s, ext, tmp); @@ -449,7 +476,7 @@ static TCGv gen_lea_indexed(CPUM68KState *env, DisasContext *s, TCGv base) if ((ext & 0x80) == 0) { /* base not suppressed */ if (IS_NULL_QREG(base)) { - base = tcg_const_i32(offset + bd); + base = mark_to_release(s, tcg_const_i32(offset + bd)); bd = 0; } if (!IS_NULL_QREG(add)) { @@ -465,11 +492,11 @@ static TCGv gen_lea_indexed(CPUM68KState *env, DisasContext *s, TCGv base) add = tmp; } } else { - add = tcg_const_i32(bd); + add = mark_to_release(s, tcg_const_i32(bd)); } if ((ext & 3) != 0) { /* memory indirect */ - base = gen_load(s, OS_LONG, add, 0, IS_USER(s)); + base = mark_to_release(s, gen_load(s, OS_LONG, add, 0, IS_USER(s))); if ((ext & 0x44) == 4) { add = gen_addr_index(s, ext, tmp); tcg_gen_add_i32(tmp, add, base); @@ -494,7 +521,7 @@ static TCGv gen_lea_indexed(CPUM68KState *env, DisasContext *s, TCGv base) } } else { /* brief extension word format */ - tmp = tcg_temp_new(); + tmp = mark_to_release(s, tcg_temp_new()); add = gen_addr_index(s, ext, tmp); if (!IS_NULL_QREG(base)) { tcg_gen_add_i32(tmp, add, base); @@ -617,14 +644,14 @@ static void gen_flush_flags(DisasContext *s) s->cc_op = CC_OP_FLAGS; } -static inline TCGv gen_extend(TCGv val, int opsize, int sign) +static inline TCGv gen_extend(DisasContext *s, TCGv val, int opsize, int sign) { TCGv tmp; if (opsize == OS_LONG) { tmp = val; } else { - tmp = tcg_temp_new(); + tmp = mark_to_release(s, tcg_temp_new()); gen_ext(tmp, val, opsize, sign); } @@ -746,7 +773,7 @@ static TCGv gen_lea_mode(CPUM68KState *env, DisasContext *s, return NULL_QREG; } reg = get_areg(s, reg0); - tmp = tcg_temp_new(); + tmp = mark_to_release(s, tcg_temp_new()); if (reg0 == 7 && opsize == OS_BYTE && m68k_feature(s->env, M68K_FEATURE_M68000)) { tcg_gen_subi_i32(tmp, reg, 2); @@ -756,7 +783,7 @@ static TCGv gen_lea_mode(CPUM68KState *env, DisasContext *s, return tmp; case 5: /* Indirect displacement. */ reg = get_areg(s, reg0); - tmp = tcg_temp_new(); + tmp = mark_to_release(s, tcg_temp_new()); ext = read_im16(env, s); tcg_gen_addi_i32(tmp, reg, (int16_t)ext); return tmp; @@ -767,14 +794,14 @@ static TCGv gen_lea_mode(CPUM68KState *env, DisasContext *s, switch (reg0) { case 0: /* Absolute short. */ offset = (int16_t)read_im16(env, s); - return tcg_const_i32(offset); + return mark_to_release(s, tcg_const_i32(offset)); case 1: /* Absolute long. */ offset = read_im32(env, s); - return tcg_const_i32(offset); + return mark_to_release(s, tcg_const_i32(offset)); case 2: /* pc displacement */ offset = s->pc; offset += (int16_t)read_im16(env, s); - return tcg_const_i32(offset); + return mark_to_release(s, tcg_const_i32(offset)); case 3: /* pc index+displacement. */ return gen_lea_indexed(env, s, NULL_QREG); case 4: /* Immediate. */ @@ -811,7 +838,7 @@ static TCGv gen_ea_mode(CPUM68KState *env, DisasContext *s, int mode, int reg0, gen_partset_reg(opsize, reg, val); return store_dummy; } else { - return gen_extend(reg, opsize, what == EA_LOADS); + return gen_extend(s, reg, opsize, what == EA_LOADS); } case 1: /* Address register direct. */ reg = get_areg(s, reg0); @@ -819,7 +846,7 @@ static TCGv gen_ea_mode(CPUM68KState *env, DisasContext *s, int mode, int reg0, tcg_gen_mov_i32(reg, val); return store_dummy; } else { - return gen_extend(reg, opsize, what == EA_LOADS); + return gen_extend(s, reg, opsize, what == EA_LOADS); } case 2: /* Indirect register */ reg = get_areg(s, reg0); @@ -900,7 +927,7 @@ static TCGv gen_ea_mode(CPUM68KState *env, DisasContext *s, int mode, int reg0, default: g_assert_not_reached(); } - return tcg_const_i32(offset); + return mark_to_release(s, tcg_const_i32(offset)); default: return NULL_QREG; } @@ -1759,8 +1786,8 @@ DISAS_INSN(abcd_reg) gen_flush_flags(s); /* !Z is sticky */ - src = gen_extend(DREG(insn, 0), OS_BYTE, 0); - dest = gen_extend(DREG(insn, 9), OS_BYTE, 0); + src = gen_extend(s, DREG(insn, 0), OS_BYTE, 0); + dest = gen_extend(s, DREG(insn, 9), OS_BYTE, 0); bcd_add(dest, src); gen_partset_reg(OS_BYTE, DREG(insn, 9), dest); @@ -1794,8 +1821,8 @@ DISAS_INSN(sbcd_reg) gen_flush_flags(s); /* !Z is sticky */ - src = gen_extend(DREG(insn, 0), OS_BYTE, 0); - dest = gen_extend(DREG(insn, 9), OS_BYTE, 0); + src = gen_extend(s, DREG(insn, 0), OS_BYTE, 0); + dest = gen_extend(s, DREG(insn, 9), OS_BYTE, 0); bcd_sub(dest, src); @@ -1856,7 +1883,7 @@ DISAS_INSN(addsub) add = (insn & 0x4000) != 0; opsize = insn_opsize(insn); - reg = gen_extend(DREG(insn, 9), opsize, 1); + reg = gen_extend(s, DREG(insn, 9), opsize, 1); dest = tcg_temp_new(); if (insn & 0x100) { SRC_EA(env, tmp, opsize, 1, &addr); @@ -2386,7 +2413,7 @@ DISAS_INSN(cas) return; } - cmp = gen_extend(DREG(ext, 0), opsize, 1); + cmp = gen_extend(s, DREG(ext, 0), opsize, 1); /* if <EA> == Dc then * <EA> = Du @@ -3055,7 +3082,7 @@ DISAS_INSN(or) int opsize; opsize = insn_opsize(insn); - reg = gen_extend(DREG(insn, 9), opsize, 0); + reg = gen_extend(s, DREG(insn, 9), opsize, 0); dest = tcg_temp_new(); if (insn & 0x100) { SRC_EA(env, src, opsize, 0, &addr); @@ -3120,8 +3147,8 @@ DISAS_INSN(subx_reg) opsize = insn_opsize(insn); - src = gen_extend(DREG(insn, 0), opsize, 1); - dest = gen_extend(DREG(insn, 9), opsize, 1); + src = gen_extend(s, DREG(insn, 0), opsize, 1); + dest = gen_extend(s, DREG(insn, 9), opsize, 1); gen_subx(s, src, dest, opsize); @@ -3176,7 +3203,7 @@ DISAS_INSN(cmp) opsize = insn_opsize(insn); SRC_EA(env, src, opsize, 1, NULL); - reg = gen_extend(DREG(insn, 9), opsize, 1); + reg = gen_extend(s, DREG(insn, 9), opsize, 1); gen_update_cc_cmp(s, reg, src, opsize); } @@ -3329,8 +3356,8 @@ DISAS_INSN(addx_reg) opsize = insn_opsize(insn); - dest = gen_extend(DREG(insn, 9), opsize, 1); - src = gen_extend(DREG(insn, 0), opsize, 1); + dest = gen_extend(s, DREG(insn, 9), opsize, 1); + src = gen_extend(s, DREG(insn, 0), opsize, 1); gen_addx(s, src, dest, opsize); @@ -3369,7 +3396,7 @@ static inline void shift_im(DisasContext *s, uint16_t insn, int opsize) int logical = insn & 8; int left = insn & 0x100; int bits = opsize_bytes(opsize) * 8; - TCGv reg = gen_extend(DREG(insn, 0), opsize, !logical); + TCGv reg = gen_extend(s, DREG(insn, 0), opsize, !logical); if (count == 0) { count = 8; @@ -3419,7 +3446,7 @@ static inline void shift_reg(DisasContext *s, uint16_t insn, int opsize) int logical = insn & 8; int left = insn & 0x100; int bits = opsize_bytes(opsize) * 8; - TCGv reg = gen_extend(DREG(insn, 0), opsize, !logical); + TCGv reg = gen_extend(s, DREG(insn, 0), opsize, !logical); TCGv s32; TCGv_i64 t64, s64; @@ -3556,7 +3583,7 @@ DISAS_INSN(shift_mem) while M68000 sets if the most significant bit is changed at any time during the shift operation */ if (!logical && m68k_feature(s->env, M68K_FEATURE_M68000)) { - src = gen_extend(src, OS_WORD, 1); + src = gen_extend(s, src, OS_WORD, 1); tcg_gen_xor_i32(QREG_CC_V, QREG_CC_N, src); } } else { @@ -3789,7 +3816,7 @@ DISAS_INSN(rotate8_im) TCGv shift; int tmp; - reg = gen_extend(DREG(insn, 0), OS_BYTE, 0); + reg = gen_extend(s, DREG(insn, 0), OS_BYTE, 0); tmp = (insn >> 9) & 7; if (tmp == 0) { @@ -3816,7 +3843,7 @@ DISAS_INSN(rotate16_im) TCGv shift; int tmp; - reg = gen_extend(DREG(insn, 0), OS_WORD, 0); + reg = gen_extend(s, DREG(insn, 0), OS_WORD, 0); tmp = (insn >> 9) & 7; if (tmp == 0) { tmp = 8; @@ -3876,7 +3903,7 @@ DISAS_INSN(rotate8_reg) TCGv t0, t1; int left = (insn & 0x100); - reg = gen_extend(DREG(insn, 0), OS_BYTE, 0); + reg = gen_extend(s, DREG(insn, 0), OS_BYTE, 0); src = DREG(insn, 9); /* shift in [0..63] */ t0 = tcg_temp_new_i32(); @@ -3911,7 +3938,7 @@ DISAS_INSN(rotate16_reg) TCGv t0, t1; int left = (insn & 0x100); - reg = gen_extend(DREG(insn, 0), OS_WORD, 0); + reg = gen_extend(s, DREG(insn, 0), OS_WORD, 0); src = DREG(insn, 9); /* shift in [0..63] */ t0 = tcg_temp_new_i32(); @@ -4353,7 +4380,7 @@ DISAS_INSN(chk) return; } SRC_EA(env, src, opsize, 1, NULL); - reg = gen_extend(DREG(insn, 9), opsize, 1); + reg = gen_extend(s, DREG(insn, 9), opsize, 1); gen_flush_flags(s); gen_helper_chk(cpu_env, reg, src); @@ -6033,6 +6060,7 @@ static void disas_m68k_insn(CPUM68KState * env, DisasContext *s) uint16_t insn = read_im16(env, s); opcode_table[insn](env, s, insn); do_writebacks(s); + do_release(s); } /* generate intermediate code for basic block 'tb'. */ @@ -6067,6 +6095,8 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb) max_insns = TCG_MAX_INSNS; } + init_release_array(dc); + gen_tb_start(tb); do { pc_offset = dc->pc - pc_start; diff --git a/tests/acpi-test-data/pc/APIC.dimmpxm b/tests/acpi-test-data/pc/APIC.dimmpxm Binary files differnew file mode 100644 index 0000000000..427bb08248 --- /dev/null +++ b/tests/acpi-test-data/pc/APIC.dimmpxm diff --git a/tests/acpi-test-data/pc/DSDT.dimmpxm b/tests/acpi-test-data/pc/DSDT.dimmpxm Binary files differnew file mode 100644 index 0000000000..38661cb13e --- /dev/null +++ b/tests/acpi-test-data/pc/DSDT.dimmpxm diff --git a/tests/acpi-test-data/pc/NFIT.dimmpxm b/tests/acpi-test-data/pc/NFIT.dimmpxm Binary files differnew file mode 100644 index 0000000000..2bfc6c51f3 --- /dev/null +++ b/tests/acpi-test-data/pc/NFIT.dimmpxm diff --git a/tests/acpi-test-data/pc/SRAT.dimmpxm b/tests/acpi-test-data/pc/SRAT.dimmpxm Binary files differnew file mode 100644 index 0000000000..3b10a607d5 --- /dev/null +++ b/tests/acpi-test-data/pc/SRAT.dimmpxm diff --git a/tests/acpi-test-data/pc/SSDT.dimmpxm b/tests/acpi-test-data/pc/SSDT.dimmpxm Binary files differnew file mode 100644 index 0000000000..8ba0e67cb7 --- /dev/null +++ b/tests/acpi-test-data/pc/SSDT.dimmpxm diff --git a/tests/acpi-test-data/q35/APIC.dimmpxm b/tests/acpi-test-data/q35/APIC.dimmpxm Binary files differnew file mode 100644 index 0000000000..427bb08248 --- /dev/null +++ b/tests/acpi-test-data/q35/APIC.dimmpxm diff --git a/tests/acpi-test-data/q35/DSDT.dimmpxm b/tests/acpi-test-data/q35/DSDT.dimmpxm Binary files differnew file mode 100644 index 0000000000..14904e8ea2 --- /dev/null +++ b/tests/acpi-test-data/q35/DSDT.dimmpxm diff --git a/tests/acpi-test-data/q35/NFIT.dimmpxm b/tests/acpi-test-data/q35/NFIT.dimmpxm Binary files differnew file mode 100644 index 0000000000..2bfc6c51f3 --- /dev/null +++ b/tests/acpi-test-data/q35/NFIT.dimmpxm diff --git a/tests/acpi-test-data/q35/SRAT.dimmpxm b/tests/acpi-test-data/q35/SRAT.dimmpxm Binary files differnew file mode 100644 index 0000000000..3b10a607d5 --- /dev/null +++ b/tests/acpi-test-data/q35/SRAT.dimmpxm diff --git a/tests/acpi-test-data/q35/SSDT.dimmpxm b/tests/acpi-test-data/q35/SSDT.dimmpxm Binary files differnew file mode 100644 index 0000000000..8ba0e67cb7 --- /dev/null +++ b/tests/acpi-test-data/q35/SSDT.dimmpxm diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 65b271a173..bf3e193ae9 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -29,7 +29,8 @@ typedef struct { uint32_t rsdp_addr; AcpiRsdpDescriptor rsdp_table; AcpiRsdtDescriptorRev1 rsdt_table; - AcpiFadtDescriptorRev3 fadt_table; + uint32_t dsdt_addr; + uint32_t facs_addr; AcpiFacsDescriptorRev1 facs_table; uint32_t *rsdt_tables_addr; int rsdt_tables_nr; @@ -127,71 +128,18 @@ static void test_acpi_rsdt_table(test_data *data) data->rsdt_tables_nr = tables_nr; } -static void test_acpi_fadt_table(test_data *data) +static void fadt_fetch_facs_and_dsdt_ptrs(test_data *data) { - AcpiFadtDescriptorRev3 *fadt_table = &data->fadt_table; uint32_t addr; + AcpiTableHeader hdr; /* FADT table comes first */ addr = le32_to_cpu(data->rsdt_tables_addr[0]); - ACPI_READ_TABLE_HEADER(fadt_table, addr); - - ACPI_READ_FIELD(fadt_table->firmware_ctrl, addr); - ACPI_READ_FIELD(fadt_table->dsdt, addr); - ACPI_READ_FIELD(fadt_table->model, addr); - ACPI_READ_FIELD(fadt_table->reserved1, addr); - ACPI_READ_FIELD(fadt_table->sci_int, addr); - ACPI_READ_FIELD(fadt_table->smi_cmd, addr); - ACPI_READ_FIELD(fadt_table->acpi_enable, addr); - ACPI_READ_FIELD(fadt_table->acpi_disable, addr); - ACPI_READ_FIELD(fadt_table->S4bios_req, addr); - ACPI_READ_FIELD(fadt_table->reserved2, addr); - ACPI_READ_FIELD(fadt_table->pm1a_evt_blk, addr); - ACPI_READ_FIELD(fadt_table->pm1b_evt_blk, addr); - ACPI_READ_FIELD(fadt_table->pm1a_cnt_blk, addr); - ACPI_READ_FIELD(fadt_table->pm1b_cnt_blk, addr); - ACPI_READ_FIELD(fadt_table->pm2_cnt_blk, addr); - ACPI_READ_FIELD(fadt_table->pm_tmr_blk, addr); - ACPI_READ_FIELD(fadt_table->gpe0_blk, addr); - ACPI_READ_FIELD(fadt_table->gpe1_blk, addr); - ACPI_READ_FIELD(fadt_table->pm1_evt_len, addr); - ACPI_READ_FIELD(fadt_table->pm1_cnt_len, addr); - ACPI_READ_FIELD(fadt_table->pm2_cnt_len, addr); - ACPI_READ_FIELD(fadt_table->pm_tmr_len, addr); - ACPI_READ_FIELD(fadt_table->gpe0_blk_len, addr); - ACPI_READ_FIELD(fadt_table->gpe1_blk_len, addr); - ACPI_READ_FIELD(fadt_table->gpe1_base, addr); - ACPI_READ_FIELD(fadt_table->reserved3, addr); - ACPI_READ_FIELD(fadt_table->plvl2_lat, addr); - ACPI_READ_FIELD(fadt_table->plvl3_lat, addr); - ACPI_READ_FIELD(fadt_table->flush_size, addr); - ACPI_READ_FIELD(fadt_table->flush_stride, addr); - ACPI_READ_FIELD(fadt_table->duty_offset, addr); - ACPI_READ_FIELD(fadt_table->duty_width, addr); - ACPI_READ_FIELD(fadt_table->day_alrm, addr); - ACPI_READ_FIELD(fadt_table->mon_alrm, addr); - ACPI_READ_FIELD(fadt_table->century, addr); - ACPI_READ_FIELD(fadt_table->boot_flags, addr); - ACPI_READ_FIELD(fadt_table->reserved, addr); - ACPI_READ_FIELD(fadt_table->flags, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->reset_register, addr); - ACPI_READ_FIELD(fadt_table->reset_value, addr); - ACPI_READ_FIELD(fadt_table->arm_boot_flags, addr); - ACPI_READ_FIELD(fadt_table->minor_revision, addr); - ACPI_READ_FIELD(fadt_table->x_facs, addr); - ACPI_READ_FIELD(fadt_table->x_dsdt, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1a_event_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1b_event_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1a_control_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1b_control_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm2_control_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm_timer_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xgpe0_block, addr); - ACPI_READ_GENERIC_ADDRESS(fadt_table->xgpe1_block, addr); - - ACPI_ASSERT_CMP(fadt_table->signature, "FACP"); - g_assert(!acpi_calc_checksum((uint8_t *)fadt_table, - le32_to_cpu(fadt_table->length))); + ACPI_READ_TABLE_HEADER(&hdr, addr); + ACPI_ASSERT_CMP(hdr.signature, "FACP"); + + ACPI_READ_FIELD(data->facs_addr, addr); + ACPI_READ_FIELD(data->dsdt_addr, addr); } static void sanitize_fadt_ptrs(test_data *data) @@ -206,6 +154,12 @@ static void sanitize_fadt_ptrs(test_data *data) continue; } + /* check original FADT checksum before sanitizing table */ + g_assert(!(uint8_t)( + acpi_calc_checksum((uint8_t *)sdt, sizeof(AcpiTableHeader)) + + acpi_calc_checksum((uint8_t *)sdt->aml, sdt->aml_len) + )); + /* sdt->aml field offset := spec offset - header size */ memset(sdt->aml + 0, 0, 4); /* sanitize FIRMWARE_CTRL(36) ptr */ memset(sdt->aml + 4, 0, 4); /* sanitize DSDT(40) ptr */ @@ -226,7 +180,7 @@ static void sanitize_fadt_ptrs(test_data *data) static void test_acpi_facs_table(test_data *data) { AcpiFacsDescriptorRev1 *facs_table = &data->facs_table; - uint32_t addr = le32_to_cpu(data->fadt_table.firmware_ctrl); + uint32_t addr = le32_to_cpu(data->facs_addr); ACPI_READ_FIELD(facs_table->signature, addr); ACPI_READ_FIELD(facs_table->length, addr); @@ -265,7 +219,7 @@ static void fetch_table(AcpiSdtTable *sdt_table, uint32_t addr) static void test_acpi_dsdt_table(test_data *data) { AcpiSdtTable dsdt_table; - uint32_t addr = le32_to_cpu(data->fadt_table.dsdt); + uint32_t addr = le32_to_cpu(data->dsdt_addr); fetch_table(&dsdt_table, addr); ACPI_ASSERT_CMP(dsdt_table.header.signature, "DSDT"); @@ -674,7 +628,7 @@ static void test_acpi_one(const char *params, test_data *data) test_acpi_rsdp_address(data); test_acpi_rsdp_table(data); test_acpi_rsdt_table(data); - test_acpi_fadt_table(data); + fadt_fetch_facs_and_dsdt_ptrs(data); test_acpi_facs_table(data); test_acpi_dsdt_table(data); fetch_rsdt_referenced_tables(data); @@ -869,6 +823,42 @@ static void test_acpi_piix4_tcg_numamem(void) free_test_data(&data); } +static void test_acpi_tcg_dimm_pxm(const char *machine) +{ + test_data data; + + memset(&data, 0, sizeof(data)); + data.machine = machine; + data.variant = ".dimmpxm"; + test_acpi_one(" -machine nvdimm=on" + " -smp 4,sockets=4" + " -m 128M,slots=3,maxmem=1G" + " -numa node,mem=32M,nodeid=0" + " -numa node,mem=32M,nodeid=1" + " -numa node,mem=32M,nodeid=2" + " -numa node,mem=32M,nodeid=3" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=1,socket-id=1" + " -numa cpu,node-id=2,socket-id=2" + " -numa cpu,node-id=3,socket-id=3" + " -object memory-backend-ram,id=ram0,size=128M" + " -object memory-backend-ram,id=nvm0,size=128M" + " -device pc-dimm,id=dimm0,memdev=ram0,node=1" + " -device nvdimm,id=dimm1,memdev=nvm0,node=2", + &data); + free_test_data(&data); +} + +static void test_acpi_q35_tcg_dimm_pxm(void) +{ + test_acpi_tcg_dimm_pxm(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_dimm_pxm(void) +{ + test_acpi_tcg_dimm_pxm(MACHINE_PC); +} + int main(int argc, char *argv[]) { const char *arch = qtest_get_arch(); @@ -893,6 +883,8 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/memhp", test_acpi_q35_tcg_memhp); qtest_add_func("acpi/piix4/numamem", test_acpi_piix4_tcg_numamem); qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem); + qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm); + qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm); } ret = g_test_run(); boot_sector_cleanup(disk); diff --git a/trace-events b/trace-events index 22c7dd4ac6..ed71f44649 100644 --- a/trace-events +++ b/trace-events @@ -58,9 +58,10 @@ dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p" dma_blk_cb(void *dbs, int ret) "dbs=%p ret=%d" dma_map_wait(void *dbs) "dbs=%p" -# # exec.c +# exec.c find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64 find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64 +ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d" # memory.c memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u" @@ -94,6 +94,7 @@ int main(int argc, char **argv) #include "audio/audio.h" #include "sysemu/cpus.h" #include "migration/colo.h" +#include "migration/postcopy-ram.h" #include "sysemu/kvm.h" #include "sysemu/hax.h" #include "qapi/qobject-input-visitor.h" @@ -3101,6 +3102,7 @@ int main(int argc, char **argv, char **envp) module_call_init(MODULE_INIT_OPTS); runstate_init(); + postcopy_infrastructure_init(); if (qcrypto_init(&err) < 0) { error_reportf_err(err, "cannot initialize crypto: "); |