From dda53ee93af90547912f0ea99ca85deba49fd364 Mon Sep 17 00:00:00 2001
From: Zihan Yang <whois.zihan.yang@gmail.com>
Date: Wed, 25 Apr 2018 17:52:23 +0800
Subject: hw/pci-host/q35: Replace hardcoded value with macro

During smram region initialization some addresses are hardcoded,
replace them with macro to be more clear to readers.

Previous patch forgets about one value and exceeds the line
limit of 90 characters. The v2 breaks a few long lines

Signed-off-by: Zihan Yang <whois.zihan.yang@gmail.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci-host/q35.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index a36a1195e4..02f9576588 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp)
 
     /* if *disabled* show SMRAM to all CPUs */
     memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region",
-                             mch->pci_address_space, 0xa0000, 0x20000);
-    memory_region_add_subregion_overlap(mch->system_memory, 0xa0000,
+                             mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
+    memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
                                         &mch->smram_region, 1);
     memory_region_set_enabled(&mch->smram_region, true);
 
     memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000,
                                         &mch->open_high_smram, 1);
     memory_region_set_enabled(&mch->open_high_smram, false);
@@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp)
     memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32);
     memory_region_set_enabled(&mch->smram, true);
     memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_set_enabled(&mch->low_smram, true);
-    memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram);
+    memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                                &mch->low_smram);
     memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high",
-                             mch->ram_memory, 0xa0000, 0x20000);
+                             mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+                             MCH_HOST_BRIDGE_SMRAM_C_SIZE);
     memory_region_set_enabled(&mch->high_smram, true);
     memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram);
 
-- 
cgit v1.2.3


From 50a6fa8f939a3448967592e5136afcc063f9a25d Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 24 Apr 2018 08:38:57 +0200
Subject: allocate pci id for mdpy

mdpy is a sample pci device for vfio-mdev.  Not (yet) merged upstream,
patch available here:

https://www.kraxel.org/cgit/linux/commit/?h=vfio-sample-display&id=6fd86cff3d7df38ab89625b16fdd6434b1c18749

Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 docs/specs/pci-ids.txt | 1 +
 include/hw/pci/pci.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
index bb99a0257e..4d53e5c7d9 100644
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -62,6 +62,7 @@ PCI devices (other than virtio):
 1b36:000a  PCI-PCI bridge (multiseat)
 1b36:000b  PCIe Expander Bridge (-device pxb-pcie)
 1b36:000d  PCI xhci usb host adapter
+1b36:000f  mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c
 
 All these devices are documented in docs/specs.
 
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index a9c3ee5aa2..990d6fcbde 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -101,6 +101,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_REDHAT_PCIE_RP     0x000c
 #define PCI_DEVICE_ID_REDHAT_XHCI        0x000d
 #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e
+#define PCI_DEVICE_ID_REDHAT_MDPY        0x000f
 #define PCI_DEVICE_ID_REDHAT_QXL         0x0100
 
 #define FMT_PCIBUS                      PRIx64
-- 
cgit v1.2.3


From b7b126442977dcfa5641e38b084bd13a0b366cff Mon Sep 17 00:00:00 2001
From: Jonathan Helman <jonathan.helman@oracle.com>
Date: Mon, 19 Mar 2018 15:28:49 -0700
Subject: virtio-balloon: add hugetlb page allocation counts

qemu should read and report hugetlb page allocation
counts exported in the following kernel patch:

    commit 4c3ca37c4a4394978fd0f005625f6064ed2b9a64
    Author: Jonathan Helman <jonathan.helman@oracle.com>
    Date:   Mon Mar 19 11:00:35 2018 -0700

    virtio_balloon: export hugetlb page allocation counts

    Export the number of successful and failed hugetlb page
    allocations via the virtio balloon driver. These 2 counts
    come directly from the vm_events HTLB_BUDDY_PGALLOC and
    HTLB_BUDDY_PGALLOC_FAIL.

Signed-off-by: Jonathan Helman <jonathan.helman@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
---
 docs/virtio-balloon-stats.txt                   | 2 ++
 hw/virtio/virtio-balloon.c                      | 2 ++
 include/standard-headers/linux/virtio_balloon.h | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/virtio-balloon-stats.txt b/docs/virtio-balloon-stats.txt
index 7a66d25da5..9985e1dffc 100644
--- a/docs/virtio-balloon-stats.txt
+++ b/docs/virtio-balloon-stats.txt
@@ -34,6 +34,8 @@ which will return a dictionary containing:
       - stat-total-memory
       - stat-available-memory
       - stat-disk-caches
+      - stat-htlb-pgalloc
+      - stat-htlb-pgfail
 
   o A key named last-update, which contains the last stats update
     timestamp in seconds. Since this timestamp is generated by the host,
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index f456cea2e7..1f7a87f094 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = {
    [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
    [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
    [VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
+   [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
+   [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
    [VIRTIO_BALLOON_S_NR] = NULL
 };
 
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 7b0a41b8fc..e446805ae9 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -53,7 +53,9 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
 #define VIRTIO_BALLOON_S_AVAIL    6   /* Available memory as in /proc */
 #define VIRTIO_BALLOON_S_CACHES   7   /* Disk caches */
-#define VIRTIO_BALLOON_S_NR       8
+#define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page allocations */
+#define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page allocation failures */
+#define VIRTIO_BALLOON_S_NR       10
 
 /*
  * Memory statistics structure.
-- 
cgit v1.2.3


From ffcbbe722fcb3c162318cba1b94a115498e25acd Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 27 Apr 2018 17:07:24 +0800
Subject: vhost: add trace for IOTLB miss

Add some trace points for IOTLB translation for vhost. After vhost-user
is setup, the only IO path that QEMU will participate should be the
IOMMU translation, so it'll be good we can track this with explicit
timestamps when needed to see how long time we take to do the
translation, and whether there's anything stuck inside.  It might be
useful for triaging vhost-user problems.

Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/trace-events | 1 +
 hw/virtio/vhost.c      | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 1422ff03ab..07bcbe9e85 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t
 vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
 vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
 vhost_section(const char *name, int r) "%s:%d"
+vhost_iotlb_miss(void *dev, int step) "%p step %d"
 
 # hw/virtio/vhost-user.c
 vhost_user_postcopy_end_entry(void) ""
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9d5850a7d7..b08290036d 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
 
     rcu_read_lock();
 
+    trace_vhost_iotlb_miss(dev, 1);
+
     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
                                           iova, write);
     if (iotlb.target_as != NULL) {
         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
                                          &uaddr, &len);
         if (ret) {
+            trace_vhost_iotlb_miss(dev, 3);
             error_report("Fail to lookup the translated address "
                          "%"PRIx64, iotlb.translated_addr);
             goto out;
@@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
                                                 len, iotlb.perm);
         if (ret) {
+            trace_vhost_iotlb_miss(dev, 4);
             error_report("Fail to update device iotlb");
             goto out;
         }
     }
+
+    trace_vhost_iotlb_miss(dev, 2);
+
 out:
     rcu_read_unlock();
 
-- 
cgit v1.2.3


From ec7b10800c3c91afe6777ea13068ccf280a741ce Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 17 Apr 2018 21:42:21 +0300
Subject: update-linux-headers.sh: drop kvm_para.h hacks

It turns out (as will be clear from follow-up patches)
we do not really need any kvm para macros host side
for now, except on x86, and there we need it
unconditionally whether we run on kvm or we don't.

Import the x86 asm/kvm_para.h into standard-headers,
follow-up patches remove a bunch of code using this.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 scripts/update-linux-headers.sh | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index b65c03f0ae..4fa08d38c8 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -43,6 +43,7 @@ cp_portable() {
                                      -e 'limits' \
                                      -e 'linux/kernel' \
                                      -e 'linux/sysinfo' \
+                                     -e 'asm-generic/kvm_para' \
                                      > /dev/null
     then
         echo "Unexpected #include in input file $f".
@@ -98,13 +99,9 @@ for arch in $ARCHLIST; do
 
     rm -rf "$output/linux-headers/asm-$arch"
     mkdir -p "$output/linux-headers/asm-$arch"
-    for header in kvm.h kvm_para.h unistd.h; do
+    for header in kvm.h unistd.h; do
         cp "$tmpdir/include/asm/$header" "$output/linux-headers/asm-$arch"
     done
-    if [ $arch = powerpc ]; then
-        cp "$tmpdir/include/asm/epapr_hcalls.h" "$output/linux-headers/asm-powerpc/"
-    fi
-
     rm -rf "$output/include/standard-headers/asm-$arch"
     mkdir -p "$output/include/standard-headers/asm-$arch"
     if [ $arch = s390 ]; then
@@ -121,20 +118,17 @@ for arch in $ARCHLIST; do
         cp "$tmpdir/include/asm/unistd_32.h" "$output/linux-headers/asm-x86/"
         cp "$tmpdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/"
         cp "$tmpdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/"
+        cp_portable "$tmpdir/include/asm/kvm_para.h" "$output/include/standard-headers/asm-$arch"
     fi
 done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h kvm_para.h vfio.h vfio_ccw.h vhost.h \
+for header in kvm.h vfio.h vfio_ccw.h vhost.h \
               psci.h psp-sev.h userfaultfd.h; do
     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
-rm -rf "$output/linux-headers/asm-generic"
-mkdir -p "$output/linux-headers/asm-generic"
-for header in kvm_para.h; do
-    cp "$tmpdir/include/asm-generic/$header" "$output/linux-headers/asm-generic"
-done
+
 if [ -L "$linux/source" ]; then
     cp "$linux/source/COPYING" "$output/linux-headers"
 else
-- 
cgit v1.2.3


From d49bd359d186a83fbb66a23dafd2d6664b79aa6a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 17 Apr 2018 21:45:56 +0300
Subject: include/standard-headers: add asm-x86/kvm_para.h

Import asm-x86/kvm_para.h from linux where it can
be easily used on Linux and non-Linux platforms.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/standard-headers/asm-x86/kvm_para.h | 121 ++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 include/standard-headers/asm-x86/kvm_para.h

diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h
new file mode 100644
index 0000000000..53a85ae3ed
--- /dev/null
+++ b/include/standard-headers/asm-x86/kvm_para.h
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+#include "standard-headers/linux/types.h"
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE	0x40000000
+
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
+ */
+#define KVM_CPUID_FEATURES	0x40000001
+#define KVM_FEATURE_CLOCKSOURCE		0
+#define KVM_FEATURE_NOP_IO_DELAY	1
+#define KVM_FEATURE_MMU_OP		2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2        3
+#define KVM_FEATURE_ASYNC_PF		4
+#define KVM_FEATURE_STEAL_TIME		5
+#define KVM_FEATURE_PV_EOI		6
+#define KVM_FEATURE_PV_UNHALT		7
+#define KVM_FEATURE_PV_TLB_FLUSH	9
+#define KVM_FEATURE_ASYNC_PF_VMEXIT	10
+
+#define KVM_HINTS_DEDICATED      0
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
+
+#define MSR_KVM_WALL_CLOCK  0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MSR_ENABLED 1
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME  0x4b564d03
+#define MSR_KVM_PV_EOI_EN      0x4b564d04
+
+struct kvm_steal_time {
+	uint64_t steal;
+	uint32_t version;
+	uint32_t flags;
+	uint8_t  preempted;
+	uint8_t  uint8_t_pad[3];
+	uint32_t pad[11];
+};
+
+#define KVM_VCPU_PREEMPTED          (1 << 0)
+#define KVM_VCPU_FLUSH_TLB          (1 << 1)
+
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+	int64_t sec;
+	int64_t nsec;
+	uint64_t tsc;
+	uint32_t flags;
+	uint32_t pad[9];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
+#define KVM_MAX_MMU_OP_BATCH           32
+
+#define KVM_ASYNC_PF_ENABLED			(1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
+#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT	(1 << 2)
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE            1
+#define KVM_MMU_OP_FLUSH_TLB	        2
+#define KVM_MMU_OP_RELEASE_PT	        3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+	uint32_t op;
+	uint32_t pad;
+};
+
+struct kvm_mmu_op_write_pte {
+	struct kvm_mmu_op_header header;
+	uint64_t pte_phys;
+	uint64_t pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+	struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+	struct kvm_mmu_op_header header;
+	uint64_t pt_phys;
+};
+
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+	uint32_t reason;
+	uint8_t pad[60];
+	uint32_t enabled;
+};
+
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
+#endif /* _ASM_X86_KVM_PARA_H */
-- 
cgit v1.2.3


From 1814eab67398029a927847dfd29198a26aeaf7d8 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 17 Apr 2018 21:47:50 +0300
Subject: x86/cpu: use standard-headers/asm-x86.kvm_para.h

Switch to the header we imported from Linux,
this allows us to drop a hack in kvm_i386.h.
More code will be dropped in the next patch.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/kvm/clock.c    | 2 +-
 include/sysemu/kvm.h   | 1 -
 target/i386/cpu.c      | 4 +---
 target/i386/cpu.h      | 2 --
 target/i386/kvm.c      | 4 ++--
 target/i386/kvm_i386.h | 6 ------
 6 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 7dac319403..0bf1c60a06 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -26,7 +26,7 @@
 #include "qapi/error.h"
 
 #include <linux/kvm.h>
-#include <linux/kvm_para.h>
+#include "standard-headers/asm-x86/kvm_para.h"
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 23669c4d5a..0b64b8e067 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -22,7 +22,6 @@
 #ifdef NEED_CPU_H
 # ifdef CONFIG_KVM
 #  include <linux/kvm.h>
-#  include <linux/kvm_para.h>
 #  define CONFIG_KVM_IS_POSSIBLE
 # endif
 #else
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index d95310ffd4..94260412e2 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -40,9 +40,7 @@
 #include "qom/qom-qobject.h"
 #include "sysemu/arch_init.h"
 
-#if defined(CONFIG_KVM)
-#include <linux/kvm_para.h>
-#endif
+#include "standard-headers/asm-x86/kvm_para.h"
 
 #include "sysemu/sysemu.h"
 #include "hw/qdev-properties.h"
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 8ac13f6c2c..664504610e 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -688,8 +688,6 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_SPEC_CTRL     (1U << 26) /* Speculation Control */
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass Disable */
 
-#define KVM_HINTS_DEDICATED (1U << 0)
-
 #define CPUID_8000_0008_EBX_IBPB    (1U << 12) /* Indirect Branch Prediction Barrier */
 
 #define CPUID_XSAVE_XSAVEOPT   (1U << 0)
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 0c656a91a4..6511329d11 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -18,7 +18,7 @@
 #include <sys/utsname.h>
 
 #include <linux/kvm.h>
-#include <linux/kvm_para.h>
+#include "standard-headers/asm-x86/kvm_para.h"
 
 #include "qemu-common.h"
 #include "cpu.h"
@@ -387,7 +387,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
         }
     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
-        ret |= KVM_HINTS_DEDICATED;
+        ret |= 1U << KVM_HINTS_DEDICATED;
         found = 1;
     }
 
diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h
index 1de9876cd9..e5df24cad1 100644
--- a/target/i386/kvm_i386.h
+++ b/target/i386/kvm_i386.h
@@ -30,12 +30,6 @@
 #define kvm_pic_in_kernel()      0
 #define kvm_ioapic_in_kernel()   0
 
-/* These constants must never be used at runtime if kvm_enabled() is false.
- * They exist so we don't need #ifdefs around KVM-specific code that already
- * checks kvm_enabled() properly.
- */
-#define KVM_CPUID_FEATURES       0
-
 #endif  /* CONFIG_KVM */
 
 bool kvm_allows_irq0_override(void);
-- 
cgit v1.2.3


From fe7f9b8e6e095bc2842180fe72dd45158678eeb7 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 17 Apr 2018 21:51:25 +0300
Subject: linux-headers: drop kvm_para.h

Unused now and can be removed.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 linux-headers/asm-arm/kvm_para.h         |   2 -
 linux-headers/asm-arm64/kvm_para.h       |   1 -
 linux-headers/asm-generic/kvm_para.h     |   4 --
 linux-headers/asm-mips/kvm_para.h        |   5 --
 linux-headers/asm-powerpc/epapr_hcalls.h |  99 --------------------------
 linux-headers/asm-powerpc/kvm_para.h     |  98 -------------------------
 linux-headers/asm-s390/kvm_para.h        |   8 ---
 linux-headers/asm-x86/kvm_para.h         | 118 -------------------------------
 linux-headers/linux/kvm_para.h           |  35 ---------
 9 files changed, 370 deletions(-)
 delete mode 100644 linux-headers/asm-arm/kvm_para.h
 delete mode 100644 linux-headers/asm-arm64/kvm_para.h
 delete mode 100644 linux-headers/asm-generic/kvm_para.h
 delete mode 100644 linux-headers/asm-mips/kvm_para.h
 delete mode 100644 linux-headers/asm-powerpc/epapr_hcalls.h
 delete mode 100644 linux-headers/asm-powerpc/kvm_para.h
 delete mode 100644 linux-headers/asm-s390/kvm_para.h
 delete mode 100644 linux-headers/asm-x86/kvm_para.h
 delete mode 100644 linux-headers/linux/kvm_para.h

diff --git a/linux-headers/asm-arm/kvm_para.h b/linux-headers/asm-arm/kvm_para.h
deleted file mode 100644
index baacc4996d..0000000000
--- a/linux-headers/asm-arm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/linux-headers/asm-arm64/kvm_para.h b/linux-headers/asm-arm64/kvm_para.h
deleted file mode 100644
index 14fab8f0b9..0000000000
--- a/linux-headers/asm-arm64/kvm_para.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/linux-headers/asm-generic/kvm_para.h b/linux-headers/asm-generic/kvm_para.h
deleted file mode 100644
index 486f0af73c..0000000000
--- a/linux-headers/asm-generic/kvm_para.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/*
- * There isn't anything here, but the file must not be empty or patch
- * will delete it.
- */
diff --git a/linux-headers/asm-mips/kvm_para.h b/linux-headers/asm-mips/kvm_para.h
deleted file mode 100644
index dbb2464f3b..0000000000
--- a/linux-headers/asm-mips/kvm_para.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifndef _ASM_MIPS_KVM_PARA_H
-#define _ASM_MIPS_KVM_PARA_H
-
-
-#endif /* _ASM_MIPS_KVM_PARA_H */
diff --git a/linux-headers/asm-powerpc/epapr_hcalls.h b/linux-headers/asm-powerpc/epapr_hcalls.h
deleted file mode 100644
index 6cca559993..0000000000
--- a/linux-headers/asm-powerpc/epapr_hcalls.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
-/*
- * ePAPR hcall interface
- *
- * Copyright 2008-2011 Freescale Semiconductor, Inc.
- *
- * Author: Timur Tabi <timur@freescale.com>
- *
- * This file is provided under a dual BSD/GPL license.  When using or
- * redistributing this file, you may do so under either license.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of Freescale Semiconductor nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- *
- * ALTERNATIVELY, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") as published by the Free Software
- * Foundation, either version 2 of that License or (at your option) any
- * later version.
- *
- * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _ASM_POWERPC_EPAPR_HCALLS_H
-#define _ASM_POWERPC_EPAPR_HCALLS_H
-
-#define EV_BYTE_CHANNEL_SEND		1
-#define EV_BYTE_CHANNEL_RECEIVE		2
-#define EV_BYTE_CHANNEL_POLL		3
-#define EV_INT_SET_CONFIG		4
-#define EV_INT_GET_CONFIG		5
-#define EV_INT_SET_MASK			6
-#define EV_INT_GET_MASK			7
-#define EV_INT_IACK			9
-#define EV_INT_EOI			10
-#define EV_INT_SEND_IPI			11
-#define EV_INT_SET_TASK_PRIORITY	12
-#define EV_INT_GET_TASK_PRIORITY	13
-#define EV_DOORBELL_SEND		14
-#define EV_MSGSND			15
-#define EV_IDLE				16
-
-/* vendor ID: epapr */
-#define EV_LOCAL_VENDOR_ID		0	/* for private use */
-#define EV_EPAPR_VENDOR_ID		1
-#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
-#define EV_IBM_VENDOR_ID		3	/* IBM */
-#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
-#define EV_ENEA_VENDOR_ID		5	/* Enea */
-#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
-#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
-#define EV_KVM_VENDOR_ID		42	/* KVM */
-
-/* The max number of bytes that a byte channel can send or receive per call */
-#define EV_BYTE_CHANNEL_MAX_BYTES	16
-
-
-#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
-#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
-
-/* epapr return codes */
-#define EV_SUCCESS		0
-#define EV_EPERM		1	/* Operation not permitted */
-#define EV_ENOENT		2	/*  Entry Not Found */
-#define EV_EIO			3	/* I/O error occurred */
-#define EV_EAGAIN		4	/* The operation had insufficient
-					 * resources to complete and should be
-					 * retried
-					 */
-#define EV_ENOMEM		5	/* There was insufficient memory to
-					 * complete the operation */
-#define EV_EFAULT		6	/* Bad guest address */
-#define EV_ENODEV		7	/* No such device */
-#define EV_EINVAL		8	/* An argument supplied to the hcall
-					   was out of range or invalid */
-#define EV_INTERNAL		9	/* An internal error occurred */
-#define EV_CONFIG		10	/* A configuration error was detected */
-#define EV_INVALID_STATE	11	/* The object is in an invalid state */
-#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
-#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
-
-#endif /* _ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/linux-headers/asm-powerpc/kvm_para.h b/linux-headers/asm-powerpc/kvm_para.h
deleted file mode 100644
index 9beb49cc10..0000000000
--- a/linux-headers/asm-powerpc/kvm_para.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright IBM Corp. 2008
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#ifndef __POWERPC_KVM_PARA_H__
-#define __POWERPC_KVM_PARA_H__
-
-#include <linux/types.h>
-
-/*
- * Additions to this struct must only occur at the end, and should be
- * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present
- * (albeit not necessarily relevant to the current target hardware platform).
- *
- * Struct fields are always 32 or 64 bit aligned, depending on them being 32
- * or 64 bit wide respectively.
- *
- * See Documentation/virtual/kvm/ppc-pv.txt
- */
-struct kvm_vcpu_arch_shared {
-	__u64 scratch1;
-	__u64 scratch2;
-	__u64 scratch3;
-	__u64 critical;		/* Guest may not get interrupts if == r1 */
-	__u64 sprg0;
-	__u64 sprg1;
-	__u64 sprg2;
-	__u64 sprg3;
-	__u64 srr0;
-	__u64 srr1;
-	__u64 dar;		/* dear on BookE */
-	__u64 msr;
-	__u32 dsisr;
-	__u32 int_pending;	/* Tells the guest if we have an interrupt */
-	__u32 sr[16];
-	__u32 mas0;
-	__u32 mas1;
-	__u64 mas7_3;
-	__u64 mas2;
-	__u32 mas4;
-	__u32 mas6;
-	__u32 esr;
-	__u32 pir;
-
-	/*
-	 * SPRG4-7 are user-readable, so we can only keep these consistent
-	 * between the shared area and the real registers when there's an
-	 * intervening exit to KVM.  This also applies to SPRG3 on some
-	 * chips.
-	 *
-	 * This suffices for access by guest userspace, since in PR-mode
-	 * KVM, an exit must occur when changing the guest's MSR[PR].
-	 * If the guest kernel writes to SPRG3-7 via the shared area, it
-	 * must also use the shared area for reading while in kernel space.
-	 */
-	__u64 sprg4;
-	__u64 sprg5;
-	__u64 sprg6;
-	__u64 sprg7;
-};
-
-#define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
-
-#define KVM_HCALL_TOKEN(num)     _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
-
-#include <asm/epapr_hcalls.h>
-
-#define KVM_FEATURE_MAGIC_PAGE	1
-
-/* Magic page flags from host to guest */
-
-#define KVM_MAGIC_FEAT_SR		(1 << 0)
-
-/* MASn, ESR, PIR, and high SPRGs */
-#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7	(1 << 1)
-
-/* Magic page flags from guest to host */
-
-#define MAGIC_PAGE_FLAG_NOT_MAPPED_NX	(1 << 0)
-
-
-#endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/linux-headers/asm-s390/kvm_para.h b/linux-headers/asm-s390/kvm_para.h
deleted file mode 100644
index b9ab584adf..0000000000
--- a/linux-headers/asm-s390/kvm_para.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * User API definitions for paravirtual devices on s390
- *
- * Copyright IBM Corp. 2008
- *
- *    Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
- */
diff --git a/linux-headers/asm-x86/kvm_para.h b/linux-headers/asm-x86/kvm_para.h
deleted file mode 100644
index 4c58184395..0000000000
--- a/linux-headers/asm-x86/kvm_para.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_KVM_PARA_H
-#define _ASM_X86_KVM_PARA_H
-
-#include <linux/types.h>
-#include <asm/hyperv.h>
-
-/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
- * should be used to determine that a VM is running under KVM.
- */
-#define KVM_CPUID_SIGNATURE	0x40000000
-
-/* This CPUID returns a feature bitmap in eax.  Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
- */
-#define KVM_CPUID_FEATURES	0x40000001
-#define KVM_FEATURE_CLOCKSOURCE		0
-#define KVM_FEATURE_NOP_IO_DELAY	1
-#define KVM_FEATURE_MMU_OP		2
-/* This indicates that the new set of kvmclock msrs
- * are available. The use of 0x11 and 0x12 is deprecated
- */
-#define KVM_FEATURE_CLOCKSOURCE2        3
-#define KVM_FEATURE_ASYNC_PF		4
-#define KVM_FEATURE_STEAL_TIME		5
-#define KVM_FEATURE_PV_EOI		6
-#define KVM_FEATURE_PV_UNHALT		7
-#define KVM_FEATURE_PV_TLB_FLUSH	9
-#define KVM_FEATURE_ASYNC_PF_VMEXIT	10
-
-/* The last 8 bits are used to indicate how to interpret the flags field
- * in pvclock structure. If no bits are set, all flags are ignored.
- */
-#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
-
-#define MSR_KVM_WALL_CLOCK  0x11
-#define MSR_KVM_SYSTEM_TIME 0x12
-
-#define KVM_MSR_ENABLED 1
-/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
-#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
-#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
-#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
-#define MSR_KVM_STEAL_TIME  0x4b564d03
-#define MSR_KVM_PV_EOI_EN      0x4b564d04
-
-struct kvm_steal_time {
-	__u64 steal;
-	__u32 version;
-	__u32 flags;
-	__u8  preempted;
-	__u8  u8_pad[3];
-	__u32 pad[11];
-};
-
-#define KVM_VCPU_PREEMPTED          (1 << 0)
-#define KVM_VCPU_FLUSH_TLB          (1 << 1)
-
-#define KVM_CLOCK_PAIRING_WALLCLOCK 0
-struct kvm_clock_pairing {
-	__s64 sec;
-	__s64 nsec;
-	__u64 tsc;
-	__u32 flags;
-	__u32 pad[9];
-};
-
-#define KVM_STEAL_ALIGNMENT_BITS 5
-#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
-#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
-
-#define KVM_MAX_MMU_OP_BATCH           32
-
-#define KVM_ASYNC_PF_ENABLED			(1 << 0)
-#define KVM_ASYNC_PF_SEND_ALWAYS		(1 << 1)
-#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT	(1 << 2)
-
-/* Operations for KVM_HC_MMU_OP */
-#define KVM_MMU_OP_WRITE_PTE            1
-#define KVM_MMU_OP_FLUSH_TLB	        2
-#define KVM_MMU_OP_RELEASE_PT	        3
-
-/* Payload for KVM_HC_MMU_OP */
-struct kvm_mmu_op_header {
-	__u32 op;
-	__u32 pad;
-};
-
-struct kvm_mmu_op_write_pte {
-	struct kvm_mmu_op_header header;
-	__u64 pte_phys;
-	__u64 pte_val;
-};
-
-struct kvm_mmu_op_flush_tlb {
-	struct kvm_mmu_op_header header;
-};
-
-struct kvm_mmu_op_release_pt {
-	struct kvm_mmu_op_header header;
-	__u64 pt_phys;
-};
-
-#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
-#define KVM_PV_REASON_PAGE_READY 2
-
-struct kvm_vcpu_pv_apf_data {
-	__u32 reason;
-	__u8 pad[60];
-	__u32 enabled;
-};
-
-#define KVM_PV_EOI_BIT 0
-#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
-#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
-#define KVM_PV_EOI_DISABLED 0x0
-
-#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h
deleted file mode 100644
index 8bcd0aa853..0000000000
--- a/linux-headers/linux/kvm_para.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_KVM_PARA_H
-#define __LINUX_KVM_PARA_H
-
-/*
- * This header file provides a method for making a hypercall to the host
- * Architectures should define:
- * - kvm_hypercall0, kvm_hypercall1...
- * - kvm_arch_para_features
- * - kvm_para_available
- */
-
-/* Return values for hypercalls */
-#define KVM_ENOSYS		1000
-#define KVM_EFAULT		EFAULT
-#define KVM_E2BIG		E2BIG
-#define KVM_EPERM		EPERM
-#define KVM_EOPNOTSUPP		95
-
-#define KVM_HC_VAPIC_POLL_IRQ		1
-#define KVM_HC_MMU_OP			2
-#define KVM_HC_FEATURES			3
-#define KVM_HC_PPC_MAP_MAGIC_PAGE	4
-#define KVM_HC_KICK_CPU			5
-#define KVM_HC_MIPS_GET_CLOCK_FREQ	6
-#define KVM_HC_MIPS_EXIT_VM		7
-#define KVM_HC_MIPS_CONSOLE_OUTPUT	8
-#define KVM_HC_CLOCK_PAIRING		9
-
-/*
- * hypercalls use architecture specific
- */
-#include <asm/kvm_para.h>
-
-#endif /* __LINUX_KVM_PARA_H */
-- 
cgit v1.2.3


From 9882d3ef3c43fc89a0e1d43d7b52720092bef29b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 20 Mar 2018 23:01:04 +0200
Subject: update-linux-headers.sh: unistd.h, kvm consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rework the update script slightly, add the unistd.h header and its
dependencies on all architectures.

This also removes the IA64 and MIPS from a KVM blacklist:
Linux dropped IA64, and there was never a reason to
exclude MIPS from kvm specifically - it was
excluded due to dependency of its unistd.h on sgidefs.h,
which we also import.

Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 scripts/update-linux-headers.sh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 4fa08d38c8..947dec2852 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -84,11 +84,6 @@ for arch in $ARCHLIST; do
         continue
     fi
 
-    # Blacklist architectures which have KVM headers but are actually dead
-    if [ "$arch" = "ia64" -o "$arch" = "mips" ]; then
-        continue
-    fi
-
     if [ "$arch" = x86 ]; then
         arch_var=SRCARCH
     else
@@ -99,9 +94,14 @@ for arch in $ARCHLIST; do
 
     rm -rf "$output/linux-headers/asm-$arch"
     mkdir -p "$output/linux-headers/asm-$arch"
-    for header in kvm.h unistd.h; do
+    for header in kvm.h unistd.h bitsperlong.h; do
         cp "$tmpdir/include/asm/$header" "$output/linux-headers/asm-$arch"
     done
+
+    if [ $arch = mips ]; then
+        cp "$tmpdir/include/asm/sgidefs.h" "$output/linux-headers/asm-mips/"
+    fi
+
     rm -rf "$output/include/standard-headers/asm-$arch"
     mkdir -p "$output/include/standard-headers/asm-$arch"
     if [ $arch = s390 ]; then
@@ -129,6 +129,12 @@ for header in kvm.h vfio.h vfio_ccw.h vhost.h \
     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
 
+rm -rf "$output/linux-headers/asm-generic"
+mkdir -p "$output/linux-headers/asm-generic"
+for header in unistd.h bitsperlong.h; do
+    cp "$tmpdir/include/asm-generic/$header" "$output/linux-headers/asm-generic"
+done
+
 if [ -L "$linux/source" ]; then
     cp "$linux/source/COPYING" "$output/linux-headers"
 else
-- 
cgit v1.2.3


From 43370036476707479a03a90e7e1c526dffbaad4b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 20 Mar 2018 23:01:05 +0200
Subject: linux-headers: add unistd.h on all arches

This adds unistd.h on ARM64 and MIPS and their dependencies.

Updated to Linux 4.17-rc2.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 linux-headers/asm-arm/bitsperlong.h     |   1 +
 linux-headers/asm-arm64/bitsperlong.h   |  24 +
 linux-headers/asm-generic/bitsperlong.h |  16 +
 linux-headers/asm-generic/unistd.h      | 781 ++++++++++++++++++++++++++++++++
 linux-headers/asm-mips/bitsperlong.h    |   9 +
 linux-headers/asm-mips/sgidefs.h        |  45 ++
 linux-headers/asm-mips/unistd.h         |  44 +-
 linux-headers/asm-powerpc/bitsperlong.h |  13 +
 linux-headers/asm-s390/bitsperlong.h    |  14 +
 linux-headers/asm-s390/unistd_32.h      |   1 +
 linux-headers/asm-s390/unistd_64.h      |   1 +
 linux-headers/asm-x86/bitsperlong.h     |  14 +
 12 files changed, 957 insertions(+), 6 deletions(-)
 create mode 100644 linux-headers/asm-arm/bitsperlong.h
 create mode 100644 linux-headers/asm-arm64/bitsperlong.h
 create mode 100644 linux-headers/asm-generic/bitsperlong.h
 create mode 100644 linux-headers/asm-generic/unistd.h
 create mode 100644 linux-headers/asm-mips/bitsperlong.h
 create mode 100644 linux-headers/asm-mips/sgidefs.h
 create mode 100644 linux-headers/asm-powerpc/bitsperlong.h
 create mode 100644 linux-headers/asm-s390/bitsperlong.h
 create mode 100644 linux-headers/asm-x86/bitsperlong.h

diff --git a/linux-headers/asm-arm/bitsperlong.h b/linux-headers/asm-arm/bitsperlong.h
new file mode 100644
index 0000000000..6dc0bb0c13
--- /dev/null
+++ b/linux-headers/asm-arm/bitsperlong.h
@@ -0,0 +1 @@
+#include <asm-generic/bitsperlong.h>
diff --git a/linux-headers/asm-arm64/bitsperlong.h b/linux-headers/asm-arm64/bitsperlong.h
new file mode 100644
index 0000000000..485d60bee2
--- /dev/null
+++ b/linux-headers/asm-arm64/bitsperlong.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_BITSPERLONG_H
+#define __ASM_BITSPERLONG_H
+
+#define __BITS_PER_LONG 64
+
+#include <asm-generic/bitsperlong.h>
+
+#endif	/* __ASM_BITSPERLONG_H */
diff --git a/linux-headers/asm-generic/bitsperlong.h b/linux-headers/asm-generic/bitsperlong.h
new file mode 100644
index 0000000000..0aac245b6b
--- /dev/null
+++ b/linux-headers/asm-generic/bitsperlong.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_GENERIC_BITS_PER_LONG
+#define __ASM_GENERIC_BITS_PER_LONG
+
+/*
+ * There seems to be no way of detecting this automatically from user
+ * space, so 64 bit architectures should override this in their
+ * bitsperlong.h. In particular, an architecture that supports
+ * both 32 and 64 bit user space must not rely on CONFIG_64BIT
+ * to decide it, but rather check a compiler provided macro.
+ */
+#ifndef __BITS_PER_LONG
+#define __BITS_PER_LONG 32
+#endif
+
+#endif /* __ASM_GENERIC_BITS_PER_LONG */
diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h
new file mode 100644
index 0000000000..8bcb186c6f
--- /dev/null
+++ b/linux-headers/asm-generic/unistd.h
@@ -0,0 +1,781 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#include <asm/bitsperlong.h>
+
+/*
+ * This file contains the system call numbers, based on the
+ * layout of the x86-64 architecture, which embeds the
+ * pointer to the syscall in the table.
+ *
+ * As a basic principle, no duplication of functionality
+ * should be added, e.g. we don't use lseek when llseek
+ * is present. New architectures should use this file
+ * and implement the less feature-full calls in user space.
+ */
+
+#ifndef __SYSCALL
+#define __SYSCALL(x, y)
+#endif
+
+#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
+#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
+#else
+#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
+#endif
+
+#ifdef __SYSCALL_COMPAT
+#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp)
+#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp)
+#else
+#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys)
+#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64)
+#endif
+
+#define __NR_io_setup 0
+__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
+#define __NR_io_destroy 1
+__SYSCALL(__NR_io_destroy, sys_io_destroy)
+#define __NR_io_submit 2
+__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
+#define __NR_io_cancel 3
+__SYSCALL(__NR_io_cancel, sys_io_cancel)
+#define __NR_io_getevents 4
+__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
+
+/* fs/xattr.c */
+#define __NR_setxattr 5
+__SYSCALL(__NR_setxattr, sys_setxattr)
+#define __NR_lsetxattr 6
+__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
+#define __NR_fsetxattr 7
+__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
+#define __NR_getxattr 8
+__SYSCALL(__NR_getxattr, sys_getxattr)
+#define __NR_lgetxattr 9
+__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
+#define __NR_fgetxattr 10
+__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
+#define __NR_listxattr 11
+__SYSCALL(__NR_listxattr, sys_listxattr)
+#define __NR_llistxattr 12
+__SYSCALL(__NR_llistxattr, sys_llistxattr)
+#define __NR_flistxattr 13
+__SYSCALL(__NR_flistxattr, sys_flistxattr)
+#define __NR_removexattr 14
+__SYSCALL(__NR_removexattr, sys_removexattr)
+#define __NR_lremovexattr 15
+__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
+#define __NR_fremovexattr 16
+__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
+
+/* fs/dcache.c */
+#define __NR_getcwd 17
+__SYSCALL(__NR_getcwd, sys_getcwd)
+
+/* fs/cookies.c */
+#define __NR_lookup_dcookie 18
+__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+
+/* fs/eventfd.c */
+#define __NR_eventfd2 19
+__SYSCALL(__NR_eventfd2, sys_eventfd2)
+
+/* fs/eventpoll.c */
+#define __NR_epoll_create1 20
+__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
+#define __NR_epoll_ctl 21
+__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
+#define __NR_epoll_pwait 22
+__SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait)
+
+/* fs/fcntl.c */
+#define __NR_dup 23
+__SYSCALL(__NR_dup, sys_dup)
+#define __NR_dup3 24
+__SYSCALL(__NR_dup3, sys_dup3)
+#define __NR3264_fcntl 25
+__SC_COMP_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl, compat_sys_fcntl64)
+
+/* fs/inotify_user.c */
+#define __NR_inotify_init1 26
+__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_inotify_add_watch 27
+__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
+#define __NR_inotify_rm_watch 28
+__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+
+/* fs/ioctl.c */
+#define __NR_ioctl 29
+__SC_COMP(__NR_ioctl, sys_ioctl, compat_sys_ioctl)
+
+/* fs/ioprio.c */
+#define __NR_ioprio_set 30
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get 31
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
+
+/* fs/locks.c */
+#define __NR_flock 32
+__SYSCALL(__NR_flock, sys_flock)
+
+/* fs/namei.c */
+#define __NR_mknodat 33
+__SYSCALL(__NR_mknodat, sys_mknodat)
+#define __NR_mkdirat 34
+__SYSCALL(__NR_mkdirat, sys_mkdirat)
+#define __NR_unlinkat 35
+__SYSCALL(__NR_unlinkat, sys_unlinkat)
+#define __NR_symlinkat 36
+__SYSCALL(__NR_symlinkat, sys_symlinkat)
+#define __NR_linkat 37
+__SYSCALL(__NR_linkat, sys_linkat)
+#ifdef __ARCH_WANT_RENAMEAT
+/* renameat is superseded with flags by renameat2 */
+#define __NR_renameat 38
+__SYSCALL(__NR_renameat, sys_renameat)
+#endif /* __ARCH_WANT_RENAMEAT */
+
+/* fs/namespace.c */
+#define __NR_umount2 39
+__SYSCALL(__NR_umount2, sys_umount)
+#define __NR_mount 40
+__SC_COMP(__NR_mount, sys_mount, compat_sys_mount)
+#define __NR_pivot_root 41
+__SYSCALL(__NR_pivot_root, sys_pivot_root)
+
+/* fs/nfsctl.c */
+#define __NR_nfsservctl 42
+__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
+
+/* fs/open.c */
+#define __NR3264_statfs 43
+__SC_COMP_3264(__NR3264_statfs, sys_statfs64, sys_statfs, \
+	       compat_sys_statfs64)
+#define __NR3264_fstatfs 44
+__SC_COMP_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs, \
+	       compat_sys_fstatfs64)
+#define __NR3264_truncate 45
+__SC_COMP_3264(__NR3264_truncate, sys_truncate64, sys_truncate, \
+	       compat_sys_truncate64)
+#define __NR3264_ftruncate 46
+__SC_COMP_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate, \
+	       compat_sys_ftruncate64)
+
+#define __NR_fallocate 47
+__SC_COMP(__NR_fallocate, sys_fallocate, compat_sys_fallocate)
+#define __NR_faccessat 48
+__SYSCALL(__NR_faccessat, sys_faccessat)
+#define __NR_chdir 49
+__SYSCALL(__NR_chdir, sys_chdir)
+#define __NR_fchdir 50
+__SYSCALL(__NR_fchdir, sys_fchdir)
+#define __NR_chroot 51
+__SYSCALL(__NR_chroot, sys_chroot)
+#define __NR_fchmod 52
+__SYSCALL(__NR_fchmod, sys_fchmod)
+#define __NR_fchmodat 53
+__SYSCALL(__NR_fchmodat, sys_fchmodat)
+#define __NR_fchownat 54
+__SYSCALL(__NR_fchownat, sys_fchownat)
+#define __NR_fchown 55
+__SYSCALL(__NR_fchown, sys_fchown)
+#define __NR_openat 56
+__SC_COMP(__NR_openat, sys_openat, compat_sys_openat)
+#define __NR_close 57
+__SYSCALL(__NR_close, sys_close)
+#define __NR_vhangup 58
+__SYSCALL(__NR_vhangup, sys_vhangup)
+
+/* fs/pipe.c */
+#define __NR_pipe2 59
+__SYSCALL(__NR_pipe2, sys_pipe2)
+
+/* fs/quota.c */
+#define __NR_quotactl 60
+__SYSCALL(__NR_quotactl, sys_quotactl)
+
+/* fs/readdir.c */
+#define __NR_getdents64 61
+__SYSCALL(__NR_getdents64, sys_getdents64)
+
+/* fs/read_write.c */
+#define __NR3264_lseek 62
+__SC_3264(__NR3264_lseek, sys_llseek, sys_lseek)
+#define __NR_read 63
+__SYSCALL(__NR_read, sys_read)
+#define __NR_write 64
+__SYSCALL(__NR_write, sys_write)
+#define __NR_readv 65
+__SC_COMP(__NR_readv, sys_readv, compat_sys_readv)
+#define __NR_writev 66
+__SC_COMP(__NR_writev, sys_writev, compat_sys_writev)
+#define __NR_pread64 67
+__SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64)
+#define __NR_pwrite64 68
+__SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64)
+#define __NR_preadv 69
+__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv)
+#define __NR_pwritev 70
+__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
+
+/* fs/sendfile.c */
+#define __NR3264_sendfile 71
+__SYSCALL(__NR3264_sendfile, sys_sendfile64)
+
+/* fs/select.c */
+#define __NR_pselect6 72
+__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6)
+#define __NR_ppoll 73
+__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll)
+
+/* fs/signalfd.c */
+#define __NR_signalfd4 74
+__SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4)
+
+/* fs/splice.c */
+#define __NR_vmsplice 75
+__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice)
+#define __NR_splice 76
+__SYSCALL(__NR_splice, sys_splice)
+#define __NR_tee 77
+__SYSCALL(__NR_tee, sys_tee)
+
+/* fs/stat.c */
+#define __NR_readlinkat 78
+__SYSCALL(__NR_readlinkat, sys_readlinkat)
+#define __NR3264_fstatat 79
+__SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat)
+#define __NR3264_fstat 80
+__SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat)
+
+/* fs/sync.c */
+#define __NR_sync 81
+__SYSCALL(__NR_sync, sys_sync)
+#define __NR_fsync 82
+__SYSCALL(__NR_fsync, sys_fsync)
+#define __NR_fdatasync 83
+__SYSCALL(__NR_fdatasync, sys_fdatasync)
+#ifdef __ARCH_WANT_SYNC_FILE_RANGE2
+#define __NR_sync_file_range2 84
+__SC_COMP(__NR_sync_file_range2, sys_sync_file_range2, \
+	  compat_sys_sync_file_range2)
+#else
+#define __NR_sync_file_range 84
+__SC_COMP(__NR_sync_file_range, sys_sync_file_range, \
+	  compat_sys_sync_file_range)
+#endif
+
+/* fs/timerfd.c */
+#define __NR_timerfd_create 85
+__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
+#define __NR_timerfd_settime 86
+__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \
+	  compat_sys_timerfd_settime)
+#define __NR_timerfd_gettime 87
+__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \
+	  compat_sys_timerfd_gettime)
+
+/* fs/utimes.c */
+#define __NR_utimensat 88
+__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat)
+
+/* kernel/acct.c */
+#define __NR_acct 89
+__SYSCALL(__NR_acct, sys_acct)
+
+/* kernel/capability.c */
+#define __NR_capget 90
+__SYSCALL(__NR_capget, sys_capget)
+#define __NR_capset 91
+__SYSCALL(__NR_capset, sys_capset)
+
+/* kernel/exec_domain.c */
+#define __NR_personality 92
+__SYSCALL(__NR_personality, sys_personality)
+
+/* kernel/exit.c */
+#define __NR_exit 93
+__SYSCALL(__NR_exit, sys_exit)
+#define __NR_exit_group 94
+__SYSCALL(__NR_exit_group, sys_exit_group)
+#define __NR_waitid 95
+__SC_COMP(__NR_waitid, sys_waitid, compat_sys_waitid)
+
+/* kernel/fork.c */
+#define __NR_set_tid_address 96
+__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
+#define __NR_unshare 97
+__SYSCALL(__NR_unshare, sys_unshare)
+
+/* kernel/futex.c */
+#define __NR_futex 98
+__SC_COMP(__NR_futex, sys_futex, compat_sys_futex)
+#define __NR_set_robust_list 99
+__SC_COMP(__NR_set_robust_list, sys_set_robust_list, \
+	  compat_sys_set_robust_list)
+#define __NR_get_robust_list 100
+__SC_COMP(__NR_get_robust_list, sys_get_robust_list, \
+	  compat_sys_get_robust_list)
+
+/* kernel/hrtimer.c */
+#define __NR_nanosleep 101
+__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep)
+
+/* kernel/itimer.c */
+#define __NR_getitimer 102
+__SC_COMP(__NR_getitimer, sys_getitimer, compat_sys_getitimer)
+#define __NR_setitimer 103
+__SC_COMP(__NR_setitimer, sys_setitimer, compat_sys_setitimer)
+
+/* kernel/kexec.c */
+#define __NR_kexec_load 104
+__SC_COMP(__NR_kexec_load, sys_kexec_load, compat_sys_kexec_load)
+
+/* kernel/module.c */
+#define __NR_init_module 105
+__SYSCALL(__NR_init_module, sys_init_module)
+#define __NR_delete_module 106
+__SYSCALL(__NR_delete_module, sys_delete_module)
+
+/* kernel/posix-timers.c */
+#define __NR_timer_create 107
+__SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create)
+#define __NR_timer_gettime 108
+__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime)
+#define __NR_timer_getoverrun 109
+__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
+#define __NR_timer_settime 110
+__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime)
+#define __NR_timer_delete 111
+__SYSCALL(__NR_timer_delete, sys_timer_delete)
+#define __NR_clock_settime 112
+__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime)
+#define __NR_clock_gettime 113
+__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime)
+#define __NR_clock_getres 114
+__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres)
+#define __NR_clock_nanosleep 115
+__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \
+	  compat_sys_clock_nanosleep)
+
+/* kernel/printk.c */
+#define __NR_syslog 116
+__SYSCALL(__NR_syslog, sys_syslog)
+
+/* kernel/ptrace.c */
+#define __NR_ptrace 117
+__SYSCALL(__NR_ptrace, sys_ptrace)
+
+/* kernel/sched/core.c */
+#define __NR_sched_setparam 118
+__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
+#define __NR_sched_setscheduler 119
+__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
+#define __NR_sched_getscheduler 120
+__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
+#define __NR_sched_getparam 121
+__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
+#define __NR_sched_setaffinity 122
+__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \
+	  compat_sys_sched_setaffinity)
+#define __NR_sched_getaffinity 123
+__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \
+	  compat_sys_sched_getaffinity)
+#define __NR_sched_yield 124
+__SYSCALL(__NR_sched_yield, sys_sched_yield)
+#define __NR_sched_get_priority_max 125
+__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
+#define __NR_sched_get_priority_min 126
+__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
+#define __NR_sched_rr_get_interval 127
+__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \
+	  compat_sys_sched_rr_get_interval)
+
+/* kernel/signal.c */
+#define __NR_restart_syscall 128
+__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
+#define __NR_kill 129
+__SYSCALL(__NR_kill, sys_kill)
+#define __NR_tkill 130
+__SYSCALL(__NR_tkill, sys_tkill)
+#define __NR_tgkill 131
+__SYSCALL(__NR_tgkill, sys_tgkill)
+#define __NR_sigaltstack 132
+__SC_COMP(__NR_sigaltstack, sys_sigaltstack, compat_sys_sigaltstack)
+#define __NR_rt_sigsuspend 133
+__SC_COMP(__NR_rt_sigsuspend, sys_rt_sigsuspend, compat_sys_rt_sigsuspend)
+#define __NR_rt_sigaction 134
+__SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction)
+#define __NR_rt_sigprocmask 135
+__SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask)
+#define __NR_rt_sigpending 136
+__SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending)
+#define __NR_rt_sigtimedwait 137
+__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \
+	  compat_sys_rt_sigtimedwait)
+#define __NR_rt_sigqueueinfo 138
+__SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \
+	  compat_sys_rt_sigqueueinfo)
+#define __NR_rt_sigreturn 139
+__SC_COMP(__NR_rt_sigreturn, sys_rt_sigreturn, compat_sys_rt_sigreturn)
+
+/* kernel/sys.c */
+#define __NR_setpriority 140
+__SYSCALL(__NR_setpriority, sys_setpriority)
+#define __NR_getpriority 141
+__SYSCALL(__NR_getpriority, sys_getpriority)
+#define __NR_reboot 142
+__SYSCALL(__NR_reboot, sys_reboot)
+#define __NR_setregid 143
+__SYSCALL(__NR_setregid, sys_setregid)
+#define __NR_setgid 144
+__SYSCALL(__NR_setgid, sys_setgid)
+#define __NR_setreuid 145
+__SYSCALL(__NR_setreuid, sys_setreuid)
+#define __NR_setuid 146
+__SYSCALL(__NR_setuid, sys_setuid)
+#define __NR_setresuid 147
+__SYSCALL(__NR_setresuid, sys_setresuid)
+#define __NR_getresuid 148
+__SYSCALL(__NR_getresuid, sys_getresuid)
+#define __NR_setresgid 149
+__SYSCALL(__NR_setresgid, sys_setresgid)
+#define __NR_getresgid 150
+__SYSCALL(__NR_getresgid, sys_getresgid)
+#define __NR_setfsuid 151
+__SYSCALL(__NR_setfsuid, sys_setfsuid)
+#define __NR_setfsgid 152
+__SYSCALL(__NR_setfsgid, sys_setfsgid)
+#define __NR_times 153
+__SC_COMP(__NR_times, sys_times, compat_sys_times)
+#define __NR_setpgid 154
+__SYSCALL(__NR_setpgid, sys_setpgid)
+#define __NR_getpgid 155
+__SYSCALL(__NR_getpgid, sys_getpgid)
+#define __NR_getsid 156
+__SYSCALL(__NR_getsid, sys_getsid)
+#define __NR_setsid 157
+__SYSCALL(__NR_setsid, sys_setsid)
+#define __NR_getgroups 158
+__SYSCALL(__NR_getgroups, sys_getgroups)
+#define __NR_setgroups 159
+__SYSCALL(__NR_setgroups, sys_setgroups)
+#define __NR_uname 160
+__SYSCALL(__NR_uname, sys_newuname)
+#define __NR_sethostname 161
+__SYSCALL(__NR_sethostname, sys_sethostname)
+#define __NR_setdomainname 162
+__SYSCALL(__NR_setdomainname, sys_setdomainname)
+#define __NR_getrlimit 163
+__SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit)
+#define __NR_setrlimit 164
+__SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit)
+#define __NR_getrusage 165
+__SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage)
+#define __NR_umask 166
+__SYSCALL(__NR_umask, sys_umask)
+#define __NR_prctl 167
+__SYSCALL(__NR_prctl, sys_prctl)
+#define __NR_getcpu 168
+__SYSCALL(__NR_getcpu, sys_getcpu)
+
+/* kernel/time.c */
+#define __NR_gettimeofday 169
+__SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday)
+#define __NR_settimeofday 170
+__SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
+#define __NR_adjtimex 171
+__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex)
+
+/* kernel/timer.c */
+#define __NR_getpid 172
+__SYSCALL(__NR_getpid, sys_getpid)
+#define __NR_getppid 173
+__SYSCALL(__NR_getppid, sys_getppid)
+#define __NR_getuid 174
+__SYSCALL(__NR_getuid, sys_getuid)
+#define __NR_geteuid 175
+__SYSCALL(__NR_geteuid, sys_geteuid)
+#define __NR_getgid 176
+__SYSCALL(__NR_getgid, sys_getgid)
+#define __NR_getegid 177
+__SYSCALL(__NR_getegid, sys_getegid)
+#define __NR_gettid 178
+__SYSCALL(__NR_gettid, sys_gettid)
+#define __NR_sysinfo 179
+__SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo)
+
+/* ipc/mqueue.c */
+#define __NR_mq_open 180
+__SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open)
+#define __NR_mq_unlink 181
+__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
+#define __NR_mq_timedsend 182
+__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend)
+#define __NR_mq_timedreceive 183
+__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \
+	  compat_sys_mq_timedreceive)
+#define __NR_mq_notify 184
+__SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify)
+#define __NR_mq_getsetattr 185
+__SC_COMP(__NR_mq_getsetattr, sys_mq_getsetattr, compat_sys_mq_getsetattr)
+
+/* ipc/msg.c */
+#define __NR_msgget 186
+__SYSCALL(__NR_msgget, sys_msgget)
+#define __NR_msgctl 187
+__SC_COMP(__NR_msgctl, sys_msgctl, compat_sys_msgctl)
+#define __NR_msgrcv 188
+__SC_COMP(__NR_msgrcv, sys_msgrcv, compat_sys_msgrcv)
+#define __NR_msgsnd 189
+__SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd)
+
+/* ipc/sem.c */
+#define __NR_semget 190
+__SYSCALL(__NR_semget, sys_semget)
+#define __NR_semctl 191
+__SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl)
+#define __NR_semtimedop 192
+__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop)
+#define __NR_semop 193
+__SYSCALL(__NR_semop, sys_semop)
+
+/* ipc/shm.c */
+#define __NR_shmget 194
+__SYSCALL(__NR_shmget, sys_shmget)
+#define __NR_shmctl 195
+__SC_COMP(__NR_shmctl, sys_shmctl, compat_sys_shmctl)
+#define __NR_shmat 196
+__SC_COMP(__NR_shmat, sys_shmat, compat_sys_shmat)
+#define __NR_shmdt 197
+__SYSCALL(__NR_shmdt, sys_shmdt)
+
+/* net/socket.c */
+#define __NR_socket 198
+__SYSCALL(__NR_socket, sys_socket)
+#define __NR_socketpair 199
+__SYSCALL(__NR_socketpair, sys_socketpair)
+#define __NR_bind 200
+__SYSCALL(__NR_bind, sys_bind)
+#define __NR_listen 201
+__SYSCALL(__NR_listen, sys_listen)
+#define __NR_accept 202
+__SYSCALL(__NR_accept, sys_accept)
+#define __NR_connect 203
+__SYSCALL(__NR_connect, sys_connect)
+#define __NR_getsockname 204
+__SYSCALL(__NR_getsockname, sys_getsockname)
+#define __NR_getpeername 205
+__SYSCALL(__NR_getpeername, sys_getpeername)
+#define __NR_sendto 206
+__SYSCALL(__NR_sendto, sys_sendto)
+#define __NR_recvfrom 207
+__SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom)
+#define __NR_setsockopt 208
+__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt)
+#define __NR_getsockopt 209
+__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt)
+#define __NR_shutdown 210
+__SYSCALL(__NR_shutdown, sys_shutdown)
+#define __NR_sendmsg 211
+__SC_COMP(__NR_sendmsg, sys_sendmsg, compat_sys_sendmsg)
+#define __NR_recvmsg 212
+__SC_COMP(__NR_recvmsg, sys_recvmsg, compat_sys_recvmsg)
+
+/* mm/filemap.c */
+#define __NR_readahead 213
+__SC_COMP(__NR_readahead, sys_readahead, compat_sys_readahead)
+
+/* mm/nommu.c, also with MMU */
+#define __NR_brk 214
+__SYSCALL(__NR_brk, sys_brk)
+#define __NR_munmap 215
+__SYSCALL(__NR_munmap, sys_munmap)
+#define __NR_mremap 216
+__SYSCALL(__NR_mremap, sys_mremap)
+
+/* security/keys/keyctl.c */
+#define __NR_add_key 217
+__SYSCALL(__NR_add_key, sys_add_key)
+#define __NR_request_key 218
+__SYSCALL(__NR_request_key, sys_request_key)
+#define __NR_keyctl 219
+__SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl)
+
+/* arch/example/kernel/sys_example.c */
+#define __NR_clone 220
+__SYSCALL(__NR_clone, sys_clone)
+#define __NR_execve 221
+__SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
+
+#define __NR3264_mmap 222
+__SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
+/* mm/fadvise.c */
+#define __NR3264_fadvise64 223
+__SC_COMP(__NR3264_fadvise64, sys_fadvise64_64, compat_sys_fadvise64_64)
+
+/* mm/, CONFIG_MMU only */
+#ifndef __ARCH_NOMMU
+#define __NR_swapon 224
+__SYSCALL(__NR_swapon, sys_swapon)
+#define __NR_swapoff 225
+__SYSCALL(__NR_swapoff, sys_swapoff)
+#define __NR_mprotect 226
+__SYSCALL(__NR_mprotect, sys_mprotect)
+#define __NR_msync 227
+__SYSCALL(__NR_msync, sys_msync)
+#define __NR_mlock 228
+__SYSCALL(__NR_mlock, sys_mlock)
+#define __NR_munlock 229
+__SYSCALL(__NR_munlock, sys_munlock)
+#define __NR_mlockall 230
+__SYSCALL(__NR_mlockall, sys_mlockall)
+#define __NR_munlockall 231
+__SYSCALL(__NR_munlockall, sys_munlockall)
+#define __NR_mincore 232
+__SYSCALL(__NR_mincore, sys_mincore)
+#define __NR_madvise 233
+__SYSCALL(__NR_madvise, sys_madvise)
+#define __NR_remap_file_pages 234
+__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
+#define __NR_mbind 235
+__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind)
+#define __NR_get_mempolicy 236
+__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy)
+#define __NR_set_mempolicy 237
+__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy)
+#define __NR_migrate_pages 238
+__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages)
+#define __NR_move_pages 239
+__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages)
+#endif
+
+#define __NR_rt_tgsigqueueinfo 240
+__SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \
+	  compat_sys_rt_tgsigqueueinfo)
+#define __NR_perf_event_open 241
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_accept4 242
+__SYSCALL(__NR_accept4, sys_accept4)
+#define __NR_recvmmsg 243
+__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg)
+
+/*
+ * Architectures may provide up to 16 syscalls of their own
+ * starting with this value.
+ */
+#define __NR_arch_specific_syscall 244
+
+#define __NR_wait4 260
+__SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4)
+#define __NR_prlimit64 261
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_fanotify_init 262
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark 263
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_name_to_handle_at         264
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at         265
+__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \
+	  compat_sys_open_by_handle_at)
+#define __NR_clock_adjtime 266
+__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime)
+#define __NR_syncfs 267
+__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_setns 268
+__SYSCALL(__NR_setns, sys_setns)
+#define __NR_sendmmsg 269
+__SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg)
+#define __NR_process_vm_readv 270
+__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
+          compat_sys_process_vm_readv)
+#define __NR_process_vm_writev 271
+__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
+          compat_sys_process_vm_writev)
+#define __NR_kcmp 272
+__SYSCALL(__NR_kcmp, sys_kcmp)
+#define __NR_finit_module 273
+__SYSCALL(__NR_finit_module, sys_finit_module)
+#define __NR_sched_setattr 274
+__SYSCALL(__NR_sched_setattr, sys_sched_setattr)
+#define __NR_sched_getattr 275
+__SYSCALL(__NR_sched_getattr, sys_sched_getattr)
+#define __NR_renameat2 276
+__SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
+#define __NR_getrandom 278
+__SYSCALL(__NR_getrandom, sys_getrandom)
+#define __NR_memfd_create 279
+__SYSCALL(__NR_memfd_create, sys_memfd_create)
+#define __NR_bpf 280
+__SYSCALL(__NR_bpf, sys_bpf)
+#define __NR_execveat 281
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_userfaultfd 282
+__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
+#define __NR_membarrier 283
+__SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_mlock2 284
+__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_copy_file_range 285
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
+#define __NR_preadv2 286
+__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
+#define __NR_pwritev2 287
+__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
+#define __NR_pkey_mprotect 288
+__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect)
+#define __NR_pkey_alloc 289
+__SYSCALL(__NR_pkey_alloc,    sys_pkey_alloc)
+#define __NR_pkey_free 290
+__SYSCALL(__NR_pkey_free,     sys_pkey_free)
+#define __NR_statx 291
+__SYSCALL(__NR_statx,     sys_statx)
+
+#undef __NR_syscalls
+#define __NR_syscalls 292
+
+/*
+ * 32 bit systems traditionally used different
+ * syscalls for off_t and loff_t arguments, while
+ * 64 bit systems only need the off_t version.
+ * For new 32 bit platforms, there is no need to
+ * implement the old 32 bit off_t syscalls, so
+ * they take different names.
+ * Here we map the numbers so that both versions
+ * use the same syscall table layout.
+ */
+#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT)
+#define __NR_fcntl __NR3264_fcntl
+#define __NR_statfs __NR3264_statfs
+#define __NR_fstatfs __NR3264_fstatfs
+#define __NR_truncate __NR3264_truncate
+#define __NR_ftruncate __NR3264_ftruncate
+#define __NR_lseek __NR3264_lseek
+#define __NR_sendfile __NR3264_sendfile
+#define __NR_newfstatat __NR3264_fstatat
+#define __NR_fstat __NR3264_fstat
+#define __NR_mmap __NR3264_mmap
+#define __NR_fadvise64 __NR3264_fadvise64
+#ifdef __NR3264_stat
+#define __NR_stat __NR3264_stat
+#define __NR_lstat __NR3264_lstat
+#endif
+#else
+#define __NR_fcntl64 __NR3264_fcntl
+#define __NR_statfs64 __NR3264_statfs
+#define __NR_fstatfs64 __NR3264_fstatfs
+#define __NR_truncate64 __NR3264_truncate
+#define __NR_ftruncate64 __NR3264_ftruncate
+#define __NR_llseek __NR3264_lseek
+#define __NR_sendfile64 __NR3264_sendfile
+#define __NR_fstatat64 __NR3264_fstatat
+#define __NR_fstat64 __NR3264_fstat
+#define __NR_mmap2 __NR3264_mmap
+#define __NR_fadvise64_64 __NR3264_fadvise64
+#ifdef __NR3264_stat
+#define __NR_stat64 __NR3264_stat
+#define __NR_lstat64 __NR3264_lstat
+#endif
+#endif
diff --git a/linux-headers/asm-mips/bitsperlong.h b/linux-headers/asm-mips/bitsperlong.h
new file mode 100644
index 0000000000..7268380d8d
--- /dev/null
+++ b/linux-headers/asm-mips/bitsperlong.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_MIPS_BITSPERLONG_H
+#define __ASM_MIPS_BITSPERLONG_H
+
+#define __BITS_PER_LONG _MIPS_SZLONG
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_MIPS_BITSPERLONG_H */
diff --git a/linux-headers/asm-mips/sgidefs.h b/linux-headers/asm-mips/sgidefs.h
new file mode 100644
index 0000000000..26143e3b7c
--- /dev/null
+++ b/linux-headers/asm-mips/sgidefs.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1996, 1999, 2001 Ralf Baechle
+ * Copyright (C) 1999 Silicon Graphics, Inc.
+ * Copyright (C) 2001 MIPS Technologies, Inc.
+ */
+#ifndef __ASM_SGIDEFS_H
+#define __ASM_SGIDEFS_H
+
+/*
+ * Using a Linux compiler for building Linux seems logic but not to
+ * everybody.
+ */
+#ifndef __linux__
+#error Use a Linux compiler or give up.
+#endif
+
+/*
+ * Definitions for the ISA levels
+ *
+ * With the introduction of MIPS32 / MIPS64 instruction sets definitions
+ * MIPS ISAs are no longer subsets of each other.  Therefore comparisons
+ * on these symbols except with == may result in unexpected results and
+ * are forbidden!
+ */
+#define _MIPS_ISA_MIPS1		1
+#define _MIPS_ISA_MIPS2		2
+#define _MIPS_ISA_MIPS3		3
+#define _MIPS_ISA_MIPS4		4
+#define _MIPS_ISA_MIPS5		5
+#define _MIPS_ISA_MIPS32	6
+#define _MIPS_ISA_MIPS64	7
+
+/*
+ * Subprogram calling convention
+ */
+#define _MIPS_SIM_ABI32		1
+#define _MIPS_SIM_NABI32	2
+#define _MIPS_SIM_ABI64		3
+
+#endif /* __ASM_SGIDEFS_H */
diff --git a/linux-headers/asm-mips/unistd.h b/linux-headers/asm-mips/unistd.h
index 2a2020938e..9bfef7f764 100644
--- a/linux-headers/asm-mips/unistd.h
+++ b/linux-headers/asm-mips/unistd.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -377,16 +378,27 @@
 #define __NR_memfd_create		(__NR_Linux + 354)
 #define __NR_bpf			(__NR_Linux + 355)
 #define __NR_execveat			(__NR_Linux + 356)
+#define __NR_userfaultfd		(__NR_Linux + 357)
+#define __NR_membarrier			(__NR_Linux + 358)
+#define __NR_mlock2			(__NR_Linux + 359)
+#define __NR_copy_file_range		(__NR_Linux + 360)
+#define __NR_preadv2			(__NR_Linux + 361)
+#define __NR_pwritev2			(__NR_Linux + 362)
+#define __NR_pkey_mprotect		(__NR_Linux + 363)
+#define __NR_pkey_alloc			(__NR_Linux + 364)
+#define __NR_pkey_free			(__NR_Linux + 365)
+#define __NR_statx			(__NR_Linux + 366)
+
 
 /*
  * Offset of the last Linux o32 flavoured syscall
  */
-#define __NR_Linux_syscalls		356
+#define __NR_Linux_syscalls		366
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
 
 #define __NR_O32_Linux			4000
-#define __NR_O32_Linux_syscalls		356
+#define __NR_O32_Linux_syscalls		366
 
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 
@@ -711,16 +723,26 @@
 #define __NR_memfd_create		(__NR_Linux + 314)
 #define __NR_bpf			(__NR_Linux + 315)
 #define __NR_execveat			(__NR_Linux + 316)
+#define __NR_userfaultfd		(__NR_Linux + 317)
+#define __NR_membarrier			(__NR_Linux + 318)
+#define __NR_mlock2			(__NR_Linux + 319)
+#define __NR_copy_file_range		(__NR_Linux + 320)
+#define __NR_preadv2			(__NR_Linux + 321)
+#define __NR_pwritev2			(__NR_Linux + 322)
+#define __NR_pkey_mprotect		(__NR_Linux + 323)
+#define __NR_pkey_alloc			(__NR_Linux + 324)
+#define __NR_pkey_free			(__NR_Linux + 325)
+#define __NR_statx			(__NR_Linux + 326)
 
 /*
  * Offset of the last Linux 64-bit flavoured syscall
  */
-#define __NR_Linux_syscalls		316
+#define __NR_Linux_syscalls		326
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
 
 #define __NR_64_Linux			5000
-#define __NR_64_Linux_syscalls		316
+#define __NR_64_Linux_syscalls		326
 
 #if _MIPS_SIM == _MIPS_SIM_NABI32
 
@@ -1049,15 +1071,25 @@
 #define __NR_memfd_create		(__NR_Linux + 318)
 #define __NR_bpf			(__NR_Linux + 319)
 #define __NR_execveat			(__NR_Linux + 320)
+#define __NR_userfaultfd		(__NR_Linux + 321)
+#define __NR_membarrier			(__NR_Linux + 322)
+#define __NR_mlock2			(__NR_Linux + 323)
+#define __NR_copy_file_range		(__NR_Linux + 324)
+#define __NR_preadv2			(__NR_Linux + 325)
+#define __NR_pwritev2			(__NR_Linux + 326)
+#define __NR_pkey_mprotect		(__NR_Linux + 327)
+#define __NR_pkey_alloc			(__NR_Linux + 328)
+#define __NR_pkey_free			(__NR_Linux + 329)
+#define __NR_statx			(__NR_Linux + 330)
 
 /*
  * Offset of the last N32 flavoured syscall
  */
-#define __NR_Linux_syscalls		320
+#define __NR_Linux_syscalls		330
 
 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
 
 #define __NR_N32_Linux			6000
-#define __NR_N32_Linux_syscalls		320
+#define __NR_N32_Linux_syscalls		330
 
 #endif /* _ASM_UNISTD_H */
diff --git a/linux-headers/asm-powerpc/bitsperlong.h b/linux-headers/asm-powerpc/bitsperlong.h
new file mode 100644
index 0000000000..46ece3ecff
--- /dev/null
+++ b/linux-headers/asm-powerpc/bitsperlong.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_POWERPC_BITSPERLONG_H
+#define __ASM_POWERPC_BITSPERLONG_H
+
+#if defined(__powerpc64__)
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_POWERPC_BITSPERLONG_H */
diff --git a/linux-headers/asm-s390/bitsperlong.h b/linux-headers/asm-s390/bitsperlong.h
new file mode 100644
index 0000000000..cceaf47b02
--- /dev/null
+++ b/linux-headers/asm-s390/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_BITSPERLONG_H
+#define __ASM_S390_BITSPERLONG_H
+
+#ifndef __s390x__
+#define __BITS_PER_LONG 32
+#else
+#define __BITS_PER_LONG 64
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_S390_BITSPERLONG_H */
+
diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h
index 1ae66a263b..d0f97cd0a4 100644
--- a/linux-headers/asm-s390/unistd_32.h
+++ b/linux-headers/asm-s390/unistd_32.h
@@ -360,5 +360,6 @@
 #define __NR_s390_guarded_storage 378
 #define __NR_statx 379
 #define __NR_s390_sthyi 380
+#define __NR_kexec_file_load 381
 
 #endif /* _ASM_S390_UNISTD_32_H */
diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h
index 8aa9d046a9..23ffb97746 100644
--- a/linux-headers/asm-s390/unistd_64.h
+++ b/linux-headers/asm-s390/unistd_64.h
@@ -327,5 +327,6 @@
 #define __NR_s390_guarded_storage 378
 #define __NR_statx 379
 #define __NR_s390_sthyi 380
+#define __NR_kexec_file_load 381
 
 #endif /* _ASM_S390_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/bitsperlong.h b/linux-headers/asm-x86/bitsperlong.h
new file mode 100644
index 0000000000..5d72c84588
--- /dev/null
+++ b/linux-headers/asm-x86/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_BITSPERLONG_H
+#define __ASM_X86_BITSPERLONG_H
+
+#if defined(__x86_64__) && !defined(__ILP32__)
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_X86_BITSPERLONG_H */
+
-- 
cgit v1.2.3


From 85e99cf80bec66adf39f948db824a0216daaa486 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 20 Mar 2018 23:01:06 +0200
Subject: linux-headers: add kvm header for mips

kvm header for MIPS was manually excluded from auto-updates.

Update it now to 4.17-rc2.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 linux-headers/asm-mips/kvm.h | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/linux-headers/asm-mips/kvm.h b/linux-headers/asm-mips/kvm.h
index 6985eb59b0..edcf717c43 100644
--- a/linux-headers/asm-mips/kvm.h
+++ b/linux-headers/asm-mips/kvm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -19,6 +20,10 @@
  * Some parts derived from the x86 version of this file.
  */
 
+#define __KVM_HAVE_READONLY_MEM
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
@@ -52,9 +57,14 @@ struct kvm_fpu {
  * Register set = 0: GP registers from kvm_regs (see definitions below).
  *
  * Register set = 1: CP0 registers.
- *  bits[15..8]  - Must be zero.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[2..0]   - Register 'sel' index.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
  *
  * Register set = 2: KVM specific registers (see definitions below).
  *
@@ -112,6 +122,15 @@ struct kvm_fpu {
 #define KVM_REG_MIPS_PC		(KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
 
 
+/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR	(KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)	(KVM_REG_MIPS_MAAR | \
+					 KVM_REG_SIZE_U64 | (n))
+
+
 /*
  * KVM_REG_MIPS_KVM - KVM specific control registers.
  */
-- 
cgit v1.2.3


From 703878e2e0975123b1a89a006c0204c469333278 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Thu, 12 Apr 2018 23:12:27 +0800
Subject: vhost-user: add Net prefix to internal state structure

We are going to introduce a shared vhost user state which
will be named as 'VhostUserState'. So add 'Net' prefix to
the existing internal state structure in the vhost-user
netdev to avoid conflict.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/vhost-user.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/net/vhost-user.c b/net/vhost-user.c
index e0f16c895b..fa28aad12d 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -20,38 +20,38 @@
 #include "qemu/option.h"
 #include "trace.h"
 
-typedef struct VhostUserState {
+typedef struct NetVhostUserState {
     NetClientState nc;
     CharBackend chr; /* only queue index 0 */
     VHostNetState *vhost_net;
     guint watch;
     uint64_t acked_features;
     bool started;
-} VhostUserState;
+} NetVhostUserState;
 
 VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
 {
-    VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+    NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
     return s->vhost_net;
 }
 
 uint64_t vhost_user_get_acked_features(NetClientState *nc)
 {
-    VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+    NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
     assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
     return s->acked_features;
 }
 
 static void vhost_user_stop(int queues, NetClientState *ncs[])
 {
-    VhostUserState *s;
+    NetVhostUserState *s;
     int i;
 
     for (i = 0; i < queues; i++) {
         assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
 
-        s = DO_UPCAST(VhostUserState, nc, ncs[i]);
+        s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
 
         if (s->vhost_net) {
             /* save acked features */
@@ -68,7 +68,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be)
 {
     VhostNetOptions options;
     struct vhost_net *net = NULL;
-    VhostUserState *s;
+    NetVhostUserState *s;
     int max_queues;
     int i;
 
@@ -77,7 +77,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be)
     for (i = 0; i < queues; i++) {
         assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
 
-        s = DO_UPCAST(VhostUserState, nc, ncs[i]);
+        s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
 
         options.net_backend = ncs[i];
         options.opaque      = be;
@@ -123,7 +123,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf,
        without GUEST_ANNOUNCE capability.
      */
     if (size == 60) {
-        VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+        NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
         int r;
         static int display_rarp_failure = 1;
         char mac_addr[6];
@@ -146,7 +146,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf,
 
 static void vhost_user_cleanup(NetClientState *nc)
 {
-    VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+    NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
 
     if (s->vhost_net) {
         vhost_net_cleanup(s->vhost_net);
@@ -180,7 +180,7 @@ static bool vhost_user_has_ufo(NetClientState *nc)
 
 static NetClientInfo net_vhost_user_info = {
         .type = NET_CLIENT_DRIVER_VHOST_USER,
-        .size = sizeof(VhostUserState),
+        .size = sizeof(NetVhostUserState),
         .receive = vhost_user_receive,
         .cleanup = vhost_user_cleanup,
         .has_vnet_hdr = vhost_user_has_vnet_hdr,
@@ -190,7 +190,7 @@ static NetClientInfo net_vhost_user_info = {
 static gboolean net_vhost_user_watch(GIOChannel *chan, GIOCondition cond,
                                            void *opaque)
 {
-    VhostUserState *s = opaque;
+    NetVhostUserState *s = opaque;
 
     qemu_chr_fe_disconnect(&s->chr);
 
@@ -203,7 +203,7 @@ static void chr_closed_bh(void *opaque)
 {
     const char *name = opaque;
     NetClientState *ncs[MAX_QUEUE_NUM];
-    VhostUserState *s;
+    NetVhostUserState *s;
     Error *err = NULL;
     int queues;
 
@@ -212,7 +212,7 @@ static void chr_closed_bh(void *opaque)
                                           MAX_QUEUE_NUM);
     assert(queues < MAX_QUEUE_NUM);
 
-    s = DO_UPCAST(VhostUserState, nc, ncs[0]);
+    s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
 
     qmp_set_link(name, false, &err);
     vhost_user_stop(queues, ncs);
@@ -229,7 +229,7 @@ static void net_vhost_user_event(void *opaque, int event)
 {
     const char *name = opaque;
     NetClientState *ncs[MAX_QUEUE_NUM];
-    VhostUserState *s;
+    NetVhostUserState *s;
     Chardev *chr;
     Error *err = NULL;
     int queues;
@@ -239,7 +239,7 @@ static void net_vhost_user_event(void *opaque, int event)
                                           MAX_QUEUE_NUM);
     assert(queues < MAX_QUEUE_NUM);
 
-    s = DO_UPCAST(VhostUserState, nc, ncs[0]);
+    s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
     chr = qemu_chr_fe_get_driver(&s->chr);
     trace_vhost_user_event(chr->label, event);
     switch (event) {
@@ -283,7 +283,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
 {
     Error *err = NULL;
     NetClientState *nc, *nc0 = NULL;
-    VhostUserState *s;
+    NetVhostUserState *s;
     int i;
 
     assert(name);
@@ -296,7 +296,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
         nc->queue_index = i;
         if (!nc0) {
             nc0 = nc;
-            s = DO_UPCAST(VhostUserState, nc, nc);
+            s = DO_UPCAST(NetVhostUserState, nc, nc);
             if (!qemu_chr_fe_init(&s->chr, chr, &err)) {
                 error_report_err(err);
                 return -1;
@@ -305,7 +305,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
 
     }
 
-    s = DO_UPCAST(VhostUserState, nc, nc0);
+    s = DO_UPCAST(NetVhostUserState, nc, nc0);
     do {
         if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) {
             error_report_err(err);
-- 
cgit v1.2.3


From 1f3a4519b1c107b5db2434b30638353978366b4d Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Thu, 12 Apr 2018 23:12:29 +0800
Subject: vhost-user: support receiving file descriptors in slave_read

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 38da8692bb..85d8fd270b 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -852,14 +852,44 @@ static void slave_read(void *opaque)
     VhostUserHeader hdr = { 0, };
     VhostUserPayload payload = { 0, };
     int size, ret = 0;
+    struct iovec iov;
+    struct msghdr msgh;
+    int fd = -1;
+    char control[CMSG_SPACE(sizeof(fd))];
+    struct cmsghdr *cmsg;
+    size_t fdsize;
+
+    memset(&msgh, 0, sizeof(msgh));
+    msgh.msg_iov = &iov;
+    msgh.msg_iovlen = 1;
+    msgh.msg_control = control;
+    msgh.msg_controllen = sizeof(control);
 
     /* Read header */
-    size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE);
+    iov.iov_base = &hdr;
+    iov.iov_len = VHOST_USER_HDR_SIZE;
+
+    size = recvmsg(u->slave_fd, &msgh, 0);
     if (size != VHOST_USER_HDR_SIZE) {
         error_report("Failed to read from slave.");
         goto err;
     }
 
+    if (msgh.msg_flags & MSG_CTRUNC) {
+        error_report("Truncated message.");
+        goto err;
+    }
+
+    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+         cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+            if (cmsg->cmsg_level == SOL_SOCKET &&
+                cmsg->cmsg_type == SCM_RIGHTS) {
+                    fdsize = cmsg->cmsg_len - CMSG_LEN(0);
+                    memcpy(&fd, CMSG_DATA(cmsg), fdsize);
+                    break;
+            }
+    }
+
     if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
         error_report("Failed to read msg header."
                 " Size %d exceeds the maximum %zu.", hdr.size,
@@ -883,9 +913,15 @@ static void slave_read(void *opaque)
         break;
     default:
         error_report("Received unexpected msg type.");
+        if (fd != -1) {
+            close(fd);
+        }
         ret = -EINVAL;
     }
 
+    /* Message handlers need to make sure that fd will be consumed. */
+    fd = -1;
+
     /*
      * REPLY_ACK feature handling. Other reply types has to be managed
      * directly in their request handlers.
@@ -918,6 +954,9 @@ err:
     qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL);
     close(u->slave_fd);
     u->slave_fd = -1;
+    if (fd != -1) {
+        close(fd);
+    }
     return;
 }
 
-- 
cgit v1.2.3


From 6f80e6170ede13605817e5c0ca73db0de7bdf261 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.bie@intel.com>
Date: Thu, 12 Apr 2018 23:12:30 +0800
Subject: virtio: support setting memory region based host notifier

This patch introduces the support for setting memory region
based host notifiers for virtio device. This is helpful when
using a hardware accelerator for a virtio device, because
hardware heavily depends on the notification, this will allow
the guest driver in the VM to notify the hardware directly.

Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio-pci.c         | 22 ++++++++++++++++++++++
 hw/virtio/virtio.c             | 13 +++++++++++++
 include/hw/virtio/virtio-bus.h |  2 ++
 include/hw/virtio/virtio.h     |  2 ++
 4 files changed, 39 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 1e8ab7bbc5..5eb0c323ca 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1037,6 +1037,27 @@ assign_error:
     return r;
 }
 
+static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n,
+                                           MemoryRegion *mr, bool assign)
+{
+    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+    int offset;
+
+    if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) ||
+        virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) {
+        return -1;
+    }
+
+    if (assign) {
+        offset = virtio_pci_queue_mem_mult(proxy) * n;
+        memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1);
+    } else {
+        memory_region_del_subregion(&proxy->notify.mr, mr);
+    }
+
+    return 0;
+}
+
 static void virtio_pci_vmstate_change(DeviceState *d, bool running)
 {
     VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
     k->has_extra_state = virtio_pci_has_extra_state;
     k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
     k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
+    k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr;
     k->vmstate_change = virtio_pci_vmstate_change;
     k->pre_plugged = virtio_pci_pre_plugged;
     k->device_plugged = virtio_pci_device_plugged;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 006d3d1148..1debb0147b 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
     return &vq->host_notifier;
 }
 
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+                                      MemoryRegion *mr, bool assign)
+{
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    if (k->set_host_notifier_mr) {
+        return k->set_host_notifier_mr(qbus->parent, n, mr, assign);
+    }
+
+    return -1;
+}
+
 void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
 {
     g_free(vdev->bus_name);
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index ced3d2d2b0..7fec9dc929 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -52,6 +52,8 @@ typedef struct VirtioBusClass {
     bool (*has_extra_state)(DeviceState *d);
     bool (*query_guest_notifiers)(DeviceState *d);
     int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign);
+    int (*set_host_notifier_mr)(DeviceState *d, int n,
+                                MemoryRegion *mr, bool assign);
     void (*vmstate_change)(DeviceState *d, bool running);
     /*
      * Expose the features the transport layer supports before
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 098bdaaea3..9c1fa07d6d 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -239,6 +239,8 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align);
 void virtio_queue_notify(VirtIODevice *vdev, int n);
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
 void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+                                      MemoryRegion *mr, bool assign);
 int virtio_set_status(VirtIODevice *vdev, uint8_t val);
 void virtio_reset(void *opaque);
 void virtio_update_irq(VirtIODevice *vdev);
-- 
cgit v1.2.3


From 9952e807fd016c95f50372536f1ac65a601be6e4 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 2 May 2018 11:55:52 +0100
Subject: vhost-user+postcopy: Use qemu_set_nonblock

Use qemu_set_nonblock rather than a simple fcntl; cleaner
and I have no reason to change other flags.

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 85d8fd270b..41cbd8a455 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1115,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
         error_setg(errp, "%s: Failed to get ufd", __func__);
         return -1;
     }
-    fcntl(ufd, F_SETFL, O_NONBLOCK);
+    qemu_set_nonblock(ufd);
 
     /* register ufd with userfault thread */
     u->postcopy_fd.fd = ufd;
-- 
cgit v1.2.3


From e3638151da2a314dda3ed862cf707b8b66aca4cb Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Fri, 4 May 2018 10:53:46 +0100
Subject: libvhost-user: Send messages with no data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The response to a VHOST_USER_POSTCOPY_ADVISE contains a fd but doesn't
actually contain any data.   FIx vu_message_write so that it doesn't
do a 0-byte write() call, since this was ending up with rc=0
that was confusing the error handling code.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index beeed0c43f..54e643d871 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -323,13 +323,15 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
         rc = sendmsg(conn_fd, &msg, 0);
     } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
 
-    do {
-        if (vmsg->data) {
-            rc = write(conn_fd, vmsg->data, vmsg->size);
-        } else {
-            rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
-        }
-    } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+    if (vmsg->size) {
+        do {
+            if (vmsg->data) {
+                rc = write(conn_fd, vmsg->data, vmsg->size);
+            } else {
+                rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
+            }
+        } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+    }
 
     if (rc <= 0) {
         vu_panic(dev, "Error while writing: %s", strerror(errno));
-- 
cgit v1.2.3


From ebf2a499a5c43d5eaee2b70ab5ae655af49d935c Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Fri, 11 May 2018 21:59:40 -0700
Subject: hw/virtio: Fix brace Werror with clang 6.0.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The warning is

hw/virtio/vhost-user.c:1319:26: error: suggest braces
      around initialization of subobject [-Werror,-Wmissing-braces]
    VhostUserMsg msg = { 0 };
                         ^
                         {}

While the original code is correct, and technically exactly correct
as per ISO C89, both GCC and Clang support plain empty set of braces
as an extension.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 41cbd8a455..ca554d4ff1 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -1355,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
 
 static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
 {
-    VhostUserMsg msg = { 0 };
+    VhostUserMsg msg = { };
 
     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
 
-- 
cgit v1.2.3


From 7d405b2ffbcbee4c622deb6c0ab4f5649de7fc2b Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Sat, 19 May 2018 06:20:46 +0800
Subject: contrib/vhost-user-blk: enable protocol feature for vhost-user-blk

This patch reports the protocol feature that is only advertised by
QEMU if the device implements the config ops.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/vhost-user-blk/vhost-user-blk.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 67dac8155a..a6a132a492 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -311,6 +311,12 @@ vub_get_features(VuDev *dev)
            1ull << VHOST_USER_F_PROTOCOL_FEATURES;
 }
 
+static uint64_t
+vub_get_protocol_features(VuDev *dev)
+{
+    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG;
+}
+
 static int
 vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
 {
@@ -373,6 +379,7 @@ vub_set_config(VuDev *vu_dev, const uint8_t *data,
 static const VuDevIface vub_iface = {
     .get_features = vub_get_features,
     .queue_set_started = vub_queue_set_started,
+    .get_protocol_features = vub_get_protocol_features,
     .get_config = vub_get_config,
     .set_config = vub_set_config,
 };
-- 
cgit v1.2.3


From 1a97a478e69534f70ebbed1f27c315dbdc23f94f Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Mon, 21 May 2018 10:32:00 -0600
Subject: nvdimm: fix typo in label-size definition

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Fixes: commit da6789c27c2e ("nvdimm: add a macro for property "label-size"")
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Cc: Haozhong Zhang <haozhong.zhang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/mem/nvdimm.c         | 2 +-
 include/hw/mem/nvdimm.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index acb656b672..4087aca25e 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp)
 
 static void nvdimm_init(Object *obj)
 {
-    object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int",
+    object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
                         nvdimm_get_label_size, nvdimm_set_label_size, NULL,
                         NULL, NULL);
     object_property_add_bool(obj, NVDIMM_UNARMED_PROP,
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 7fd87c4e1c..74c60332e1 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -48,7 +48,7 @@
 #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \
                                                TYPE_NVDIMM)
 
-#define NVDIMM_LABLE_SIZE_PROP "label-size"
+#define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UNARMED_PROP    "unarmed"
 
 struct NVDIMMDevice {
-- 
cgit v1.2.3


From 36d2d52bdb45f5b753a61fdaf0fe7891f1f5b61d Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:09 +0800
Subject: intel-iommu: send PSI always even if across PDEs

SECURITY IMPLICATION: without this patch, any guest with both assigned
device and a vIOMMU might encounter stale IO page mappings even if guest
has already unmapped the page, which may lead to guest memory
corruption.  The stale mappings will only be limited to the guest's own
memory range, so it should not affect the host memory or other guests on
the host.

During IOVA page table walking, there is a special case when the PSI
covers one whole PDE (Page Directory Entry, which contains 512 Page
Table Entries) or more.  In the past, we skip that entry and we don't
notify the IOMMU notifiers.  This is not correct.  We should send UNMAP
notification to registered UNMAP notifiers in this case.

For UNMAP only notifiers, this might cause IOTLBs cached in the devices
even if they were already invalid.  For MAP/UNMAP notifiers like
vfio-pci, this will cause stale page mappings.

This special case doesn't trigger often, but it is very easy to be
triggered by nested device assignments, since in that case we'll
possibly map the whole L2 guest RAM region into the device's IOVA
address space (several GBs at least), which is far bigger than normal
kernel driver usages of the device (tens of MBs normally).

Without this patch applied to L1 QEMU, nested device assignment to L2
guests will dump some errors like:

qemu-system-x86_64: VFIO_MAP_DMA: -17
qemu-system-x86_64: vfio_dma_map(0x557305420c30, 0xad000, 0x1000,
                    0x7f89a920d000) = -17 (File exists)

CC: QEMU Stable <qemu-stable@nongnu.org>
Acked-by: Jason Wang <jasowang@redhat.com>
[peterx: rewrite the commit message]
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fb31de9416..b359efd6f9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -722,6 +722,15 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 
 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
 
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
+                             vtd_page_walk_hook hook_fn, void *private)
+{
+    assert(hook_fn);
+    trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
+                            entry->addr_mask, entry->perm);
+    return hook_fn(entry, private);
+}
+
 /**
  * vtd_page_walk_level - walk over specific level for IOVA range
  *
@@ -781,28 +790,37 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
          */
         entry_valid = read_cur | write_cur;
 
+        entry.target_as = &address_space_memory;
+        entry.iova = iova & subpage_mask;
+        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+        entry.addr_mask = ~subpage_mask;
+
         if (vtd_is_last_slpte(slpte, level)) {
-            entry.target_as = &address_space_memory;
-            entry.iova = iova & subpage_mask;
             /* NOTE: this is only meaningful if entry_valid == true */
             entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
-            entry.addr_mask = ~subpage_mask;
-            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
             if (!entry_valid && !notify_unmap) {
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
-                                    entry.addr_mask, entry.perm);
-            if (hook_fn) {
-                ret = hook_fn(&entry, private);
-                if (ret < 0) {
-                    return ret;
-                }
+            ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+            if (ret < 0) {
+                return ret;
             }
         } else {
             if (!entry_valid) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
+                if (notify_unmap) {
+                    /*
+                     * The whole entry is invalid; unmap it all.
+                     * Translated address is meaningless, zero it.
+                     */
+                    entry.translated_addr = 0x0;
+                    ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+                    if (ret < 0) {
+                        return ret;
+                    }
+                } else {
+                    trace_vtd_page_walk_skip_perm(iova, iova_next);
+                }
                 goto next;
             }
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
-- 
cgit v1.2.3


From b4a4ba0d68f50f218ee3957b6638dbee32a5eeef Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:10 +0800
Subject: intel-iommu: remove IntelIOMMUNotifierNode

That is not really necessary.  Removing that node struct and put the
list entry directly into VTDAddressSpace.  It simplfies the code a lot.
Since at it, rename the old notifiers_list into vtd_as_with_notifiers.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c         | 41 +++++++++++------------------------------
 include/hw/i386/intel_iommu.h |  9 ++-------
 2 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b359efd6f9..3df90457f8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1248,10 +1248,10 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
 
 static void vtd_iommu_replay_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        memory_region_iommu_replay_all(&node->vtd_as->iommu);
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        memory_region_iommu_replay_all(&vtd_as->iommu);
     }
 }
 
@@ -1372,7 +1372,6 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
 
 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 {
-    IntelIOMMUNotifierNode *node;
     VTDContextEntry ce;
     VTDAddressSpace *vtd_as;
 
@@ -1381,8 +1380,7 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                 &domain_id);
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce) &&
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
@@ -1402,12 +1400,11 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                            uint16_t domain_id, hwaddr addr,
                                            uint8_t am)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;
     VTDContextEntry ce;
     int ret;
 
-    QLIST_FOREACH(node, &(s->notifiers_list), next) {
-        VTDAddressSpace *vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
@@ -2344,8 +2341,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
     IntelIOMMUState *s = vtd_as->iommu_state;
-    IntelIOMMUNotifierNode *node = NULL;
-    IntelIOMMUNotifierNode *next_node = NULL;
 
     if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
         error_report("We need to set caching-mode=1 for intel-iommu to enable "
@@ -2354,21 +2349,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
     }
 
     if (old == IOMMU_NOTIFIER_NONE) {
-        node = g_malloc0(sizeof(*node));
-        node->vtd_as = vtd_as;
-        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
-        return;
-    }
-
-    /* update notifier node with new flags */
-    QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
-        if (node->vtd_as == vtd_as) {
-            if (new == IOMMU_NOTIFIER_NONE) {
-                QLIST_REMOVE(node, next);
-                g_free(node);
-            }
-            return;
-        }
+        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        QLIST_REMOVE(vtd_as, next);
     }
 }
 
@@ -2838,12 +2821,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
 
 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
     VTDAddressSpace *vtd_as;
     IOMMUNotifier *n;
 
-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
             vtd_address_space_unmap(vtd_as, n);
         }
@@ -3088,7 +3069,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
         return;
     }
 
-    QLIST_INIT(&s->notifiers_list);
+    QLIST_INIT(&s->vtd_as_with_notifiers);
     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                           "intel_iommu", DMAR_REG_SIZE);
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 45ec8919b6..032e33bcb2 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -67,7 +67,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDIrq VTDIrq;
 typedef struct VTD_MSIMessage VTD_MSIMessage;
-typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -93,6 +92,7 @@ struct VTDAddressSpace {
     MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
     IntelIOMMUState *iommu_state;
     VTDContextCacheEntry context_cache_entry;
+    QLIST_ENTRY(VTDAddressSpace) next;
 };
 
 struct VTDBus {
@@ -253,11 +253,6 @@ struct VTD_MSIMessage {
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
 #define VTD_IR_MSI_DATA          (0)
 
-struct IntelIOMMUNotifierNode {
-    VTDAddressSpace *vtd_as;
-    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
-};
-
 /* The iommu (DMAR) device state struct */
 struct IntelIOMMUState {
     X86IOMMUState x86_iommu;
@@ -295,7 +290,7 @@ struct IntelIOMMUState {
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
     /* list of registered notifiers */
-    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
+    QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
 
     /* interrupt remapping */
     bool intr_enabled;              /* Whether guest enabled IR */
-- 
cgit v1.2.3


From 1d9efa73e12ddf361ea997c2d532cc4afa6674d1 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:11 +0800
Subject: intel-iommu: add iommu lock

SECURITY IMPLICATION: this patch fixes a potential race when multiple
threads access the IOMMU IOTLB cache.

Add a per-iommu big lock to protect IOMMU status.  Currently the only
thing to be protected is the IOTLB/context cache, since that can be
accessed even without BQL, e.g., in IO dataplane.

Note that we don't need to protect device page tables since that's fully
controlled by the guest kernel.  However there is still possibility that
malicious drivers will program the device to not obey the rule.  In that
case QEMU can't really do anything useful, instead the guest itself will
be responsible for all uncertainties.

CC: QEMU Stable <qemu-stable@nongnu.org>
Reported-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c         | 56 ++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/intel_iommu.h |  6 +++++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3df90457f8..8d4069de9f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -128,6 +128,16 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }
 
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+    qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+    qemu_mutex_unlock(&s->iommu_lock);
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -172,9 +182,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 }
 
 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
- * IntelIOMMUState to 1.
+ * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
  */
-static void vtd_reset_context_cache(IntelIOMMUState *s)
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
 {
     VTDAddressSpace *vtd_as;
     VTDBus *vtd_bus;
@@ -197,12 +207,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
     s->context_cache_gen = 1;
 }
 
-static void vtd_reset_iotlb(IntelIOMMUState *s)
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
 {
     assert(s->iotlb);
     g_hash_table_remove_all(s->iotlb);
 }
 
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+    vtd_iommu_lock(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
+}
+
 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
                                   uint32_t level)
 {
@@ -215,6 +233,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
     return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 }
 
+/* Must be called with IOMMU lock held */
 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
                                        hwaddr addr)
 {
@@ -235,6 +254,7 @@ out:
     return entry;
 }
 
+/* Must be with IOMMU lock held */
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
                              uint16_t domain_id, hwaddr addr, uint64_t slpte,
                              uint8_t access_flags, uint32_t level)
@@ -246,7 +266,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
     trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
     if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
         trace_vtd_iotlb_reset("iotlb exceeds size limit");
-        vtd_reset_iotlb(s);
+        vtd_reset_iotlb_locked(s);
     }
 
     entry->gfn = gfn;
@@ -1106,7 +1126,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     IntelIOMMUState *s = vtd_as->iommu_state;
     VTDContextEntry ce;
     uint8_t bus_num = pci_bus_num(bus);
-    VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+    VTDContextCacheEntry *cc_entry;
     uint64_t slpte, page_mask;
     uint32_t level;
     uint16_t source_id = vtd_make_source_id(bus_num, devfn);
@@ -1123,6 +1143,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
      */
     assert(!vtd_is_interrupt_addr(addr));
 
+    vtd_iommu_lock(s);
+
+    cc_entry = &vtd_as->context_cache_entry;
+
     /* Try to fetch slpte form IOTLB */
     iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
     if (iotlb_entry) {
@@ -1182,7 +1206,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
          * IOMMU region can be swapped back.
          */
         vtd_pt_enable_fast_path(s, source_id);
-
+        vtd_iommu_unlock(s);
         return true;
     }
 
@@ -1203,6 +1227,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
                      access_flags, level);
 out:
+    vtd_iommu_unlock(s);
     entry->iova = addr & page_mask;
     entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
     entry->addr_mask = ~page_mask;
@@ -1210,6 +1235,7 @@ out:
     return true;
 
 error:
+    vtd_iommu_unlock(s);
     entry->iova = 0;
     entry->translated_addr = 0;
     entry->addr_mask = 0;
@@ -1258,10 +1284,13 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
     trace_vtd_inv_desc_cc_global();
+    /* Protects context cache */
+    vtd_iommu_lock(s);
     s->context_cache_gen++;
     if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
-        vtd_reset_context_cache(s);
+        vtd_reset_context_cache_locked(s);
     }
+    vtd_iommu_unlock(s);
     vtd_switch_address_space_all(s);
     /*
      * From VT-d spec 6.5.2.1, a global context entry invalidation
@@ -1313,7 +1342,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
             if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
                 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
                                              VTD_PCI_FUNC(devfn_it));
+                vtd_iommu_lock(s);
                 vtd_as->context_cache_entry.context_cache_gen = 0;
+                vtd_iommu_unlock(s);
                 /*
                  * Do switch address space when needed, in case if the
                  * device passthrough bit is switched.
@@ -1377,8 +1408,10 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 
     trace_vtd_inv_desc_iotlb_domain(domain_id);
 
+    vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                 &domain_id);
+    vtd_iommu_unlock(s);
 
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
@@ -1426,7 +1459,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
     info.domain_id = domain_id;
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
+    vtd_iommu_lock(s);
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+    vtd_iommu_unlock(s);
     vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
@@ -2929,8 +2964,10 @@ static void vtd_init(IntelIOMMUState *s)
         s->cap |= VTD_CAP_CM;
     }
 
-    vtd_reset_context_cache(s);
-    vtd_reset_iotlb(s);
+    vtd_iommu_lock(s);
+    vtd_reset_context_cache_locked(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
 
     /* Define registers with default values and bit semantics */
     vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
@@ -3070,6 +3107,7 @@ static void vtd_realize(DeviceState *dev, Error **errp)
     }
 
     QLIST_INIT(&s->vtd_as_with_notifiers);
+    qemu_mutex_init(&s->iommu_lock);
     memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
     memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                           "intel_iommu", DMAR_REG_SIZE);
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 032e33bcb2..016e74bedb 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -300,6 +300,12 @@ struct IntelIOMMUState {
     OnOffAuto intr_eim;             /* Toggle for EIM cabability */
     bool buggy_eim;                 /* Force buggy EIM unless eim=off */
     uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
+
+    /*
+     * Protects IOMMU states in general.  Currently it protects the
+     * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.
+     */
+    QemuMutex iommu_lock;
 };
 
 /* Find the VTD Address space associated with the given bus pointer,
-- 
cgit v1.2.3


From 4f8a62a933a79094e44bc1b16b63bb23e62d67b4 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:12 +0800
Subject: intel-iommu: only do page walk for MAP notifiers

For UNMAP-only IOMMU notifiers, we don't need to walk the page tables.
Fasten that procedure by skipping the page table walk.  That should
boost performance for UNMAP-only notifiers like vhost.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c         | 44 ++++++++++++++++++++++++++++++++++++++-----
 include/hw/i386/intel_iommu.h |  2 ++
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8d4069de9f..38ccc741f9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -138,6 +138,12 @@ static inline void vtd_iommu_unlock(IntelIOMMUState *s)
     qemu_mutex_unlock(&s->iommu_lock);
 }
 
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@@ -1436,14 +1442,36 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
     VTDAddressSpace *vtd_as;
     VTDContextEntry ce;
     int ret;
+    hwaddr size = (1 << am) * VTD_PAGE_SIZE;
 
     QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
         ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
-                          vtd_page_invalidate_notify_hook,
-                          (void *)&vtd_as->iommu, true, s->aw_bits);
+            if (vtd_as_has_map_notifier(vtd_as)) {
+                /*
+                 * As long as we have MAP notifications registered in
+                 * any of our IOMMU notifiers, we need to sync the
+                 * shadow page table.
+                 */
+                vtd_page_walk(&ce, addr, addr + size,
+                              vtd_page_invalidate_notify_hook,
+                              (void *)&vtd_as->iommu, true, s->aw_bits);
+            } else {
+                /*
+                 * For UNMAP-only notifiers, we don't need to walk the
+                 * page tables.  We just deliver the PSI down to
+                 * invalidate caches.
+                 */
+                IOMMUTLBEntry entry = {
+                    .target_as = &address_space_memory,
+                    .iova = addr,
+                    .translated_addr = 0,
+                    .addr_mask = size - 1,
+                    .perm = IOMMU_NONE,
+                };
+                memory_region_notify_iommu(&vtd_as->iommu, entry);
+            }
         }
     }
 }
@@ -2383,6 +2411,9 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
         exit(1);
     }
 
+    /* Update per-address-space notifier flags */
+    vtd_as->notifier_flags = new;
+
     if (old == IOMMU_NOTIFIER_NONE) {
         QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
     } else if (new == IOMMU_NOTIFIER_NONE) {
@@ -2891,8 +2922,11 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                   PCI_FUNC(vtd_as->devfn),
                                   VTD_CONTEXT_ENTRY_DID(ce.hi),
                                   ce.hi, ce.lo);
-        vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
-                      s->aw_bits);
+        if (vtd_as_has_map_notifier(vtd_as)) {
+            /* This is required only for MAP typed notifiers */
+            vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
+                          s->aw_bits);
+        }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
                                     PCI_FUNC(vtd_as->devfn));
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 016e74bedb..156f35e919 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -93,6 +93,8 @@ struct VTDAddressSpace {
     IntelIOMMUState *iommu_state;
     VTDContextCacheEntry context_cache_entry;
     QLIST_ENTRY(VTDAddressSpace) next;
+    /* Superset of notifier flags that this address space has */
+    IOMMUNotifierFlag notifier_flags;
 };
 
 struct VTDBus {
-- 
cgit v1.2.3


From fe215b0cbb8c1f4b4af0a64aa5c02042080dd537 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:13 +0800
Subject: intel-iommu: introduce vtd_page_walk_info

During the recursive page walking of IOVA page tables, some stack
variables are constant variables and never changed during the whole page
walking procedure.  Isolate them into a struct so that we don't need to
pass those contants down the stack every time and multiple times.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 84 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 32 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 38ccc741f9..e247269659 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -748,9 +748,27 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 
 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
 
+/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @aw: maximum address width
+ */
+typedef struct {
+    vtd_page_walk_hook hook_fn;
+    void *private;
+    bool notify_unmap;
+    uint8_t aw;
+} vtd_page_walk_info;
+
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
-                             vtd_page_walk_hook hook_fn, void *private)
+                             vtd_page_walk_info *info)
 {
+    vtd_page_walk_hook hook_fn = info->hook_fn;
+    void *private = info->private;
+
     assert(hook_fn);
     trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
                             entry->addr_mask, entry->perm);
@@ -763,17 +781,13 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
  * @addr: base GPA addr to start the walk
  * @start: IOVA range start address
  * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: hook func to be called when detected page
- * @private: private data to be passed into hook func
  * @read: whether parent level has read permission
  * @write: whether parent level has write permission
- * @notify_unmap: whether we should notify invalid entries
- * @aw: maximum address width
+ * @info: constant information for the page walk
  */
 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
-                               uint64_t end, vtd_page_walk_hook hook_fn,
-                               void *private, uint32_t level, bool read,
-                               bool write, bool notify_unmap, uint8_t aw)
+                               uint64_t end, uint32_t level, bool read,
+                               bool write, vtd_page_walk_info *info)
 {
     bool read_cur, write_cur, entry_valid;
     uint32_t offset;
@@ -823,24 +837,24 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
 
         if (vtd_is_last_slpte(slpte, level)) {
             /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
-            if (!entry_valid && !notify_unmap) {
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            if (!entry_valid && !info->notify_unmap) {
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+            ret = vtd_page_walk_one(&entry, level, info);
             if (ret < 0) {
                 return ret;
             }
         } else {
             if (!entry_valid) {
-                if (notify_unmap) {
+                if (info->notify_unmap) {
                     /*
                      * The whole entry is invalid; unmap it all.
                      * Translated address is meaningless, zero it.
                      */
                     entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, level, hook_fn, private);
+                    ret = vtd_page_walk_one(&entry, level, info);
                     if (ret < 0) {
                         return ret;
                     }
@@ -849,10 +863,9 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                 }
                 goto next;
             }
-            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
-                                      MIN(iova_next, end), hook_fn, private,
-                                      level - 1, read_cur, write_cur,
-                                      notify_unmap, aw);
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+                                      iova, MIN(iova_next, end), level - 1,
+                                      read_cur, write_cur, info);
             if (ret < 0) {
                 return ret;
             }
@@ -871,28 +884,24 @@ next:
  * @ce: context entry to walk upon
  * @start: IOVA address to start the walk
  * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: the hook that to be called for each detected area
- * @private: private data for the hook function
- * @aw: maximum address width
+ * @info: page walking information struct
  */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
-                         vtd_page_walk_hook hook_fn, void *private,
-                         bool notify_unmap, uint8_t aw)
+                         vtd_page_walk_info *info)
 {
     dma_addr_t addr = vtd_ce_get_slpt_base(ce);
     uint32_t level = vtd_ce_get_level(ce);
 
-    if (!vtd_iova_range_check(start, ce, aw)) {
+    if (!vtd_iova_range_check(start, ce, info->aw)) {
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
 
-    if (!vtd_iova_range_check(end, ce, aw)) {
+    if (!vtd_iova_range_check(end, ce, info->aw)) {
         /* Fix end so that it reaches the maximum */
-        end = vtd_iova_limit(ce, aw);
+        end = vtd_iova_limit(ce, info->aw);
     }
 
-    return vtd_page_walk_level(addr, start, end, hook_fn, private,
-                               level, true, true, notify_unmap, aw);
+    return vtd_page_walk_level(addr, start, end, level, true, true, info);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -1449,14 +1458,19 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
             if (vtd_as_has_map_notifier(vtd_as)) {
+                vtd_page_walk_info info = {
+                    .hook_fn = vtd_page_invalidate_notify_hook,
+                    .private = (void *)&vtd_as->iommu,
+                    .notify_unmap = true,
+                    .aw = s->aw_bits,
+                };
+
                 /*
                  * As long as we have MAP notifications registered in
                  * any of our IOMMU notifiers, we need to sync the
                  * shadow page table.
                  */
-                vtd_page_walk(&ce, addr, addr + size,
-                              vtd_page_invalidate_notify_hook,
-                              (void *)&vtd_as->iommu, true, s->aw_bits);
+                vtd_page_walk(&ce, addr, addr + size, &info);
             } else {
                 /*
                  * For UNMAP-only notifiers, we don't need to walk the
@@ -2924,8 +2938,14 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                   ce.hi, ce.lo);
         if (vtd_as_has_map_notifier(vtd_as)) {
             /* This is required only for MAP typed notifiers */
-            vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
-                          s->aw_bits);
+            vtd_page_walk_info info = {
+                .hook_fn = vtd_replay_hook,
+                .private = (void *)n,
+                .notify_unmap = false,
+                .aw = s->aw_bits,
+            };
+
+            vtd_page_walk(&ce, 0, ~0ULL, &info);
         }
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
-- 
cgit v1.2.3


From 2f764fa87d2a81812b313dd6d998e10126292653 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:14 +0800
Subject: intel-iommu: pass in address space when page walk

We pass in the VTDAddressSpace too.  It'll be used in the follow up
patches.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e247269659..a882894f49 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -754,9 +754,11 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
  * @hook_fn: hook func to be called when detected page
  * @private: private data to be passed into hook func
  * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
  * @aw: maximum address width
  */
 typedef struct {
+    VTDAddressSpace *as;
     vtd_page_walk_hook hook_fn;
     void *private;
     bool notify_unmap;
@@ -1463,6 +1465,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                     .private = (void *)&vtd_as->iommu,
                     .notify_unmap = true,
                     .aw = s->aw_bits,
+                    .as = vtd_as,
                 };
 
                 /*
@@ -2943,6 +2946,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .private = (void *)n,
                 .notify_unmap = false,
                 .aw = s->aw_bits,
+                .as = vtd_as,
             };
 
             vtd_page_walk(&ce, 0, ~0ULL, &info);
-- 
cgit v1.2.3


From d118c06ebbee2d23ddf873cae4a809311aa61310 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:15 +0800
Subject: intel-iommu: trace domain id during page walk

This patch only modifies the trace points.

Previously we were tracing page walk levels.  They are redundant since
we have page mask (size) already.  Now we trace something much more
useful which is the domain ID of the page walking.  That can be very
useful when we trace more than one devices on the same system, so that
we can know which map is for which domain.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c | 16 ++++++++++------
 hw/i386/trace-events  |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a882894f49..61bb3d31e7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -756,6 +756,7 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
  * @notify_unmap: whether we should notify invalid entries
  * @as: VT-d address space of the device
  * @aw: maximum address width
+ * @domain: domain ID of the page walk
  */
 typedef struct {
     VTDAddressSpace *as;
@@ -763,17 +764,18 @@ typedef struct {
     void *private;
     bool notify_unmap;
     uint8_t aw;
+    uint16_t domain_id;
 } vtd_page_walk_info;
 
-static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
-                             vtd_page_walk_info *info)
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
 {
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
 
     assert(hook_fn);
-    trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
-                            entry->addr_mask, entry->perm);
+    trace_vtd_page_walk_one(info->domain_id, entry->iova,
+                            entry->translated_addr, entry->addr_mask,
+                            entry->perm);
     return hook_fn(entry, private);
 }
 
@@ -844,7 +846,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            ret = vtd_page_walk_one(&entry, level, info);
+            ret = vtd_page_walk_one(&entry, info);
             if (ret < 0) {
                 return ret;
             }
@@ -856,7 +858,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                      * Translated address is meaningless, zero it.
                      */
                     entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, level, info);
+                    ret = vtd_page_walk_one(&entry, info);
                     if (ret < 0) {
                         return ret;
                     }
@@ -1466,6 +1468,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                     .notify_unmap = true,
                     .aw = s->aw_bits,
                     .as = vtd_as,
+                    .domain_id = domain_id,
                 };
 
                 /*
@@ -2947,6 +2950,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                 .notify_unmap = false,
                 .aw = s->aw_bits,
                 .as = vtd_as,
+                .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
             };
 
             vtd_page_walk(&ce, 0, ~0ULL, &info);
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 22d44648af..ca23ba9fad 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -39,7 +39,7 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry"
 vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
-vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
 vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
-- 
cgit v1.2.3


From eecf5eedbdc0fc04f39abcf3afeedfbf21b25ca4 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:16 +0800
Subject: util: implement simple iova tree

Introduce a simplest iova tree implementation based on GTree.

CC: QEMU Stable <qemu-stable@nongnu.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS              |   6 +++
 include/qemu/iova-tree.h | 134 +++++++++++++++++++++++++++++++++++++++++++++++
 util/Makefile.objs       |   1 +
 util/iova-tree.c         | 114 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 include/qemu/iova-tree.h
 create mode 100644 util/iova-tree.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e187b1f18f..f07fcee72e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1783,6 +1783,12 @@ F: include/sysemu/replay.h
 F: docs/replay.txt
 F: stubs/replay.c
 
+IOVA Tree
+M: Peter Xu <peterx@redhat.com>
+S: Maintained
+F: include/qemu/iova-tree.h
+F: util/iova-tree.c
+
 Usermode Emulation
 ------------------
 Overall
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
new file mode 100644
index 0000000000..b061932097
--- /dev/null
+++ b/include/qemu/iova-tree.h
@@ -0,0 +1,134 @@
+/*
+ * An very simplified iova tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+#ifndef IOVA_TREE_H
+#define IOVA_TREE_H
+
+/*
+ * Currently the iova tree will only allow to keep ranges
+ * information, and no extra user data is allowed for each element.  A
+ * benefit is that we can merge adjacent ranges internally within the
+ * tree.  It can save a lot of memory when the ranges are splitted but
+ * mostly continuous.
+ *
+ * Note that current implementation does not provide any thread
+ * protections.  Callers of the iova tree should be responsible
+ * for the thread safety issue.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/memory.h"
+#include "exec/hwaddr.h"
+
+#define  IOVA_OK           (0)
+#define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
+#define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+
+typedef struct IOVATree IOVATree;
+typedef struct DMAMap {
+    hwaddr iova;
+    hwaddr translated_addr;
+    hwaddr size;                /* Inclusive */
+    IOMMUAccessFlags perm;
+} QEMU_PACKED DMAMap;
+typedef gboolean (*iova_tree_iterator)(DMAMap *map);
+
+/**
+ * iova_tree_new:
+ *
+ * Create a new iova tree.
+ *
+ * Returns: the tree pointer when succeeded, or NULL if error.
+ */
+IOVATree *iova_tree_new(void);
+
+/**
+ * iova_tree_insert:
+ *
+ * @tree: the iova tree to insert
+ * @map: the mapping to insert
+ *
+ * Insert an iova range to the tree.  If there is overlapped
+ * ranges, IOVA_ERR_OVERLAP will be returned.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_insert(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_remove:
+ *
+ * @tree: the iova tree to remove range from
+ * @map: the map range to remove
+ *
+ * Remove mappings from the tree that are covered by the map range
+ * provided.  The range does not need to be exactly what has inserted,
+ * all the mappings that are included in the provided range will be
+ * removed from the tree.  Here map->translated_addr is meaningless.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_remove(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find_address:
+ *
+ * @tree: the iova tree to search from
+ * @iova: the iova address to find
+ *
+ * Similar to iova_tree_find(), but it tries to find mapping with
+ * range iova=iova & size=0.
+ *
+ * Return: same as iova_tree_find().
+ */
+DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova);
+
+/**
+ * iova_tree_foreach:
+ *
+ * @tree: the iova tree to iterate on
+ * @iterator: the interator for the mappings, return true to stop
+ *
+ * Iterate over the iova tree.
+ *
+ * Return: 1 if found any overlap, 0 if not, <0 if error.
+ */
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
+
+/**
+ * iova_tree_destroy:
+ *
+ * @tree: the iova tree to destroy
+ *
+ * Destroy an existing iova tree.
+ *
+ * Return: None.
+ */
+void iova_tree_destroy(IOVATree *tree);
+
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 728c3541db..e1c3fed4dc 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -47,4 +47,5 @@ util-obj-y += qht.o
 util-obj-y += range.o
 util-obj-y += stats64.o
 util-obj-y += systemd.o
+util-obj-y += iova-tree.o
 util-obj-$(CONFIG_LINUX) += vfio-helpers.o
diff --git a/util/iova-tree.c b/util/iova-tree.c
new file mode 100644
index 0000000000..2d9cebfc89
--- /dev/null
+++ b/util/iova-tree.c
@@ -0,0 +1,114 @@
+/*
+ * IOVA tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+
+#include <glib.h>
+#include "qemu/iova-tree.h"
+
+struct IOVATree {
+    GTree *tree;
+};
+
+static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
+{
+    const DMAMap *m1 = a, *m2 = b;
+
+    if (m1->iova > m2->iova + m2->size) {
+        return 1;
+    }
+
+    if (m1->iova + m1->size < m2->iova) {
+        return -1;
+    }
+
+    /* Overlapped */
+    return 0;
+}
+
+IOVATree *iova_tree_new(void)
+{
+    IOVATree *iova_tree = g_new0(IOVATree, 1);
+
+    /* We don't have values actually, no need to free */
+    iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL);
+
+    return iova_tree;
+}
+
+DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map)
+{
+    return g_tree_lookup(tree->tree, map);
+}
+
+DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova)
+{
+    DMAMap map = { .iova = iova, .size = 0 };
+
+    return iova_tree_find(tree, &map);
+}
+
+static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range)
+{
+    /* Key and value are sharing the same range data */
+    g_tree_insert(gtree, range, range);
+}
+
+int iova_tree_insert(IOVATree *tree, DMAMap *map)
+{
+    DMAMap *new;
+
+    if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* We don't allow to insert range that overlaps with existings */
+    if (iova_tree_find(tree, map)) {
+        return IOVA_ERR_OVERLAP;
+    }
+
+    new = g_new0(DMAMap, 1);
+    memcpy(new, map, sizeof(*new));
+    iova_tree_insert_internal(tree->tree, new);
+
+    return IOVA_OK;
+}
+
+static gboolean iova_tree_traverse(gpointer key, gpointer value,
+                                gpointer data)
+{
+    iova_tree_iterator iterator = data;
+    DMAMap *map = key;
+
+    g_assert(key == value);
+
+    return iterator(map);
+}
+
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator)
+{
+    g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
+}
+
+int iova_tree_remove(IOVATree *tree, DMAMap *map)
+{
+    DMAMap *overlap;
+
+    while ((overlap = iova_tree_find(tree, map))) {
+        g_tree_remove(tree->tree, overlap);
+    }
+
+    return IOVA_OK;
+}
+
+void iova_tree_destroy(IOVATree *tree)
+{
+    g_tree_destroy(tree->tree);
+    g_free(tree);
+}
-- 
cgit v1.2.3


From 63b88968f139b6a77f2f81e6f1eedf70c0170a85 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 18 May 2018 15:25:17 +0800
Subject: intel-iommu: rework the page walk logic

This patch fixes a potential small window that the DMA page table might
be incomplete or invalid when the guest sends domain/context
invalidations to a device.  This can cause random DMA errors for
assigned devices.

This is a major change to the VT-d shadow page walking logic. It
includes but is not limited to:

- For each VTDAddressSpace, now we maintain what IOVA ranges we have
  mapped and what we have not.  With that information, now we only send
  MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
  know we have already mapped the range, meanwhile we don't send UNMAP
  notifies if we know we never mapped the range at all.

- Introduce vtd_sync_shadow_page_table[_range] APIs so that we can call
  in any places to resync the shadow page table for a device.

- When we receive domain/context invalidation, we should not really run
  the replay logic, instead we use the new sync shadow page table API to
  resync the whole shadow page table without unmapping the whole
  region.  After this change, we'll only do the page walk once for each
  domain invalidations (before this, it can be multiple, depending on
  number of notifiers per address space).

While at it, the page walking logic is also refactored to be simpler.

CC: QEMU Stable <qemu-stable@nongnu.org>
Reported-by: Jintack Lim <jintack@cs.columbia.edu>
Tested-by: Jintack Lim <jintack@cs.columbia.edu>
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c         | 213 ++++++++++++++++++++++++++++++------------
 hw/i386/trace-events          |   3 +-
 include/hw/i386/intel_iommu.h |   2 +
 3 files changed, 159 insertions(+), 59 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 61bb3d31e7..b5a09b7908 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -769,10 +769,77 @@ typedef struct {
 
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
 {
+    VTDAddressSpace *as = info->as;
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+        return 0;
+    }
 
     assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (entry->perm) {
+        if (mapped) {
+            /* If it's exactly the same translation, skip */
+            if (!memcmp(mapped, &target, sizeof(target))) {
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                                 entry->translated_addr);
+                return 0;
+            } else {
+                /*
+                 * Translation changed.  Normally this should not
+                 * happen, but it can happen when with buggy guest
+                 * OSes.  Note that there will be a small window that
+                 * we don't have map at all.  But that's the best
+                 * effort we can do.  The ideal way to emulate this is
+                 * atomically modify the PTE to follow what has
+                 * changed, but we can't.  One example is that vfio
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+                 * interface to modify a mapping (meanwhile it seems
+                 * meaningless to even provide one).  Anyway, let's
+                 * mark this as a TODO in case one day we'll have
+                 * a better solution.
+                 */
+                IOMMUAccessFlags cache_perm = entry->perm;
+                int ret;
+
+                /* Emulate an UNMAP */
+                entry->perm = IOMMU_NONE;
+                trace_vtd_page_walk_one(info->domain_id,
+                                        entry->iova,
+                                        entry->translated_addr,
+                                        entry->addr_mask,
+                                        entry->perm);
+                ret = hook_fn(entry, private);
+                if (ret) {
+                    return ret;
+                }
+                /* Drop any existing mapping */
+                iova_tree_remove(as->iova_tree, &target);
+                /* Recover the correct permission */
+                entry->perm = cache_perm;
+            }
+        }
+        iova_tree_insert(as->iova_tree, &target);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        iova_tree_remove(as->iova_tree, &target);
+    }
+
     trace_vtd_page_walk_one(info->domain_id, entry->iova,
                             entry->translated_addr, entry->addr_mask,
                             entry->perm);
@@ -834,45 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
          */
         entry_valid = read_cur | write_cur;
 
-        entry.target_as = &address_space_memory;
-        entry.iova = iova & subpage_mask;
-        entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
-        entry.addr_mask = ~subpage_mask;
-
-        if (vtd_is_last_slpte(slpte, level)) {
-            /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
-            if (!entry_valid && !info->notify_unmap) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
-                goto next;
-            }
-            ret = vtd_page_walk_one(&entry, info);
-            if (ret < 0) {
-                return ret;
-            }
-        } else {
-            if (!entry_valid) {
-                if (info->notify_unmap) {
-                    /*
-                     * The whole entry is invalid; unmap it all.
-                     * Translated address is meaningless, zero it.
-                     */
-                    entry.translated_addr = 0x0;
-                    ret = vtd_page_walk_one(&entry, info);
-                    if (ret < 0) {
-                        return ret;
-                    }
-                } else {
-                    trace_vtd_page_walk_skip_perm(iova, iova_next);
-                }
-                goto next;
-            }
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+            /*
+             * This is a valid PDE (or even bigger than PDE).  We need
+             * to walk one further level.
+             */
             ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
                                       iova, MIN(iova_next, end), level - 1,
                                       read_cur, write_cur, info);
-            if (ret < 0) {
-                return ret;
-            }
+        } else {
+            /*
+             * This means we are either:
+             *
+             * (1) the real page entry (either 4K page, or huge page)
+             * (2) the whole range is invalid
+             *
+             * In either case, we send an IOTLB notification down.
+             */
+            entry.target_as = &address_space_memory;
+            entry.iova = iova & subpage_mask;
+            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
+            entry.addr_mask = ~subpage_mask;
+            /* NOTE: this is only meaningful if entry_valid == true */
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            ret = vtd_page_walk_one(&entry, info);
+        }
+
+        if (ret < 0) {
+            return ret;
         }
 
 next:
@@ -964,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     return 0;
 }
 
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
+                                     void *private)
+{
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
+    return 0;
+}
+
+/* If context entry is NULL, we'll try to fetch it on our own. */
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+                                            VTDContextEntry *ce,
+                                            hwaddr addr, hwaddr size)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    vtd_page_walk_info info = {
+        .hook_fn = vtd_sync_shadow_page_hook,
+        .private = (void *)&vtd_as->iommu,
+        .notify_unmap = true,
+        .aw = s->aw_bits,
+        .as = vtd_as,
+    };
+    VTDContextEntry ce_cache;
+    int ret;
+
+    if (ce) {
+        /* If the caller provided context entry, use it */
+        ce_cache = *ce;
+    } else {
+        /* If the caller didn't provide ce, try to fetch */
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                       vtd_as->devfn, &ce_cache);
+        if (ret) {
+            /*
+             * This should not really happen, but in case it happens,
+             * we just skip the sync for this time.  After all we even
+             * don't have the root table pointer!
+             */
+            trace_vtd_err("Detected invalid context entry when "
+                          "trying to sync shadow page table");
+            return 0;
+        }
+    }
+
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
+
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
+}
+
 /*
  * Fetch translation type for specific device. Returns <0 if error
  * happens, otherwise return the shifted type to check against
@@ -1296,7 +1404,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
     VTDAddressSpace *vtd_as;
 
     QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
-        memory_region_iommu_replay_all(&vtd_as->iommu);
+        vtd_sync_shadow_page_table(vtd_as);
     }
 }
 
@@ -1371,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
                 vtd_switch_address_space(vtd_as);
                 /*
                  * So a device is moving out of (or moving into) a
-                 * domain, a replay() suites here to notify all the
-                 * IOMMU_NOTIFIER_MAP registers about this change.
+                 * domain, resync the shadow page table.
                  * This won't bring bad even if we have no such
                  * notifier registered - the IOMMU notification
                  * framework will skip MAP notifications if that
                  * happened.
                  */
-                memory_region_iommu_replay_all(&vtd_as->iommu);
+                vtd_sync_shadow_page_table(vtd_as);
             }
         }
     }
@@ -1436,18 +1543,11 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
         if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce) &&
             domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            memory_region_iommu_replay_all(&vtd_as->iommu);
+            vtd_sync_shadow_page_table(vtd_as);
         }
     }
 }
 
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
-                                           void *private)
-{
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
-    return 0;
-}
-
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                            uint16_t domain_id, hwaddr addr,
                                            uint8_t am)
@@ -1462,21 +1562,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                        vtd_as->devfn, &ce);
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
             if (vtd_as_has_map_notifier(vtd_as)) {
-                vtd_page_walk_info info = {
-                    .hook_fn = vtd_page_invalidate_notify_hook,
-                    .private = (void *)&vtd_as->iommu,
-                    .notify_unmap = true,
-                    .aw = s->aw_bits,
-                    .as = vtd_as,
-                    .domain_id = domain_id,
-                };
-
                 /*
                  * As long as we have MAP notifications registered in
                  * any of our IOMMU notifiers, we need to sync the
                  * shadow page table.
                  */
-                vtd_page_walk(&ce, addr, addr + size, &info);
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
             } else {
                 /*
                  * For UNMAP-only notifiers, we don't need to walk the
@@ -2806,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->devfn = (uint8_t)devfn;
         vtd_dev_as->iommu_state = s;
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = iova_tree_new();
 
         /*
          * Memory region relationships looks like (Address range shows
@@ -2858,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
     hwaddr start = n->start;
     hwaddr end = n->end;
     IntelIOMMUState *s = as->iommu_state;
+    DMAMap map;
 
     /*
      * Note: all the codes in this function has a assumption that IOVA
@@ -2902,6 +2995,10 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
                              VTD_PCI_FUNC(as->devfn),
                              entry.iova, size);
 
+    map.iova = entry.iova;
+    map.size = entry.addr_mask;
+    iova_tree_remove(as->iova_tree, &map);
+
     memory_region_notify_one(n, &entry);
 }
 
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index ca23ba9fad..e14d06ec83 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -40,8 +40,9 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
 vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 156f35e919..fbfedcb1c0 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -27,6 +27,7 @@
 #include "hw/i386/ioapic.h"
 #include "hw/pci/msi.h"
 #include "hw/sysbus.h"
+#include "qemu/iova-tree.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -95,6 +96,7 @@ struct VTDAddressSpace {
     QLIST_ENTRY(VTDAddressSpace) next;
     /* Superset of notifier flags that this address space has */
     IOMMUNotifierFlag notifier_flags;
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
 };
 
 struct VTDBus {
-- 
cgit v1.2.3