aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2022-02-19 15:24:12 +0000
committerPeter Maydell <peter.maydell@linaro.org>2022-02-19 15:24:12 +0000
commit242f2cae782d433d69d195e14564b6437ec9f7e6 (patch)
tree637e34022e9979c74bdd570d3b9a58f14db76fc9
parent439346ce8fa32095433a9abb2aa3564d11283372 (diff)
parent45b04ef48dbbeb18d93c2631bf5584ac493de749 (diff)
Merge remote-tracking branch 'remotes/dgilbert-gitlab/tags/pull-virtiofs-20220217b' into staging
V3: virtiofs pull 2022-02-17 Security label improvements from Vivek - includes a fix for building against new kernel headers [V3: checkpatch style fixes] [V2: Fix building on old Linux] Blocking flock disable from Sebastian SYNCFS support from Greg Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> # gpg: Signature made Thu 17 Feb 2022 17:24:25 GMT # gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7 # gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full] # Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7 * remotes/dgilbert-gitlab/tags/pull-virtiofs-20220217b: virtiofsd: Add basic support for FUSE_SYNCFS request virtiofsd: Add an option to enable/disable security label virtiofsd: Create new file using O_TMPFILE and set security context virtiofsd: Create new file with security context virtiofsd: Add helpers to work with /proc/self/task/tid/attr/fscreate virtiofsd: Move core file creation code in separate function virtiofsd, fuse_lowlevel.c: Add capability to parse security context virtiofsd: Extend size of fuse_conn_info->capable and ->want fields virtiofsd: Parse extended "struct fuse_init_in" linux-headers: Update headers to v5.17-rc1 virtiofsd: Fix breakage due to fuse_init_in size change virtiofsd: Do not support blocking flock Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--docs/tools/virtiofsd.rst32
-rw-r--r--include/standard-headers/asm-x86/kvm_para.h1
-rw-r--r--include/standard-headers/drm/drm_fourcc.h11
-rw-r--r--include/standard-headers/linux/ethtool.h1
-rw-r--r--include/standard-headers/linux/fuse.h60
-rw-r--r--include/standard-headers/linux/pci_regs.h142
-rw-r--r--include/standard-headers/linux/virtio_gpio.h72
-rw-r--r--include/standard-headers/linux/virtio_i2c.h47
-rw-r--r--include/standard-headers/linux/virtio_iommu.h8
-rw-r--r--include/standard-headers/linux/virtio_pcidev.h65
-rw-r--r--include/standard-headers/linux/virtio_scmi.h24
-rw-r--r--linux-headers/asm-generic/unistd.h5
-rw-r--r--linux-headers/asm-mips/unistd_n32.h2
-rw-r--r--linux-headers/asm-mips/unistd_n64.h2
-rw-r--r--linux-headers/asm-mips/unistd_o32.h2
-rw-r--r--linux-headers/asm-powerpc/unistd_32.h2
-rw-r--r--linux-headers/asm-powerpc/unistd_64.h2
-rw-r--r--linux-headers/asm-riscv/bitsperlong.h14
-rw-r--r--linux-headers/asm-riscv/mman.h1
-rw-r--r--linux-headers/asm-riscv/unistd.h44
-rw-r--r--linux-headers/asm-s390/unistd_32.h2
-rw-r--r--linux-headers/asm-s390/unistd_64.h2
-rw-r--r--linux-headers/asm-x86/kvm.h16
-rw-r--r--linux-headers/asm-x86/unistd_32.h1
-rw-r--r--linux-headers/asm-x86/unistd_64.h1
-rw-r--r--linux-headers/asm-x86/unistd_x32.h1
-rw-r--r--linux-headers/linux/kvm.h17
-rw-r--r--tools/virtiofsd/fuse_common.h9
-rw-r--r--tools/virtiofsd/fuse_i.h7
-rw-r--r--tools/virtiofsd/fuse_lowlevel.c180
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h13
-rw-r--r--tools/virtiofsd/helper.c1
-rw-r--r--tools/virtiofsd/passthrough_ll.c467
-rw-r--r--tools/virtiofsd/passthrough_seccomp.c1
34 files changed, 1123 insertions, 132 deletions
diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst
index 07ac0be551..0c0560203c 100644
--- a/docs/tools/virtiofsd.rst
+++ b/docs/tools/virtiofsd.rst
@@ -104,6 +104,13 @@ Options
* posix_acl|no_posix_acl -
Enable/disable posix acl support. Posix ACLs are disabled by default.
+ * security_label|no_security_label -
+ Enable/disable security label support. Security labels are disabled by
+ default. This will allow client to send a MAC label of file during
+ file creation. Typically this is expected to be SELinux security
+ label. Server will try to set that label on newly created file
+ atomically wherever possible.
+
.. option:: --socket-path=PATH
Listen on vhost-user UNIX domain socket at PATH.
@@ -348,6 +355,31 @@ client arguments or lists returned from the host. This stops
the client seeing any 'security.' attributes on the server and
stops it setting any.
+SELinux support
+---------------
+One can enable support for SELinux by running virtiofsd with option
+"-o security_label". But this will try to save guest's security context
+in xattr security.selinux on host and it might fail if host's SELinux
+policy does not permit virtiofsd to do this operation.
+
+Hence, it is preferred to remap guest's "security.selinux" xattr to say
+"trusted.virtiofs.security.selinux" on host.
+
+"-o xattrmap=:map:security.selinux:trusted.virtiofs.:"
+
+This will make sure that guest and host's SELinux xattrs on same file
+remain separate and not interfere with each other. And will allow both
+host and guest to implement their own separate SELinux policies.
+
+Setting trusted xattr on host requires CAP_SYS_ADMIN. So one will need
+add this capability to daemon.
+
+"-o modcaps=+sys_admin"
+
+Giving CAP_SYS_ADMIN increases the risk on system. Now virtiofsd is more
+powerful and if gets compromised, it can do lot of damage to host system.
+So keep this trade-off in my mind while making a decision.
+
Examples
--------
diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h
index 204cfb8640..f0235e58a1 100644
--- a/include/standard-headers/asm-x86/kvm_para.h
+++ b/include/standard-headers/asm-x86/kvm_para.h
@@ -8,6 +8,7 @@
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
+#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
* a particular paravirtualization, the appropriate feature bit should
diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h
index 2c025cb4fe..4888f85f69 100644
--- a/include/standard-headers/drm/drm_fourcc.h
+++ b/include/standard-headers/drm/drm_fourcc.h
@@ -313,6 +313,13 @@ extern "C" {
*/
#define DRM_FORMAT_P016 fourcc_code('P', '0', '1', '6') /* 2x2 subsampled Cr:Cb plane 16 bits per channel */
+/* 2 plane YCbCr420.
+ * 3 10 bit components and 2 padding bits packed into 4 bytes.
+ * index 0 = Y plane, [31:0] x:Y2:Y1:Y0 2:10:10:10 little endian
+ * index 1 = Cr:Cb plane, [63:0] x:Cr2:Cb2:Cr1:x:Cb1:Cr0:Cb0 [2:10:10:10:2:10:10:10] little endian
+ */
+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
+
/* 3 plane non-subsampled (444) YCbCr
* 16 bits per component, but only 10 bits are used and 6 bits are padded
* index 0: Y plane, [15:0] Y:x [10:6] little endian
@@ -853,6 +860,10 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier)
* and UV. Some SAND-using hardware stores UV in a separate tiled
* image from Y to reduce the column height, which is not supported
* with these modifiers.
+ *
+ * The DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT modifier is also
+ * supported for DRM_FORMAT_P030 where the columns remain as 128 bytes
+ * wide, but as this is a 10 bpp format that translates to 96 pixels.
*/
#define DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(v) \
diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h
index 688eb8dc39..38d5a4cd6e 100644
--- a/include/standard-headers/linux/ethtool.h
+++ b/include/standard-headers/linux/ethtool.h
@@ -231,6 +231,7 @@ enum tunable_id {
ETHTOOL_RX_COPYBREAK,
ETHTOOL_TX_COPYBREAK,
ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */
+ ETHTOOL_TX_COPYBREAK_BUF_SIZE,
/*
* Add your fresh new tunable attribute above and remember to update
* tunable_strings[] in net/ethtool/common.c
diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h
index 23ea31708b..bda06258be 100644
--- a/include/standard-headers/linux/fuse.h
+++ b/include/standard-headers/linux/fuse.h
@@ -184,6 +184,16 @@
*
* 7.34
* - add FUSE_SYNCFS
+ *
+ * 7.35
+ * - add FOPEN_NOFLUSH
+ *
+ * 7.36
+ * - extend fuse_init_in with reserved fields, add FUSE_INIT_EXT init flag
+ * - add flags2 to fuse_init_in and fuse_init_out
+ * - add FUSE_SECURITY_CTX init flag
+ * - add security context to create, mkdir, symlink, and mknod requests
+ * - add FUSE_HAS_INODE_DAX, FUSE_ATTR_DAX
*/
#ifndef _LINUX_FUSE_H
@@ -215,7 +225,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 34
+#define FUSE_KERNEL_MINOR_VERSION 36
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@@ -286,12 +296,14 @@ struct fuse_file_lock {
* FOPEN_NONSEEKABLE: the file is not seekable
* FOPEN_CACHE_DIR: allow caching this directory
* FOPEN_STREAM: the file is stream-like (no file position at all)
+ * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
*/
#define FOPEN_DIRECT_IO (1 << 0)
#define FOPEN_KEEP_CACHE (1 << 1)
#define FOPEN_NONSEEKABLE (1 << 2)
#define FOPEN_CACHE_DIR (1 << 3)
#define FOPEN_STREAM (1 << 4)
+#define FOPEN_NOFLUSH (1 << 5)
/**
* INIT request/reply flags
@@ -332,6 +344,11 @@ struct fuse_file_lock {
* write/truncate sgid is killed only if file has group
* execute permission. (Same as Linux VFS behavior).
* FUSE_SETXATTR_EXT: Server supports extended struct fuse_setxattr_in
+ * FUSE_INIT_EXT: extended fuse_init_in request
+ * FUSE_INIT_RESERVED: reserved, do not use
+ * FUSE_SECURITY_CTX: add security context to create, mkdir, symlink, and
+ * mknod
+ * FUSE_HAS_INODE_DAX: use per inode DAX
*/
#define FUSE_ASYNC_READ (1 << 0)
#define FUSE_POSIX_LOCKS (1 << 1)
@@ -363,6 +380,11 @@ struct fuse_file_lock {
#define FUSE_SUBMOUNTS (1 << 27)
#define FUSE_HANDLE_KILLPRIV_V2 (1 << 28)
#define FUSE_SETXATTR_EXT (1 << 29)
+#define FUSE_INIT_EXT (1 << 30)
+#define FUSE_INIT_RESERVED (1 << 31)
+/* bits 32..63 get shifted down 32 bits into the flags2 field */
+#define FUSE_SECURITY_CTX (1ULL << 32)
+#define FUSE_HAS_INODE_DAX (1ULL << 33)
/**
* CUSE INIT request/reply flags
@@ -445,8 +467,10 @@ struct fuse_file_lock {
* fuse_attr flags
*
* FUSE_ATTR_SUBMOUNT: Object is a submount root
+ * FUSE_ATTR_DAX: Enable DAX for this file in per inode DAX mode
*/
#define FUSE_ATTR_SUBMOUNT (1 << 0)
+#define FUSE_ATTR_DAX (1 << 1)
/**
* Open flags
@@ -732,6 +756,8 @@ struct fuse_init_in {
uint32_t minor;
uint32_t max_readahead;
uint32_t flags;
+ uint32_t flags2;
+ uint32_t unused[11];
};
#define FUSE_COMPAT_INIT_OUT_SIZE 8
@@ -748,7 +774,8 @@ struct fuse_init_out {
uint32_t time_gran;
uint16_t max_pages;
uint16_t map_alignment;
- uint32_t unused[8];
+ uint32_t flags2;
+ uint32_t unused[7];
};
#define CUSE_INIT_INFO_MAX 4096
@@ -856,9 +883,12 @@ struct fuse_dirent {
char name[];
};
-#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
-#define FUSE_DIRENT_ALIGN(x) \
+/* Align variable length records to 64bit boundary */
+#define FUSE_REC_ALIGN(x) \
(((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
+
+#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
+#define FUSE_DIRENT_ALIGN(x) FUSE_REC_ALIGN(x)
#define FUSE_DIRENT_SIZE(d) \
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
@@ -975,4 +1005,26 @@ struct fuse_syncfs_in {
uint64_t padding;
};
+/*
+ * For each security context, send fuse_secctx with size of security context
+ * fuse_secctx will be followed by security context name and this in turn
+ * will be followed by actual context label.
+ * fuse_secctx, name, context
+ */
+struct fuse_secctx {
+ uint32_t size;
+ uint32_t padding;
+};
+
+/*
+ * Contains the information about how many fuse_secctx structures are being
+ * sent and what's the total size of all security contexts (including
+ * size of fuse_secctx_header).
+ *
+ */
+struct fuse_secctx_header {
+ uint32_t size;
+ uint32_t nr_secctx;
+};
+
#endif /* _LINUX_FUSE_H */
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index ff6ccbc6ef..bee1a9ed6e 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -301,23 +301,23 @@
#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */
#define PCI_SID_CHASSIS_NR 3 /* Chassis Number */
-/* Message Signalled Interrupt registers */
+/* Message Signaled Interrupt registers */
-#define PCI_MSI_FLAGS 2 /* Message Control */
+#define PCI_MSI_FLAGS 0x02 /* Message Control */
#define PCI_MSI_FLAGS_ENABLE 0x0001 /* MSI feature enabled */
#define PCI_MSI_FLAGS_QMASK 0x000e /* Maximum queue size available */
#define PCI_MSI_FLAGS_QSIZE 0x0070 /* Message queue size configured */
#define PCI_MSI_FLAGS_64BIT 0x0080 /* 64-bit addresses allowed */
#define PCI_MSI_FLAGS_MASKBIT 0x0100 /* Per-vector masking capable */
#define PCI_MSI_RFU 3 /* Rest of capability flags */
-#define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */
-#define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
-#define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */
-#define PCI_MSI_MASK_32 12 /* Mask bits register for 32-bit devices */
-#define PCI_MSI_PENDING_32 16 /* Pending intrs for 32-bit devices */
-#define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */
-#define PCI_MSI_MASK_64 16 /* Mask bits register for 64-bit devices */
-#define PCI_MSI_PENDING_64 20 /* Pending intrs for 64-bit devices */
+#define PCI_MSI_ADDRESS_LO 0x04 /* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI 0x08 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32 0x08 /* 16 bits of data for 32-bit devices */
+#define PCI_MSI_MASK_32 0x0c /* Mask bits register for 32-bit devices */
+#define PCI_MSI_PENDING_32 0x10 /* Pending intrs for 32-bit devices */
+#define PCI_MSI_DATA_64 0x0c /* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_64 0x10 /* Mask bits register for 64-bit devices */
+#define PCI_MSI_PENDING_64 0x14 /* Pending intrs for 64-bit devices */
/* MSI-X registers (in MSI-X capability) */
#define PCI_MSIX_FLAGS 2 /* Message Control */
@@ -335,10 +335,10 @@
/* MSI-X Table entry format (in memory mapped by a BAR) */
#define PCI_MSIX_ENTRY_SIZE 16
-#define PCI_MSIX_ENTRY_LOWER_ADDR 0 /* Message Address */
-#define PCI_MSIX_ENTRY_UPPER_ADDR 4 /* Message Upper Address */
-#define PCI_MSIX_ENTRY_DATA 8 /* Message Data */
-#define PCI_MSIX_ENTRY_VECTOR_CTRL 12 /* Vector Control */
+#define PCI_MSIX_ENTRY_LOWER_ADDR 0x0 /* Message Address */
+#define PCI_MSIX_ENTRY_UPPER_ADDR 0x4 /* Message Upper Address */
+#define PCI_MSIX_ENTRY_DATA 0x8 /* Message Data */
+#define PCI_MSIX_ENTRY_VECTOR_CTRL 0xc /* Vector Control */
#define PCI_MSIX_ENTRY_CTRL_MASKBIT 0x00000001
/* CompactPCI Hotswap Register */
@@ -470,7 +470,7 @@
/* PCI Express capability registers */
-#define PCI_EXP_FLAGS 2 /* Capabilities register */
+#define PCI_EXP_FLAGS 0x02 /* Capabilities register */
#define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */
#define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */
#define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */
@@ -484,7 +484,7 @@
#define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */
#define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */
#define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */
-#define PCI_EXP_DEVCAP 4 /* Device capabilities */
+#define PCI_EXP_DEVCAP 0x04 /* Device capabilities */
#define PCI_EXP_DEVCAP_PAYLOAD 0x00000007 /* Max_Payload_Size */
#define PCI_EXP_DEVCAP_PHANTOM 0x00000018 /* Phantom functions */
#define PCI_EXP_DEVCAP_EXT_TAG 0x00000020 /* Extended tags */
@@ -497,7 +497,7 @@
#define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */
#define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */
#define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */
-#define PCI_EXP_DEVCTL 8 /* Device Control */
+#define PCI_EXP_DEVCTL 0x08 /* Device Control */
#define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */
#define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */
#define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */
@@ -522,7 +522,7 @@
#define PCI_EXP_DEVCTL_READRQ_2048B 0x4000 /* 2048 Bytes */
#define PCI_EXP_DEVCTL_READRQ_4096B 0x5000 /* 4096 Bytes */
#define PCI_EXP_DEVCTL_BCR_FLR 0x8000 /* Bridge Configuration Retry / FLR */
-#define PCI_EXP_DEVSTA 10 /* Device Status */
+#define PCI_EXP_DEVSTA 0x0a /* Device Status */
#define PCI_EXP_DEVSTA_CED 0x0001 /* Correctable Error Detected */
#define PCI_EXP_DEVSTA_NFED 0x0002 /* Non-Fatal Error Detected */
#define PCI_EXP_DEVSTA_FED 0x0004 /* Fatal Error Detected */
@@ -530,7 +530,7 @@
#define PCI_EXP_DEVSTA_AUXPD 0x0010 /* AUX Power Detected */
#define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */
#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 /* v1 endpoints without link end here */
-#define PCI_EXP_LNKCAP 12 /* Link Capabilities */
+#define PCI_EXP_LNKCAP 0x0c /* Link Capabilities */
#define PCI_EXP_LNKCAP_SLS 0x0000000f /* Supported Link Speeds */
#define PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */
#define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */
@@ -549,7 +549,7 @@
#define PCI_EXP_LNKCAP_DLLLARC 0x00100000 /* Data Link Layer Link Active Reporting Capable */
#define PCI_EXP_LNKCAP_LBNC 0x00200000 /* Link Bandwidth Notification Capability */
#define PCI_EXP_LNKCAP_PN 0xff000000 /* Port Number */
-#define PCI_EXP_LNKCTL 16 /* Link Control */
+#define PCI_EXP_LNKCTL 0x10 /* Link Control */
#define PCI_EXP_LNKCTL_ASPMC 0x0003 /* ASPM Control */
#define PCI_EXP_LNKCTL_ASPM_L0S 0x0001 /* L0s Enable */
#define PCI_EXP_LNKCTL_ASPM_L1 0x0002 /* L1 Enable */
@@ -562,7 +562,7 @@
#define PCI_EXP_LNKCTL_HAWD 0x0200 /* Hardware Autonomous Width Disable */
#define PCI_EXP_LNKCTL_LBMIE 0x0400 /* Link Bandwidth Management Interrupt Enable */
#define PCI_EXP_LNKCTL_LABIE 0x0800 /* Link Autonomous Bandwidth Interrupt Enable */
-#define PCI_EXP_LNKSTA 18 /* Link Status */
+#define PCI_EXP_LNKSTA 0x12 /* Link Status */
#define PCI_EXP_LNKSTA_CLS 0x000f /* Current Link Speed */
#define PCI_EXP_LNKSTA_CLS_2_5GB 0x0001 /* Current Link Speed 2.5GT/s */
#define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */
@@ -582,7 +582,7 @@
#define PCI_EXP_LNKSTA_LBMS 0x4000 /* Link Bandwidth Management Status */
#define PCI_EXP_LNKSTA_LABS 0x8000 /* Link Autonomous Bandwidth Status */
#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 20 /* v1 endpoints with link end here */
-#define PCI_EXP_SLTCAP 20 /* Slot Capabilities */
+#define PCI_EXP_SLTCAP 0x14 /* Slot Capabilities */
#define PCI_EXP_SLTCAP_ABP 0x00000001 /* Attention Button Present */
#define PCI_EXP_SLTCAP_PCP 0x00000002 /* Power Controller Present */
#define PCI_EXP_SLTCAP_MRLSP 0x00000004 /* MRL Sensor Present */
@@ -595,7 +595,7 @@
#define PCI_EXP_SLTCAP_EIP 0x00020000 /* Electromechanical Interlock Present */
#define PCI_EXP_SLTCAP_NCCS 0x00040000 /* No Command Completed Support */
#define PCI_EXP_SLTCAP_PSN 0xfff80000 /* Physical Slot Number */
-#define PCI_EXP_SLTCTL 24 /* Slot Control */
+#define PCI_EXP_SLTCTL 0x18 /* Slot Control */
#define PCI_EXP_SLTCTL_ABPE 0x0001 /* Attention Button Pressed Enable */
#define PCI_EXP_SLTCTL_PFDE 0x0002 /* Power Fault Detected Enable */
#define PCI_EXP_SLTCTL_MRLSCE 0x0004 /* MRL Sensor Changed Enable */
@@ -617,7 +617,7 @@
#define PCI_EXP_SLTCTL_EIC 0x0800 /* Electromechanical Interlock Control */
#define PCI_EXP_SLTCTL_DLLSCE 0x1000 /* Data Link Layer State Changed Enable */
#define PCI_EXP_SLTCTL_IBPD_DISABLE 0x4000 /* In-band PD disable */
-#define PCI_EXP_SLTSTA 26 /* Slot Status */
+#define PCI_EXP_SLTSTA 0x1a /* Slot Status */
#define PCI_EXP_SLTSTA_ABP 0x0001 /* Attention Button Pressed */
#define PCI_EXP_SLTSTA_PFD 0x0002 /* Power Fault Detected */
#define PCI_EXP_SLTSTA_MRLSC 0x0004 /* MRL Sensor Changed */
@@ -627,15 +627,15 @@
#define PCI_EXP_SLTSTA_PDS 0x0040 /* Presence Detect State */
#define PCI_EXP_SLTSTA_EIS 0x0080 /* Electromechanical Interlock Status */
#define PCI_EXP_SLTSTA_DLLSC 0x0100 /* Data Link Layer State Changed */
-#define PCI_EXP_RTCTL 28 /* Root Control */
+#define PCI_EXP_RTCTL 0x1c /* Root Control */
#define PCI_EXP_RTCTL_SECEE 0x0001 /* System Error on Correctable Error */
#define PCI_EXP_RTCTL_SENFEE 0x0002 /* System Error on Non-Fatal Error */
#define PCI_EXP_RTCTL_SEFEE 0x0004 /* System Error on Fatal Error */
#define PCI_EXP_RTCTL_PMEIE 0x0008 /* PME Interrupt Enable */
#define PCI_EXP_RTCTL_CRSSVE 0x0010 /* CRS Software Visibility Enable */
-#define PCI_EXP_RTCAP 30 /* Root Capabilities */
+#define PCI_EXP_RTCAP 0x1e /* Root Capabilities */
#define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */
-#define PCI_EXP_RTSTA 32 /* Root Status */
+#define PCI_EXP_RTSTA 0x20 /* Root Status */
#define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */
#define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */
/*
@@ -646,7 +646,7 @@
* Use pcie_capability_read_word() and similar interfaces to use them
* safely.
*/
-#define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */
+#define PCI_EXP_DEVCAP2 0x24 /* Device Capabilities 2 */
#define PCI_EXP_DEVCAP2_COMP_TMOUT_DIS 0x00000010 /* Completion Timeout Disable supported */
#define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */
#define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */
@@ -658,7 +658,7 @@
#define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */
#define PCI_EXP_DEVCAP2_OBFF_WAKE 0x00080000 /* Re-use WAKE# for OBFF */
#define PCI_EXP_DEVCAP2_EE_PREFIX 0x00200000 /* End-End TLP Prefix */
-#define PCI_EXP_DEVCTL2 40 /* Device Control 2 */
+#define PCI_EXP_DEVCTL2 0x28 /* Device Control 2 */
#define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */
#define PCI_EXP_DEVCTL2_COMP_TMOUT_DIS 0x0010 /* Completion Timeout Disable */
#define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */
@@ -670,9 +670,9 @@
#define PCI_EXP_DEVCTL2_OBFF_MSGA_EN 0x2000 /* Enable OBFF Message type A */
#define PCI_EXP_DEVCTL2_OBFF_MSGB_EN 0x4000 /* Enable OBFF Message type B */
#define PCI_EXP_DEVCTL2_OBFF_WAKE_EN 0x6000 /* OBFF using WAKE# signaling */
-#define PCI_EXP_DEVSTA2 42 /* Device Status 2 */
-#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 44 /* v2 endpoints without link end here */
-#define PCI_EXP_LNKCAP2 44 /* Link Capabilities 2 */
+#define PCI_EXP_DEVSTA2 0x2a /* Device Status 2 */
+#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2 0x2c /* end of v2 EPs w/o link */
+#define PCI_EXP_LNKCAP2 0x2c /* Link Capabilities 2 */
#define PCI_EXP_LNKCAP2_SLS_2_5GB 0x00000002 /* Supported Speed 2.5GT/s */
#define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */
#define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */
@@ -680,7 +680,7 @@
#define PCI_EXP_LNKCAP2_SLS_32_0GB 0x00000020 /* Supported Speed 32GT/s */
#define PCI_EXP_LNKCAP2_SLS_64_0GB 0x00000040 /* Supported Speed 64GT/s */
#define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */
-#define PCI_EXP_LNKCTL2 48 /* Link Control 2 */
+#define PCI_EXP_LNKCTL2 0x30 /* Link Control 2 */
#define PCI_EXP_LNKCTL2_TLS 0x000f
#define PCI_EXP_LNKCTL2_TLS_2_5GT 0x0001 /* Supported Speed 2.5GT/s */
#define PCI_EXP_LNKCTL2_TLS_5_0GT 0x0002 /* Supported Speed 5GT/s */
@@ -691,12 +691,12 @@
#define PCI_EXP_LNKCTL2_ENTER_COMP 0x0010 /* Enter Compliance */
#define PCI_EXP_LNKCTL2_TX_MARGIN 0x0380 /* Transmit Margin */
#define PCI_EXP_LNKCTL2_HASD 0x0020 /* HW Autonomous Speed Disable */
-#define PCI_EXP_LNKSTA2 50 /* Link Status 2 */
-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */
-#define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */
+#define PCI_EXP_LNKSTA2 0x32 /* Link Status 2 */
+#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 0x32 /* end of v2 EPs w/ link */
+#define PCI_EXP_SLTCAP2 0x34 /* Slot Capabilities 2 */
#define PCI_EXP_SLTCAP2_IBPD 0x00000001 /* In-band PD Disable Supported */
-#define PCI_EXP_SLTCTL2 56 /* Slot Control 2 */
-#define PCI_EXP_SLTSTA2 58 /* Slot Status 2 */
+#define PCI_EXP_SLTCTL2 0x38 /* Slot Control 2 */
+#define PCI_EXP_SLTSTA2 0x3a /* Slot Status 2 */
/* Extended Capabilities (PCI-X 2.0 and Express) */
#define PCI_EXT_CAP_ID(header) (header & 0x0000ffff)
@@ -742,7 +742,7 @@
#define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
/* Advanced Error Reporting */
-#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */
+#define PCI_ERR_UNCOR_STATUS 0x04 /* Uncorrectable Error Status */
#define PCI_ERR_UNC_UND 0x00000001 /* Undefined */
#define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */
#define PCI_ERR_UNC_SURPDN 0x00000020 /* Surprise Down */
@@ -760,11 +760,11 @@
#define PCI_ERR_UNC_MCBTLP 0x00800000 /* MC blocked TLP */
#define PCI_ERR_UNC_ATOMEG 0x01000000 /* Atomic egress blocked */
#define PCI_ERR_UNC_TLPPRE 0x02000000 /* TLP prefix blocked */
-#define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */
+#define PCI_ERR_UNCOR_MASK 0x08 /* Uncorrectable Error Mask */
/* Same bits as above */
-#define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */
+#define PCI_ERR_UNCOR_SEVER 0x0c /* Uncorrectable Error Severity */
/* Same bits as above */
-#define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */
+#define PCI_ERR_COR_STATUS 0x10 /* Correctable Error Status */
#define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */
#define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */
#define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */
@@ -773,20 +773,20 @@
#define PCI_ERR_COR_ADV_NFAT 0x00002000 /* Advisory Non-Fatal */
#define PCI_ERR_COR_INTERNAL 0x00004000 /* Corrected Internal */
#define PCI_ERR_COR_LOG_OVER 0x00008000 /* Header Log Overflow */
-#define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */
+#define PCI_ERR_COR_MASK 0x14 /* Correctable Error Mask */
/* Same bits as above */
-#define PCI_ERR_CAP 24 /* Advanced Error Capabilities */
-#define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */
+#define PCI_ERR_CAP 0x18 /* Advanced Error Capabilities & Ctrl*/
+#define PCI_ERR_CAP_FEP(x) ((x) & 0x1f) /* First Error Pointer */
#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */
#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */
#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */
#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */
-#define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */
-#define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */
+#define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */
#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */
#define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 /* Non-Fatal Err Reporting Enable */
#define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 /* Fatal Err Reporting Enable */
-#define PCI_ERR_ROOT_STATUS 48
+#define PCI_ERR_ROOT_STATUS 0x30
#define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */
#define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 /* Multiple ERR_COR */
#define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 /* ERR_FATAL/NONFATAL */
@@ -795,52 +795,52 @@
#define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */
#define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */
#define PCI_ERR_ROOT_AER_IRQ 0xf8000000 /* Advanced Error Interrupt Message Number */
-#define PCI_ERR_ROOT_ERR_SRC 52 /* Error Source Identification */
+#define PCI_ERR_ROOT_ERR_SRC 0x34 /* Error Source Identification */
/* Virtual Channel */
-#define PCI_VC_PORT_CAP1 4
+#define PCI_VC_PORT_CAP1 0x04
#define PCI_VC_CAP1_EVCC 0x00000007 /* extended VC count */
#define PCI_VC_CAP1_LPEVCC 0x00000070 /* low prio extended VC count */
#define PCI_VC_CAP1_ARB_SIZE 0x00000c00
-#define PCI_VC_PORT_CAP2 8
+#define PCI_VC_PORT_CAP2 0x08
#define PCI_VC_CAP2_32_PHASE 0x00000002
#define PCI_VC_CAP2_64_PHASE 0x00000004
#define PCI_VC_CAP2_128_PHASE 0x00000008
#define PCI_VC_CAP2_ARB_OFF 0xff000000
-#define PCI_VC_PORT_CTRL 12
+#define PCI_VC_PORT_CTRL 0x0c
#define PCI_VC_PORT_CTRL_LOAD_TABLE 0x00000001
-#define PCI_VC_PORT_STATUS 14
+#define PCI_VC_PORT_STATUS 0x0e
#define PCI_VC_PORT_STATUS_TABLE 0x00000001
-#define PCI_VC_RES_CAP 16
+#define PCI_VC_RES_CAP 0x10
#define PCI_VC_RES_CAP_32_PHASE 0x00000002
#define PCI_VC_RES_CAP_64_PHASE 0x00000004
#define PCI_VC_RES_CAP_128_PHASE 0x00000008
#define PCI_VC_RES_CAP_128_PHASE_TB 0x00000010
#define PCI_VC_RES_CAP_256_PHASE 0x00000020
#define PCI_VC_RES_CAP_ARB_OFF 0xff000000
-#define PCI_VC_RES_CTRL 20
+#define PCI_VC_RES_CTRL 0x14
#define PCI_VC_RES_CTRL_LOAD_TABLE 0x00010000
#define PCI_VC_RES_CTRL_ARB_SELECT 0x000e0000
#define PCI_VC_RES_CTRL_ID 0x07000000
#define PCI_VC_RES_CTRL_ENABLE 0x80000000
-#define PCI_VC_RES_STATUS 26
+#define PCI_VC_RES_STATUS 0x1a
#define PCI_VC_RES_STATUS_TABLE 0x00000001
#define PCI_VC_RES_STATUS_NEGO 0x00000002
#define PCI_CAP_VC_BASE_SIZEOF 0x10
-#define PCI_CAP_VC_PER_VC_SIZEOF 0x0C
+#define PCI_CAP_VC_PER_VC_SIZEOF 0x0c
/* Power Budgeting */
-#define PCI_PWR_DSR 4 /* Data Select Register */
-#define PCI_PWR_DATA 8 /* Data Register */
+#define PCI_PWR_DSR 0x04 /* Data Select Register */
+#define PCI_PWR_DATA 0x08 /* Data Register */
#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */
#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */
#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */
#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */
#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */
-#define PCI_PWR_CAP 12 /* Capability */
+#define PCI_PWR_CAP 0x0c /* Capability */
#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */
-#define PCI_EXT_CAP_PWR_SIZEOF 16
+#define PCI_EXT_CAP_PWR_SIZEOF 0x10
/* Root Complex Event Collector Endpoint Association */
#define PCI_RCEC_RCIEP_BITMAP 4 /* Associated Bitmap for RCiEPs */
@@ -964,7 +964,7 @@
#define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */
#define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */
#define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */
-#define PCI_EXT_CAP_SRIOV_SIZEOF 64
+#define PCI_EXT_CAP_SRIOV_SIZEOF 0x40
#define PCI_LTR_MAX_SNOOP_LAT 0x4
#define PCI_LTR_MAX_NOSNOOP_LAT 0x6
@@ -1017,12 +1017,12 @@
#define PCI_TPH_LOC_NONE 0x000 /* no location */
#define PCI_TPH_LOC_CAP 0x200 /* in capability */
#define PCI_TPH_LOC_MSIX 0x400 /* in MSI-X */
-#define PCI_TPH_CAP_ST_MASK 0x07FF0000 /* st table mask */
-#define PCI_TPH_CAP_ST_SHIFT 16 /* st table shift */
-#define PCI_TPH_BASE_SIZEOF 12 /* size with no st table */
+#define PCI_TPH_CAP_ST_MASK 0x07FF0000 /* ST table mask */
+#define PCI_TPH_CAP_ST_SHIFT 16 /* ST table shift */
+#define PCI_TPH_BASE_SIZEOF 0xc /* size with no ST table */
/* Downstream Port Containment */
-#define PCI_EXP_DPC_CAP 4 /* DPC Capability */
+#define PCI_EXP_DPC_CAP 0x04 /* DPC Capability */
#define PCI_EXP_DPC_IRQ 0x001F /* Interrupt Message Number */
#define PCI_EXP_DPC_CAP_RP_EXT 0x0020 /* Root Port Extensions */
#define PCI_EXP_DPC_CAP_POISONED_TLP 0x0040 /* Poisoned TLP Egress Blocking Supported */
@@ -1030,19 +1030,19 @@
#define PCI_EXP_DPC_RP_PIO_LOG_SIZE 0x0F00 /* RP PIO Log Size */
#define PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000 /* ERR_COR signal on DL_Active supported */
-#define PCI_EXP_DPC_CTL 6 /* DPC control */
+#define PCI_EXP_DPC_CTL 0x06 /* DPC control */
#define PCI_EXP_DPC_CTL_EN_FATAL 0x0001 /* Enable trigger on ERR_FATAL message */
#define PCI_EXP_DPC_CTL_EN_NONFATAL 0x0002 /* Enable trigger on ERR_NONFATAL message */
#define PCI_EXP_DPC_CTL_INT_EN 0x0008 /* DPC Interrupt Enable */
-#define PCI_EXP_DPC_STATUS 8 /* DPC Status */
+#define PCI_EXP_DPC_STATUS 0x08 /* DPC Status */
#define PCI_EXP_DPC_STATUS_TRIGGER 0x0001 /* Trigger Status */
#define PCI_EXP_DPC_STATUS_TRIGGER_RSN 0x0006 /* Trigger Reason */
#define PCI_EXP_DPC_STATUS_INTERRUPT 0x0008 /* Interrupt Status */
#define PCI_EXP_DPC_RP_BUSY 0x0010 /* Root Port Busy */
#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */
-#define PCI_EXP_DPC_SOURCE_ID 10 /* DPC Source Identifier */
+#define PCI_EXP_DPC_SOURCE_ID 0x0A /* DPC Source Identifier */
#define PCI_EXP_DPC_RP_PIO_STATUS 0x0C /* RP PIO Status */
#define PCI_EXP_DPC_RP_PIO_MASK 0x10 /* RP PIO Mask */
@@ -1086,7 +1086,11 @@
/* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */
#define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */
+#define PCI_DVSEC_HEADER1_VID(x) ((x) & 0xffff)
+#define PCI_DVSEC_HEADER1_REV(x) (((x) >> 16) & 0xf)
+#define PCI_DVSEC_HEADER1_LEN(x) (((x) >> 20) & 0xfff)
#define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */
+#define PCI_DVSEC_HEADER2_ID(x) ((x) & 0xffff)
/* Data Link Feature */
#define PCI_DLF_CAP 0x04 /* Capabilities Register */
diff --git a/include/standard-headers/linux/virtio_gpio.h b/include/standard-headers/linux/virtio_gpio.h
new file mode 100644
index 0000000000..2b5cf06349
--- /dev/null
+++ b/include/standard-headers/linux/virtio_gpio.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _LINUX_VIRTIO_GPIO_H
+#define _LINUX_VIRTIO_GPIO_H
+
+#include "standard-headers/linux/types.h"
+
+/* Virtio GPIO Feature bits */
+#define VIRTIO_GPIO_F_IRQ 0
+
+/* Virtio GPIO request types */
+#define VIRTIO_GPIO_MSG_GET_NAMES 0x0001
+#define VIRTIO_GPIO_MSG_GET_DIRECTION 0x0002
+#define VIRTIO_GPIO_MSG_SET_DIRECTION 0x0003
+#define VIRTIO_GPIO_MSG_GET_VALUE 0x0004
+#define VIRTIO_GPIO_MSG_SET_VALUE 0x0005
+#define VIRTIO_GPIO_MSG_IRQ_TYPE 0x0006
+
+/* Possible values of the status field */
+#define VIRTIO_GPIO_STATUS_OK 0x0
+#define VIRTIO_GPIO_STATUS_ERR 0x1
+
+/* Direction types */
+#define VIRTIO_GPIO_DIRECTION_NONE 0x00
+#define VIRTIO_GPIO_DIRECTION_OUT 0x01
+#define VIRTIO_GPIO_DIRECTION_IN 0x02
+
+/* Virtio GPIO IRQ types */
+#define VIRTIO_GPIO_IRQ_TYPE_NONE 0x00
+#define VIRTIO_GPIO_IRQ_TYPE_EDGE_RISING 0x01
+#define VIRTIO_GPIO_IRQ_TYPE_EDGE_FALLING 0x02
+#define VIRTIO_GPIO_IRQ_TYPE_EDGE_BOTH 0x03
+#define VIRTIO_GPIO_IRQ_TYPE_LEVEL_HIGH 0x04
+#define VIRTIO_GPIO_IRQ_TYPE_LEVEL_LOW 0x08
+
+struct virtio_gpio_config {
+ uint16_t ngpio;
+ uint8_t padding[2];
+ uint32_t gpio_names_size;
+};
+
+/* Virtio GPIO Request / Response */
+struct virtio_gpio_request {
+ uint16_t type;
+ uint16_t gpio;
+ uint32_t value;
+};
+
+struct virtio_gpio_response {
+ uint8_t status;
+ uint8_t value;
+};
+
+struct virtio_gpio_response_get_names {
+ uint8_t status;
+ uint8_t value[];
+};
+
+/* Virtio GPIO IRQ Request / Response */
+struct virtio_gpio_irq_request {
+ uint16_t gpio;
+};
+
+struct virtio_gpio_irq_response {
+ uint8_t status;
+};
+
+/* Possible values of the interrupt status field */
+#define VIRTIO_GPIO_IRQ_STATUS_INVALID 0x0
+#define VIRTIO_GPIO_IRQ_STATUS_VALID 0x1
+
+#endif /* _LINUX_VIRTIO_GPIO_H */
diff --git a/include/standard-headers/linux/virtio_i2c.h b/include/standard-headers/linux/virtio_i2c.h
new file mode 100644
index 0000000000..09fa907793
--- /dev/null
+++ b/include/standard-headers/linux/virtio_i2c.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
+/*
+ * Definitions for virtio I2C Adpter
+ *
+ * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ */
+
+#ifndef _LINUX_VIRTIO_I2C_H
+#define _LINUX_VIRTIO_I2C_H
+
+#include "standard-headers/linux/const.h"
+#include "standard-headers/linux/types.h"
+
+/* Virtio I2C Feature bits */
+#define VIRTIO_I2C_F_ZERO_LENGTH_REQUEST 0
+
+/* The bit 0 of the @virtio_i2c_out_hdr.@flags, used to group the requests */
+#define VIRTIO_I2C_FLAGS_FAIL_NEXT _BITUL(0)
+
+/* The bit 1 of the @virtio_i2c_out_hdr.@flags, used to mark a buffer as read */
+#define VIRTIO_I2C_FLAGS_M_RD _BITUL(1)
+
+/**
+ * struct virtio_i2c_out_hdr - the virtio I2C message OUT header
+ * @addr: the controlled device address
+ * @padding: used to pad to full dword
+ * @flags: used for feature extensibility
+ */
+struct virtio_i2c_out_hdr {
+ uint16_t addr;
+ uint16_t padding;
+ uint32_t flags;
+};
+
+/**
+ * struct virtio_i2c_in_hdr - the virtio I2C message IN header
+ * @status: the processing result from the backend
+ */
+struct virtio_i2c_in_hdr {
+ uint8_t status;
+};
+
+/* The final status written by the device */
+#define VIRTIO_I2C_MSG_OK 0
+#define VIRTIO_I2C_MSG_ERR 1
+
+#endif /* _LINUX_VIRTIO_I2C_H */
diff --git a/include/standard-headers/linux/virtio_iommu.h b/include/standard-headers/linux/virtio_iommu.h
index b9443b83a1..366379c2f0 100644
--- a/include/standard-headers/linux/virtio_iommu.h
+++ b/include/standard-headers/linux/virtio_iommu.h
@@ -16,6 +16,7 @@
#define VIRTIO_IOMMU_F_BYPASS 3
#define VIRTIO_IOMMU_F_PROBE 4
#define VIRTIO_IOMMU_F_MMIO 5
+#define VIRTIO_IOMMU_F_BYPASS_CONFIG 6
struct virtio_iommu_range_64 {
uint64_t start;
@@ -36,6 +37,8 @@ struct virtio_iommu_config {
struct virtio_iommu_range_32 domain_range;
/* Probe buffer size */
uint32_t probe_size;
+ uint8_t bypass;
+ uint8_t reserved[3];
};
/* Request types */
@@ -66,11 +69,14 @@ struct virtio_iommu_req_tail {
uint8_t reserved[3];
};
+#define VIRTIO_IOMMU_ATTACH_F_BYPASS (1 << 0)
+
struct virtio_iommu_req_attach {
struct virtio_iommu_req_head head;
uint32_t domain;
uint32_t endpoint;
- uint8_t reserved[8];
+ uint32_t flags;
+ uint8_t reserved[4];
struct virtio_iommu_req_tail tail;
};
diff --git a/include/standard-headers/linux/virtio_pcidev.h b/include/standard-headers/linux/virtio_pcidev.h
new file mode 100644
index 0000000000..bdf1d062da
--- /dev/null
+++ b/include/standard-headers/linux/virtio_pcidev.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ * Copyright (C) 2021 Intel Corporation
+ * Author: Johannes Berg <johannes@sipsolutions.net>
+ */
+#ifndef _LINUX_VIRTIO_PCIDEV_H
+#define _LINUX_VIRTIO_PCIDEV_H
+#include "standard-headers/linux/types.h"
+
+/**
+ * enum virtio_pcidev_ops - virtual PCI device operations
+ * @VIRTIO_PCIDEV_OP_RESERVED: reserved to catch errors
+ * @VIRTIO_PCIDEV_OP_CFG_READ: read config space, size is 1, 2, 4 or 8;
+ * the @data field should be filled in by the device (in little endian).
+ * @VIRTIO_PCIDEV_OP_CFG_WRITE: write config space, size is 1, 2, 4 or 8;
+ * the @data field contains the data to write (in little endian).
+ * @VIRTIO_PCIDEV_OP_MMIO_READ: read BAR mem/pio, size can be variable;
+ * the @data field should be filled in by the device (in little endian).
+ * @VIRTIO_PCIDEV_OP_MMIO_WRITE: write BAR mem/pio, size can be variable;
+ * the @data field contains the data to write (in little endian).
+ * @VIRTIO_PCIDEV_OP_MMIO_MEMSET: memset MMIO, size is variable but
+ * the @data field only has one byte (unlike @VIRTIO_PCIDEV_OP_MMIO_WRITE)
+ * @VIRTIO_PCIDEV_OP_INT: legacy INTx# pin interrupt, the addr field is 1-4 for
+ * the number
+ * @VIRTIO_PCIDEV_OP_MSI: MSI(-X) interrupt, this message basically transports
+ * the 16- or 32-bit write that would otherwise be done into memory,
+ * analogous to the write messages (@VIRTIO_PCIDEV_OP_MMIO_WRITE) above
+ * @VIRTIO_PCIDEV_OP_PME: Dummy message whose content is ignored (and should be
+ * all zeroes) to signal the PME# pin.
+ */
+enum virtio_pcidev_ops {
+ VIRTIO_PCIDEV_OP_RESERVED = 0,
+ VIRTIO_PCIDEV_OP_CFG_READ,
+ VIRTIO_PCIDEV_OP_CFG_WRITE,
+ VIRTIO_PCIDEV_OP_MMIO_READ,
+ VIRTIO_PCIDEV_OP_MMIO_WRITE,
+ VIRTIO_PCIDEV_OP_MMIO_MEMSET,
+ VIRTIO_PCIDEV_OP_INT,
+ VIRTIO_PCIDEV_OP_MSI,
+ VIRTIO_PCIDEV_OP_PME,
+};
+
+/**
+ * struct virtio_pcidev_msg - virtio PCI device operation
+ * @op: the operation to do
+ * @bar: the bar (only with BAR read/write messages)
+ * @reserved: reserved
+ * @size: the size of the read/write (in bytes)
+ * @addr: the address to read/write
+ * @data: the data, normally @size long, but just one byte for
+ * %VIRTIO_PCIDEV_OP_MMIO_MEMSET
+ *
+ * Note: the fields are all in native (CPU) endian, however, the
+ * @data values will often be in little endian (see the ops above.)
+ */
+struct virtio_pcidev_msg {
+ uint8_t op;
+ uint8_t bar;
+ uint16_t reserved;
+ uint32_t size;
+ uint64_t addr;
+ uint8_t data[];
+};
+
+#endif /* _LINUX_VIRTIO_PCIDEV_H */
diff --git a/include/standard-headers/linux/virtio_scmi.h b/include/standard-headers/linux/virtio_scmi.h
new file mode 100644
index 0000000000..8f2c305aea
--- /dev/null
+++ b/include/standard-headers/linux/virtio_scmi.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/*
+ * Copyright (C) 2020-2021 OpenSynergy GmbH
+ * Copyright (C) 2021 ARM Ltd.
+ */
+
+#ifndef _LINUX_VIRTIO_SCMI_H
+#define _LINUX_VIRTIO_SCMI_H
+
+#include "standard-headers/linux/virtio_types.h"
+
+/* Device implements some SCMI notifications, or delayed responses. */
+#define VIRTIO_SCMI_F_P2A_CHANNELS 0
+
+/* Device implements any SCMI statistics shared memory region */
+#define VIRTIO_SCMI_F_SHARED_MEMORY 1
+
+/* Virtqueues */
+
+#define VIRTIO_SCMI_VQ_TX 0 /* cmdq */
+#define VIRTIO_SCMI_VQ_RX 1 /* eventq */
+#define VIRTIO_SCMI_VQ_MAX_CNT 2
+
+#endif /* _LINUX_VIRTIO_SCMI_H */
diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h
index 4557a8b608..1c48b0ae3b 100644
--- a/linux-headers/asm-generic/unistd.h
+++ b/linux-headers/asm-generic/unistd.h
@@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#define __NR_futex_waitv 449
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
+#define __NR_set_mempolicy_home_node 450
+__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+
#undef __NR_syscalls
-#define __NR_syscalls 450
+#define __NR_syscalls 451
/*
* 32 bit systems traditionally used different
diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h
index 4b3e7ad1ec..1f14a6fad3 100644
--- a/linux-headers/asm-mips/unistd_n32.h
+++ b/linux-headers/asm-mips/unistd_n32.h
@@ -377,5 +377,7 @@
#define __NR_landlock_add_rule (__NR_Linux + 445)
#define __NR_landlock_restrict_self (__NR_Linux + 446)
#define __NR_process_mrelease (__NR_Linux + 448)
+#define __NR_futex_waitv (__NR_Linux + 449)
+#define __NR_set_mempolicy_home_node (__NR_Linux + 450)
#endif /* _ASM_UNISTD_N32_H */
diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h
index 488d9298d9..e5a8ebec78 100644
--- a/linux-headers/asm-mips/unistd_n64.h
+++ b/linux-headers/asm-mips/unistd_n64.h
@@ -353,5 +353,7 @@
#define __NR_landlock_add_rule (__NR_Linux + 445)
#define __NR_landlock_restrict_self (__NR_Linux + 446)
#define __NR_process_mrelease (__NR_Linux + 448)
+#define __NR_futex_waitv (__NR_Linux + 449)
+#define __NR_set_mempolicy_home_node (__NR_Linux + 450)
#endif /* _ASM_UNISTD_N64_H */
diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h
index f47399870a..871d57168f 100644
--- a/linux-headers/asm-mips/unistd_o32.h
+++ b/linux-headers/asm-mips/unistd_o32.h
@@ -423,5 +423,7 @@
#define __NR_landlock_add_rule (__NR_Linux + 445)
#define __NR_landlock_restrict_self (__NR_Linux + 446)
#define __NR_process_mrelease (__NR_Linux + 448)
+#define __NR_futex_waitv (__NR_Linux + 449)
+#define __NR_set_mempolicy_home_node (__NR_Linux + 450)
#endif /* _ASM_UNISTD_O32_H */
diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h
index 11d54696dc..585c7fefbc 100644
--- a/linux-headers/asm-powerpc/unistd_32.h
+++ b/linux-headers/asm-powerpc/unistd_32.h
@@ -430,6 +430,8 @@
#define __NR_landlock_add_rule 445
#define __NR_landlock_restrict_self 446
#define __NR_process_mrelease 448
+#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_UNISTD_32_H */
diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h
index cf740bab13..350f7ec0ac 100644
--- a/linux-headers/asm-powerpc/unistd_64.h
+++ b/linux-headers/asm-powerpc/unistd_64.h
@@ -402,6 +402,8 @@
#define __NR_landlock_add_rule 445
#define __NR_landlock_restrict_self 446
#define __NR_process_mrelease 448
+#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_UNISTD_64_H */
diff --git a/linux-headers/asm-riscv/bitsperlong.h b/linux-headers/asm-riscv/bitsperlong.h
new file mode 100644
index 0000000000..cc5c45a9ce
--- /dev/null
+++ b/linux-headers/asm-riscv/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Copyright (C) 2015 Regents of the University of California
+ */
+
+#ifndef _ASM_RISCV_BITSPERLONG_H
+#define _ASM_RISCV_BITSPERLONG_H
+
+#define __BITS_PER_LONG (__SIZEOF_POINTER__ * 8)
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* _ASM_RISCV_BITSPERLONG_H */
diff --git a/linux-headers/asm-riscv/mman.h b/linux-headers/asm-riscv/mman.h
new file mode 100644
index 0000000000..8eebf89f5a
--- /dev/null
+++ b/linux-headers/asm-riscv/mman.h
@@ -0,0 +1 @@
+#include <asm-generic/mman.h>
diff --git a/linux-headers/asm-riscv/unistd.h b/linux-headers/asm-riscv/unistd.h
new file mode 100644
index 0000000000..8062996c2d
--- /dev/null
+++ b/linux-headers/asm-riscv/unistd.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2018 David Abdurachmanov <david.abdurachmanov@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifdef __LP64__
+#define __ARCH_WANT_NEW_STAT
+#define __ARCH_WANT_SET_GET_RLIMIT
+#endif /* __LP64__ */
+
+#define __ARCH_WANT_SYS_CLONE3
+
+#include <asm-generic/unistd.h>
+
+/*
+ * Allows the instruction cache to be flushed from userspace. Despite RISC-V
+ * having a direct 'fence.i' instruction available to userspace (which we
+ * can't trap!), that's not actually viable when running on Linux because the
+ * kernel might schedule a process on another hart. There is no way for
+ * userspace to handle this without invoking the kernel (as it doesn't know the
+ * thread->hart mappings), so we've defined a RISC-V specific system call to
+ * flush the instruction cache.
+ *
+ * __NR_riscv_flush_icache is defined to flush the instruction cache over an
+ * address range, with the flush applying to either all threads or just the
+ * caller. We don't currently do anything with the address range, that's just
+ * in there for forwards compatibility.
+ */
+#ifndef __NR_riscv_flush_icache
+#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
+#endif
+__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h
index 8f97d98128..8e644d65f5 100644
--- a/linux-headers/asm-s390/unistd_32.h
+++ b/linux-headers/asm-s390/unistd_32.h
@@ -420,5 +420,7 @@
#define __NR_landlock_add_rule 445
#define __NR_landlock_restrict_self 446
#define __NR_process_mrelease 448
+#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_S390_UNISTD_32_H */
diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h
index 021ffc30e6..51da542fec 100644
--- a/linux-headers/asm-s390/unistd_64.h
+++ b/linux-headers/asm-s390/unistd_64.h
@@ -368,5 +368,7 @@
#define __NR_landlock_add_rule 445
#define __NR_landlock_restrict_self 446
#define __NR_process_mrelease 448
+#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_S390_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index 5a776a08f7..2da3316bb5 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -373,9 +373,23 @@ struct kvm_debugregs {
__u64 reserved[9];
};
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
struct kvm_xsave {
+ /*
+ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * respectively, when invoked on the vm file descriptor.
+ *
+ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * will always be at least 4096. Currently, it is only greater
+ * than 4096 if a dynamic feature has been enabled with
+ * ``arch_prctl()``, but this may change in the future.
+ *
+ * The offsets of the state save areas in struct kvm_xsave follow
+ * the contents of CPUID leaf 0xD on the host.
+ */
__u32 region[1024];
+ __u32 extra[0];
};
#define KVM_MAX_XCRS 16
diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index 9c9ffe312b..87e1e977af 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -440,6 +440,7 @@
#define __NR_memfd_secret 447
#define __NR_process_mrelease 448
#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_UNISTD_32_H */
diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index 084f1eef9c..147a78d623 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -362,6 +362,7 @@
#define __NR_memfd_secret 447
#define __NR_process_mrelease 448
#define __NR_futex_waitv 449
+#define __NR_set_mempolicy_home_node 450
#endif /* _ASM_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index a2441affc2..27098db7fb 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -315,6 +315,7 @@
#define __NR_memfd_secret (__X32_SYSCALL_BIT + 447)
#define __NR_process_mrelease (__X32_SYSCALL_BIT + 448)
#define __NR_futex_waitv (__X32_SYSCALL_BIT + 449)
+#define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450)
#define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
#define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
#define __NR_ioctl (__X32_SYSCALL_BIT + 514)
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 02c5e7b7bb..00af3bc333 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1130,6 +1130,9 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_BINARY_STATS_FD 203
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
#define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
+#define KVM_CAP_VM_GPA_BITS 207
+#define KVM_CAP_XSAVE2 208
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1161,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
__u32 sint;
};
+struct kvm_irq_routing_xen_evtchn {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+};
+
+#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
+
/* gsi routing entry types */
#define KVM_IRQ_ROUTING_IRQCHIP 1
#define KVM_IRQ_ROUTING_MSI 2
#define KVM_IRQ_ROUTING_S390_ADAPTER 3
#define KVM_IRQ_ROUTING_HV_SINT 4
+#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
struct kvm_irq_routing_entry {
__u32 gsi;
@@ -1177,6 +1189,7 @@ struct kvm_irq_routing_entry {
struct kvm_irq_routing_msi msi;
struct kvm_irq_routing_s390_adapter adapter;
struct kvm_irq_routing_hv_sint hv_sint;
+ struct kvm_irq_routing_xen_evtchn xen_evtchn;
__u32 pad[8];
} u;
};
@@ -1207,6 +1220,7 @@ struct kvm_x86_mce {
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
struct kvm_xen_hvm_config {
__u32 flags;
@@ -1609,6 +1623,9 @@ struct kvm_enc_region {
#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
+/* Available with KVM_CAP_XSAVE2 */
+#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
+
struct kvm_s390_pv_sec_parm {
__u64 origin;
__u64 length;
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
index 0c2665b977..bf46954dab 100644
--- a/tools/virtiofsd/fuse_common.h
+++ b/tools/virtiofsd/fuse_common.h
@@ -378,6 +378,11 @@ struct fuse_file_info {
#define FUSE_CAP_SETXATTR_EXT (1 << 29)
/**
+ * Indicates that file server supports creating file security context
+ */
+#define FUSE_CAP_SECURITY_CTX (1ULL << 32)
+
+/**
* Ioctl flags
*
* FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
@@ -439,7 +444,7 @@ struct fuse_conn_info {
/**
* Capability flags that the kernel supports (read-only)
*/
- unsigned capable;
+ uint64_t capable;
/**
* Capability flags that the filesystem wants to enable.
@@ -447,7 +452,7 @@ struct fuse_conn_info {
* libfuse attempts to initialize this field with
* reasonable default values before calling the init() handler.
*/
- unsigned want;
+ uint64_t want;
/**
* Maximum number of pending "background" requests. A
diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h
index 492e002181..a5572fa4ae 100644
--- a/tools/virtiofsd/fuse_i.h
+++ b/tools/virtiofsd/fuse_i.h
@@ -15,6 +15,12 @@
struct fv_VuDev;
struct fv_QueueInfo;
+struct fuse_security_context {
+ const char *name;
+ uint32_t ctxlen;
+ const void *ctx;
+};
+
struct fuse_req {
struct fuse_session *se;
uint64_t unique;
@@ -35,6 +41,7 @@ struct fuse_req {
} u;
struct fuse_req *next;
struct fuse_req *prev;
+ struct fuse_security_context secctx;
};
struct fuse_notify_req {
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index e4679c73ab..752928741d 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -886,11 +886,63 @@ static void do_readlink(fuse_req_t req, fuse_ino_t nodeid,
}
}
+static int parse_secctx_fill_req(fuse_req_t req, struct fuse_mbuf_iter *iter)
+{
+ struct fuse_secctx_header *fsecctx_header;
+ struct fuse_secctx *fsecctx;
+ const void *secctx;
+ const char *name;
+
+ fsecctx_header = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx_header));
+ if (!fsecctx_header) {
+ return -EINVAL;
+ }
+
+ /*
+ * As of now maximum of one security context is supported. It can
+ * change in future though.
+ */
+ if (fsecctx_header->nr_secctx > 1) {
+ return -EINVAL;
+ }
+
+ /* No security context sent. Maybe no LSM supports it */
+ if (!fsecctx_header->nr_secctx) {
+ return 0;
+ }
+
+ fsecctx = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx));
+ if (!fsecctx) {
+ return -EINVAL;
+ }
+
+ /* struct fsecctx with zero sized context is not expected */
+ if (!fsecctx->size) {
+ return -EINVAL;
+ }
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!name) {
+ return -EINVAL;
+ }
+
+ secctx = fuse_mbuf_iter_advance(iter, fsecctx->size);
+ if (!secctx) {
+ return -EINVAL;
+ }
+
+ req->secctx.name = name;
+ req->secctx.ctx = secctx;
+ req->secctx.ctxlen = fsecctx->size;
+ return 0;
+}
+
static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_mbuf_iter *iter)
{
struct fuse_mknod_in *arg;
const char *name;
+ bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
+ int err;
arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
name = fuse_mbuf_iter_advance_str(iter);
@@ -901,6 +953,14 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
req->ctx.umask = arg->umask;
+ if (secctx_enabled) {
+ err = parse_secctx_fill_req(req, iter);
+ if (err) {
+ fuse_reply_err(req, -err);
+ return;
+ }
+ }
+
if (req->se->op.mknod) {
req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev);
} else {
@@ -913,6 +973,8 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
{
struct fuse_mkdir_in *arg;
const char *name;
+ bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
+ int err;
arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
name = fuse_mbuf_iter_advance_str(iter);
@@ -923,6 +985,14 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
req->ctx.umask = arg->umask;
+ if (secctx_enabled) {
+ err = parse_secctx_fill_req(req, iter);
+ if (err) {
+ fuse_reply_err(req, err);
+ return;
+ }
+ }
+
if (req->se->op.mkdir) {
req->se->op.mkdir(req, nodeid, name, arg->mode);
} else {
@@ -969,12 +1039,22 @@ static void do_symlink(fuse_req_t req, fuse_ino_t nodeid,
{
const char *name = fuse_mbuf_iter_advance_str(iter);
const char *linkname = fuse_mbuf_iter_advance_str(iter);
+ bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
+ int err;
if (!name || !linkname) {
fuse_reply_err(req, EINVAL);
return;
}
+ if (secctx_enabled) {
+ err = parse_secctx_fill_req(req, iter);
+ if (err) {
+ fuse_reply_err(req, err);
+ return;
+ }
+ }
+
if (req->se->op.symlink) {
req->se->op.symlink(req, linkname, nodeid, name);
} else {
@@ -1048,6 +1128,8 @@ static void do_link(fuse_req_t req, fuse_ino_t nodeid,
static void do_create(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_mbuf_iter *iter)
{
+ bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
+
if (req->se->op.create) {
struct fuse_create_in *arg;
struct fuse_file_info fi;
@@ -1060,6 +1142,15 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid,
return;
}
+ if (secctx_enabled) {
+ int err;
+ err = parse_secctx_fill_req(req, iter);
+ if (err) {
+ fuse_reply_err(req, err);
+ return;
+ }
+ }
+
memset(&fi, 0, sizeof(fi));
fi.flags = arg->flags;
fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
@@ -1876,15 +1967,30 @@ static void do_lseek(fuse_req_t req, fuse_ino_t nodeid,
}
}
+static void do_syncfs(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.syncfs) {
+ req->se->op.syncfs(req, nodeid);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
static void do_init(fuse_req_t req, fuse_ino_t nodeid,
struct fuse_mbuf_iter *iter)
{
size_t compat_size = offsetof(struct fuse_init_in, max_readahead);
+ size_t compat2_size = offsetof(struct fuse_init_in, flags) +
+ sizeof(uint32_t);
+ /* Fuse structure extended with minor version 36 */
+ size_t compat3_size = endof(struct fuse_init_in, unused);
struct fuse_init_in *arg;
struct fuse_init_out outarg;
struct fuse_session *se = req->se;
size_t bufsize = se->bufsize;
size_t outargsize = sizeof(outarg);
+ uint64_t flags = 0;
(void)nodeid;
@@ -1897,15 +2003,29 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
/* ...and now consume the new fields. */
if (arg->major == 7 && arg->minor >= 6) {
- if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) {
+ if (!fuse_mbuf_iter_advance(iter, compat2_size - compat_size)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ flags |= arg->flags;
+ }
+
+ /*
+ * fuse_init_in was extended again with minor version 36. Just read
+ * current known size of fuse_init so that future extension and
+ * header rebase does not cause breakage.
+ */
+ if (sizeof(*arg) > compat2_size && (arg->flags & FUSE_INIT_EXT)) {
+ if (!fuse_mbuf_iter_advance(iter, compat3_size - compat2_size)) {
fuse_reply_err(req, EINVAL);
return;
}
+ flags |= (uint64_t) arg->flags2 << 32;
}
fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor);
if (arg->major == 7 && arg->minor >= 6) {
- fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags);
+ fuse_log(FUSE_LOG_DEBUG, "flags=0x%016llx\n", flags);
fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead);
}
se->conn.proto_major = arg->major;
@@ -1933,70 +2053,73 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
if (arg->max_readahead < se->conn.max_readahead) {
se->conn.max_readahead = arg->max_readahead;
}
- if (arg->flags & FUSE_ASYNC_READ) {
+ if (flags & FUSE_ASYNC_READ) {
se->conn.capable |= FUSE_CAP_ASYNC_READ;
}
- if (arg->flags & FUSE_POSIX_LOCKS) {
+ if (flags & FUSE_POSIX_LOCKS) {
se->conn.capable |= FUSE_CAP_POSIX_LOCKS;
}
- if (arg->flags & FUSE_ATOMIC_O_TRUNC) {
+ if (flags & FUSE_ATOMIC_O_TRUNC) {
se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC;
}
- if (arg->flags & FUSE_EXPORT_SUPPORT) {
+ if (flags & FUSE_EXPORT_SUPPORT) {
se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT;
}
- if (arg->flags & FUSE_DONT_MASK) {
+ if (flags & FUSE_DONT_MASK) {
se->conn.capable |= FUSE_CAP_DONT_MASK;
}
- if (arg->flags & FUSE_FLOCK_LOCKS) {
+ if (flags & FUSE_FLOCK_LOCKS) {
se->conn.capable |= FUSE_CAP_FLOCK_LOCKS;
}
- if (arg->flags & FUSE_AUTO_INVAL_DATA) {
+ if (flags & FUSE_AUTO_INVAL_DATA) {
se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA;
}
- if (arg->flags & FUSE_DO_READDIRPLUS) {
+ if (flags & FUSE_DO_READDIRPLUS) {
se->conn.capable |= FUSE_CAP_READDIRPLUS;
}
- if (arg->flags & FUSE_READDIRPLUS_AUTO) {
+ if (flags & FUSE_READDIRPLUS_AUTO) {
se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO;
}
- if (arg->flags & FUSE_ASYNC_DIO) {
+ if (flags & FUSE_ASYNC_DIO) {
se->conn.capable |= FUSE_CAP_ASYNC_DIO;
}
- if (arg->flags & FUSE_WRITEBACK_CACHE) {
+ if (flags & FUSE_WRITEBACK_CACHE) {
se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE;
}
- if (arg->flags & FUSE_NO_OPEN_SUPPORT) {
+ if (flags & FUSE_NO_OPEN_SUPPORT) {
se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT;
}
- if (arg->flags & FUSE_PARALLEL_DIROPS) {
+ if (flags & FUSE_PARALLEL_DIROPS) {
se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS;
}
- if (arg->flags & FUSE_POSIX_ACL) {
+ if (flags & FUSE_POSIX_ACL) {
se->conn.capable |= FUSE_CAP_POSIX_ACL;
}
- if (arg->flags & FUSE_HANDLE_KILLPRIV) {
+ if (flags & FUSE_HANDLE_KILLPRIV) {
se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV;
}
- if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) {
+ if (flags & FUSE_NO_OPENDIR_SUPPORT) {
se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT;
}
- if (!(arg->flags & FUSE_MAX_PAGES)) {
+ if (!(flags & FUSE_MAX_PAGES)) {
size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() +
FUSE_BUFFER_HEADER_SIZE;
if (bufsize > max_bufsize) {
bufsize = max_bufsize;
}
}
- if (arg->flags & FUSE_SUBMOUNTS) {
+ if (flags & FUSE_SUBMOUNTS) {
se->conn.capable |= FUSE_CAP_SUBMOUNTS;
}
- if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
+ if (flags & FUSE_HANDLE_KILLPRIV_V2) {
se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2;
}
- if (arg->flags & FUSE_SETXATTR_EXT) {
+ if (flags & FUSE_SETXATTR_EXT) {
se->conn.capable |= FUSE_CAP_SETXATTR_EXT;
}
+ if (flags & FUSE_SECURITY_CTX) {
+ se->conn.capable |= FUSE_CAP_SECURITY_CTX;
+ }
#ifdef HAVE_SPLICE
#ifdef HAVE_VMSPLICE
se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
@@ -2051,7 +2174,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
if (se->conn.want & (~se->conn.capable)) {
fuse_log(FUSE_LOG_ERR,
"fuse: error: filesystem requested capabilities "
- "0x%x that are not supported by kernel, aborting.\n",
+ "0x%llx that are not supported by kernel, aborting.\n",
se->conn.want & (~se->conn.capable));
fuse_reply_err(req, EPROTO);
se->error = -EPROTO;
@@ -2062,7 +2185,7 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) {
se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE;
}
- if (arg->flags & FUSE_MAX_PAGES) {
+ if (flags & FUSE_MAX_PAGES) {
outarg.flags |= FUSE_MAX_PAGES;
outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1;
}
@@ -2136,8 +2259,14 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
outarg.flags |= FUSE_SETXATTR_EXT;
}
+ if (se->conn.want & FUSE_CAP_SECURITY_CTX) {
+ /* bits 32..63 get shifted down 32 bits into the flags2 field */
+ outarg.flags2 |= FUSE_SECURITY_CTX >> 32;
+ }
+
fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
- fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
+ fuse_log(FUSE_LOG_DEBUG, " flags2=0x%08x flags=0x%08x\n", outarg.flags2,
+ outarg.flags);
fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead);
fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write);
fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background);
@@ -2280,6 +2409,7 @@ static struct {
[FUSE_RENAME2] = { do_rename2, "RENAME2" },
[FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" },
[FUSE_LSEEK] = { do_lseek, "LSEEK" },
+ [FUSE_SYNCFS] = { do_syncfs, "SYNCFS" },
};
#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0]))
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index c55c0ca2fc..b889dae4de 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1226,6 +1226,19 @@ struct fuse_lowlevel_ops {
*/
void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
struct fuse_file_info *fi);
+
+ /**
+ * Synchronize file system content
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to syncfs() will
+ * succeed automatically without being sent to the filesystem
+ * process.
+ *
+ * @param req request handle
+ * @param ino the inode number
+ */
+ void (*syncfs)(fuse_req_t req, fuse_ino_t ino);
};
/**
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index a8295d975a..e226fc590f 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -187,6 +187,7 @@ void fuse_cmdline_help(void)
" default: no_allow_direct_io\n"
" -o announce_submounts Announce sub-mount points to the guest\n"
" -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n"
+ " -o security_label/no_security_label Enable/Disable security label. (default: disabled)\n"
);
}
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index b3d0674f6d..dfa2fc250d 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -173,10 +173,15 @@ struct lo_data {
/* An O_PATH file descriptor to /proc/self/fd/ */
int proc_self_fd;
+ /* An O_PATH file descriptor to /proc/self/task/ */
+ int proc_self_task;
int user_killpriv_v2, killpriv_v2;
/* If set, virtiofsd is responsible for setting umask during creation */
bool change_umask;
int user_posix_acl, posix_acl;
+ /* Keeps track if /proc/<pid>/attr/fscreate should be used or not */
+ bool use_fscreate;
+ int user_security_label;
};
static const struct fuse_opt lo_opts[] = {
@@ -211,6 +216,8 @@ static const struct fuse_opt lo_opts[] = {
{ "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
{ "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
{ "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
+ { "security_label", offsetof(struct lo_data, user_security_label), 1 },
+ { "no_security_label", offsetof(struct lo_data, user_security_label), 0 },
FUSE_OPT_END
};
static bool use_syslog = false;
@@ -230,6 +237,11 @@ static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
static int xattr_map_client(const struct lo_data *lo, const char *client_name,
char **out_name);
+#define FCHDIR_NOFAIL(fd) do { \
+ int fchdir_res = fchdir(fd); \
+ assert(fchdir_res == 0); \
+ } while (0)
+
static bool is_dot_or_dotdot(const char *name)
{
return name[0] == '.' &&
@@ -257,6 +269,70 @@ static struct lo_data *lo_data(fuse_req_t req)
}
/*
+ * Tries to figure out if /proc/<pid>/attr/fscreate is usable or not. With
+ * selinux=0, read from fscreate returns -EINVAL.
+ *
+ * TODO: Link with libselinux and use is_selinux_enabled() instead down
+ * the line. It probably will be more reliable indicator.
+ */
+static bool is_fscreate_usable(struct lo_data *lo)
+{
+ char procname[64];
+ int fscreate_fd;
+ size_t bytes_read;
+
+ sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
+ fscreate_fd = openat(lo->proc_self_task, procname, O_RDWR);
+ if (fscreate_fd == -1) {
+ return false;
+ }
+
+ bytes_read = read(fscreate_fd, procname, 64);
+ close(fscreate_fd);
+ if (bytes_read == -1) {
+ return false;
+ }
+ return true;
+}
+
+/* Helpers to set/reset fscreate */
+static int open_set_proc_fscreate(struct lo_data *lo, const void *ctx,
+ size_t ctxlen, int *fd)
+{
+ char procname[64];
+ int fscreate_fd, err = 0;
+ size_t written;
+
+ sprintf(procname, "%ld/attr/fscreate", syscall(SYS_gettid));
+ fscreate_fd = openat(lo->proc_self_task, procname, O_WRONLY);
+ err = fscreate_fd == -1 ? errno : 0;
+ if (err) {
+ return err;
+ }
+
+ written = write(fscreate_fd, ctx, ctxlen);
+ err = written == -1 ? errno : 0;
+ if (err) {
+ goto out;
+ }
+
+ *fd = fscreate_fd;
+ return 0;
+out:
+ close(fscreate_fd);
+ return err;
+}
+
+static void close_reset_proc_fscreate(int fd)
+{
+ if ((write(fd, NULL, 0)) == -1) {
+ fuse_log(FUSE_LOG_WARNING, "Failed to reset fscreate. err=%d\n", errno);
+ }
+ close(fd);
+ return;
+}
+
+/*
* Load capng's state from our saved state if the current thread
* hadn't previously been loaded.
* returns 0 on success
@@ -735,6 +811,17 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn)
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
conn->want &= ~FUSE_CAP_POSIX_ACL;
}
+
+ if (lo->user_security_label == 1) {
+ if (!(conn->capable & FUSE_CAP_SECURITY_CTX)) {
+ fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable security label."
+ " kernel does not support FUSE_SECURITY_CTX capability.\n");
+ }
+ conn->want |= FUSE_CAP_SECURITY_CTX;
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling security label\n");
+ conn->want &= ~FUSE_CAP_SECURITY_CTX;
+ }
}
static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
@@ -1284,16 +1371,103 @@ static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
}
}
+static int do_mknod_symlink_secctx(fuse_req_t req, struct lo_inode *dir,
+ const char *name, const char *secctx_name)
+{
+ int path_fd, err;
+ char procname[64];
+ struct lo_data *lo = lo_data(req);
+
+ if (!req->secctx.ctxlen) {
+ return 0;
+ }
+
+ /* Open newly created element with O_PATH */
+ path_fd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
+ err = path_fd == -1 ? errno : 0;
+ if (err) {
+ return err;
+ }
+ sprintf(procname, "%i", path_fd);
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ /* Set security context. This is not atomic w.r.t file creation */
+ err = setxattr(procname, secctx_name, req->secctx.ctx, req->secctx.ctxlen,
+ 0);
+ if (err) {
+ err = errno;
+ }
+ FCHDIR_NOFAIL(lo->root.fd);
+ close(path_fd);
+ return err;
+}
+
+static int do_mknod_symlink(fuse_req_t req, struct lo_inode *dir,
+ const char *name, mode_t mode, dev_t rdev,
+ const char *link)
+{
+ int err, fscreate_fd = -1;
+ const char *secctx_name = req->secctx.name;
+ struct lo_cred old = {};
+ struct lo_data *lo = lo_data(req);
+ char *mapped_name = NULL;
+ bool secctx_enabled = req->secctx.ctxlen;
+ bool do_fscreate = false;
+
+ if (secctx_enabled && lo->xattrmap) {
+ err = xattr_map_client(lo, req->secctx.name, &mapped_name);
+ if (err < 0) {
+ return -err;
+ }
+ secctx_name = mapped_name;
+ }
+
+ /*
+ * If security xattr has not been remapped and selinux is enabled on
+ * host, set fscreate and no need to do a setxattr() after file creation
+ */
+ if (secctx_enabled && !mapped_name && lo->use_fscreate) {
+ do_fscreate = true;
+ err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
+ &fscreate_fd);
+ if (err) {
+ goto out;
+ }
+ }
+
+ err = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
+ if (err) {
+ goto out;
+ }
+
+ err = mknod_wrapper(dir->fd, name, link, mode, rdev);
+ err = err == -1 ? errno : 0;
+ lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
+ if (err) {
+ goto out;
+ }
+
+ if (!do_fscreate) {
+ err = do_mknod_symlink_secctx(req, dir, name, secctx_name);
+ if (err) {
+ unlinkat(dir->fd, name, S_ISDIR(mode) ? AT_REMOVEDIR : 0);
+ }
+ }
+out:
+ if (fscreate_fd != -1) {
+ close_reset_proc_fscreate(fscreate_fd);
+ }
+ g_free(mapped_name);
+ return err;
+}
+
static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
const char *name, mode_t mode, dev_t rdev,
const char *link)
{
- int res;
int saverr;
struct lo_data *lo = lo_data(req);
struct lo_inode *dir;
struct fuse_entry_param e;
- struct lo_cred old = {};
if (is_empty(name)) {
fuse_reply_err(req, ENOENT);
@@ -1311,21 +1485,11 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
return;
}
- saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
+ saverr = do_mknod_symlink(req, dir, name, mode, rdev, link);
if (saverr) {
goto out;
}
- res = mknod_wrapper(dir->fd, name, link, mode, rdev);
-
- saverr = errno;
-
- lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
-
- if (res == -1) {
- goto out;
- }
-
saverr = lo_do_lookup(req, parent, name, &e, NULL);
if (saverr) {
goto out;
@@ -2001,6 +2165,190 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
return 0;
}
+static int do_create_nosecctx(fuse_req_t req, struct lo_inode *parent_inode,
+ const char *name, mode_t mode,
+ struct fuse_file_info *fi, int *open_fd,
+ bool tmpfile)
+{
+ int err, fd;
+ struct lo_cred old = {};
+ struct lo_data *lo = lo_data(req);
+ int flags;
+
+ if (tmpfile) {
+ flags = fi->flags | O_TMPFILE;
+ /*
+ * Don't use O_EXCL as we want to link file later. Also reset O_CREAT
+ * otherwise openat() returns -EINVAL.
+ */
+ flags &= ~(O_CREAT | O_EXCL);
+
+ /* O_TMPFILE needs either O_RDWR or O_WRONLY */
+ if ((flags & O_ACCMODE) == O_RDONLY) {
+ flags |= O_RDWR;
+ }
+ } else {
+ flags = fi->flags | O_CREAT | O_EXCL;
+ }
+
+ err = lo_change_cred(req, &old, lo->change_umask);
+ if (err) {
+ return err;
+ }
+
+ /* Try to create a new file but don't open existing files */
+ fd = openat(parent_inode->fd, name, flags, mode);
+ err = fd == -1 ? errno : 0;
+ lo_restore_cred(&old, lo->change_umask);
+ if (!err) {
+ *open_fd = fd;
+ }
+ return err;
+}
+
+static int do_create_secctx_fscreate(fuse_req_t req,
+ struct lo_inode *parent_inode,
+ const char *name, mode_t mode,
+ struct fuse_file_info *fi, int *open_fd)
+{
+ int err = 0, fd = -1, fscreate_fd = -1;
+ struct lo_data *lo = lo_data(req);
+
+ err = open_set_proc_fscreate(lo, req->secctx.ctx, req->secctx.ctxlen,
+ &fscreate_fd);
+ if (err) {
+ return err;
+ }
+
+ err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
+
+ close_reset_proc_fscreate(fscreate_fd);
+ if (!err) {
+ *open_fd = fd;
+ }
+ return err;
+}
+
+static int do_create_secctx_tmpfile(fuse_req_t req,
+ struct lo_inode *parent_inode,
+ const char *name, mode_t mode,
+ struct fuse_file_info *fi,
+ const char *secctx_name, int *open_fd)
+{
+ int err, fd = -1;
+ struct lo_data *lo = lo_data(req);
+ char procname[64];
+
+ err = do_create_nosecctx(req, parent_inode, ".", mode, fi, &fd, true);
+ if (err) {
+ return err;
+ }
+
+ err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
+ if (err) {
+ err = errno;
+ goto out;
+ }
+
+ /* Security context set on file. Link it in place */
+ sprintf(procname, "%d", fd);
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ err = linkat(AT_FDCWD, procname, parent_inode->fd, name,
+ AT_SYMLINK_FOLLOW);
+ err = err == -1 ? errno : 0;
+ FCHDIR_NOFAIL(lo->root.fd);
+
+out:
+ if (!err) {
+ *open_fd = fd;
+ } else if (fd != -1) {
+ close(fd);
+ }
+ return err;
+}
+
+static int do_create_secctx_noatomic(fuse_req_t req,
+ struct lo_inode *parent_inode,
+ const char *name, mode_t mode,
+ struct fuse_file_info *fi,
+ const char *secctx_name, int *open_fd)
+{
+ int err = 0, fd = -1;
+
+ err = do_create_nosecctx(req, parent_inode, name, mode, fi, &fd, false);
+ if (err) {
+ goto out;
+ }
+
+ /* Set security context. This is not atomic w.r.t file creation */
+ err = fsetxattr(fd, secctx_name, req->secctx.ctx, req->secctx.ctxlen, 0);
+ err = err == -1 ? errno : 0;
+out:
+ if (!err) {
+ *open_fd = fd;
+ } else {
+ if (fd != -1) {
+ close(fd);
+ unlinkat(parent_inode->fd, name, 0);
+ }
+ }
+ return err;
+}
+
+static int do_lo_create(fuse_req_t req, struct lo_inode *parent_inode,
+ const char *name, mode_t mode,
+ struct fuse_file_info *fi, int *open_fd)
+{
+ struct lo_data *lo = lo_data(req);
+ char *mapped_name = NULL;
+ int err;
+ const char *ctxname = req->secctx.name;
+ bool secctx_enabled = req->secctx.ctxlen;
+
+ if (secctx_enabled && lo->xattrmap) {
+ err = xattr_map_client(lo, req->secctx.name, &mapped_name);
+ if (err < 0) {
+ return -err;
+ }
+
+ ctxname = mapped_name;
+ }
+
+ if (secctx_enabled) {
+ /*
+ * If security.selinux has not been remapped and selinux is enabled,
+ * use fscreate to set context before file creation. If not, use
+ * tmpfile method for regular files. Otherwise fallback to
+ * non-atomic method of file creation and xattr settting.
+ */
+ if (!mapped_name && lo->use_fscreate) {
+ err = do_create_secctx_fscreate(req, parent_inode, name, mode, fi,
+ open_fd);
+ goto out;
+ } else if (S_ISREG(mode)) {
+ err = do_create_secctx_tmpfile(req, parent_inode, name, mode, fi,
+ ctxname, open_fd);
+ /*
+ * If filesystem does not support O_TMPFILE, fallback to non-atomic
+ * method.
+ */
+ if (!err || err != EOPNOTSUPP) {
+ goto out;
+ }
+ }
+
+ err = do_create_secctx_noatomic(req, parent_inode, name, mode, fi,
+ ctxname, open_fd);
+ } else {
+ err = do_create_nosecctx(req, parent_inode, name, mode, fi, open_fd,
+ false);
+ }
+
+out:
+ g_free(mapped_name);
+ return err;
+}
+
static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, struct fuse_file_info *fi)
{
@@ -2010,7 +2358,6 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
struct lo_inode *inode = NULL;
struct fuse_entry_param e;
int err;
- struct lo_cred old = {};
fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
" kill_priv=%d\n", parent, name, fi->kill_priv);
@@ -2026,18 +2373,9 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
return;
}
- err = lo_change_cred(req, &old, lo->change_umask);
- if (err) {
- goto out;
- }
-
update_open_flags(lo->writeback, lo->allow_direct_io, fi);
- /* Try to create a new file but don't open existing files */
- fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
- err = fd == -1 ? errno : 0;
-
- lo_restore_cred(&old, lo->change_umask);
+ err = do_lo_create(req, parent_inode, name, mode, fi, &fd);
/* Ignore the error if file exists and O_EXCL was not given */
if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
@@ -2467,6 +2805,15 @@ static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
int res;
(void)ino;
+ if (!(op & LOCK_NB)) {
+ /*
+ * Blocking flock can deadlock as there is only one thread
+ * serving the queue.
+ */
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
res = flock(lo_fi_fd(req, fi), op);
fuse_reply_err(req, res == -1 ? errno : 0);
@@ -2842,11 +3189,6 @@ static int xattr_map_server(const struct lo_data *lo, const char *server_name,
return -ENODATA;
}
-#define FCHDIR_NOFAIL(fd) do { \
- int fchdir_res = fchdir(fd); \
- assert(fchdir_res == 0); \
- } while (0)
-
static bool block_xattr(struct lo_data *lo, const char *name)
{
/*
@@ -3357,6 +3699,49 @@ static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
}
}
+static int lo_do_syncfs(struct lo_data *lo, struct lo_inode *inode)
+{
+ int fd, ret = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_do_syncfs(ino=%" PRIu64 ")\n",
+ inode->fuse_ino);
+
+ fd = lo_inode_open(lo, inode, O_RDONLY);
+ if (fd < 0) {
+ return -fd;
+ }
+
+ if (syncfs(fd) < 0) {
+ ret = errno;
+ }
+
+ close(fd);
+ return ret;
+}
+
+static void lo_syncfs(fuse_req_t req, fuse_ino_t ino)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode = lo_inode(req, ino);
+ int err;
+
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ err = lo_do_syncfs(lo, inode);
+ lo_inode_put(lo, &inode);
+
+ /*
+ * If submounts aren't announced, the client only sends a request to
+ * sync the root inode. TODO: Track submounts internally and iterate
+ * over them as well.
+ */
+
+ fuse_reply_err(req, err);
+}
+
static void lo_destroy(void *userdata)
{
struct lo_data *lo = (struct lo_data *)userdata;
@@ -3417,6 +3802,7 @@ static struct fuse_lowlevel_ops lo_oper = {
.copy_file_range = lo_copy_file_range,
#endif
.lseek = lo_lseek,
+ .syncfs = lo_syncfs,
.destroy = lo_destroy,
};
@@ -3508,6 +3894,15 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
exit(1);
}
+ /* Get the /proc/self/task descriptor */
+ lo->proc_self_task = open("/proc/self/task/", O_PATH);
+ if (lo->proc_self_task == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(/proc/self/task, O_PATH): %m\n");
+ exit(1);
+ }
+
+ lo->use_fscreate = is_fscreate_usable(lo);
+
/*
* We only need /proc/self/fd. Prevent ".." from accessing parent
* directories of /proc/self/fd by bind-mounting it over /proc. Since / was
@@ -3724,6 +4119,14 @@ static void setup_chroot(struct lo_data *lo)
exit(1);
}
+ lo->proc_self_task = open("/proc/self/task", O_PATH);
+ if (lo->proc_self_fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/task\", O_PATH): %m\n");
+ exit(1);
+ }
+
+ lo->use_fscreate = is_fscreate_usable(lo);
+
/*
* Make the shared directory the file system root so that FUSE_OPEN
* (lo_open()) cannot escape the shared directory by opening a symlink.
@@ -3909,6 +4312,10 @@ static void fuse_lo_data_cleanup(struct lo_data *lo)
close(lo->proc_self_fd);
}
+ if (lo->proc_self_task >= 0) {
+ close(lo->proc_self_task);
+ }
+
if (lo->root.fd >= 0) {
close(lo->root.fd);
}
@@ -3936,8 +4343,10 @@ int main(int argc, char *argv[])
.posix_lock = 0,
.allow_direct_io = 0,
.proc_self_fd = -1,
+ .proc_self_task = -1,
.user_killpriv_v2 = -1,
.user_posix_acl = -1,
+ .user_security_label = -1,
};
struct lo_map_elem *root_elem;
struct lo_map_elem *reserve_elem;
diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c
index 2bc0127b69..888295c073 100644
--- a/tools/virtiofsd/passthrough_seccomp.c
+++ b/tools/virtiofsd/passthrough_seccomp.c
@@ -111,6 +111,7 @@ static const int syscall_allowlist[] = {
SCMP_SYS(set_robust_list),
SCMP_SYS(setxattr),
SCMP_SYS(symlinkat),
+ SCMP_SYS(syncfs),
SCMP_SYS(time), /* Rarely needed, except on static builds */
SCMP_SYS(tgkill),
SCMP_SYS(unlinkat),