diff options
Diffstat (limited to 'hw')
55 files changed, 1186 insertions, 235 deletions
diff --git a/hw/9pfs/9p-local.c b/hw/9pfs/9p-local.c index 210d9e7705..d42ce6d8b8 100644 --- a/hw/9pfs/9p-local.c +++ b/hw/9pfs/9p-local.c @@ -32,10 +32,12 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include <libgen.h> +#ifdef CONFIG_LINUX #include <linux/fs.h> #ifdef CONFIG_LINUX_MAGIC_H #include <linux/magic.h> #endif +#endif #include <sys/ioctl.h> #ifndef XFS_SUPER_MAGIC @@ -560,6 +562,15 @@ again: if (!entry) { return NULL; } +#ifdef CONFIG_DARWIN + int off; + off = telldir(fs->dir.stream); + /* If telldir fails, fail the entire readdir call */ + if (off < 0) { + return NULL; + } + entry->d_seekoff = off; +#endif if (ctx->export_flags & V9FS_SM_MAPPED) { entry->d_type = DT_UNKNOWN; @@ -671,7 +682,7 @@ static int local_mknod(FsContext *fs_ctx, V9fsPath *dir_path, if (fs_ctx->export_flags & V9FS_SM_MAPPED || fs_ctx->export_flags & V9FS_SM_MAPPED_FILE) { - err = mknodat(dirfd, name, fs_ctx->fmode | S_IFREG, 0); + err = qemu_mknodat(dirfd, name, fs_ctx->fmode | S_IFREG, 0); if (err == -1) { goto out; } @@ -686,7 +697,7 @@ static int local_mknod(FsContext *fs_ctx, V9fsPath *dir_path, } } else if (fs_ctx->export_flags & V9FS_SM_PASSTHROUGH || fs_ctx->export_flags & V9FS_SM_NONE) { - err = mknodat(dirfd, name, credp->fc_mode, credp->fc_rdev); + err = qemu_mknodat(dirfd, name, credp->fc_mode, credp->fc_rdev); if (err == -1) { goto out; } @@ -779,16 +790,20 @@ static int local_fstat(FsContext *fs_ctx, int fid_type, mode_t tmp_mode; dev_t tmp_dev; - if (fgetxattr(fd, "user.virtfs.uid", &tmp_uid, sizeof(uid_t)) > 0) { + if (qemu_fgetxattr(fd, "user.virtfs.uid", + &tmp_uid, sizeof(uid_t)) > 0) { stbuf->st_uid = le32_to_cpu(tmp_uid); } - if (fgetxattr(fd, "user.virtfs.gid", &tmp_gid, sizeof(gid_t)) > 0) { + if (qemu_fgetxattr(fd, "user.virtfs.gid", + &tmp_gid, sizeof(gid_t)) > 0) { stbuf->st_gid = le32_to_cpu(tmp_gid); } - if (fgetxattr(fd, "user.virtfs.mode", &tmp_mode, sizeof(mode_t)) > 0) { + if (qemu_fgetxattr(fd, "user.virtfs.mode", + &tmp_mode, sizeof(mode_t)) > 0) { stbuf->st_mode = le32_to_cpu(tmp_mode); } - if (fgetxattr(fd, "user.virtfs.rdev", &tmp_dev, sizeof(dev_t)) > 0) { + if (qemu_fgetxattr(fd, "user.virtfs.rdev", + &tmp_dev, sizeof(dev_t)) > 0) { stbuf->st_rdev = le64_to_cpu(tmp_dev); } } else if (fs_ctx->export_flags & V9FS_SM_MAPPED_FILE) { diff --git a/hw/9pfs/9p-proxy.c b/hw/9pfs/9p-proxy.c index 09bd9f1464..8b4b5cf7dc 100644 --- a/hw/9pfs/9p-proxy.c +++ b/hw/9pfs/9p-proxy.c @@ -123,10 +123,16 @@ static void prstatfs_to_statfs(struct statfs *stfs, ProxyStatFS *prstfs) stfs->f_bavail = prstfs->f_bavail; stfs->f_files = prstfs->f_files; stfs->f_ffree = prstfs->f_ffree; +#ifdef CONFIG_DARWIN + /* f_namelen and f_frsize do not exist on Darwin */ + stfs->f_fsid.val[0] = prstfs->f_fsid[0] & 0xFFFFFFFFU; + stfs->f_fsid.val[1] = prstfs->f_fsid[1] >> 32 & 0xFFFFFFFFU; +#else stfs->f_fsid.__val[0] = prstfs->f_fsid[0] & 0xFFFFFFFFU; stfs->f_fsid.__val[1] = prstfs->f_fsid[1] >> 32 & 0xFFFFFFFFU; stfs->f_namelen = prstfs->f_namelen; stfs->f_frsize = prstfs->f_frsize; +#endif } /* Converts proxy_stat structure to VFS stat structure */ @@ -143,12 +149,24 @@ static void prstat_to_stat(struct stat *stbuf, ProxyStat *prstat) stbuf->st_size = prstat->st_size; stbuf->st_blksize = prstat->st_blksize; stbuf->st_blocks = prstat->st_blocks; + stbuf->st_atime = prstat->st_atim_sec; + stbuf->st_mtime = prstat->st_mtim_sec; + stbuf->st_ctime = prstat->st_ctim_sec; +#ifdef CONFIG_DARWIN + stbuf->st_atimespec.tv_sec = prstat->st_atim_sec; + stbuf->st_mtimespec.tv_sec = prstat->st_mtim_sec; + stbuf->st_ctimespec.tv_sec = prstat->st_ctim_sec; + stbuf->st_atimespec.tv_nsec = prstat->st_atim_nsec; + stbuf->st_mtimespec.tv_nsec = prstat->st_mtim_nsec; + stbuf->st_ctimespec.tv_nsec = prstat->st_ctim_nsec; +#else stbuf->st_atim.tv_sec = prstat->st_atim_sec; + stbuf->st_mtim.tv_sec = prstat->st_mtim_sec; + stbuf->st_ctim.tv_sec = prstat->st_ctim_sec; stbuf->st_atim.tv_nsec = prstat->st_atim_nsec; - stbuf->st_mtime = prstat->st_mtim_sec; stbuf->st_mtim.tv_nsec = prstat->st_mtim_nsec; - stbuf->st_ctime = prstat->st_ctim_sec; stbuf->st_ctim.tv_nsec = prstat->st_ctim_nsec; +#endif } /* @@ -688,7 +706,21 @@ static off_t proxy_telldir(FsContext *ctx, V9fsFidOpenState *fs) static struct dirent *proxy_readdir(FsContext *ctx, V9fsFidOpenState *fs) { - return readdir(fs->dir.stream); + struct dirent *entry; + entry = readdir(fs->dir.stream); +#ifdef CONFIG_DARWIN + if (!entry) { + return NULL; + } + int td; + td = telldir(fs->dir.stream); + /* If telldir fails, fail the entire readdir call */ + if (td < 0) { + return NULL; + } + entry->d_seekoff = td; +#endif + return entry; } static void proxy_seekdir(FsContext *ctx, V9fsFidOpenState *fs, off_t off) diff --git a/hw/9pfs/9p-synth.c b/hw/9pfs/9p-synth.c index 7a7cd5c5ba..b3080e415b 100644 --- a/hw/9pfs/9p-synth.c +++ b/hw/9pfs/9p-synth.c @@ -234,7 +234,11 @@ static void synth_direntry(V9fsSynthNode *node, offsetof(struct dirent, d_name) + sz); memcpy(entry->d_name, node->name, sz); entry->d_ino = node->attr->inode; +#ifdef CONFIG_DARWIN + entry->d_seekoff = off + 1; +#else entry->d_off = off + 1; +#endif } static struct dirent *synth_get_dentry(V9fsSynthNode *dir, @@ -439,7 +443,9 @@ static int synth_statfs(FsContext *s, V9fsPath *fs_path, stbuf->f_bsize = 512; stbuf->f_blocks = 0; stbuf->f_files = synth_node_count; +#ifndef CONFIG_DARWIN stbuf->f_namelen = NAME_MAX; +#endif return 0; } diff --git a/hw/9pfs/9p-util-darwin.c b/hw/9pfs/9p-util-darwin.c new file mode 100644 index 0000000000..bec0253474 --- /dev/null +++ b/hw/9pfs/9p-util-darwin.c @@ -0,0 +1,97 @@ +/* + * 9p utilities (Darwin Implementation) + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/xattr.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "9p-util.h" + +ssize_t fgetxattrat_nofollow(int dirfd, const char *filename, const char *name, + void *value, size_t size) +{ + int ret; + int fd = openat_file(dirfd, filename, + O_RDONLY | O_PATH_9P_UTIL | O_NOFOLLOW, 0); + if (fd == -1) { + return -1; + } + ret = fgetxattr(fd, name, value, size, 0, 0); + close_preserve_errno(fd); + return ret; +} + +ssize_t flistxattrat_nofollow(int dirfd, const char *filename, + char *list, size_t size) +{ + int ret; + int fd = openat_file(dirfd, filename, + O_RDONLY | O_PATH_9P_UTIL | O_NOFOLLOW, 0); + if (fd == -1) { + return -1; + } + ret = flistxattr(fd, list, size, 0); + close_preserve_errno(fd); + return ret; +} + +ssize_t fremovexattrat_nofollow(int dirfd, const char *filename, + const char *name) +{ + int ret; + int fd = openat_file(dirfd, filename, O_PATH_9P_UTIL | O_NOFOLLOW, 0); + if (fd == -1) { + return -1; + } + ret = fremovexattr(fd, name, 0); + close_preserve_errno(fd); + return ret; +} + +int fsetxattrat_nofollow(int dirfd, const char *filename, const char *name, + void *value, size_t size, int flags) +{ + int ret; + int fd = openat_file(dirfd, filename, O_PATH_9P_UTIL | O_NOFOLLOW, 0); + if (fd == -1) { + return -1; + } + ret = fsetxattr(fd, name, value, size, 0, flags); + close_preserve_errno(fd); + return ret; +} + +/* + * As long as mknodat is not available on macOS, this workaround + * using pthread_fchdir_np is needed. + * + * Radar filed with Apple for implementing mknodat: + * rdar://FB9862426 (https://openradar.appspot.com/FB9862426) + */ +#if defined CONFIG_PTHREAD_FCHDIR_NP + +int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev) +{ + int preserved_errno, err; + if (!pthread_fchdir_np) { + error_report_once("pthread_fchdir_np() not available on this version of macOS"); + return -ENOTSUP; + } + if (pthread_fchdir_np(dirfd) < 0) { + return -1; + } + err = mknod(filename, mode, dev); + preserved_errno = errno; + /* Stop using the thread-local cwd */ + pthread_fchdir_np(-1); + if (err < 0) { + errno = preserved_errno; + } + return err; +} + +#endif diff --git a/hw/9pfs/9p-util.c b/hw/9pfs/9p-util-linux.c index 3221d9b498..db451b0784 100644 --- a/hw/9pfs/9p-util.c +++ b/hw/9pfs/9p-util-linux.c @@ -1,5 +1,5 @@ /* - * 9p utilities + * 9p utilities (Linux Implementation) * * Copyright IBM, Corp. 2017 * @@ -61,4 +61,10 @@ int fsetxattrat_nofollow(int dirfd, const char *filename, const char *name, ret = lsetxattr(proc_path, name, value, size, flags); g_free(proc_path); return ret; + +} + +int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev) +{ + return mknodat(dirfd, filename, mode, dev); } diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index 546f46dc7d..97e681e167 100644 --- a/hw/9pfs/9p-util.h +++ b/hw/9pfs/9p-util.h @@ -19,6 +19,23 @@ #define O_PATH_9P_UTIL 0 #endif +#ifdef CONFIG_DARWIN +#define qemu_fgetxattr(...) fgetxattr(__VA_ARGS__, 0, 0) +#define qemu_lgetxattr(...) getxattr(__VA_ARGS__, 0, XATTR_NOFOLLOW) +#define qemu_llistxattr(...) listxattr(__VA_ARGS__, XATTR_NOFOLLOW) +#define qemu_lremovexattr(...) removexattr(__VA_ARGS__, XATTR_NOFOLLOW) +static inline int qemu_lsetxattr(const char *path, const char *name, + const void *value, size_t size, int flags) { + return setxattr(path, name, value, size, 0, flags | XATTR_NOFOLLOW); +} +#else +#define qemu_fgetxattr fgetxattr +#define qemu_lgetxattr lgetxattr +#define qemu_llistxattr llistxattr +#define qemu_lremovexattr lremovexattr +#define qemu_lsetxattr lsetxattr +#endif + static inline void close_preserve_errno(int fd) { int serrno = errno; @@ -37,10 +54,13 @@ static inline int openat_file(int dirfd, const char *name, int flags, { int fd, serrno, ret; +#ifndef CONFIG_DARWIN again: +#endif fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK, mode); if (fd == -1) { +#ifndef CONFIG_DARWIN if (errno == EPERM && (flags & O_NOATIME)) { /* * The client passed O_NOATIME but we lack permissions to honor it. @@ -53,6 +73,7 @@ again: flags &= ~O_NOATIME; goto again; } +#endif return -1; } @@ -78,4 +99,61 @@ ssize_t flistxattrat_nofollow(int dirfd, const char *filename, ssize_t fremovexattrat_nofollow(int dirfd, const char *filename, const char *name); +/* + * Darwin has d_seekoff, which appears to function similarly to d_off. + * However, it does not appear to be supported on all file systems, + * so ensure it is manually injected earlier and call here when + * needed. + */ +static inline off_t qemu_dirent_off(struct dirent *dent) +{ +#ifdef CONFIG_DARWIN + return dent->d_seekoff; +#else + return dent->d_off; +#endif +} + +/** + * qemu_dirent_dup() - Duplicate directory entry @dent. + * + * @dent: original directory entry to be duplicated + * Return: duplicated directory entry which should be freed with g_free() + * + * It is highly recommended to use this function instead of open coding + * duplication of dirent objects, because the actual struct dirent + * size may be bigger or shorter than sizeof(struct dirent) and correct + * handling is platform specific (see gitlab issue #841). + */ +static inline struct dirent *qemu_dirent_dup(struct dirent *dent) +{ + size_t sz = 0; +#if defined _DIRENT_HAVE_D_RECLEN + /* Avoid use of strlen() if platform supports d_reclen. */ + sz = dent->d_reclen; +#endif + /* + * Test sz for zero even if d_reclen is available + * because some drivers may set d_reclen to zero. + */ + if (sz == 0) { + /* Fallback to the most portable way. */ + sz = offsetof(struct dirent, d_name) + + strlen(dent->d_name) + 1; + } + return g_memdup(dent, sz); +} + +/* + * As long as mknodat is not available on macOS, this workaround + * using pthread_fchdir_np is needed. qemu_mknodat is defined in + * os-posix.c. pthread_fchdir_np is weakly linked here as a guard + * in case it disappears in future macOS versions, because it is + * is a private API. + */ +#if defined CONFIG_DARWIN && defined CONFIG_PTHREAD_FCHDIR_NP +int pthread_fchdir_np(int fd) __attribute__((weak_import)); +#endif +int qemu_mknodat(int dirfd, const char *filename, mode_t mode, dev_t dev); + #endif diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 15b3f4d385..a6d6b3f835 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -27,12 +27,17 @@ #include "virtio-9p.h" #include "fsdev/qemu-fsdev.h" #include "9p-xattr.h" +#include "9p-util.h" #include "coth.h" #include "trace.h" #include "migration/blocker.h" #include "qemu/xxhash.h" #include <math.h> +#ifdef CONFIG_LINUX #include <linux/limits.h> +#else +#include <limits.h> +#endif int open_fd_hw; int total_open_fd; @@ -133,11 +138,20 @@ static int dotl_to_open_flags(int flags) { P9_DOTL_NONBLOCK, O_NONBLOCK } , { P9_DOTL_DSYNC, O_DSYNC }, { P9_DOTL_FASYNC, FASYNC }, +#ifndef CONFIG_DARWIN + { P9_DOTL_NOATIME, O_NOATIME }, + /* + * On Darwin, we could map to F_NOCACHE, which is + * similar, but doesn't quite have the same + * semantics. However, we don't support O_DIRECT + * even on linux at the moment, so we just ignore + * it here. + */ { P9_DOTL_DIRECT, O_DIRECT }, +#endif { P9_DOTL_LARGEFILE, O_LARGEFILE }, { P9_DOTL_DIRECTORY, O_DIRECTORY }, { P9_DOTL_NOFOLLOW, O_NOFOLLOW }, - { P9_DOTL_NOATIME, O_NOATIME }, { P9_DOTL_SYNC, O_SYNC }, }; @@ -166,10 +180,12 @@ static int get_dotl_openflags(V9fsState *s, int oflags) */ flags = dotl_to_open_flags(oflags); flags &= ~(O_NOCTTY | O_ASYNC | O_CREAT); +#ifndef CONFIG_DARWIN /* * Ignore direct disk access hint until the server supports it. */ flags &= ~O_DIRECT; +#endif return flags; } @@ -612,8 +628,8 @@ static inline uint64_t mirror64bit(uint64_t value) ((uint64_t)mirror8bit((value >> 56) & 0xff)); } -/** - * @brief Parameter k for the Exponential Golomb algorihm to be used. +/* + * Parameter k for the Exponential Golomb algorihm to be used. * * The smaller this value, the smaller the minimum bit count for the Exp. * Golomb generated affixes will be (at lowest index) however for the @@ -626,28 +642,30 @@ static inline uint64_t mirror64bit(uint64_t value) * should be small, for a large amount of devices k might be increased * instead. The default of k=0 should be fine for most users though. * - * @b IMPORTANT: In case this ever becomes a runtime parameter; the value of + * IMPORTANT: In case this ever becomes a runtime parameter; the value of * k should not change as long as guest is still running! Because that would * cause completely different inode numbers to be generated on guest. */ #define EXP_GOLOMB_K 0 /** - * @brief Exponential Golomb algorithm for arbitrary k (including k=0). + * expGolombEncode() - Exponential Golomb algorithm for arbitrary k + * (including k=0). + * + * @n: natural number (or index) of the prefix to be generated + * (1, 2, 3, ...) + * @k: parameter k of Exp. Golomb algorithm to be used + * (see comment on EXP_GOLOMB_K macro for details about k) + * Return: prefix for given @n and @k * - * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!) + * The Exponential Golomb algorithm generates prefixes (NOT suffixes!) * with growing length and with the mathematical property of being * "prefix-free". The latter means the generated prefixes can be prepended * in front of arbitrary numbers and the resulting concatenated numbers are * guaranteed to be always unique. * * This is a minor adjustment to the original Exp. Golomb algorithm in the - * sense that lowest allowed index (@param n) starts with 1, not with zero. - * - * @param n - natural number (or index) of the prefix to be generated - * (1, 2, 3, ...) - * @param k - parameter k of Exp. Golomb algorithm to be used - * (see comment on EXP_GOLOMB_K macro for details about k) + * sense that lowest allowed index (@n) starts with 1, not with zero. */ static VariLenAffix expGolombEncode(uint64_t n, int k) { @@ -661,7 +679,9 @@ static VariLenAffix expGolombEncode(uint64_t n, int k) } /** - * @brief Converts a suffix into a prefix, or a prefix into a suffix. + * invertAffix() - Converts a suffix into a prefix, or a prefix into a suffix. + * @affix: either suffix or prefix to be inverted + * Return: inversion of passed @affix * * Simply mirror all bits of the affix value, for the purpose to preserve * respectively the mathematical "prefix-free" or "suffix-free" property @@ -685,16 +705,16 @@ static VariLenAffix invertAffix(const VariLenAffix *affix) } /** - * @brief Generates suffix numbers with "suffix-free" property. + * affixForIndex() - Generates suffix numbers with "suffix-free" property. + * @index: natural number (or index) of the suffix to be generated + * (1, 2, 3, ...) + * Return: Suffix suitable to assemble unique number. * * This is just a wrapper function on top of the Exp. Golomb algorithm. * * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes, * this function converts the Exp. Golomb prefixes into appropriate suffixes * which are still suitable for generating unique numbers. - * - * @param n - natural number (or index) of the suffix to be generated - * (1, 2, 3, ...) */ static VariLenAffix affixForIndex(uint64_t index) { @@ -794,8 +814,8 @@ static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev) return val->prefix_bits; } -/** - * @brief Slow / full mapping host inode nr -> guest inode nr. +/* + * Slow / full mapping host inode nr -> guest inode nr. * * This function performs a slower and much more costly remapping of an * original file inode number on host to an appropriate different inode @@ -807,7 +827,7 @@ static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev) * qid_path_suffixmap() failed. In practice this slow / full mapping is not * expected ever to be used at all though. * - * @see qid_path_suffixmap() for details + * See qid_path_suffixmap() for details * */ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf, @@ -848,8 +868,8 @@ static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf, return 0; } -/** - * @brief Quick mapping host inode nr -> guest inode nr. +/* + * Quick mapping host inode nr -> guest inode nr. * * This function performs quick remapping of an original file inode number * on host to an appropriate different inode number on guest. This remapping @@ -1265,12 +1285,15 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path, /** - * Convert host filesystem's block size into an appropriate block size for - * 9p client (guest OS side). The value returned suggests an "optimum" block - * size for 9p I/O, i.e. to maximize performance. + * blksize_to_iounit() - Block size exposed to 9p client. + * Return: block size * * @pdu: 9p client request * @blksize: host filesystem's block size + * + * Convert host filesystem's block size into an appropriate block size for + * 9p client (guest OS side). The value returned suggests an "optimum" block + * size for 9p I/O, i.e. to maximize performance. */ static int32_t blksize_to_iounit(const V9fsPDU *pdu, int32_t blksize) { @@ -1309,11 +1332,17 @@ static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf, v9lstat->st_blksize = stat_to_iounit(pdu, stbuf); v9lstat->st_blocks = stbuf->st_blocks; v9lstat->st_atime_sec = stbuf->st_atime; - v9lstat->st_atime_nsec = stbuf->st_atim.tv_nsec; v9lstat->st_mtime_sec = stbuf->st_mtime; - v9lstat->st_mtime_nsec = stbuf->st_mtim.tv_nsec; v9lstat->st_ctime_sec = stbuf->st_ctime; +#ifdef CONFIG_DARWIN + v9lstat->st_atime_nsec = stbuf->st_atimespec.tv_nsec; + v9lstat->st_mtime_nsec = stbuf->st_mtimespec.tv_nsec; + v9lstat->st_ctime_nsec = stbuf->st_ctimespec.tv_nsec; +#else + v9lstat->st_atime_nsec = stbuf->st_atim.tv_nsec; + v9lstat->st_mtime_nsec = stbuf->st_mtim.tv_nsec; v9lstat->st_ctime_nsec = stbuf->st_ctim.tv_nsec; +#endif /* Currently we only support BASIC fields in stat */ v9lstat->st_result_mask = P9_STATS_BASIC; @@ -2271,7 +2300,7 @@ static int coroutine_fn v9fs_do_readdir_with_stat(V9fsPDU *pdu, count += len; v9fs_stat_free(&v9stat); v9fs_path_free(&path); - saved_dir_pos = dent->d_off; + saved_dir_pos = qemu_dirent_off(dent); } v9fs_readdir_unlock(&fidp->fs.dir); @@ -2376,10 +2405,11 @@ out_nofid: } /** - * Returns size required in Rreaddir response for the passed dirent @p name. + * v9fs_readdir_response_size() - Returns size required in Rreaddir response + * for the passed dirent @name. * - * @param name - directory entry's name (i.e. file name, directory name) - * @returns required size in bytes + * @name: directory entry's name (i.e. file name, directory name) + * Return: required size in bytes */ size_t v9fs_readdir_response_size(V9fsString *name) { @@ -2410,6 +2440,7 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, V9fsString name; int len, err = 0; int32_t count = 0; + off_t off; struct dirent *dent; struct stat *st; struct V9fsDirEnt *entries = NULL; @@ -2470,12 +2501,13 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, qid.version = 0; } + off = qemu_dirent_off(dent); v9fs_string_init(&name); v9fs_string_sprintf(&name, "%s", dent->d_name); /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */ len = pdu_marshal(pdu, 11 + count, "Qqbs", - &qid, dent->d_off, + &qid, off, dent->d_type, &name); v9fs_string_free(&name); @@ -3515,9 +3547,15 @@ static int v9fs_fill_statfs(V9fsState *s, V9fsPDU *pdu, struct statfs *stbuf) f_bavail = stbuf->f_bavail / bsize_factor; f_files = stbuf->f_files; f_ffree = stbuf->f_ffree; +#ifdef CONFIG_DARWIN + fsid_val = (unsigned int)stbuf->f_fsid.val[0] | + (unsigned long long)stbuf->f_fsid.val[1] << 32; + f_namelen = NAME_MAX; +#else fsid_val = (unsigned int) stbuf->f_fsid.__val[0] | (unsigned long long)stbuf->f_fsid.__val[1] << 32; f_namelen = stbuf->f_namelen; +#endif return pdu_marshal(pdu, offset, "ddqqqqqqd", f_type, f_bsize, f_blocks, f_bfree, @@ -3919,7 +3957,7 @@ static void coroutine_fn v9fs_xattrcreate(void *opaque) rflags |= XATTR_REPLACE; } - if (size > XATTR_SIZE_MAX) { + if (size > P9_XATTR_SIZE_MAX) { err = -E2BIG; goto out_nofid; } diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index 1567b67841..af2635fae9 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -100,8 +100,8 @@ typedef enum P9ProtoVersion { V9FS_PROTO_2000L = 0x02, } P9ProtoVersion; -/** - * @brief Minimum message size supported by this 9pfs server. +/* + * Minimum message size supported by this 9pfs server. * * A client establishes a session by sending a Tversion request along with a * 'msize' parameter which suggests the server a maximum message size ever to be @@ -231,7 +231,7 @@ static inline void v9fs_readdir_init(P9ProtoVersion proto_version, V9fsDir *dir) } } -/** +/* * Type for 9p fs drivers' (a.k.a. 9p backends) result of readdir requests, * which is a chained list of directory entries. */ @@ -289,8 +289,8 @@ typedef enum AffixType_t { AffixType_Suffix, /* A.k.a. postfix. */ } AffixType_t; -/** - * @brief Unique affix of variable length. +/* + * Unique affix of variable length. * * An affix is (currently) either a suffix or a prefix, which is either * going to be prepended (prefix) or appended (suffix) with some other @@ -304,7 +304,7 @@ typedef struct VariLenAffix { AffixType_t type; /* Whether this affix is a suffix or a prefix. */ uint64_t value; /* Actual numerical value of this affix. */ /* - * Lenght of the affix, that is how many (of the lowest) bits of @c value + * Lenght of the affix, that is how many (of the lowest) bits of ``value`` * must be used for appending/prepending this affix to its final resulting, * unique number. */ @@ -479,4 +479,22 @@ struct V9fsTransport { void (*push_and_notify)(V9fsPDU *pdu); }; +#if defined(XATTR_SIZE_MAX) +/* Linux */ +#define P9_XATTR_SIZE_MAX XATTR_SIZE_MAX +#elif defined(CONFIG_DARWIN) +/* + * Darwin doesn't seem to define a maximum xattr size in its user + * space header, so manually configure it across platforms as 64k. + * + * Having no limit at all can lead to QEMU crashing during large g_malloc() + * calls. Because QEMU does not currently support macOS guests, the below + * preliminary solution only works due to its being a reflection of the limit of + * Linux guests. + */ +#define P9_XATTR_SIZE_MAX 65536 +#else +#error Missing definition for P9_XATTR_SIZE_MAX for this host system +#endif + #endif diff --git a/hw/9pfs/codir.c b/hw/9pfs/codir.c index c0873bde16..75148bc985 100644 --- a/hw/9pfs/codir.c +++ b/hw/9pfs/codir.c @@ -22,6 +22,8 @@ #include "qemu/coroutine.h" #include "qemu/main-loop.h" #include "coth.h" +#include "9p-xattr.h" +#include "9p-util.h" /* * Intended to be called from bottom-half (e.g. background I/O thread) @@ -166,7 +168,7 @@ static int do_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, } size += len; - saved_dir_pos = dent->d_off; + saved_dir_pos = qemu_dirent_off(dent); } /* restore (last) saved position */ @@ -182,14 +184,25 @@ out: } /** - * @brief Reads multiple directory entries in one rush. + * v9fs_co_readdir_many() - Reads multiple directory entries in one rush. + * + * @pdu: the causing 9p (T_readdir) client request + * @fidp: already opened directory where readdir shall be performed on + * @entries: output for directory entries (must not be NULL) + * @offset: initial position inside the directory the function shall + * seek to before retrieving the directory entries + * @maxsize: maximum result message body size (in bytes) + * @dostat: whether a stat() should be performed and returned for + * each directory entry + * Return: resulting response message body size (in bytes) on success, + * negative error code otherwise * * Retrieves the requested (max. amount of) directory entries from the fs * driver. This function must only be called by the main IO thread (top half). * Internally this function call will be dispatched to a background IO thread * (bottom half) where it is eventually executed by the fs driver. * - * @discussion Acquiring multiple directory entries in one rush from the fs + * Acquiring multiple directory entries in one rush from the fs * driver, instead of retrieving each directory entry individually, is very * beneficial from performance point of view. Because for every fs driver * request latency is added, which in practice could lead to overall @@ -197,20 +210,9 @@ out: * directory) if every directory entry was individually requested from fs * driver. * - * @note You must @b ALWAYS call @c v9fs_free_dirents(entries) after calling + * NOTE: You must ALWAYS call v9fs_free_dirents(entries) after calling * v9fs_co_readdir_many(), both on success and on error cases of this - * function, to avoid memory leaks once @p entries are no longer needed. - * - * @param pdu - the causing 9p (T_readdir) client request - * @param fidp - already opened directory where readdir shall be performed on - * @param entries - output for directory entries (must not be NULL) - * @param offset - initial position inside the directory the function shall - * seek to before retrieving the directory entries - * @param maxsize - maximum result message body size (in bytes) - * @param dostat - whether a stat() should be performed and returned for - * each directory entry - * @returns resulting response message body size (in bytes) on success, - * negative error code otherwise + * function, to avoid memory leaks once @entries are no longer needed. */ int coroutine_fn v9fs_co_readdir_many(V9fsPDU *pdu, V9fsFidState *fidp, struct V9fsDirEnt **entries, diff --git a/hw/9pfs/coth.h b/hw/9pfs/coth.h index f83c7dda7b..1a1edbdc2a 100644 --- a/hw/9pfs/coth.h +++ b/hw/9pfs/coth.h @@ -19,7 +19,7 @@ #include "qemu/coroutine.h" #include "9p.h" -/** +/* * we want to use bottom half because we want to make sure the below * sequence of events. * @@ -29,7 +29,7 @@ * we cannot swap step 1 and 2, because that would imply worker thread * can enter coroutine while step1 is still running * - * @b PERFORMANCE @b CONSIDERATIONS: As a rule of thumb, keep in mind + * PERFORMANCE CONSIDERATIONS: As a rule of thumb, keep in mind * that hopping between threads adds @b latency! So when handling a * 9pfs request, avoid calling v9fs_co_run_in_worker() too often, because * this might otherwise sum up to a significant, huge overall latency for diff --git a/hw/9pfs/meson.build b/hw/9pfs/meson.build index 99be5d9119..12443b6ad5 100644 --- a/hw/9pfs/meson.build +++ b/hw/9pfs/meson.build @@ -4,7 +4,6 @@ fs_ss.add(files( '9p-posix-acl.c', '9p-proxy.c', '9p-synth.c', - '9p-util.c', '9p-xattr-user.c', '9p-xattr.c', '9p.c', @@ -14,6 +13,8 @@ fs_ss.add(files( 'coth.c', 'coxattr.c', )) +fs_ss.add(when: 'CONFIG_LINUX', if_true: files('9p-util-linux.c')) +fs_ss.add(when: 'CONFIG_DARWIN', if_true: files('9p-util-darwin.c')) fs_ss.add(when: 'CONFIG_XEN', if_true: files('xen-9p-backend.c')) softmmu_ss.add_all(when: 'CONFIG_FSDEV_9P', if_true: fs_ss) diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c index 8966e16320..1773cf55f1 100644 --- a/hw/acpi/aml-build.c +++ b/hw/acpi/aml-build.c @@ -2152,7 +2152,13 @@ void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f, build_append_int_noprefix(tbl, 0, 1); /* DAY_ALRM */ build_append_int_noprefix(tbl, 0, 1); /* MON_ALRM */ build_append_int_noprefix(tbl, f->rtc_century, 1); /* CENTURY */ - build_append_int_noprefix(tbl, 0, 2); /* IAPC_BOOT_ARCH */ + /* IAPC_BOOT_ARCH */ + if (f->rev == 1) { + build_append_int_noprefix(tbl, 0, 2); + } else { + /* since ACPI v2.0 */ + build_append_int_noprefix(tbl, f->iapc_boot_arch, 2); + } build_append_int_noprefix(tbl, 0, 1); /* Reserved */ build_append_int_noprefix(tbl, f->flags, 4); /* Flags */ diff --git a/hw/acpi/erst.c b/hw/acpi/erst.c index c0a23cf467..de509c2b48 100644 --- a/hw/acpi/erst.c +++ b/hw/acpi/erst.c @@ -80,11 +80,6 @@ #define UEFI_CPER_RECORD_MIN_SIZE 128U #define UEFI_CPER_RECORD_LENGTH_OFFSET 20U #define UEFI_CPER_RECORD_ID_OFFSET 96U -#define IS_UEFI_CPER_RECORD(ptr) \ - (((ptr)[0] == 'C') && \ - ((ptr)[1] == 'P') && \ - ((ptr)[2] == 'E') && \ - ((ptr)[3] == 'R')) /* * NOTE that when accessing CPER fields within a record, memcpy() diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 6befd23e16..6351bd3424 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -32,6 +32,7 @@ #include "hw/pci/pci_bridge.h" #include "hw/pci/pci_host.h" #include "hw/pci/pcie_port.h" +#include "hw/pci-bridge/xio3130_downstream.h" #include "hw/i386/acpi-build.h" #include "hw/acpi/acpi.h" #include "hw/pci/pci_bus.h" @@ -336,6 +337,8 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, { PCIDevice *pdev = PCI_DEVICE(dev); int slot = PCI_SLOT(pdev->devfn); + PCIDevice *bridge; + PCIBus *bus; int bsel; /* Don't send event when device is enabled during qemu machine creation: @@ -365,7 +368,14 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, return; } - bsel = acpi_pcihp_get_bsel(pci_get_bus(pdev)); + bus = pci_get_bus(pdev); + bridge = pci_bridge_get_device(bus); + if (object_dynamic_cast(OBJECT(bridge), TYPE_PCIE_ROOT_PORT) || + object_dynamic_cast(OBJECT(bridge), TYPE_XIO3130_DOWNSTREAM)) { + pcie_cap_slot_enable_power(bridge); + } + + bsel = acpi_pcihp_get_bsel(bus); g_assert(bsel >= 0); s->acpi_pcihp_pci_status[bsel].up |= (1U << slot); acpi_send_event(DEVICE(hotplug_dev), ACPI_PCI_HOTPLUG_STATUS); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 46bf7ceddf..46a42502bc 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -2102,6 +2102,10 @@ static void machvirt_init(MachineState *machine) object_property_set_bool(cpuobj, "pmu", false, NULL); } + if (vmc->no_tcg_lpa2 && object_property_find(cpuobj, "lpa2")) { + object_property_set_bool(cpuobj, "lpa2", false, NULL); + } + if (object_property_find(cpuobj, "reset-cbar")) { object_property_set_int(cpuobj, "reset-cbar", vms->memmap[VIRT_CPUPERIPHS].base, @@ -3020,8 +3024,11 @@ DEFINE_VIRT_MACHINE_AS_LATEST(7, 0) static void virt_machine_6_2_options(MachineClass *mc) { + VirtMachineClass *vmc = VIRT_MACHINE_CLASS(OBJECT_CLASS(mc)); + virt_machine_7_0_options(mc); compat_props_add(mc->compat_props, hw_compat_6_2, hw_compat_6_2_len); + vmc->no_tcg_lpa2 = true; } DEFINE_VIRT_MACHINE(6, 2) diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c index 860787580a..2785b9e849 100644 --- a/hw/block/dataplane/xen-block.c +++ b/hw/block/dataplane/xen-block.c @@ -21,6 +21,7 @@ #include "qemu/osdep.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" +#include "qemu/memalign.h" #include "qapi/error.h" #include "hw/xen/xen_common.h" #include "hw/block/xen_blkif.h" diff --git a/hw/block/fdc.c b/hw/block/fdc.c index 21d18ac2e3..347875a0cd 100644 --- a/hw/block/fdc.c +++ b/hw/block/fdc.c @@ -32,6 +32,7 @@ #include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/timer.h" +#include "qemu/memalign.h" #include "hw/irq.h" #include "hw/isa/isa.h" #include "hw/qdev-properties.h" diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index ebd47aa26f..4ad4d7286c 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -38,6 +38,7 @@ #include "hw/nvram/fw_cfg.h" #include "hw/acpi/bios-linker-loader.h" #include "hw/isa/isa.h" +#include "hw/input/i8042.h" #include "hw/block/fdc.h" #include "hw/acpi/memory_hotplug.h" #include "sysemu/tpm.h" @@ -192,6 +193,13 @@ static void init_common_fadt_data(MachineState *ms, Object *o, .address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL) }, }; + + /* + * ACPI v2, Table 5-10 - Fixed ACPI Description Table Boot Architecture + * Flags, bit offset 1 - 8042. + */ + fadt.iapc_boot_arch = iapc_boot_arch_8042(); + *data = fadt; } diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c index 68ca7e7fc2..fb09185cbd 100644 --- a/hw/i386/acpi-microvm.c +++ b/hw/i386/acpi-microvm.c @@ -37,6 +37,7 @@ #include "hw/pci/pcie_host.h" #include "hw/usb/xhci.h" #include "hw/virtio/virtio-mmio.h" +#include "hw/input/i8042.h" #include "acpi-common.h" #include "acpi-microvm.h" @@ -187,6 +188,11 @@ static void acpi_build_microvm(AcpiBuildTables *tables, .address = GED_MMIO_BASE_REGS + ACPI_GED_REG_RESET, }, .reset_val = ACPI_GED_RESET_VALUE, + /* + * ACPI v2, Table 5-10 - Fixed ACPI Description Table Boot Architecture + * Flags, bit offset 1 - 8042. + */ + .iapc_boot_arch = iapc_boot_arch_8042(), }; table_offsets = g_array_new(false, true /* clear */, diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 4c6c016388..32471a44cb 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -3030,6 +3030,13 @@ static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; + /* TODO: add support for VFIO and vhost users */ + if (s->snoop_control) { + error_setg_errno(errp, -ENOTSUP, + "Snoop Control with vhost or VFIO is not supported"); + return -ENOTSUP; + } + /* Update per-address-space notifier flags */ vtd_as->notifier_flags = new; @@ -3113,6 +3120,7 @@ static Property vtd_properties[] = { VTD_HOST_ADDRESS_WIDTH), DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), + DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), DEFINE_PROP_END_OF_LIST(), }; @@ -3643,7 +3651,7 @@ static void vtd_init(IntelIOMMUState *s) vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, x86_iommu->dt_supported); - if (s->scalable_mode) { + if (s->scalable_mode || s->snoop_control) { vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; @@ -3674,6 +3682,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; } + if (s->snoop_control) { + s->ecap |= VTD_ECAP_SC; + } + vtd_reset_caches(s); /* Define registers with default values and bit semantics */ diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index a6c788049b..1ff13b40f9 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -188,6 +188,7 @@ #define VTD_ECAP_IR (1ULL << 3) #define VTD_ECAP_EIM (1ULL << 4) #define VTD_ECAP_PT (1ULL << 6) +#define VTD_ECAP_SC (1ULL << 7) #define VTD_ECAP_MHMV (15ULL << 20) #define VTD_ECAP_SRS (1ULL << 31) #define VTD_ECAP_SMTS (1ULL << 43) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c8696ac01e..fd55fc725c 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -318,8 +318,6 @@ GlobalProperty pc_compat_2_0[] = { { "pci-serial-4x", "prog_if", "0" }, { "virtio-net-pci", "guest_announce", "off" }, { "ICH9-LPC", "memory-hotplug-support", "off" }, - { "xio3130-downstream", COMPAT_PROP_PCP, "off" }, - { "ioh3420", COMPAT_PROP_PCP, "off" }, }; const size_t pc_compat_2_0_len = G_N_ELEMENTS(pc_compat_2_0); @@ -1014,7 +1012,8 @@ static const MemoryRegionOps ioportF0_io_ops = { }, }; -static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, bool no_vmport) +static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, + bool create_i8042, bool no_vmport) { int i; DriveInfo *fd[MAX_FD]; @@ -1036,6 +1035,10 @@ static void pc_superio_init(ISABus *isa_bus, bool create_fdctrl, bool no_vmport) } } + if (!create_i8042) { + return; + } + i8042 = isa_create_simple(isa_bus, "i8042"); if (!no_vmport) { isa_create_simple(isa_bus, TYPE_VMPORT); @@ -1131,7 +1134,8 @@ void pc_basic_device_init(struct PCMachineState *pcms, i8257_dma_init(isa_bus, 0); /* Super I/O */ - pc_superio_init(isa_bus, create_fdctrl, pcms->vmport != ON_OFF_AUTO_ON); + pc_superio_init(isa_bus, create_fdctrl, pcms->i8042_enabled, + pcms->vmport != ON_OFF_AUTO_ON); } void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus) @@ -1512,6 +1516,20 @@ static void pc_machine_set_hpet(Object *obj, bool value, Error **errp) pcms->hpet_enabled = value; } +static bool pc_machine_get_i8042(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->i8042_enabled; +} + +static void pc_machine_set_i8042(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->i8042_enabled = value; +} + static bool pc_machine_get_default_bus_bypass_iommu(Object *obj, Error **errp) { PCMachineState *pcms = PC_MACHINE(obj); @@ -1641,6 +1659,7 @@ static void pc_machine_initfn(Object *obj) pcms->smbus_enabled = true; pcms->sata_enabled = true; pcms->pit_enabled = true; + pcms->i8042_enabled = true; pcms->max_fw_size = 8 * MiB; #ifdef CONFIG_HPET pcms->hpet_enabled = true; @@ -1777,6 +1796,9 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_set_description(oc, "hpet", "Enable/disable high precision event timer emulation"); + object_class_property_add_bool(oc, PC_MACHINE_I8042, + pc_machine_get_i8042, pc_machine_set_i8042); + object_class_property_add_bool(oc, "default-bus-bypass-iommu", pc_machine_get_default_bus_bypass_iommu, pc_machine_set_default_bus_bypass_iommu); diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 8d33cf689d..b72c03d0a6 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -757,6 +757,7 @@ static void pc_i440fx_1_7_machine_options(MachineClass *m) m->hw_version = "1.7.0"; m->default_machine_opts = NULL; m->option_rom_has_mr = true; + m->deprecation_reason = "old and unattended - use a newer version instead"; compat_props_add(m->compat_props, pc_compat_1_7, pc_compat_1_7_len); pcmc->smbios_defaults = false; pcmc->gigabyte_align = false; diff --git a/hw/i386/x86.c b/hw/i386/x86.c index b84840a1bb..4cf107baea 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -83,24 +83,11 @@ inline void init_topo_info(X86CPUTopoInfo *topo_info, uint32_t x86_cpu_apic_id_from_index(X86MachineState *x86ms, unsigned int cpu_index) { - X86MachineClass *x86mc = X86_MACHINE_GET_CLASS(x86ms); X86CPUTopoInfo topo_info; - uint32_t correct_id; - static bool warned; init_topo_info(&topo_info, x86ms); - correct_id = x86_apicid_from_cpu_idx(&topo_info, cpu_index); - if (x86mc->compat_apic_id_mode) { - if (cpu_index != correct_id && !warned && !qtest_enabled()) { - error_report("APIC IDs set in compatibility mode, " - "CPU topology won't match the configuration"); - warned = true; - } - return cpu_index; - } else { - return correct_id; - } + return x86_apicid_from_cpu_idx(&topo_info, cpu_index); } @@ -1330,7 +1317,6 @@ static void x86_machine_class_init(ObjectClass *oc, void *data) mc->cpu_index_to_instance_props = x86_cpu_index_to_props; mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; - x86mc->compat_apic_id_mode = false; x86mc->save_tsc_khz = true; x86mc->fwcfg_dma_enabled = true; nc->nmi_monitor_handler = x86_nmi; diff --git a/hw/ide/core.c b/hw/ide/core.c index d667d0b55e..3a5afff5d7 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -30,6 +30,7 @@ #include "qemu/main-loop.h" #include "qemu/timer.h" #include "qemu/hw-version.h" +#include "qemu/memalign.h" #include "sysemu/sysemu.h" #include "sysemu/blockdev.h" #include "sysemu/dma.h" diff --git a/hw/intc/arm_gicv3.c b/hw/intc/arm_gicv3.c index 6d3c8ee231..0b8f79a122 100644 --- a/hw/intc/arm_gicv3.c +++ b/hw/intc/arm_gicv3.c @@ -369,11 +369,19 @@ static const MemoryRegionOps gic_ops[] = { .read_with_attrs = gicv3_dist_read, .write_with_attrs = gicv3_dist_write, .endianness = DEVICE_NATIVE_ENDIAN, + .valid.min_access_size = 1, + .valid.max_access_size = 8, + .impl.min_access_size = 1, + .impl.max_access_size = 8, }, { .read_with_attrs = gicv3_redist_read, .write_with_attrs = gicv3_redist_write, .endianness = DEVICE_NATIVE_ENDIAN, + .valid.min_access_size = 1, + .valid.max_access_size = 8, + .impl.min_access_size = 1, + .impl.max_access_size = 8, } }; diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c index d7e03d0cab..1a3d440a54 100644 --- a/hw/intc/arm_gicv3_cpuif.c +++ b/hw/intc/arm_gicv3_cpuif.c @@ -612,7 +612,8 @@ static uint64_t icv_hppir_read(CPUARMState *env, const ARMCPRegInfo *ri) } } - trace_gicv3_icv_hppir_read(grp, gicv3_redist_affid(cs), value); + trace_gicv3_icv_hppir_read(ri->crm == 8 ? 0 : 1, + gicv3_redist_affid(cs), value); return value; } diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c index 4164500ea9..28d913b211 100644 --- a/hw/intc/arm_gicv3_dist.c +++ b/hw/intc/arm_gicv3_dist.c @@ -838,7 +838,7 @@ MemTxResult gicv3_dist_read(void *opaque, hwaddr offset, uint64_t *data, if (!r) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid guest read at offset " TARGET_FMT_plx - "size %u\n", __func__, offset, size); + " size %u\n", __func__, offset, size); trace_gicv3_dist_badread(offset, size, attrs.secure); /* The spec requires that reserved registers are RAZ/WI; * so use MEMTX_ERROR returns from leaf functions as a way to @@ -879,7 +879,7 @@ MemTxResult gicv3_dist_write(void *opaque, hwaddr offset, uint64_t data, if (!r) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid guest write at offset " TARGET_FMT_plx - "size %u\n", __func__, offset, size); + " size %u\n", __func__, offset, size); trace_gicv3_dist_badwrite(offset, data, size, attrs.secure); /* The spec requires that reserved registers are RAZ/WI; * so use MEMTX_ERROR returns from leaf functions as a way to diff --git a/hw/intc/arm_gicv3_its.c b/hw/intc/arm_gicv3_its.c index 4f598d3c14..b96b874afd 100644 --- a/hw/intc/arm_gicv3_its.c +++ b/hw/intc/arm_gicv3_its.c @@ -161,16 +161,22 @@ static MemTxResult get_cte(GICv3ITSState *s, uint16_t icid, CTEntry *cte) if (entry_addr == -1) { /* No L2 table entry, i.e. no valid CTE, or a memory error */ cte->valid = false; - return res; + goto out; } cteval = address_space_ldq_le(as, entry_addr, MEMTXATTRS_UNSPECIFIED, &res); if (res != MEMTX_OK) { - return res; + goto out; } cte->valid = FIELD_EX64(cteval, CTE, VALID); cte->rdbase = FIELD_EX64(cteval, CTE, RDBASE); - return MEMTX_OK; +out: + if (res != MEMTX_OK) { + trace_gicv3_its_cte_read_fault(icid); + } else { + trace_gicv3_its_cte_read(icid, cte->valid, cte->rdbase); + } + return res; } /* @@ -187,6 +193,10 @@ static bool update_ite(GICv3ITSState *s, uint32_t eventid, const DTEntry *dte, uint64_t itel = 0; uint32_t iteh = 0; + trace_gicv3_its_ite_write(dte->ittaddr, eventid, ite->valid, + ite->inttype, ite->intid, ite->icid, + ite->vpeid, ite->doorbell); + if (ite->valid) { itel = FIELD_DP64(itel, ITE_L, VALID, 1); itel = FIELD_DP64(itel, ITE_L, INTTYPE, ite->inttype); @@ -221,11 +231,13 @@ static MemTxResult get_ite(GICv3ITSState *s, uint32_t eventid, itel = address_space_ldq_le(as, iteaddr, MEMTXATTRS_UNSPECIFIED, &res); if (res != MEMTX_OK) { + trace_gicv3_its_ite_read_fault(dte->ittaddr, eventid); return res; } iteh = address_space_ldl_le(as, iteaddr + 8, MEMTXATTRS_UNSPECIFIED, &res); if (res != MEMTX_OK) { + trace_gicv3_its_ite_read_fault(dte->ittaddr, eventid); return res; } @@ -235,6 +247,9 @@ static MemTxResult get_ite(GICv3ITSState *s, uint32_t eventid, ite->icid = FIELD_EX64(itel, ITE_L, ICID); ite->vpeid = FIELD_EX64(itel, ITE_L, VPEID); ite->doorbell = FIELD_EX64(iteh, ITE_H, DOORBELL); + trace_gicv3_its_ite_read(dte->ittaddr, eventid, ite->valid, + ite->inttype, ite->intid, ite->icid, + ite->vpeid, ite->doorbell); return MEMTX_OK; } @@ -254,17 +269,23 @@ static MemTxResult get_dte(GICv3ITSState *s, uint32_t devid, DTEntry *dte) if (entry_addr == -1) { /* No L2 table entry, i.e. no valid DTE, or a memory error */ dte->valid = false; - return res; + goto out; } dteval = address_space_ldq_le(as, entry_addr, MEMTXATTRS_UNSPECIFIED, &res); if (res != MEMTX_OK) { - return res; + goto out; } dte->valid = FIELD_EX64(dteval, DTE, VALID); dte->size = FIELD_EX64(dteval, DTE, SIZE); /* DTE word field stores bits [51:8] of the ITT address */ dte->ittaddr = FIELD_EX64(dteval, DTE, ITTADDR) << ITTADDR_SHIFT; - return MEMTX_OK; +out: + if (res != MEMTX_OK) { + trace_gicv3_its_dte_read_fault(devid); + } else { + trace_gicv3_its_dte_read(devid, dte->valid, dte->size, dte->ittaddr); + } + return res; } /* @@ -366,6 +387,19 @@ static ItsCmdResult process_its_cmd(GICv3ITSState *s, const uint64_t *cmdpkt, devid = (cmdpkt[0] & DEVID_MASK) >> DEVID_SHIFT; eventid = cmdpkt[1] & EVENTID_MASK; + switch (cmd) { + case INTERRUPT: + trace_gicv3_its_cmd_int(devid, eventid); + break; + case CLEAR: + trace_gicv3_its_cmd_clear(devid, eventid); + break; + case DISCARD: + trace_gicv3_its_cmd_discard(devid, eventid); + break; + default: + g_assert_not_reached(); + } return do_process_its_cmd(s, devid, eventid, cmd); } @@ -382,15 +416,16 @@ static ItsCmdResult process_mapti(GICv3ITSState *s, const uint64_t *cmdpkt, devid = (cmdpkt[0] & DEVID_MASK) >> DEVID_SHIFT; eventid = cmdpkt[1] & EVENTID_MASK; + icid = cmdpkt[2] & ICID_MASK; if (ignore_pInt) { pIntid = eventid; + trace_gicv3_its_cmd_mapi(devid, eventid, icid); } else { pIntid = (cmdpkt[1] & pINTID_MASK) >> pINTID_SHIFT; + trace_gicv3_its_cmd_mapti(devid, eventid, icid, pIntid); } - icid = cmdpkt[2] & ICID_MASK; - if (devid >= s->dt.num_entries) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid command attributes: devid %d>=%d", @@ -451,6 +486,8 @@ static bool update_cte(GICv3ITSState *s, uint16_t icid, const CTEntry *cte) uint64_t cteval = 0; MemTxResult res = MEMTX_OK; + trace_gicv3_its_cte_write(icid, cte->valid, cte->rdbase); + if (cte->valid) { /* add mapping entry to collection table */ cteval = FIELD_DP64(cteval, CTE, VALID, 1); @@ -484,6 +521,7 @@ static ItsCmdResult process_mapc(GICv3ITSState *s, const uint64_t *cmdpkt) } else { cte.rdbase = 0; } + trace_gicv3_its_cmd_mapc(icid, cte.rdbase, cte.valid); if (icid >= s->ct.num_entries) { qemu_log_mask(LOG_GUEST_ERROR, "ITS MAPC: invalid ICID 0x%d", icid); @@ -509,6 +547,8 @@ static bool update_dte(GICv3ITSState *s, uint32_t devid, const DTEntry *dte) uint64_t dteval = 0; MemTxResult res = MEMTX_OK; + trace_gicv3_its_dte_write(devid, dte->valid, dte->size, dte->ittaddr); + if (dte->valid) { /* add mapping entry to device table */ dteval = FIELD_DP64(dteval, DTE, VALID, 1); @@ -539,6 +579,8 @@ static ItsCmdResult process_mapd(GICv3ITSState *s, const uint64_t *cmdpkt) dte.ittaddr = (cmdpkt[2] & ITTADDR_MASK) >> ITTADDR_SHIFT; dte.valid = cmdpkt[2] & CMD_FIELD_VALID_MASK; + trace_gicv3_its_cmd_mapd(devid, dte.size, dte.ittaddr, dte.valid); + if (devid >= s->dt.num_entries) { qemu_log_mask(LOG_GUEST_ERROR, "ITS MAPD: invalid device ID field 0x%x >= 0x%x\n", @@ -562,6 +604,8 @@ static ItsCmdResult process_movall(GICv3ITSState *s, const uint64_t *cmdpkt) rd1 = FIELD_EX64(cmdpkt[2], MOVALL_2, RDBASE1); rd2 = FIELD_EX64(cmdpkt[3], MOVALL_3, RDBASE2); + trace_gicv3_its_cmd_movall(rd1, rd2); + if (rd1 >= s->gicv3->num_cpu) { qemu_log_mask(LOG_GUEST_ERROR, "%s: RDBASE1 %" PRId64 @@ -601,6 +645,8 @@ static ItsCmdResult process_movi(GICv3ITSState *s, const uint64_t *cmdpkt) eventid = FIELD_EX64(cmdpkt[1], MOVI_1, EVENTID); new_icid = FIELD_EX64(cmdpkt[2], MOVI_2, ICID); + trace_gicv3_its_cmd_movi(devid, eventid, new_icid); + if (devid >= s->dt.num_entries) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid command attributes: devid %d>=%d", @@ -779,6 +825,7 @@ static void process_cmdq(GICv3ITSState *s) * is already consistent by the time SYNC command is executed. * Hence no further processing is required for SYNC command. */ + trace_gicv3_its_cmd_sync(); break; case GITS_CMD_MAPD: result = process_mapd(s, cmdpkt); @@ -803,6 +850,7 @@ static void process_cmdq(GICv3ITSState *s) * need to trigger lpi priority re-calculation to be in * sync with LPI config table or pending table changes. */ + trace_gicv3_its_cmd_inv(); for (i = 0; i < s->gicv3->num_cpu; i++) { gicv3_redist_update_lpi(&s->gicv3->cpu[i]); } @@ -814,6 +862,7 @@ static void process_cmdq(GICv3ITSState *s) result = process_movall(s, cmdpkt); break; default: + trace_gicv3_its_cmd_unknown(cmd); break; } if (result == CMD_CONTINUE) { @@ -1264,7 +1313,7 @@ static MemTxResult gicv3_its_read(void *opaque, hwaddr offset, uint64_t *data, if (!result) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid guest read at offset " TARGET_FMT_plx - "size %u\n", __func__, offset, size); + " size %u\n", __func__, offset, size); trace_gicv3_its_badread(offset, size); /* * The spec requires that reserved registers are RAZ/WI; @@ -1300,7 +1349,7 @@ static MemTxResult gicv3_its_write(void *opaque, hwaddr offset, uint64_t data, if (!result) { qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid guest write at offset " TARGET_FMT_plx - "size %u\n", __func__, offset, size); + " size %u\n", __func__, offset, size); trace_gicv3_its_badwrite(offset, data, size); /* * The spec requires that reserved registers are RAZ/WI; diff --git a/hw/intc/trace-events b/hw/intc/trace-events index b28cda4e08..53414aa197 100644 --- a/hw/intc/trace-events +++ b/hw/intc/trace-events @@ -176,6 +176,27 @@ gicv3_its_write(uint64_t offset, uint64_t data, unsigned size) "GICv3 ITS write: gicv3_its_badwrite(uint64_t offset, uint64_t data, unsigned size) "GICv3 ITS write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u: error" gicv3_its_translation_write(uint64_t offset, uint64_t data, unsigned size, uint32_t requester_id) "GICv3 ITS TRANSLATER write: offset 0x%" PRIx64 " data 0x%" PRIx64 " size %u requester_id 0x%x" gicv3_its_process_command(uint32_t rd_offset, uint8_t cmd) "GICv3 ITS: processing command at offset 0x%x: 0x%x" +gicv3_its_cmd_int(uint32_t devid, uint32_t eventid) "GICv3 ITS: command INT DeviceID 0x%x EventID 0x%x" +gicv3_its_cmd_clear(uint32_t devid, uint32_t eventid) "GICv3 ITS: command CLEAR DeviceID 0x%x EventID 0x%x" +gicv3_its_cmd_discard(uint32_t devid, uint32_t eventid) "GICv3 ITS: command DISCARD DeviceID 0x%x EventID 0x%x" +gicv3_its_cmd_sync(void) "GICv3 ITS: command SYNC" +gicv3_its_cmd_mapd(uint32_t devid, uint32_t size, uint64_t ittaddr, int valid) "GICv3 ITS: command MAPD DeviceID 0x%x Size 0x%x ITT_addr 0x%" PRIx64 " V %d" +gicv3_its_cmd_mapc(uint32_t icid, uint64_t rdbase, int valid) "GICv3 ITS: command MAPC ICID 0x%x RDbase 0x%" PRIx64 " V %d" +gicv3_its_cmd_mapi(uint32_t devid, uint32_t eventid, uint32_t icid) "GICv3 ITS: command MAPI DeviceID 0x%x EventID 0x%x ICID 0x%x" +gicv3_its_cmd_mapti(uint32_t devid, uint32_t eventid, uint32_t icid, uint32_t intid) "GICv3 ITS: command MAPTI DeviceID 0x%x EventID 0x%x ICID 0x%x pINTID 0x%x" +gicv3_its_cmd_inv(void) "GICv3 ITS: command INV or INVALL" +gicv3_its_cmd_movall(uint64_t rd1, uint64_t rd2) "GICv3 ITS: command MOVALL RDbase1 0x%" PRIx64 " RDbase2 0x%" PRIx64 +gicv3_its_cmd_movi(uint32_t devid, uint32_t eventid, uint32_t icid) "GICv3 ITS: command MOVI DeviceID 0x%x EventID 0x%x ICID 0x%x" +gicv3_its_cmd_unknown(unsigned cmd) "GICv3 ITS: unknown command 0x%x" +gicv3_its_cte_read(uint32_t icid, int valid, uint32_t rdbase) "GICv3 ITS: Collection Table read for ICID 0x%x: valid %d RDBase 0x%x" +gicv3_its_cte_write(uint32_t icid, int valid, uint32_t rdbase) "GICv3 ITS: Collection Table write for ICID 0x%x: valid %d RDBase 0x%x" +gicv3_its_cte_read_fault(uint32_t icid) "GICv3 ITS: Collection Table read for ICID 0x%x: faulted" +gicv3_its_ite_read(uint64_t ittaddr, uint32_t eventid, int valid, int inttype, uint32_t intid, uint32_t icid, uint32_t vpeid, uint32_t doorbell) "GICv3 ITS: Interrupt Table read for ITTaddr 0x%" PRIx64 " EventID 0x%x: valid %d inttype %d intid 0x%x ICID 0x%x vPEID 0x%x doorbell 0x%x" +gicv3_its_ite_read_fault(uint64_t ittaddr, uint32_t eventid) "GICv3 ITS: Interrupt Table read for ITTaddr 0x%" PRIx64 " EventID 0x%x: faulted" +gicv3_its_ite_write(uint64_t ittaddr, uint32_t eventid, int valid, int inttype, uint32_t intid, uint32_t icid, uint32_t vpeid, uint32_t doorbell) "GICv3 ITS: Interrupt Table write for ITTaddr 0x%" PRIx64 " EventID 0x%x: valid %d inttype %d intid 0x%x ICID 0x%x vPEID 0x%x doorbell 0x%x" +gicv3_its_dte_read(uint32_t devid, int valid, uint32_t size, uint64_t ittaddr) "GICv3 ITS: Device Table read for DeviceID 0x%x: valid %d size 0x%x ITTaddr 0x%" PRIx64 +gicv3_its_dte_write(uint32_t devid, int valid, uint32_t size, uint64_t ittaddr) "GICv3 ITS: Device Table write for DeviceID 0x%x: valid %d size 0x%x ITTaddr 0x%" PRIx64 +gicv3_its_dte_read_fault(uint32_t devid) "GICv3 ITS: Device Table read for DeviceID 0x%x: faulted" # armv7m_nvic.c nvic_recompute_state(int vectpending, int vectpending_prio, int exception_prio) "NVIC state recomputed: vectpending %d vectpending_prio %d exception_prio %d" diff --git a/hw/misc/pvpanic-isa.c b/hw/misc/pvpanic-isa.c index a39fcdd1fc..b84d4d458d 100644 --- a/hw/misc/pvpanic-isa.c +++ b/hw/misc/pvpanic-isa.c @@ -21,6 +21,7 @@ #include "hw/misc/pvpanic.h" #include "qom/object.h" #include "hw/isa/isa.h" +#include "standard-headers/linux/pvpanic.h" OBJECT_DECLARE_SIMPLE_TYPE(PVPanicISAState, PVPANIC_ISA_DEVICE) @@ -64,7 +65,8 @@ static void pvpanic_isa_realizefn(DeviceState *dev, Error **errp) static Property pvpanic_isa_properties[] = { DEFINE_PROP_UINT16(PVPANIC_IOPORT_PROP, PVPanicISAState, ioport, 0x505), - DEFINE_PROP_UINT8("events", PVPanicISAState, pvpanic.events, PVPANIC_PANICKED | PVPANIC_CRASHLOADED), + DEFINE_PROP_UINT8("events", PVPanicISAState, pvpanic.events, + PVPANIC_PANICKED | PVPANIC_CRASH_LOADED), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/misc/pvpanic-pci.c b/hw/misc/pvpanic-pci.c index 62e1be68c1..99cf7e2041 100644 --- a/hw/misc/pvpanic-pci.c +++ b/hw/misc/pvpanic-pci.c @@ -21,6 +21,7 @@ #include "hw/misc/pvpanic.h" #include "qom/object.h" #include "hw/pci/pci.h" +#include "standard-headers/linux/pvpanic.h" OBJECT_DECLARE_SIMPLE_TYPE(PVPanicPCIState, PVPANIC_PCI_DEVICE) @@ -53,7 +54,8 @@ static void pvpanic_pci_realizefn(PCIDevice *dev, Error **errp) } static Property pvpanic_pci_properties[] = { - DEFINE_PROP_UINT8("events", PVPanicPCIState, pvpanic.events, PVPANIC_PANICKED | PVPANIC_CRASHLOADED), + DEFINE_PROP_UINT8("events", PVPanicPCIState, pvpanic.events, + PVPANIC_PANICKED | PVPANIC_CRASH_LOADED), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/misc/pvpanic.c b/hw/misc/pvpanic.c index e2cb4a5d28..1540e9091a 100644 --- a/hw/misc/pvpanic.c +++ b/hw/misc/pvpanic.c @@ -21,12 +21,13 @@ #include "hw/qdev-properties.h" #include "hw/misc/pvpanic.h" #include "qom/object.h" +#include "standard-headers/linux/pvpanic.h" static void handle_event(int event) { static bool logged; - if (event & ~(PVPANIC_PANICKED | PVPANIC_CRASHLOADED) && !logged) { + if (event & ~(PVPANIC_PANICKED | PVPANIC_CRASH_LOADED) && !logged) { qemu_log_mask(LOG_GUEST_ERROR, "pvpanic: unknown event %#x.\n", event); logged = true; } @@ -36,7 +37,7 @@ static void handle_event(int event) return; } - if (event & PVPANIC_CRASHLOADED) { + if (event & PVPANIC_CRASH_LOADED) { qemu_system_guest_crashloaded(NULL); return; } diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index cf8ab0f8af..b02a0632df 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -628,17 +628,20 @@ static int virtio_net_max_tx_queue_size(VirtIONet *n) NetClientState *peer = n->nic_conf.peers.ncs[0]; /* - * Backends other than vhost-user don't support max queue size. + * Backends other than vhost-user or vhost-vdpa don't support max queue + * size. */ if (!peer) { return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; } - if (peer->info->type != NET_CLIENT_DRIVER_VHOST_USER) { + switch(peer->info->type) { + case NET_CLIENT_DRIVER_VHOST_USER: + case NET_CLIENT_DRIVER_VHOST_VDPA: + return VIRTQUEUE_MAX_SIZE; + default: return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE; - } - - return VIRTQUEUE_MAX_SIZE; + }; } static int peer_attach(VirtIONet *n, int index) diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index 10e6e7c2ab..de932286b5 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -193,6 +193,12 @@ static int pxb_map_irq_fn(PCIDevice *pci_dev, int pin) PCIDevice *pxb = pci_get_bus(pci_dev)->parent_dev; /* + * First carry out normal swizzle to handle + * multple root ports on a pxb instance. + */ + pin = pci_swizzle_map_irq_fn(pci_dev, pin); + + /* * The bios does not index the pxb slot number when * it computes the IRQ because it resides on bus 0 * and not on the current bus. diff --git a/hw/pci-bridge/xio3130_downstream.c b/hw/pci-bridge/xio3130_downstream.c index 04aae72cd6..05e2b06c0c 100644 --- a/hw/pci-bridge/xio3130_downstream.c +++ b/hw/pci-bridge/xio3130_downstream.c @@ -28,6 +28,7 @@ #include "migration/vmstate.h" #include "qapi/error.h" #include "qemu/module.h" +#include "hw/pci-bridge/xio3130_downstream.h" #define PCI_DEVICE_ID_TI_XIO3130D 0x8233 /* downstream port */ #define XIO3130_REVISION 0x1 @@ -84,7 +85,7 @@ static void xio3130_downstream_realize(PCIDevice *d, Error **errp) XIO3130_SSVID_SVID, XIO3130_SSVID_SSID, errp); if (rc < 0) { - goto err_bridge; + goto err_msi; } rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_DOWNSTREAM, @@ -173,7 +174,7 @@ static void xio3130_downstream_class_init(ObjectClass *klass, void *data) } static const TypeInfo xio3130_downstream_info = { - .name = "xio3130-downstream", + .name = TYPE_XIO3130_DOWNSTREAM, .parent = TYPE_PCIE_SLOT, .class_init = xio3130_downstream_class_init, .interfaces = (InterfaceInfo[]) { diff --git a/hw/pci-bridge/xio3130_upstream.c b/hw/pci-bridge/xio3130_upstream.c index 5cd3af4fbc..5ff46ef050 100644 --- a/hw/pci-bridge/xio3130_upstream.c +++ b/hw/pci-bridge/xio3130_upstream.c @@ -75,7 +75,7 @@ static void xio3130_upstream_realize(PCIDevice *d, Error **errp) XIO3130_SSVID_SVID, XIO3130_SSVID_SSID, errp); if (rc < 0) { - goto err_bridge; + goto err_msi; } rc = pcie_cap_init(d, XIO3130_EXP_OFFSET, PCI_EXP_TYPE_UPSTREAM, diff --git a/hw/pci/meson.build b/hw/pci/meson.build index 5c4bbac817..bcc9c75919 100644 --- a/hw/pci/meson.build +++ b/hw/pci/meson.build @@ -5,6 +5,7 @@ pci_ss.add(files( 'pci.c', 'pci_bridge.c', 'pci_host.c', + 'pcie_sriov.c', 'shpc.c', 'slotid_cap.c' )) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 5d30f9ca60..5cb1232e27 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg) { uint8_t type; + /* PCIe virtual functions do not have their own BARs */ + assert(!pci_is_vf(d)); + if (reg != PCI_ROM_SLOT) return PCI_BASE_ADDRESS_0 + reg * 4; @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev) } } -static void pci_do_device_reset(PCIDevice *dev) +static void pci_reset_regions(PCIDevice *dev) { int r; + if (pci_is_vf(dev)) { + return; + } + + for (r = 0; r < PCI_NUM_REGIONS; ++r) { + PCIIORegion *region = &dev->io_regions[r]; + if (!region->size) { + continue; + } + if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) && + region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_set_quad(dev->config + pci_bar(dev, r), region->type); + } else { + pci_set_long(dev->config + pci_bar(dev, r), region->type); + } + } +} + +static void pci_do_device_reset(PCIDevice *dev) +{ pci_device_deassert_intx(dev); assert(dev->irq_state == 0); @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev) pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE)); dev->config[PCI_CACHE_LINE_SIZE] = 0x0; - for (r = 0; r < PCI_NUM_REGIONS; ++r) { - PCIIORegion *region = &dev->io_regions[r]; - if (!region->size) { - continue; - } - - if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) && - region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { - pci_set_quad(dev->config + pci_bar(dev, r), region->type); - } else { - pci_set_long(dev->config + pci_bar(dev, r), region->type); - } - } + pci_reset_regions(dev); pci_update_mappings(dev); msi_reset(dev); @@ -885,6 +896,16 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) } /* + * With SR/IOV and ARI, a device at function 0 need not be a multifunction + * device, as it may just be a VF that ended up with function 0 in + * the legacy PCI interpretation. Avoid failing in such cases: + */ + if (pci_is_vf(dev) && + dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + return; + } + + /* * multifunction bit is interpreted in two ways as follows. * - all functions must set the bit to 1. * Example: Intel X53 @@ -1078,11 +1099,12 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, return NULL; } else if (!pci_bus_devfn_available(bus, devfn)) { error_setg(errp, "PCI: slot %d function %d not available for %s," - " in use by %s", + " in use by %s,id=%s", PCI_SLOT(devfn), PCI_FUNC(devfn), name, - bus->devices[devfn]->name); + bus->devices[devfn]->name, bus->devices[devfn]->qdev.id); return NULL; } else if (dev->hotplugged && + !pci_is_vf(pci_dev) && pci_get_function_0(pci_dev)) { error_setg(errp, "PCI: slot %d function 0 already occupied by %s," " new func %s cannot be exposed to guest.", @@ -1191,6 +1213,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, pcibus_t size = memory_region_size(memory); uint8_t hdr_type; + assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */ assert(region_num >= 0); assert(region_num < PCI_NUM_REGIONS); assert(is_power_of_2(size)); @@ -1294,11 +1317,45 @@ pcibus_t pci_get_bar_addr(PCIDevice *pci_dev, int region_num) return pci_dev->io_regions[region_num].addr; } -static pcibus_t pci_bar_address(PCIDevice *d, - int reg, uint8_t type, pcibus_t size) +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg, + uint8_t type, pcibus_t size) +{ + pcibus_t new_addr; + if (!pci_is_vf(d)) { + int bar = pci_bar(d, reg); + if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + new_addr = pci_get_quad(d->config + bar); + } else { + new_addr = pci_get_long(d->config + bar); + } + } else { + PCIDevice *pf = d->exp.sriov_vf.pf; + uint16_t sriov_cap = pf->exp.sriov_cap; + int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4; + uint16_t vf_offset = + pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET); + uint16_t vf_stride = + pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE); + uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride; + + if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + new_addr = pci_get_quad(pf->config + bar); + } else { + new_addr = pci_get_long(pf->config + bar); + } + new_addr += vf_num * size; + } + /* The ROM slot has a specific enable bit, keep it intact */ + if (reg != PCI_ROM_SLOT) { + new_addr &= ~(size - 1); + } + return new_addr; +} + +pcibus_t pci_bar_address(PCIDevice *d, + int reg, uint8_t type, pcibus_t size) { pcibus_t new_addr, last_addr; - int bar = pci_bar(d, reg); uint16_t cmd = pci_get_word(d->config + PCI_COMMAND); Object *machine = qdev_get_machine(); ObjectClass *oc = object_get_class(machine); @@ -1309,7 +1366,7 @@ static pcibus_t pci_bar_address(PCIDevice *d, if (!(cmd & PCI_COMMAND_IO)) { return PCI_BAR_UNMAPPED; } - new_addr = pci_get_long(d->config + bar) & ~(size - 1); + new_addr = pci_config_get_bar_addr(d, reg, type, size); last_addr = new_addr + size - 1; /* Check if 32 bit BAR wraps around explicitly. * TODO: make priorities correct and remove this work around. @@ -1324,11 +1381,7 @@ static pcibus_t pci_bar_address(PCIDevice *d, if (!(cmd & PCI_COMMAND_MEMORY)) { return PCI_BAR_UNMAPPED; } - if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { - new_addr = pci_get_quad(d->config + bar); - } else { - new_addr = pci_get_long(d->config + bar); - } + new_addr = pci_config_get_bar_addr(d, reg, type, size); /* the ROM slot has a specific enable bit */ if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) { return PCI_BAR_UNMAPPED; @@ -1473,6 +1526,7 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int msi_write_config(d, addr, val_in, l); msix_write_config(d, addr, val_in, l); + pcie_sriov_config_write(d, addr, val_in, l); } /***********************************************************/ diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index d7d73a31e4..67a5d67372 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -366,6 +366,17 @@ static void hotplug_event_clear(PCIDevice *dev) } } +void pcie_cap_slot_enable_power(PCIDevice *dev) +{ + uint8_t *exp_cap = dev->config + dev->exp.exp_cap; + uint32_t sltcap = pci_get_long(exp_cap + PCI_EXP_SLTCAP); + + if (sltcap & PCI_EXP_SLTCAP_PCP) { + pci_set_word_by_mask(exp_cap + PCI_EXP_SLTCTL, + PCI_EXP_SLTCTL_PCC, PCI_EXP_SLTCTL_PWR_ON); + } +} + static void pcie_set_power_device(PCIBus *bus, PCIDevice *dev, void *opaque) { bool *power = opaque; @@ -446,6 +457,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, PCIDevice *pci_dev = PCI_DEVICE(dev); uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP); + if (pci_is_vf(pci_dev)) { + /* Virtual function cannot be physically disconnected */ + return; + } + /* Don't send event when device is enabled during qemu machine creation: * it is present on boot, no hotplug event is necessary. We do send an * event when the device is disabled later. */ diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c new file mode 100644 index 0000000000..87abad6ac8 --- /dev/null +++ b/hw/pci/pcie_sriov.c @@ -0,0 +1,302 @@ +/* + * pcie_sriov.c: + * + * Implementation of SR/IOV emulation support. + * + * Copyright (c) 2015-2017 Knut Omang <knut.omang@oracle.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "hw/pci/pci.h" +#include "hw/pci/pcie.h" +#include "hw/pci/pci_bus.h" +#include "hw/qdev-properties.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "qapi/error.h" +#include "trace.h" + +static PCIDevice *register_vf(PCIDevice *pf, int devfn, + const char *name, uint16_t vf_num); +static void unregister_vfs(PCIDevice *dev); + +void pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, + const char *vfname, uint16_t vf_dev_id, + uint16_t init_vfs, uint16_t total_vfs, + uint16_t vf_offset, uint16_t vf_stride) +{ + uint8_t *cfg = dev->config + offset; + uint8_t *wmask; + + pcie_add_capability(dev, PCI_EXT_CAP_ID_SRIOV, 1, + offset, PCI_EXT_CAP_SRIOV_SIZEOF); + dev->exp.sriov_cap = offset; + dev->exp.sriov_pf.num_vfs = 0; + dev->exp.sriov_pf.vfname = g_strdup(vfname); + dev->exp.sriov_pf.vf = NULL; + + pci_set_word(cfg + PCI_SRIOV_VF_OFFSET, vf_offset); + pci_set_word(cfg + PCI_SRIOV_VF_STRIDE, vf_stride); + + /* + * Mandatory page sizes to support. + * Device implementations can call pcie_sriov_pf_add_sup_pgsize() + * to set more bits: + */ + pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, SRIOV_SUP_PGSIZE_MINREQ); + + /* + * Default is to use 4K pages, software can modify it + * to any of the supported bits + */ + pci_set_word(cfg + PCI_SRIOV_SYS_PGSIZE, 0x1); + + /* Set up device ID and initial/total number of VFs available */ + pci_set_word(cfg + PCI_SRIOV_VF_DID, vf_dev_id); + pci_set_word(cfg + PCI_SRIOV_INITIAL_VF, init_vfs); + pci_set_word(cfg + PCI_SRIOV_TOTAL_VF, total_vfs); + pci_set_word(cfg + PCI_SRIOV_NUM_VF, 0); + + /* Write enable control bits */ + wmask = dev->wmask + offset; + pci_set_word(wmask + PCI_SRIOV_CTRL, + PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE | PCI_SRIOV_CTRL_ARI); + pci_set_word(wmask + PCI_SRIOV_NUM_VF, 0xffff); + pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, 0x553); + + qdev_prop_set_bit(&dev->qdev, "multifunction", true); +} + +void pcie_sriov_pf_exit(PCIDevice *dev) +{ + unregister_vfs(dev); + g_free((char *)dev->exp.sriov_pf.vfname); + dev->exp.sriov_pf.vfname = NULL; +} + +void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, + uint8_t type, dma_addr_t size) +{ + uint32_t addr; + uint64_t wmask; + uint16_t sriov_cap = dev->exp.sriov_cap; + + assert(sriov_cap > 0); + assert(region_num >= 0); + assert(region_num < PCI_NUM_REGIONS); + assert(region_num != PCI_ROM_SLOT); + + wmask = ~(size - 1); + addr = sriov_cap + PCI_SRIOV_BAR + region_num * 4; + + pci_set_long(dev->config + addr, type); + if (!(type & PCI_BASE_ADDRESS_SPACE_IO) && + type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_set_quad(dev->wmask + addr, wmask); + pci_set_quad(dev->cmask + addr, ~0ULL); + } else { + pci_set_long(dev->wmask + addr, wmask & 0xffffffff); + pci_set_long(dev->cmask + addr, 0xffffffff); + } + dev->exp.sriov_pf.vf_bar_type[region_num] = type; +} + +void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, + MemoryRegion *memory) +{ + PCIIORegion *r; + PCIBus *bus = pci_get_bus(dev); + uint8_t type; + pcibus_t size = memory_region_size(memory); + + assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */ + assert(region_num >= 0); + assert(region_num < PCI_NUM_REGIONS); + type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; + + if (!is_power_of_2(size)) { + error_report("%s: PCI region size must be a power" + " of two - type=0x%x, size=0x%"FMT_PCIBUS, + __func__, type, size); + exit(1); + } + + r = &dev->io_regions[region_num]; + r->memory = memory; + r->address_space = + type & PCI_BASE_ADDRESS_SPACE_IO + ? bus->address_space_io + : bus->address_space_mem; + r->size = size; + r->type = type; + + r->addr = pci_bar_address(dev, region_num, r->type, r->size); + if (r->addr != PCI_BAR_UNMAPPED) { + memory_region_add_subregion_overlap(r->address_space, + r->addr, r->memory, 1); + } +} + +static PCIDevice *register_vf(PCIDevice *pf, int devfn, const char *name, + uint16_t vf_num) +{ + PCIDevice *dev = pci_new(devfn, name); + dev->exp.sriov_vf.pf = pf; + dev->exp.sriov_vf.vf_number = vf_num; + PCIBus *bus = pci_get_bus(pf); + Error *local_err = NULL; + + qdev_realize(&dev->qdev, &bus->qbus, &local_err); + if (local_err) { + error_report_err(local_err); + return NULL; + } + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(dev->config, 0xffff); + pci_config_set_device_id(dev->config, 0xffff); + + return dev; +} + +static void register_vfs(PCIDevice *dev) +{ + uint16_t num_vfs; + uint16_t i; + uint16_t sriov_cap = dev->exp.sriov_cap; + uint16_t vf_offset = + pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_OFFSET); + uint16_t vf_stride = + pci_get_word(dev->config + sriov_cap + PCI_SRIOV_VF_STRIDE); + int32_t devfn = dev->devfn + vf_offset; + + assert(sriov_cap > 0); + num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + + dev->exp.sriov_pf.vf = g_malloc(sizeof(PCIDevice *) * num_vfs); + assert(dev->exp.sriov_pf.vf); + + trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), num_vfs); + for (i = 0; i < num_vfs; i++) { + dev->exp.sriov_pf.vf[i] = register_vf(dev, devfn, + dev->exp.sriov_pf.vfname, i); + if (!dev->exp.sriov_pf.vf[i]) { + num_vfs = i; + break; + } + devfn += vf_stride; + } + dev->exp.sriov_pf.num_vfs = num_vfs; +} + +static void unregister_vfs(PCIDevice *dev) +{ + Error *local_err = NULL; + uint16_t num_vfs = dev->exp.sriov_pf.num_vfs; + uint16_t i; + + trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), num_vfs); + for (i = 0; i < num_vfs; i++) { + PCIDevice *vf = dev->exp.sriov_pf.vf[i]; + object_property_set_bool(OBJECT(vf), "realized", false, &local_err); + if (local_err) { + fprintf(stderr, "Failed to unplug: %s\n", + error_get_pretty(local_err)); + error_free(local_err); + } + object_unparent(OBJECT(vf)); + } + g_free(dev->exp.sriov_pf.vf); + dev->exp.sriov_pf.vf = NULL; + dev->exp.sriov_pf.num_vfs = 0; + pci_set_word(dev->config + dev->exp.sriov_cap + PCI_SRIOV_NUM_VF, 0); +} + +void pcie_sriov_config_write(PCIDevice *dev, uint32_t address, + uint32_t val, int len) +{ + uint32_t off; + uint16_t sriov_cap = dev->exp.sriov_cap; + + if (!sriov_cap || address < sriov_cap) { + return; + } + off = address - sriov_cap; + if (off >= PCI_EXT_CAP_SRIOV_SIZEOF) { + return; + } + + trace_sriov_config_write(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), off, val, len); + + if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) { + if (dev->exp.sriov_pf.num_vfs) { + if (!(val & PCI_SRIOV_CTRL_VFE)) { + unregister_vfs(dev); + } + } else { + if (val & PCI_SRIOV_CTRL_VFE) { + register_vfs(dev); + } + } + } +} + + +/* Reset SR/IOV VF Enable bit to trigger an unregister of all VFs */ +void pcie_sriov_pf_disable_vfs(PCIDevice *dev) +{ + uint16_t sriov_cap = dev->exp.sriov_cap; + if (sriov_cap) { + uint32_t val = pci_get_byte(dev->config + sriov_cap + PCI_SRIOV_CTRL); + if (val & PCI_SRIOV_CTRL_VFE) { + val &= ~PCI_SRIOV_CTRL_VFE; + pcie_sriov_config_write(dev, sriov_cap + PCI_SRIOV_CTRL, val, 1); + } + } +} + +/* Add optional supported page sizes to the mask of supported page sizes */ +void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize) +{ + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + uint8_t *wmask = dev->wmask + dev->exp.sriov_cap; + + uint16_t sup_pgsize = pci_get_word(cfg + PCI_SRIOV_SUP_PGSIZE); + + sup_pgsize |= opt_sup_pgsize; + + /* + * Make sure the new bits are set, and that system page size + * also can be set to any of the new values according to spec: + */ + pci_set_word(cfg + PCI_SRIOV_SUP_PGSIZE, sup_pgsize); + pci_set_word(wmask + PCI_SRIOV_SYS_PGSIZE, sup_pgsize); +} + + +uint16_t pcie_sriov_vf_number(PCIDevice *dev) +{ + assert(pci_is_vf(dev)); + return dev->exp.sriov_vf.vf_number; +} + +PCIDevice *pcie_sriov_get_pf(PCIDevice *dev) +{ + return dev->exp.sriov_vf.pf; +} + +PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) +{ + assert(!pci_is_vf(dev)); + if (n < dev->exp.sriov_pf.num_vfs) { + return dev->exp.sriov_pf.vf[n]; + } + return NULL; +} diff --git a/hw/pci/trace-events b/hw/pci/trace-events index 7570752c40..aaf46bc92d 100644 --- a/hw/pci/trace-events +++ b/hw/pci/trace-events @@ -10,3 +10,8 @@ pci_cfg_write(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, unsig # msix.c msix_write_config(char *name, bool enabled, bool masked) "dev %s enabled %d masked %d" + +# hw/pci/pcie_sriov.c +sriov_register_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: creating %d vf devs" +sriov_unregister_vfs(const char *name, int slot, int function, int num_vfs) "%s %02x:%x: Unregistering %d vf devs" +sriov_config_write(const char *name, int slot, int fun, uint32_t offset, uint32_t val, uint32_t len) "%s %02x:%x: sriov offset 0x%x val 0x%x len %d" diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4cc204f90d..953fc65fa8 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -27,6 +27,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "qemu/datadir.h" +#include "qemu/memalign.h" #include "qapi/error.h" #include "qapi/qapi-events-machine.h" #include "qapi/qapi-events-qdev.h" diff --git a/hw/ppc/spapr_softmmu.c b/hw/ppc/spapr_softmmu.c index 4ee03c83e4..5170a33369 100644 --- a/hw/ppc/spapr_softmmu.c +++ b/hw/ppc/spapr_softmmu.c @@ -1,5 +1,6 @@ #include "qemu/osdep.h" #include "qemu/cutils.h" +#include "qemu/memalign.h" #include "cpu.h" #include "helper_regs.h" #include "hw/ppc/spapr.h" diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 3666b8d946..072686ed58 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -26,6 +26,7 @@ #include "qemu/main-loop.h" #include "qemu/module.h" #include "qemu/hw-version.h" +#include "qemu/memalign.h" #include "hw/scsi/scsi.h" #include "migration/qemu-file-types.h" #include "migration/vmstate.h" diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 6013df1698..60349ee402 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -104,9 +104,11 @@ static struct { const char *sock_pfx, *manufacturer, *version, *serial, *asset, *part; uint64_t max_speed; uint64_t current_speed; + uint64_t processor_id; } type4 = { .max_speed = DEFAULT_CPU_SPEED, - .current_speed = DEFAULT_CPU_SPEED + .current_speed = DEFAULT_CPU_SPEED, + .processor_id = 0, }; static struct { @@ -327,6 +329,10 @@ static const QemuOptDesc qemu_smbios_type4_opts[] = { .name = "part", .type = QEMU_OPT_STRING, .help = "part number", + }, { + .name = "processor-id", + .type = QEMU_OPT_NUMBER, + .help = "processor id", }, { /* end of list */ } }; @@ -549,9 +555,23 @@ bool smbios_skip_table(uint8_t type, bool required_table) return true; } +#define T0_BASE 0x000 +#define T1_BASE 0x100 +#define T2_BASE 0x200 +#define T3_BASE 0x300 +#define T4_BASE 0x400 +#define T11_BASE 0xe00 + +#define T16_BASE 0x1000 +#define T17_BASE 0x1100 +#define T19_BASE 0x1300 +#define T32_BASE 0x2000 +#define T41_BASE 0x2900 +#define T127_BASE 0x7F00 + static void smbios_build_type_0_table(void) { - SMBIOS_BUILD_TABLE_PRE(0, 0x000, false); /* optional, leave up to BIOS */ + SMBIOS_BUILD_TABLE_PRE(0, T0_BASE, false); /* optional, leave up to BIOS */ SMBIOS_TABLE_SET_STR(0, vendor_str, type0.vendor); SMBIOS_TABLE_SET_STR(0, bios_version_str, type0.version); @@ -599,7 +619,7 @@ static void smbios_encode_uuid(struct smbios_uuid *uuid, QemuUUID *in) static void smbios_build_type_1_table(void) { - SMBIOS_BUILD_TABLE_PRE(1, 0x100, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(1, T1_BASE, true); /* required */ SMBIOS_TABLE_SET_STR(1, manufacturer_str, type1.manufacturer); SMBIOS_TABLE_SET_STR(1, product_name_str, type1.product); @@ -619,7 +639,7 @@ static void smbios_build_type_1_table(void) static void smbios_build_type_2_table(void) { - SMBIOS_BUILD_TABLE_PRE(2, 0x200, false); /* optional */ + SMBIOS_BUILD_TABLE_PRE(2, T2_BASE, false); /* optional */ SMBIOS_TABLE_SET_STR(2, manufacturer_str, type2.manufacturer); SMBIOS_TABLE_SET_STR(2, product_str, type2.product); @@ -637,7 +657,7 @@ static void smbios_build_type_2_table(void) static void smbios_build_type_3_table(void) { - SMBIOS_BUILD_TABLE_PRE(3, 0x300, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(3, T3_BASE, true); /* required */ SMBIOS_TABLE_SET_STR(3, manufacturer_str, type3.manufacturer); t->type = 0x01; /* Other */ @@ -662,15 +682,20 @@ static void smbios_build_type_4_table(MachineState *ms, unsigned instance) { char sock_str[128]; - SMBIOS_BUILD_TABLE_PRE(4, 0x400 + instance, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(4, T4_BASE + instance, true); /* required */ snprintf(sock_str, sizeof(sock_str), "%s%2x", type4.sock_pfx, instance); SMBIOS_TABLE_SET_STR(4, socket_designation_str, sock_str); t->processor_type = 0x03; /* CPU */ t->processor_family = 0x01; /* Other */ SMBIOS_TABLE_SET_STR(4, processor_manufacturer_str, type4.manufacturer); - t->processor_id[0] = cpu_to_le32(smbios_cpuid_version); - t->processor_id[1] = cpu_to_le32(smbios_cpuid_features); + if (type4.processor_id == 0) { + t->processor_id[0] = cpu_to_le32(smbios_cpuid_version); + t->processor_id[1] = cpu_to_le32(smbios_cpuid_features); + } else { + t->processor_id[0] = cpu_to_le32((uint32_t)type4.processor_id); + t->processor_id[1] = cpu_to_le32(type4.processor_id >> 32); + } SMBIOS_TABLE_SET_STR(4, processor_version_str, type4.version); t->voltage = 0; t->external_clock = cpu_to_le16(0); /* Unknown */ @@ -702,7 +727,7 @@ static void smbios_build_type_11_table(void) return; } - SMBIOS_BUILD_TABLE_PRE(11, 0xe00, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(11, T11_BASE, true); /* required */ snprintf(count_str, sizeof(count_str), "%zu", type11.nvalues); t->count = type11.nvalues; @@ -722,7 +747,7 @@ static void smbios_build_type_16_table(unsigned dimm_cnt) { uint64_t size_kb; - SMBIOS_BUILD_TABLE_PRE(16, 0x1000, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(16, T16_BASE, true); /* required */ t->location = 0x01; /* Other */ t->use = 0x03; /* System memory */ @@ -749,7 +774,7 @@ static void smbios_build_type_17_table(unsigned instance, uint64_t size) char loc_str[128]; uint64_t size_mb; - SMBIOS_BUILD_TABLE_PRE(17, 0x1100 + instance, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(17, T17_BASE + instance, true); /* required */ t->physical_memory_array_handle = cpu_to_le16(0x1000); /* Type 16 above */ t->memory_error_information_handle = cpu_to_le16(0xFFFE); /* Not provided */ @@ -785,12 +810,13 @@ static void smbios_build_type_17_table(unsigned instance, uint64_t size) SMBIOS_BUILD_TABLE_POST; } -static void smbios_build_type_19_table(unsigned instance, +static void smbios_build_type_19_table(unsigned instance, unsigned offset, uint64_t start, uint64_t size) { uint64_t end, start_kb, end_kb; - SMBIOS_BUILD_TABLE_PRE(19, 0x1300 + instance, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(19, T19_BASE + offset + instance, + true); /* required */ end = start + size - 1; assert(end > start); @@ -814,7 +840,7 @@ static void smbios_build_type_19_table(unsigned instance, static void smbios_build_type_32_table(void) { - SMBIOS_BUILD_TABLE_PRE(32, 0x2000, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(32, T32_BASE, true); /* required */ memset(t->reserved, 0, 6); t->boot_status = 0; /* No errors detected */ @@ -828,7 +854,7 @@ static void smbios_build_type_41_table(Error **errp) struct type41_instance *t41; QTAILQ_FOREACH(t41, &type41, next) { - SMBIOS_BUILD_TABLE_PRE(41, 0x2900 + instance, true); + SMBIOS_BUILD_TABLE_PRE(41, T41_BASE + instance, true); SMBIOS_TABLE_SET_STR(41, reference_designation_str, t41->designation); t->device_type = t41->kind; @@ -871,7 +897,7 @@ static void smbios_build_type_41_table(Error **errp) static void smbios_build_type_127_table(void) { - SMBIOS_BUILD_TABLE_PRE(127, 0x7F00, true); /* required */ + SMBIOS_BUILD_TABLE_PRE(127, T127_BASE, true); /* required */ SMBIOS_BUILD_TABLE_POST; } @@ -982,7 +1008,7 @@ void smbios_get_tables(MachineState *ms, uint8_t **anchor, size_t *anchor_len, Error **errp) { - unsigned i, dimm_cnt; + unsigned i, dimm_cnt, offset; if (smbios_legacy) { *tables = *anchor = NULL; @@ -1012,6 +1038,16 @@ void smbios_get_tables(MachineState *ms, dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, MAX_DIMM_SZ) / MAX_DIMM_SZ; + /* + * The offset determines if we need to keep additional space betweeen + * table 17 and table 19 header handle numbers so that they do + * not overlap. For example, for a VM with larger than 8 TB guest + * memory and DIMM like chunks of 16 GiB, the default space between + * the two tables (T19_BASE - T17_BASE = 512) is not enough. + */ + offset = (dimm_cnt > (T19_BASE - T17_BASE)) ? \ + dimm_cnt - (T19_BASE - T17_BASE) : 0; + smbios_build_type_16_table(dimm_cnt); for (i = 0; i < dimm_cnt; i++) { @@ -1019,10 +1055,16 @@ void smbios_get_tables(MachineState *ms, } for (i = 0; i < mem_array_size; i++) { - smbios_build_type_19_table(i, mem_array[i].address, + smbios_build_type_19_table(i, offset, mem_array[i].address, mem_array[i].length); } + /* + * make sure 16 bit handle numbers in the headers of tables 19 + * and 32 do not overlap. + */ + assert((mem_array_size + offset) < (T32_BASE - T19_BASE)); + smbios_build_type_32_table(); smbios_build_type_38_table(); smbios_build_type_41_table(errp); @@ -1292,6 +1334,8 @@ void smbios_entry_add(QemuOpts *opts, Error **errp) save_opt(&type4.serial, opts, "serial"); save_opt(&type4.asset, opts, "asset"); save_opt(&type4.part, opts, "part"); + /* If the value is 0, it will take the value from the CPU model. */ + type4.processor_id = qemu_opt_get_number(opts, "processor-id", 0); type4.max_speed = qemu_opt_get_number(opts, "max-speed", DEFAULT_CPU_SPEED); type4.current_speed = qemu_opt_get_number(opts, "current-speed", diff --git a/hw/tpm/tpm_ppi.c b/hw/tpm/tpm_ppi.c index 6dbb9f41e4..c89ac53e65 100644 --- a/hw/tpm/tpm_ppi.c +++ b/hw/tpm/tpm_ppi.c @@ -12,7 +12,7 @@ */ #include "qemu/osdep.h" - +#include "qemu/memalign.h" #include "qapi/error.h" #include "sysemu/memory_mapping.h" #include "migration/vmstate.h" diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index f7ad6be5fb..a5102eac9e 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -89,9 +89,11 @@ virtio_mmio_setting_irq(int level) "virtio_mmio setting IRQ %d" # virtio-iommu.c virtio_iommu_device_reset(void) "reset!" +virtio_iommu_system_reset(void) "system reset!" virtio_iommu_get_features(uint64_t features) "device supports features=0x%"PRIx64 virtio_iommu_device_status(uint8_t status) "driver status = %d" -virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x" +virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, uint32_t domain_start, uint32_t domain_end, uint32_t probe_size, uint8_t bypass) "page_size_mask=0x%"PRIx64" input range start=0x%"PRIx64" input range end=0x%"PRIx64" domain range start=%d domain range end=%d probe_size=0x%x bypass=0x%x" +virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" diff --git a/hw/virtio/vhost-user-i2c.c b/hw/virtio/vhost-user-i2c.c index d172632bb0..42c7f6d9e5 100644 --- a/hw/virtio/vhost-user-i2c.c +++ b/hw/virtio/vhost-user-i2c.c @@ -19,6 +19,11 @@ #define VIRTIO_ID_I2C_ADAPTER 34 #endif +static const int feature_bits[] = { + VIRTIO_I2C_F_ZERO_LENGTH_REQUEST, + VHOST_INVALID_FEATURE_BIT +}; + static void vu_i2c_start(VirtIODevice *vdev) { BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -113,8 +118,10 @@ static void vu_i2c_set_status(VirtIODevice *vdev, uint8_t status) static uint64_t vu_i2c_get_features(VirtIODevice *vdev, uint64_t requested_features, Error **errp) { - /* No feature bits used yet */ - return requested_features; + VHostUserI2C *i2c = VHOST_USER_I2C(vdev); + + virtio_add_feature(&requested_features, VIRTIO_I2C_F_ZERO_LENGTH_REQUEST); + return vhost_get_features(&i2c->vhost_dev, feature_bits, requested_features); } static void vu_i2c_handle_output(VirtIODevice *vdev, VirtQueue *vq) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 662853513e..6abbc9da32 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -25,6 +25,7 @@ #include "migration/migration.h" #include "migration/postcopy-ram.h" #include "trace.h" +#include "exec/ramblock.h" #include <sys/ioctl.h> #include <sys/socket.h> @@ -1162,37 +1163,32 @@ static int vhost_user_set_vring_num(struct vhost_dev *dev, return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring); } -static void vhost_user_host_notifier_restore(struct vhost_dev *dev, - int queue_idx) +static void vhost_user_host_notifier_free(VhostUserHostNotifier *n) { - struct vhost_user *u = dev->opaque; - VhostUserHostNotifier *n = &u->user->notifier[queue_idx]; - VirtIODevice *vdev = dev->vdev; - - if (n->addr && !n->set) { - virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true); - n->set = true; - } + assert(n && n->unmap_addr); + munmap(n->unmap_addr, qemu_real_host_page_size); + n->unmap_addr = NULL; } -static void vhost_user_host_notifier_remove(struct vhost_dev *dev, - int queue_idx) +static void vhost_user_host_notifier_remove(VhostUserState *user, + VirtIODevice *vdev, int queue_idx) { - struct vhost_user *u = dev->opaque; - VhostUserHostNotifier *n = &u->user->notifier[queue_idx]; - VirtIODevice *vdev = dev->vdev; + VhostUserHostNotifier *n = &user->notifier[queue_idx]; - if (n->addr && n->set) { - virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false); - n->set = false; + if (n->addr) { + if (vdev) { + virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false); + } + assert(!n->unmap_addr); + n->unmap_addr = n->addr; + n->addr = NULL; + call_rcu(n, vhost_user_host_notifier_free, rcu); } } static int vhost_user_set_vring_base(struct vhost_dev *dev, struct vhost_vring_state *ring) { - vhost_user_host_notifier_restore(dev, ring->index); - return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring); } @@ -1235,8 +1231,9 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, .payload.state = *ring, .hdr.size = sizeof(msg.payload.state), }; + struct vhost_user *u = dev->opaque; - vhost_user_host_notifier_remove(dev, ring->index); + vhost_user_host_notifier_remove(u->user, dev->vdev, ring->index); ret = vhost_user_write(dev, &msg, NULL, 0); if (ret < 0) { @@ -1522,12 +1519,7 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, n = &user->notifier[queue_idx]; - if (n->addr) { - virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, false); - object_unparent(OBJECT(&n->mr)); - munmap(n->addr, page_size); - n->addr = NULL; - } + vhost_user_host_notifier_remove(user, vdev, queue_idx); if (area->u64 & VHOST_USER_VRING_NOFD_MASK) { return 0; @@ -1546,9 +1538,12 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]", user, queue_idx); - if (!n->mr.ram) /* Don't init again after suspend. */ + if (!n->mr.ram) { /* Don't init again after suspend. */ memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name, page_size, addr); + } else { + n->mr.ram_block->host = addr; + } g_free(name); if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) { @@ -1558,7 +1553,6 @@ static int vhost_user_slave_handle_vring_host_notifier(struct vhost_dev *dev, } n->addr = addr; - n->set = true; return 0; } @@ -2522,17 +2516,16 @@ bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp) void vhost_user_cleanup(VhostUserState *user) { int i; + VhostUserHostNotifier *n; if (!user->chr) { return; } memory_region_transaction_begin(); for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - if (user->notifier[i].addr) { - object_unparent(OBJECT(&user->notifier[i].mr)); - munmap(user->notifier[i].addr, qemu_real_host_page_size); - user->notifier[i].addr = NULL; - } + n = &user->notifier[i]; + vhost_user_host_notifier_remove(user, NULL, i); + object_unparent(OBJECT(&n->mr)); } memory_region_transaction_commit(); user->chr = NULL; diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 04ea43704f..6c67d5f034 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -395,15 +395,6 @@ static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev, } } -static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) -{ - int i; - - for (i = 0; i < n; i++) { - vhost_vdpa_host_notifier_uninit(dev, i); - } -} - static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) { size_t page_size = qemu_real_host_page_size; @@ -431,6 +422,7 @@ static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index) g_free(name); if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) { + object_unparent(OBJECT(&n->mr)); munmap(addr, page_size); goto err; } @@ -442,6 +434,15 @@ err: return -1; } +static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n) +{ + int i; + + for (i = dev->vq_index; i < dev->vq_index + n; i++) { + vhost_vdpa_host_notifier_uninit(dev, i); + } +} + static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) { int i; @@ -455,7 +456,7 @@ static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev) return; err: - vhost_vdpa_host_notifiers_uninit(dev, i); + vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index); return; } diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c index 3f3771274e..ed706681ac 100644 --- a/hw/virtio/vhost-vsock-common.c +++ b/hw/virtio/vhost-vsock-common.c @@ -153,19 +153,23 @@ static void vhost_vsock_common_send_transport_reset(VHostVSockCommon *vvc) if (elem->out_num) { error_report("invalid vhost-vsock event virtqueue element with " "out buffers"); - goto out; + goto err; } if (iov_from_buf(elem->in_sg, elem->in_num, 0, &event, sizeof(event)) != sizeof(event)) { error_report("vhost-vsock event virtqueue element is too short"); - goto out; + goto err; } virtqueue_push(vq, elem, sizeof(event)); virtio_notify(VIRTIO_DEVICE(vvc), vq); -out: + g_free(elem); + return; + +err: + virtqueue_detach_element(vq, elem, 0); g_free(elem); } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 7b03efccec..b643f42ea4 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1287,7 +1287,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, return r; } - file.fd = event_notifier_get_fd(&vq->masked_notifier); + file.fd = event_notifier_get_wfd(&vq->masked_notifier); r = dev->vhost_ops->vhost_set_vring_call(dev, &file); if (r) { VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed"); @@ -1542,9 +1542,9 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, if (mask) { assert(vdev->use_guest_notifier_mask); - file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); + file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier); } else { - file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); + file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq)); } file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c index d23db98c56..0f69d1c742 100644 --- a/hw/virtio/virtio-bus.c +++ b/hw/virtio/virtio-bus.c @@ -48,6 +48,7 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) VirtioBusClass *klass = VIRTIO_BUS_GET_CLASS(bus); VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); bool has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); + bool vdev_has_iommu; Error *local_err = NULL; DPRINTF("%s: plug device.\n", qbus->name); @@ -69,11 +70,6 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) return; } - if (has_iommu && !virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM)) { - error_setg(errp, "iommu_platform=true is not supported by the device"); - return; - } - if (klass->device_plugged != NULL) { klass->device_plugged(qbus->parent, &local_err); } @@ -82,9 +78,15 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) return; } + vdev_has_iommu = virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM); if (klass->get_dma_as != NULL && has_iommu) { virtio_add_feature(&vdev->host_features, VIRTIO_F_IOMMU_PLATFORM); vdev->dma_as = klass->get_dma_as(qbus->parent); + if (!vdev_has_iommu && vdev->dma_as != &address_space_memory) { + error_setg(errp, + "iommu_platform=true is not supported by the device"); + return; + } } else { vdev->dma_as = &address_space_memory; } diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index aa9c16a17b..239fe97b12 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -24,6 +24,7 @@ #include "hw/qdev-properties.h" #include "hw/virtio/virtio.h" #include "sysemu/kvm.h" +#include "sysemu/reset.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "trace.h" @@ -42,6 +43,7 @@ typedef struct VirtIOIOMMUDomain { uint32_t id; + bool bypass; GTree *mappings; QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list; } VirtIOIOMMUDomain; @@ -257,12 +259,16 @@ static void virtio_iommu_put_endpoint(gpointer data) } static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s, - uint32_t domain_id) + uint32_t domain_id, + bool bypass) { VirtIOIOMMUDomain *domain; domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id)); if (domain) { + if (domain->bypass != bypass) { + return NULL; + } return domain; } domain = g_malloc0(sizeof(*domain)); @@ -270,6 +276,7 @@ static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s, domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp, NULL, (GDestroyNotify)g_free, (GDestroyNotify)g_free); + domain->bypass = bypass; g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain); QLIST_INIT(&domain->endpoint_list); trace_virtio_iommu_get_domain(domain_id); @@ -333,11 +340,16 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, { uint32_t domain_id = le32_to_cpu(req->domain); uint32_t ep_id = le32_to_cpu(req->endpoint); + uint32_t flags = le32_to_cpu(req->flags); VirtIOIOMMUDomain *domain; VirtIOIOMMUEndpoint *ep; trace_virtio_iommu_attach(domain_id, ep_id); + if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) { + return VIRTIO_IOMMU_S_INVAL; + } + ep = virtio_iommu_get_endpoint(s, ep_id); if (!ep) { return VIRTIO_IOMMU_S_NOENT; @@ -355,7 +367,12 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, } } - domain = virtio_iommu_get_domain(s, domain_id); + domain = virtio_iommu_get_domain(s, domain_id, + flags & VIRTIO_IOMMU_ATTACH_F_BYPASS); + if (!domain) { + /* Incompatible bypass flag */ + return VIRTIO_IOMMU_S_INVAL; + } QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); ep->domain = domain; @@ -418,6 +435,10 @@ static int virtio_iommu_map(VirtIOIOMMU *s, return VIRTIO_IOMMU_S_NOENT; } + if (domain->bypass) { + return VIRTIO_IOMMU_S_INVAL; + } + interval = g_malloc0(sizeof(*interval)); interval->low = virt_start; @@ -463,6 +484,11 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, if (!domain) { return VIRTIO_IOMMU_S_NOENT; } + + if (domain->bypass) { + return VIRTIO_IOMMU_S_INVAL; + } + interval.low = virt_start; interval.high = virt_end; @@ -728,8 +754,7 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, .perm = IOMMU_NONE, }; - bypass_allowed = virtio_vdev_has_feature(&s->parent_obj, - VIRTIO_IOMMU_F_BYPASS); + bypass_allowed = s->config.bypass; sid = virtio_iommu_get_bdf(sdev); @@ -780,6 +805,9 @@ static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr, entry.perm = flag; } goto unlock; + } else if (ep->domain->bypass) { + entry.perm = flag; + goto unlock; } found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval), @@ -831,13 +859,37 @@ static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data) out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start); out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end); out_config->probe_size = cpu_to_le32(dev_config->probe_size); + out_config->bypass = dev_config->bypass; trace_virtio_iommu_get_config(dev_config->page_size_mask, dev_config->input_range.start, dev_config->input_range.end, dev_config->domain_range.start, dev_config->domain_range.end, - dev_config->probe_size); + dev_config->probe_size, + dev_config->bypass); +} + +static void virtio_iommu_set_config(VirtIODevice *vdev, + const uint8_t *config_data) +{ + VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev); + struct virtio_iommu_config *dev_config = &dev->config; + const struct virtio_iommu_config *in_config = (void *)config_data; + + if (in_config->bypass != dev_config->bypass) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { + virtio_error(vdev, "cannot set config.bypass"); + return; + } else if (in_config->bypass != 0 && in_config->bypass != 1) { + virtio_error(vdev, "invalid config.bypass value '%u'", + in_config->bypass); + return; + } + dev_config->bypass = in_config->bypass; + } + + trace_virtio_iommu_set_config(in_config->bypass); } static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f, @@ -963,6 +1015,19 @@ static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, return 0; } +static void virtio_iommu_system_reset(void *opaque) +{ + VirtIOIOMMU *s = opaque; + + trace_virtio_iommu_system_reset(); + + /* + * config.bypass is sticky across device reset, but should be restored on + * system reset + */ + s->config.bypass = s->boot_bypass; +} + static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); @@ -988,9 +1053,9 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE); virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE); virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP); - virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS); virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO); virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE); + virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG); qemu_mutex_init(&s->mutex); @@ -1001,6 +1066,8 @@ static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) } else { error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!"); } + + qemu_register_reset(virtio_iommu_system_reset, s); } static void virtio_iommu_device_unrealize(DeviceState *dev) @@ -1008,6 +1075,8 @@ static void virtio_iommu_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOIOMMU *s = VIRTIO_IOMMU(dev); + qemu_unregister_reset(virtio_iommu_system_reset, s); + g_hash_table_destroy(s->as_by_busptr); if (s->domains) { g_tree_destroy(s->domains); @@ -1098,8 +1167,8 @@ static const VMStateDescription vmstate_endpoint = { static const VMStateDescription vmstate_domain = { .name = "domain", - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, .pre_load = domain_preload, .fields = (VMStateField[]) { VMSTATE_UINT32(id, VirtIOIOMMUDomain), @@ -1108,6 +1177,7 @@ static const VMStateDescription vmstate_domain = { VirtIOIOMMUInterval, VirtIOIOMMUMapping), VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1, vmstate_endpoint, VirtIOIOMMUEndpoint, next), + VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2), VMSTATE_END_OF_LIST() } }; @@ -1141,21 +1211,22 @@ static int iommu_post_load(void *opaque, int version_id) static const VMStateDescription vmstate_virtio_iommu_device = { .name = "virtio-iommu-device", - .minimum_version_id = 1, - .version_id = 1, + .minimum_version_id = 2, + .version_id = 2, .post_load = iommu_post_load, .fields = (VMStateField[]) { - VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1, + VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2, &vmstate_domain, VirtIOIOMMUDomain), + VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2), VMSTATE_END_OF_LIST() }, }; static const VMStateDescription vmstate_virtio_iommu = { .name = "virtio-iommu", - .minimum_version_id = 1, + .minimum_version_id = 2, .priority = MIG_PRI_IOMMU, - .version_id = 1, + .version_id = 2, .fields = (VMStateField[]) { VMSTATE_VIRTIO_DEVICE, VMSTATE_END_OF_LIST() @@ -1164,6 +1235,7 @@ static const VMStateDescription vmstate_virtio_iommu = { static Property virtio_iommu_properties[] = { DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus, "PCI", PCIBus *), + DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true), DEFINE_PROP_END_OF_LIST(), }; @@ -1180,6 +1252,7 @@ static void virtio_iommu_class_init(ObjectClass *klass, void *data) vdc->unrealize = virtio_iommu_device_unrealize; vdc->reset = virtio_iommu_device_reset; vdc->get_config = virtio_iommu_get_config; + vdc->set_config = virtio_iommu_set_config; vdc->get_features = virtio_iommu_get_features; vdc->set_status = virtio_iommu_set_status; vdc->vmsd = &vmstate_virtio_iommu_device; |