diff options
-rw-r--r-- | contrib/ivshmem-server/ivshmem-server.c | 56 | ||||
-rw-r--r-- | contrib/ivshmem-server/ivshmem-server.h | 4 | ||||
-rw-r--r-- | contrib/ivshmem-server/main.c | 100 | ||||
-rw-r--r-- | default-configs/pci.mak | 2 | ||||
-rw-r--r-- | docs/specs/ivshmem-spec.txt | 254 | ||||
-rw-r--r-- | docs/specs/ivshmem_device_spec.txt | 161 | ||||
-rw-r--r-- | hw/core/qdev-properties.c | 10 | ||||
-rw-r--r-- | hw/misc/ivshmem.c | 1084 | ||||
-rw-r--r-- | include/hw/qdev-properties.h | 3 | ||||
-rw-r--r-- | qemu-doc.texi | 49 | ||||
-rw-r--r-- | target-ppc/kvm.c | 6 | ||||
-rw-r--r-- | tests/Makefile | 2 | ||||
-rw-r--r-- | tests/ivshmem-test.c | 95 | ||||
-rw-r--r-- | tests/libqos/pci-pc.c | 8 | ||||
-rw-r--r-- | util/event_notifier-posix.c | 6 |
15 files changed, 1021 insertions, 819 deletions
diff --git a/contrib/ivshmem-server/ivshmem-server.c b/contrib/ivshmem-server/ivshmem-server.c index bfd0fad49a..172db78b37 100644 --- a/contrib/ivshmem-server/ivshmem-server.c +++ b/contrib/ivshmem-server/ivshmem-server.c @@ -12,9 +12,6 @@ #include <sys/mman.h> #include <sys/socket.h> #include <sys/un.h> -#ifdef CONFIG_LINUX -#include <sys/vfs.h> -#endif #include "ivshmem-server.h" @@ -257,7 +254,8 @@ ivshmem_server_ftruncate(int fd, unsigned shmsize) /* Init a new ivshmem server */ int ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path, - const char *shm_path, size_t shm_size, unsigned n_vectors, + const char *shm_path, bool use_shm_open, + size_t shm_size, unsigned n_vectors, bool verbose) { int ret; @@ -278,6 +276,7 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path, return -1; } + server->use_shm_open = use_shm_open; server->shm_size = shm_size; server->n_vectors = n_vectors; @@ -286,31 +285,6 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path, return 0; } -#ifdef CONFIG_LINUX - -#define HUGETLBFS_MAGIC 0x958458f6 - -static long gethugepagesize(const char *path) -{ - struct statfs fs; - int ret; - - do { - ret = statfs(path, &fs); - } while (ret != 0 && errno == EINTR); - - if (ret != 0) { - return -1; - } - - if (fs.f_type != HUGETLBFS_MAGIC) { - return -1; - } - - return fs.f_bsize; -} -#endif - /* open shm, create and bind to the unix socket */ int ivshmem_server_start(IvshmemServer *server) @@ -319,27 +293,17 @@ ivshmem_server_start(IvshmemServer *server) int shm_fd, sock_fd, ret; /* open shm file */ -#ifdef CONFIG_LINUX - long hpagesize; - - hpagesize = gethugepagesize(server->shm_path); - if (hpagesize < 0 && errno != ENOENT) { - IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n", - server->shm_path, strerror(errno)); - } - - if (hpagesize > 0) { + if (server->use_shm_open) { + IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n", + server->shm_path); + shm_fd = shm_open(server->shm_path, O_CREAT | O_RDWR, S_IRWXU); + } else { gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path); - IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path); + IVSHMEM_SERVER_DEBUG(server, "Using file-backed shared memory: %s\n", + server->shm_path); shm_fd = mkstemp(filename); unlink(filename); g_free(filename); - } else -#endif - { - IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n", - server->shm_path); - shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU); } if (shm_fd < 0) { diff --git a/contrib/ivshmem-server/ivshmem-server.h b/contrib/ivshmem-server/ivshmem-server.h index e9de8a369d..3851639618 100644 --- a/contrib/ivshmem-server/ivshmem-server.h +++ b/contrib/ivshmem-server/ivshmem-server.h @@ -66,6 +66,7 @@ typedef struct IvshmemServer { char unix_sock_path[PATH_MAX]; /**< path to unix socket */ int sock_fd; /**< unix sock file descriptor */ char shm_path[PATH_MAX]; /**< path to shm */ + bool use_shm_open; size_t shm_size; /**< size of shm */ int shm_fd; /**< shm file descriptor */ unsigned n_vectors; /**< number of vectors */ @@ -89,7 +90,8 @@ typedef struct IvshmemServer { */ int ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path, - const char *shm_path, size_t shm_size, unsigned n_vectors, + const char *shm_path, bool use_shm_open, + size_t shm_size, unsigned n_vectors, bool verbose); /** diff --git a/contrib/ivshmem-server/main.c b/contrib/ivshmem-server/main.c index cca1061f0e..dc64a1832c 100644 --- a/contrib/ivshmem-server/main.c +++ b/contrib/ivshmem-server/main.c @@ -29,35 +29,38 @@ typedef struct IvshmemServerArgs { const char *pid_file; const char *unix_socket_path; const char *shm_path; + bool use_shm_open; uint64_t shm_size; unsigned n_vectors; } IvshmemServerArgs; -/* show ivshmem_server_usage and exit with given error code */ static void -ivshmem_server_usage(const char *name, int code) +ivshmem_server_usage(const char *progname) { - fprintf(stderr, "%s [opts]\n", name); - fprintf(stderr, " -h: show this help\n"); - fprintf(stderr, " -v: verbose mode\n"); - fprintf(stderr, " -F: foreground mode (default is to daemonize)\n"); - fprintf(stderr, " -p <pid_file>: path to the PID file (used in daemon\n" - " mode only).\n" - " Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH); - fprintf(stderr, " -S <unix_socket_path>: path to the unix socket\n" - " to listen to.\n" - " Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH); - fprintf(stderr, " -m <shm_path>: path to the shared memory.\n" - " The path corresponds to a POSIX shm name or a\n" - " hugetlbfs mount point.\n" - " default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH); - fprintf(stderr, " -l <size>: size of shared memory in bytes. The suffix\n" - " K, M and G can be used (ex: 1K means 1024).\n" - " default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE); - fprintf(stderr, " -n <n_vects>: number of vectors.\n" - " default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS); - - exit(code); + printf("Usage: %s [OPTION]...\n" + " -h: show this help\n" + " -v: verbose mode\n" + " -F: foreground mode (default is to daemonize)\n" + " -p <pid-file>: path to the PID file (used in daemon mode only)\n" + " default " IVSHMEM_SERVER_DEFAULT_PID_FILE "\n" + " -S <unix-socket-path>: path to the unix socket to listen to\n" + " default " IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "\n" + " -M <shm-name>: POSIX shared memory object to use\n" + " default " IVSHMEM_SERVER_DEFAULT_SHM_PATH "\n" + " -m <dir-name>: where to create shared memory\n" + " -l <size>: size of shared memory in bytes\n" + " suffixes K, M and G can be used, e.g. 1K means 1024\n" + " default %u\n" + " -n <nvectors>: number of vectors\n" + " default %u\n", + progname, IVSHMEM_SERVER_DEFAULT_SHM_SIZE, + IVSHMEM_SERVER_DEFAULT_N_VECTORS); +} + +static void +ivshmem_server_help(const char *progname) +{ + fprintf(stderr, "Try '%s -h' for more information.\n", progname); } /* parse the program arguments, exit on error */ @@ -68,20 +71,12 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[]) unsigned long long v; Error *err = NULL; - while ((c = getopt(argc, argv, - "h" /* help */ - "v" /* verbose */ - "F" /* foreground */ - "p:" /* pid_file */ - "S:" /* unix_socket_path */ - "m:" /* shm_path */ - "l:" /* shm_size */ - "n:" /* n_vectors */ - )) != -1) { + while ((c = getopt(argc, argv, "hvFp:S:m:M:l:n:")) != -1) { switch (c) { case 'h': /* help */ - ivshmem_server_usage(argv[0], 0); + ivshmem_server_usage(argv[0]); + exit(0); break; case 'v': /* verbose */ @@ -92,36 +87,41 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[]) args->foreground = 1; break; - case 'p': /* pid_file */ + case 'p': /* pid file */ args->pid_file = optarg; break; - case 'S': /* unix_socket_path */ + case 'S': /* unix socket path */ args->unix_socket_path = optarg; break; - case 'm': /* shm_path */ + case 'M': /* shm name */ + case 'm': /* dir name */ args->shm_path = optarg; + args->use_shm_open = c == 'M'; break; - case 'l': /* shm_size */ + case 'l': /* shm size */ parse_option_size("shm_size", optarg, &args->shm_size, &err); if (err) { error_report_err(err); - ivshmem_server_usage(argv[0], 1); + ivshmem_server_help(argv[0]); + exit(1); } break; - case 'n': /* n_vectors */ + case 'n': /* number of vectors */ if (parse_uint_full(optarg, &v, 0) < 0) { fprintf(stderr, "cannot parse n_vectors\n"); - ivshmem_server_usage(argv[0], 1); + ivshmem_server_help(argv[0]); + exit(1); } args->n_vectors = v; break; default: - ivshmem_server_usage(argv[0], 1); + ivshmem_server_usage(argv[0]); + exit(1); break; } } @@ -129,12 +129,14 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[]) if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) { fprintf(stderr, "too many requested vectors (max is %d)\n", IVSHMEM_SERVER_MAX_VECTORS); - ivshmem_server_usage(argv[0], 1); + ivshmem_server_help(argv[0]); + exit(1); } if (args->verbose == 1 && args->foreground == 0) { fprintf(stderr, "cannot use verbose in daemon mode\n"); - ivshmem_server_usage(argv[0], 1); + ivshmem_server_help(argv[0]); + exit(1); } } @@ -192,11 +194,18 @@ main(int argc, char *argv[]) .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE, .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH, .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH, + .use_shm_open = true, .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE, .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS, }; int ret = 1; + /* + * Do not remove this notice without adding proper error handling! + * Start with handling ivshmem_server_send_one_msg() failure. + */ + printf("*** Example code, do not use in production ***\n"); + /* parse arguments, will exit on error */ ivshmem_server_parse_args(&args, argc, argv); @@ -219,7 +228,8 @@ main(int argc, char *argv[]) } /* init the ivshms structure */ - if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path, + if (ivshmem_server_init(&server, args.unix_socket_path, + args.shm_path, args.use_shm_open, args.shm_size, args.n_vectors, args.verbose) < 0) { fprintf(stderr, "cannot init server\n"); goto err; diff --git a/default-configs/pci.mak b/default-configs/pci.mak index 4fa9a28ef6..9c8bc68c4c 100644 --- a/default-configs/pci.mak +++ b/default-configs/pci.mak @@ -36,5 +36,5 @@ CONFIG_SDHCI=y CONFIG_EDU=y CONFIG_VGA=y CONFIG_VGA_PCI=y -CONFIG_IVSHMEM=$(CONFIG_POSIX) +CONFIG_IVSHMEM=$(CONFIG_EVENTFD) CONFIG_ROCKER=y diff --git a/docs/specs/ivshmem-spec.txt b/docs/specs/ivshmem-spec.txt new file mode 100644 index 0000000000..a1f5499796 --- /dev/null +++ b/docs/specs/ivshmem-spec.txt @@ -0,0 +1,254 @@ += Device Specification for Inter-VM shared memory device = + +The Inter-VM shared memory device (ivshmem) is designed to share a +memory region between multiple QEMU processes running different guests +and the host. In order for all guests to be able to pick up the +shared memory area, it is modeled by QEMU as a PCI device exposing +said memory to the guest as a PCI BAR. + +The device can use a shared memory object on the host directly, or it +can obtain one from an ivshmem server. + +In the latter case, the device can additionally interrupt its peers, and +get interrupted by its peers. + + +== Configuring the ivshmem PCI device == + +There are two basic configurations: + +- Just shared memory: -device ivshmem-plain,memdev=HMB,... + + This uses host memory backend HMB. It should have option "share" + set. + +- Shared memory plus interrupts: -device ivshmem,chardev=CHR,vectors=N,... + + An ivshmem server must already be running on the host. The device + connects to the server's UNIX domain socket via character device + CHR. + + Each peer gets assigned a unique ID by the server. IDs must be + between 0 and 65535. + + Interrupts are message-signaled (MSI-X). vectors=N configures the + number of vectors to use. + +For more details on ivshmem device properties, see The QEMU Emulator +User Documentation (qemu-doc.*). + + +== The ivshmem PCI device's guest interface == + +The device has vendor ID 1af4, device ID 1110, revision 1. Before +QEMU 2.6.0, it had revision 0. + +=== PCI BARs === + +The ivshmem PCI device has two or three BARs: + +- BAR0 holds device registers (256 Byte MMIO) +- BAR1 holds MSI-X table and PBA (only ivshmem-doorbell) +- BAR2 maps the shared memory object + +There are two ways to use this device: + +- If you only need the shared memory part, BAR2 suffices. This way, + you have access to the shared memory in the guest and can use it as + you see fit. Memnic, for example, uses ivshmem this way from guest + user space (see http://dpdk.org/browse/memnic). + +- If you additionally need the capability for peers to interrupt each + other, you need BAR0 and BAR1. You will most likely want to write a + kernel driver to handle interrupts. Requires the device to be + configured for interrupts, obviously. + +Before QEMU 2.6.0, BAR2 can initially be invalid if the device is +configured for interrupts. It becomes safely accessible only after +the ivshmem server provided the shared memory. These devices have PCI +revision 0 rather than 1. Guest software should wait for the +IVPosition register (described below) to become non-negative before +accessing BAR2. + +Revision 0 of the device is not capable to tell guest software whether +it is configured for interrupts. + +=== PCI device registers === + +BAR 0 contains the following registers: + + Offset Size Access On reset Function + 0 4 read/write 0 Interrupt Mask + bit 0: peer interrupt (rev 0) + reserved (rev 1) + bit 1..31: reserved + 4 4 read/write 0 Interrupt Status + bit 0: peer interrupt (rev 0) + reserved (rev 1) + bit 1..31: reserved + 8 4 read-only 0 or ID IVPosition + 12 4 write-only N/A Doorbell + bit 0..15: vector + bit 16..31: peer ID + 16 240 none N/A reserved + +Software should only access the registers as specified in column +"Access". Reserved bits should be ignored on read, and preserved on +write. + +In revision 0 of the device, Interrupt Status and Mask Register +together control the legacy INTx interrupt when the device has no +MSI-X capability: INTx is asserted when the bit-wise AND of Status and +Mask is non-zero and the device has no MSI-X capability. Interrupt +Status Register bit 0 becomes 1 when an interrupt request from a peer +is received. Reading the register clears it. + +IVPosition Register: if the device is not configured for interrupts, +this is zero. Else, it is the device's ID (between 0 and 65535). + +Before QEMU 2.6.0, the register may read -1 for a short while after +reset. These devices have PCI revision 0 rather than 1. + +There is no good way for software to find out whether the device is +configured for interrupts. A positive IVPosition means interrupts, +but zero could be either. + +Doorbell Register: writing this register requests to interrupt a peer. +The written value's high 16 bits are the ID of the peer to interrupt, +and its low 16 bits select an interrupt vector. + +If the device is not configured for interrupts, the write is ignored. + +If the interrupt hasn't completed setup, the write is ignored. The +device is not capable to tell guest software whether setup is +complete. Interrupts can regress to this state on migration. + +If the peer with the requested ID isn't connected, or it has fewer +interrupt vectors connected, the write is ignored. The device is not +capable to tell guest software what peers are connected, or how many +interrupt vectors are connected. + +The peer's interrupt for this vector then becomes pending. There is +no way for software to clear the pending bit, and a polling mode of +operation is therefore impossible. + +If the peer is a revision 0 device without MSI-X capability, its +Interrupt Status register is set to 1. This asserts INTx unless +masked by the Interrupt Mask register. The device is not capable to +communicate the interrupt vector to guest software then. + +With multiple MSI-X vectors, different vectors can be used to indicate +different events have occurred. The semantics of interrupt vectors +are left to the application. + + +== Interrupt infrastructure == + +When configured for interrupts, the peers share eventfd objects in +addition to shared memory. The shared resources are managed by an +ivshmem server. + +=== The ivshmem server === + +The server listens on a UNIX domain socket. + +For each new client that connects to the server, the server +- picks an ID, +- creates eventfd file descriptors for the interrupt vectors, +- sends the ID and the file descriptor for the shared memory to the + new client, +- sends connect notifications for the new client to the other clients + (these contain file descriptors for sending interrupts), +- sends connect notifications for the other clients to the new client, + and +- sends interrupt setup messages to the new client (these contain file + descriptors for receiving interrupts). + +The first client to connect to the server receives ID zero. + +When a client disconnects from the server, the server sends disconnect +notifications to the other clients. + +The next section describes the protocol in detail. + +If the server terminates without sending disconnect notifications for +its connected clients, the clients can elect to continue. They can +communicate with each other normally, but won't receive disconnect +notification on disconnect, and no new clients can connect. There is +no way for the clients to connect to a restarted server. The device +is not capable to tell guest software whether the server is still up. + +Example server code is in contrib/ivshmem-server/. Not to be used in +production. It assumes all clients use the same number of interrupt +vectors. + +A standalone client is in contrib/ivshmem-client/. It can be useful +for debugging. + +=== The ivshmem Client-Server Protocol === + +An ivshmem device configured for interrupts connects to an ivshmem +server. This section details the protocol between the two. + +The connection is one-way: the server sends messages to the client. +Each message consists of a single 8 byte little-endian signed number, +and may be accompanied by a file descriptor via SCM_RIGHTS. Both +client and server close the connection on error. + +Note: QEMU currently doesn't close the connection right on error, but +only when the character device is destroyed. + +On connect, the server sends the following messages in order: + +1. The protocol version number, currently zero. The client should + close the connection on receipt of versions it can't handle. + +2. The client's ID. This is unique among all clients of this server. + IDs must be between 0 and 65535, because the Doorbell register + provides only 16 bits for them. + +3. The number -1, accompanied by the file descriptor for the shared + memory. + +4. Connect notifications for existing other clients, if any. This is + a peer ID (number between 0 and 65535 other than the client's ID), + repeated N times. Each repetition is accompanied by one file + descriptor. These are for interrupting the peer with that ID using + vector 0,..,N-1, in order. If the client is configured for fewer + vectors, it closes the extra file descriptors. If it is configured + for more, the extra vectors remain unconnected. + +5. Interrupt setup. This is the client's own ID, repeated N times. + Each repetition is accompanied by one file descriptor. These are + for receiving interrupts from peers using vector 0,..,N-1, in + order. If the client is configured for fewer vectors, it closes + the extra file descriptors. If it is configured for more, the + extra vectors remain unconnected. + +From then on, the server sends these kinds of messages: + +6. Connection / disconnection notification. This is a peer ID. + + - If the number comes with a file descriptor, it's a connection + notification, exactly like in step 4. + + - Else, it's a disconnection notification for the peer with that ID. + +Known bugs: + +* The protocol changed incompatibly in QEMU 2.5. Before, messages + were native endian long, and there was no version number. + +* The protocol is poorly designed. + +=== The ivshmem Client-Client Protocol === + +An ivshmem device configured for interrupts receives eventfd file +descriptors for interrupting peers and getting interrupted by peers +from the server, as explained in the previous section. + +To interrupt a peer, the device writes the 8-byte integer 1 in native +byte order to the respective file descriptor. + +To receive an interrupt, the device reads and discards as many 8-byte +integers as it can. diff --git a/docs/specs/ivshmem_device_spec.txt b/docs/specs/ivshmem_device_spec.txt deleted file mode 100644 index d318d65c32..0000000000 --- a/docs/specs/ivshmem_device_spec.txt +++ /dev/null @@ -1,161 +0,0 @@ - -Device Specification for Inter-VM shared memory device ------------------------------------------------------- - -The Inter-VM shared memory device is designed to share a memory region (created -on the host via the POSIX shared memory API) between multiple QEMU processes -running different guests. In order for all guests to be able to pick up the -shared memory area, it is modeled by QEMU as a PCI device exposing said memory -to the guest as a PCI BAR. -The memory region does not belong to any guest, but is a POSIX memory object on -the host. The host can access this shared memory if needed. - -The device also provides an optional communication mechanism between guests -sharing the same memory object. More details about that in the section 'Guest to -guest communication' section. - - -The Inter-VM PCI device ------------------------ - -From the VM point of view, the ivshmem PCI device supports three BARs. - -- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is - not used. -- BAR1 is used for MSI-X when it is enabled in the device. -- BAR2 is used to access the shared memory object. - -It is your choice how to use the device but you must choose between two -behaviors : - -- basically, if you only need the shared memory part, you will map BAR2. - This way, you have access to the shared memory in guest and can use it as you - see fit (memnic, for example, uses it in userland - http://dpdk.org/browse/memnic). - -- BAR0 and BAR1 are used to implement an optional communication mechanism - through interrupts in the guests. If you need an event mechanism between the - guests accessing the shared memory, you will most likely want to write a - kernel driver that will handle interrupts. See details in the section 'Guest - to guest communication' section. - -The behavior is chosen when starting your QEMU processes: -- no communication mechanism needed, the first QEMU to start creates the shared - memory on the host, subsequent QEMU processes will use it. - -- communication mechanism needed, an ivshmem server must be started before any - QEMU processes, then each QEMU process connects to the server unix socket. - -For more details on the QEMU ivshmem parameters, see qemu-doc documentation. - - -Guest to guest communication ----------------------------- - -This section details the communication mechanism between the guests accessing -the ivhsmem shared memory. - -*ivshmem server* - -This server code is available in qemu.git/contrib/ivshmem-server. - -The server must be started on the host before any guest. -It creates a shared memory object then waits for clients to connect on a unix -socket. All the messages are little-endian int64_t integer. - -For each client (QEMU process) that connects to the server: -- the server sends a protocol version, if client does not support it, the client - closes the communication, -- the server assigns an ID for this client and sends this ID to him as the first - message, -- the server sends a fd to the shared memory object to this client, -- the server creates a new set of host eventfds associated to the new client and - sends this set to all already connected clients, -- finally, the server sends all the eventfds sets for all clients to the new - client. - -The server signals all clients when one of them disconnects. - -The client IDs are limited to 16 bits because of the current implementation (see -Doorbell register in 'PCI device registers' subsection). Hence only 65536 -clients are supported. - -All the file descriptors (fd to the shared memory, eventfds for each client) -are passed to clients using SCM_RIGHTS over the server unix socket. - -Apart from the current ivshmem implementation in QEMU, an ivshmem client has -been provided in qemu.git/contrib/ivshmem-client for debug. - -*QEMU as an ivshmem client* - -At initialisation, when creating the ivshmem device, QEMU first receives a -protocol version and closes communication with server if it does not match. -Then, QEMU gets its ID from the server then makes it available through BAR0 -IVPosition register for the VM to use (see 'PCI device registers' subsection). -QEMU then uses the fd to the shared memory to map it to BAR2. -eventfds for all other clients received from the server are stored to implement -BAR0 Doorbell register (see 'PCI device registers' subsection). -Finally, eventfds assigned to this QEMU process are used to send interrupts in -this VM. - -*PCI device registers* - -From the VM point of view, the ivshmem PCI device supports 4 registers of -32-bits each. - -enum ivshmem_registers { - IntrMask = 0, - IntrStatus = 4, - IVPosition = 8, - Doorbell = 12 -}; - -The first two registers are the interrupt mask and status registers. Mask and -status are only used with pin-based interrupts. They are unused with MSI -interrupts. - -Status Register: The status register is set to 1 when an interrupt occurs. - -Mask Register: The mask register is bitwise ANDed with the interrupt status -and the result will raise an interrupt if it is non-zero. However, since 1 is -the only value the status will be set to, it is only the first bit of the mask -that has any effect. Therefore interrupts can be masked by setting the first -bit to 0 and unmasked by setting the first bit to 1. - -IVPosition Register: The IVPosition register is read-only and reports the -guest's ID number. The guest IDs are non-negative integers. When using the -server, since the server is a separate process, the VM ID will only be set when -the device is ready (shared memory is received from the server and accessible -via the device). If the device is not ready, the IVPosition will return -1. -Applications should ensure that they have a valid VM ID before accessing the -shared memory. - -Doorbell Register: To interrupt another guest, a guest must write to the -Doorbell register. The doorbell register is 32-bits, logically divided into -two 16-bit fields. The high 16-bits are the guest ID to interrupt and the low -16-bits are the interrupt vector to trigger. The semantics of the value -written to the doorbell depends on whether the device is using MSI or a regular -pin-based interrupt. In short, MSI uses vectors while regular interrupts set -the status register. - -Regular Interrupts - -If regular interrupts are used (due to either a guest not supporting MSI or the -user specifying not to use them on startup) then the value written to the lower -16-bits of the Doorbell register results is arbitrary and will trigger an -interrupt in the destination guest. - -Message Signalled Interrupts - -An ivshmem device may support multiple MSI vectors. If so, the lower 16-bits -written to the Doorbell register must be between 0 and the maximum number of -vectors the guest supports. The lower 16 bits written to the doorbell is the -MSI vector that will be raised in the destination guest. The number of MSI -vectors is configurable but it is set when the VM is started. - -The important thing to remember with MSI is that it is only a signal, no status -is set (since MSI interrupts are not shared). All information other than the -interrupt itself should be communicated via the shared memory region. Devices -supporting multiple MSI vectors can use different vectors to indicate different -events have occurred. The semantics of interrupt vectors are left to the -user's discretion. diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c index bc89800246..d2f5a08af6 100644 --- a/hw/core/qdev-properties.c +++ b/hw/core/qdev-properties.c @@ -516,6 +516,16 @@ PropertyInfo qdev_prop_macaddr = { .set = set_mac, }; +/* --- on/off/auto --- */ + +PropertyInfo qdev_prop_on_off_auto = { + .name = "OnOffAuto", + .description = "on/off/auto", + .enum_table = OnOffAuto_lookup, + .get = get_enum, + .set = set_enum, +}; + /* --- lost tick policy --- */ QEMU_BUILD_BUG_ON(sizeof(LostTickPolicy) != sizeof(int)); diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c index 1838bc8506..132387f04b 100644 --- a/hw/misc/ivshmem.c +++ b/hw/misc/ivshmem.c @@ -26,9 +26,10 @@ #include "migration/migration.h" #include "qemu/error-report.h" #include "qemu/event_notifier.h" -#include "qemu/fifo8.h" +#include "qom/object_interfaces.h" #include "sysemu/char.h" #include "sysemu/hostmem.h" +#include "sysemu/qtest.h" #include "qapi/visitor.h" #include "exec/ram_addr.h" @@ -39,22 +40,31 @@ #define PCI_VENDOR_ID_IVSHMEM PCI_VENDOR_ID_REDHAT_QUMRANET #define PCI_DEVICE_ID_IVSHMEM 0x1110 -#define IVSHMEM_MAX_PEERS G_MAXUINT16 +#define IVSHMEM_MAX_PEERS UINT16_MAX #define IVSHMEM_IOEVENTFD 0 #define IVSHMEM_MSI 1 -#define IVSHMEM_PEER 0 -#define IVSHMEM_MASTER 1 - #define IVSHMEM_REG_BAR_SIZE 0x100 -//#define DEBUG_IVSHMEM -#ifdef DEBUG_IVSHMEM -#define IVSHMEM_DPRINTF(fmt, ...) \ - do {printf("IVSHMEM: " fmt, ## __VA_ARGS__); } while (0) -#else -#define IVSHMEM_DPRINTF(fmt, ...) -#endif +#define IVSHMEM_DEBUG 0 +#define IVSHMEM_DPRINTF(fmt, ...) \ + do { \ + if (IVSHMEM_DEBUG) { \ + printf("IVSHMEM: " fmt, ## __VA_ARGS__); \ + } \ + } while (0) + +#define TYPE_IVSHMEM_COMMON "ivshmem-common" +#define IVSHMEM_COMMON(obj) \ + OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_COMMON) + +#define TYPE_IVSHMEM_PLAIN "ivshmem-plain" +#define IVSHMEM_PLAIN(obj) \ + OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_PLAIN) + +#define TYPE_IVSHMEM_DOORBELL "ivshmem-doorbell" +#define IVSHMEM_DOORBELL(obj) \ + OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_DOORBELL) #define TYPE_IVSHMEM "ivshmem" #define IVSHMEM(obj) \ @@ -75,38 +85,40 @@ typedef struct IVShmemState { PCIDevice parent_obj; /*< public >*/ - HostMemoryBackend *hostmem; + uint32_t features; + + /* exactly one of these two may be set */ + HostMemoryBackend *hostmem; /* with interrupts */ + CharDriverState *server_chr; /* without interrupts */ + + /* registers */ uint32_t intrmask; uint32_t intrstatus; + int vm_id; - CharDriverState **eventfd_chr; - CharDriverState *server_chr; - Fifo8 incoming_fifo; - MemoryRegion ivshmem_mmio; - - /* We might need to register the BAR before we actually have the memory. - * So prepare a container MemoryRegion for the BAR immediately and - * add a subregion when we have the memory. - */ - MemoryRegion bar; - MemoryRegion ivshmem; - uint64_t ivshmem_size; /* size of shared memory region */ - uint32_t ivshmem_64bit; + /* BARs */ + MemoryRegion ivshmem_mmio; /* BAR 0 (registers) */ + MemoryRegion *ivshmem_bar2; /* BAR 2 (shared memory) */ + MemoryRegion server_bar2; /* used with server_chr */ + /* interrupt support */ Peer *peers; - int nb_peers; /* how many peers we have space for */ - - int vm_id; + int nb_peers; /* space in @peers[] */ uint32_t vectors; - uint32_t features; MSIVector *msi_vectors; + uint64_t msg_buf; /* buffer for receiving server messages */ + int msg_buffered_bytes; /* #bytes in @msg_buf */ + /* migration stuff */ + OnOffAuto master; Error *migration_blocker; - char * shmobj; - char * sizearg; - char * role; - int role_val; /* scalar to avoid multiple string comparisons */ + /* legacy cruft */ + char *role; + char *shmobj; + char *sizearg; + size_t legacy_size; + uint32_t not_legacy_32bit; } IVShmemState; /* registers for the Inter-VM shared memory device */ @@ -122,12 +134,34 @@ static inline uint32_t ivshmem_has_feature(IVShmemState *ivs, return (ivs->features & (1 << feature)); } -/* accessing registers - based on rtl8139 */ +static inline bool ivshmem_is_master(IVShmemState *s) +{ + assert(s->master != ON_OFF_AUTO_AUTO); + return s->master == ON_OFF_AUTO_ON; +} + static void ivshmem_update_irq(IVShmemState *s) { PCIDevice *d = PCI_DEVICE(s); - int isr; - isr = (s->intrstatus & s->intrmask) & 0xffffffff; + uint32_t isr = s->intrstatus & s->intrmask; + + /* + * Do nothing unless the device actually uses INTx. Here's how + * the device variants signal interrupts, what they put in PCI + * config space: + * Device variant Interrupt Interrupt Pin MSI-X cap. + * ivshmem-plain none 0 no + * ivshmem-doorbell MSI-X 1 yes(1) + * ivshmem,msi=off INTx 1 no + * ivshmem,msi=on MSI-X 1(2) yes(1) + * (1) if guest enabled MSI-X + * (2) the device lies + * Leads to the condition for doing nothing: + */ + if (ivshmem_has_feature(s, IVSHMEM_MSI) + || !d->config[PCI_INTERRUPT_PIN]) { + return; + } /* don't print ISR resets */ if (isr) { @@ -135,7 +169,7 @@ static void ivshmem_update_irq(IVShmemState *s) isr ? 1 : 0, s->intrstatus, s->intrmask); } - pci_set_irq(d, (isr != 0)); + pci_set_irq(d, isr != 0); } static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val) @@ -143,7 +177,6 @@ static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val) IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val); s->intrmask = val; - ivshmem_update_irq(s); } @@ -152,7 +185,6 @@ static uint32_t ivshmem_IntrMask_read(IVShmemState *s) uint32_t ret = s->intrmask; IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret); - return ret; } @@ -161,7 +193,6 @@ static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val) IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val); s->intrstatus = val; - ivshmem_update_irq(s); } @@ -171,9 +202,7 @@ static uint32_t ivshmem_IntrStatus_read(IVShmemState *s) /* reading ISR clears all interrupts */ s->intrstatus = 0; - ivshmem_update_irq(s); - return ret; } @@ -237,12 +266,7 @@ static uint64_t ivshmem_io_read(void *opaque, hwaddr addr, break; case IVPOSITION: - /* return my VM ID if the memory is mapped */ - if (memory_region_is_mapped(&s->ivshmem)) { - ret = s->vm_id; - } else { - ret = -1; - } + ret = s->vm_id; break; default: @@ -263,21 +287,11 @@ static const MemoryRegionOps ivshmem_mmio_ops = { }, }; -static int ivshmem_can_receive(void * opaque) -{ - return sizeof(int64_t); -} - -static void ivshmem_event(void *opaque, int event) -{ - IVSHMEM_DPRINTF("ivshmem_event %d\n", event); -} - static void ivshmem_vector_notify(void *opaque) { MSIVector *entry = opaque; PCIDevice *pdev = entry->pdev; - IVShmemState *s = IVSHMEM(pdev); + IVShmemState *s = IVSHMEM_COMMON(pdev); int vector = entry - s->msi_vectors; EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; @@ -287,7 +301,9 @@ static void ivshmem_vector_notify(void *opaque) IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector); if (ivshmem_has_feature(s, IVSHMEM_MSI)) { - msix_notify(pdev, vector); + if (msix_enabled(pdev)) { + msix_notify(pdev, vector); + } } else { ivshmem_IntrStatus_write(s, 1); } @@ -296,7 +312,7 @@ static void ivshmem_vector_notify(void *opaque) static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, MSIMessage msg) { - IVShmemState *s = IVSHMEM(dev); + IVShmemState *s = IVSHMEM_COMMON(dev); EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; MSIVector *v = &s->msi_vectors[vector]; int ret; @@ -313,7 +329,7 @@ static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector, static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector) { - IVShmemState *s = IVSHMEM(dev); + IVShmemState *s = IVSHMEM_COMMON(dev); EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; int ret; @@ -330,7 +346,7 @@ static void ivshmem_vector_poll(PCIDevice *dev, unsigned int vector_start, unsigned int vector_end) { - IVShmemState *s = IVSHMEM(dev); + IVShmemState *s = IVSHMEM_COMMON(dev); unsigned int vector; IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end); @@ -355,61 +371,13 @@ static void watch_vector_notifier(IVShmemState *s, EventNotifier *n, { int eventfd = event_notifier_get_fd(n); - /* if MSI is supported we need multiple interrupts */ + assert(!s->msi_vectors[vector].pdev); s->msi_vectors[vector].pdev = PCI_DEVICE(s); qemu_set_fd_handler(eventfd, ivshmem_vector_notify, NULL, &s->msi_vectors[vector]); } -static int check_shm_size(IVShmemState *s, int fd, Error **errp) -{ - /* check that the guest isn't going to try and map more memory than the - * the object has allocated return -1 to indicate error */ - - struct stat buf; - - if (fstat(fd, &buf) < 0) { - error_setg(errp, "exiting: fstat on fd %d failed: %s", - fd, strerror(errno)); - return -1; - } - - if (s->ivshmem_size > buf.st_size) { - error_setg(errp, "Requested memory size greater" - " than shared object size (%" PRIu64 " > %" PRIu64")", - s->ivshmem_size, (uint64_t)buf.st_size); - return -1; - } else { - return 0; - } -} - -/* create the shared memory BAR when we are not using the server, so we can - * create the BAR and map the memory immediately */ -static int create_shared_memory_BAR(IVShmemState *s, int fd, uint8_t attr, - Error **errp) -{ - void * ptr; - - ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); - if (ptr == MAP_FAILED) { - error_setg_errno(errp, errno, "Failed to mmap shared memory"); - return -1; - } - - memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), "ivshmem.bar2", - s->ivshmem_size, ptr); - qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), fd); - vmstate_register_ram(&s->ivshmem, DEVICE(s)); - memory_region_add_subregion(&s->bar, 0, &s->ivshmem); - - /* region for shared memory */ - pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar); - - return 0; -} - static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i) { memory_region_add_eventfd(&s->ivshmem_mmio, @@ -434,21 +402,17 @@ static void close_peer_eventfds(IVShmemState *s, int posn) { int i, n; - if (!ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { - return; - } - if (posn < 0 || posn >= s->nb_peers) { - error_report("invalid peer %d", posn); - return; - } - + assert(posn >= 0 && posn < s->nb_peers); n = s->peers[posn].nb_eventfds; - memory_region_transaction_begin(); - for (i = 0; i < n; i++) { - ivshmem_del_eventfd(s, posn, i); + if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { + memory_region_transaction_begin(); + for (i = 0; i < n; i++) { + ivshmem_del_eventfd(s, posn, i); + } + memory_region_transaction_commit(); } - memory_region_transaction_commit(); + for (i = 0; i < n; i++) { event_notifier_cleanup(&s->peers[posn].eventfds[i]); } @@ -457,286 +421,320 @@ static void close_peer_eventfds(IVShmemState *s, int posn) s->peers[posn].nb_eventfds = 0; } -/* this function increase the dynamic storage need to store data about other - * peers */ -static int resize_peers(IVShmemState *s, int new_min_size) -{ - - int j, old_size; - - /* limit number of max peers */ - if (new_min_size <= 0 || new_min_size > IVSHMEM_MAX_PEERS) { - return -1; - } - if (new_min_size <= s->nb_peers) { - return 0; - } - - old_size = s->nb_peers; - s->nb_peers = new_min_size; - - IVSHMEM_DPRINTF("bumping storage to %d peers\n", s->nb_peers); - - s->peers = g_realloc(s->peers, s->nb_peers * sizeof(Peer)); - - for (j = old_size; j < s->nb_peers; j++) { - s->peers[j].eventfds = g_new0(EventNotifier, s->vectors); - s->peers[j].nb_eventfds = 0; - } - - return 0; -} - -static bool fifo_update_and_get(IVShmemState *s, const uint8_t *buf, int size, - void *data, size_t len) +static void resize_peers(IVShmemState *s, int nb_peers) { - const uint8_t *p; - uint32_t num; - - assert(len <= sizeof(int64_t)); /* limitation of the fifo */ - if (fifo8_is_empty(&s->incoming_fifo) && size == len) { - memcpy(data, buf, size); - return true; - } - - IVSHMEM_DPRINTF("short read of %d bytes\n", size); - - num = MIN(size, sizeof(int64_t) - fifo8_num_used(&s->incoming_fifo)); - fifo8_push_all(&s->incoming_fifo, buf, num); - - if (fifo8_num_used(&s->incoming_fifo) < len) { - assert(num == 0); - return false; - } - - size -= num; - buf += num; - p = fifo8_pop_buf(&s->incoming_fifo, len, &num); - assert(num == len); - - memcpy(data, p, len); + int old_nb_peers = s->nb_peers; + int i; - if (size > 0) { - fifo8_push_all(&s->incoming_fifo, buf, size); - } + assert(nb_peers > old_nb_peers); + IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers); - return true; -} + s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer)); + s->nb_peers = nb_peers; -static bool fifo_update_and_get_i64(IVShmemState *s, - const uint8_t *buf, int size, int64_t *i64) -{ - if (fifo_update_and_get(s, buf, size, i64, sizeof(*i64))) { - *i64 = GINT64_FROM_LE(*i64); - return true; + for (i = old_nb_peers; i < nb_peers; i++) { + s->peers[i].eventfds = g_new0(EventNotifier, s->vectors); + s->peers[i].nb_eventfds = 0; } - - return false; } -static int ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector) +static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector, + Error **errp) { PCIDevice *pdev = PCI_DEVICE(s); MSIMessage msg = msix_get_message(pdev, vector); int ret; IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector); - - if (s->msi_vectors[vector].pdev != NULL) { - return 0; - } + assert(!s->msi_vectors[vector].pdev); ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev); if (ret < 0) { - error_report("ivshmem: kvm_irqchip_add_msi_route failed"); - return -1; + error_setg(errp, "kvm_irqchip_add_msi_route failed"); + return; } s->msi_vectors[vector].virq = ret; s->msi_vectors[vector].pdev = pdev; - - return 0; } -static void setup_interrupt(IVShmemState *s, int vector) +static void setup_interrupt(IVShmemState *s, int vector, Error **errp) { EventNotifier *n = &s->peers[s->vm_id].eventfds[vector]; bool with_irqfd = kvm_msi_via_irqfd_enabled() && ivshmem_has_feature(s, IVSHMEM_MSI); PCIDevice *pdev = PCI_DEVICE(s); + Error *err = NULL; IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector); if (!with_irqfd) { - IVSHMEM_DPRINTF("with eventfd"); + IVSHMEM_DPRINTF("with eventfd\n"); watch_vector_notifier(s, n, vector); } else if (msix_enabled(pdev)) { - IVSHMEM_DPRINTF("with irqfd"); - if (ivshmem_add_kvm_msi_virq(s, vector) < 0) { + IVSHMEM_DPRINTF("with irqfd\n"); + ivshmem_add_kvm_msi_virq(s, vector, &err); + if (err) { + error_propagate(errp, err); return; } if (!msix_is_masked(pdev, vector)) { kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, s->msi_vectors[vector].virq); + /* TODO handle error */ } } else { /* it will be delayed until msix is enabled, in write_config */ - IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled"); + IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n"); } } -static void ivshmem_read(void *opaque, const uint8_t *buf, int size) +static void process_msg_shmem(IVShmemState *s, int fd, Error **errp) { - IVShmemState *s = opaque; - int incoming_fd; - int new_eventfd; - int64_t incoming_posn; - Error *err = NULL; - Peer *peer; + struct stat buf; + size_t size; + void *ptr; - if (!fifo_update_and_get_i64(s, buf, size, &incoming_posn)) { + if (s->ivshmem_bar2) { + error_setg(errp, "server sent unexpected shared memory message"); + close(fd); return; } - if (incoming_posn < -1) { - IVSHMEM_DPRINTF("invalid incoming_posn %" PRId64 "\n", incoming_posn); + if (fstat(fd, &buf) < 0) { + error_setg_errno(errp, errno, + "can't determine size of shared memory sent by server"); + close(fd); return; } - /* pick off s->server_chr->msgfd and store it, posn should accompany msg */ - incoming_fd = qemu_chr_fe_get_msgfd(s->server_chr); - IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", - incoming_posn, incoming_fd); - - /* make sure we have enough space for this peer */ - if (incoming_posn >= s->nb_peers) { - if (resize_peers(s, incoming_posn + 1) < 0) { - error_report("failed to resize peers array"); - if (incoming_fd != -1) { - close(incoming_fd); - } + size = buf.st_size; + + /* Legacy cruft */ + if (s->legacy_size != SIZE_MAX) { + if (size < s->legacy_size) { + error_setg(errp, "server sent only %zd bytes of shared memory", + (size_t)buf.st_size); + close(fd); return; } + size = s->legacy_size; } - peer = &s->peers[incoming_posn]; + /* mmap the region and map into the BAR2 */ + ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + error_setg_errno(errp, errno, "Failed to mmap shared memory"); + close(fd); + return; + } + memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s), + "ivshmem.bar2", size, ptr); + qemu_set_ram_fd(memory_region_get_ram_addr(&s->server_bar2), fd); + s->ivshmem_bar2 = &s->server_bar2; +} - if (incoming_fd == -1) { - /* if posn is positive and unseen before then this is our posn*/ - if (incoming_posn >= 0 && s->vm_id == -1) { - /* receive our posn */ - s->vm_id = incoming_posn; - } else { - /* otherwise an fd == -1 means an existing peer has gone away */ - IVSHMEM_DPRINTF("posn %" PRId64 " has gone away\n", incoming_posn); - close_peer_eventfds(s, incoming_posn); - } +static void process_msg_disconnect(IVShmemState *s, uint16_t posn, + Error **errp) +{ + IVSHMEM_DPRINTF("posn %d has gone away\n", posn); + if (posn >= s->nb_peers || posn == s->vm_id) { + error_setg(errp, "invalid peer %d", posn); return; } + close_peer_eventfds(s, posn); +} - /* if the position is -1, then it's shared memory region fd */ - if (incoming_posn == -1) { - void * map_ptr; +static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd, + Error **errp) +{ + Peer *peer = &s->peers[posn]; + int vector; - if (memory_region_is_mapped(&s->ivshmem)) { - error_report("shm already initialized"); - close(incoming_fd); - return; - } + /* + * The N-th connect message for this peer comes with the file + * descriptor for vector N-1. Count messages to find the vector. + */ + if (peer->nb_eventfds >= s->vectors) { + error_setg(errp, "Too many eventfd received, device has %d vectors", + s->vectors); + close(fd); + return; + } + vector = peer->nb_eventfds++; - if (check_shm_size(s, incoming_fd, &err) == -1) { - error_report_err(err); - close(incoming_fd); - return; - } + IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd); + event_notifier_init_fd(&peer->eventfds[vector], fd); + fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */ - /* mmap the region and map into the BAR2 */ - map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, - incoming_fd, 0); - if (map_ptr == MAP_FAILED) { - error_report("Failed to mmap shared memory %s", strerror(errno)); - close(incoming_fd); - return; - } - memory_region_init_ram_ptr(&s->ivshmem, OBJECT(s), - "ivshmem.bar2", s->ivshmem_size, map_ptr); - qemu_set_ram_fd(memory_region_get_ram_addr(&s->ivshmem), - incoming_fd); - vmstate_register_ram(&s->ivshmem, DEVICE(s)); + if (posn == s->vm_id) { + setup_interrupt(s, vector, errp); + /* TODO do we need to handle the error? */ + } - IVSHMEM_DPRINTF("guest h/w addr = %p, size = %" PRIu64 "\n", - map_ptr, s->ivshmem_size); + if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { + ivshmem_add_eventfd(s, posn, vector); + } +} - memory_region_add_subregion(&s->bar, 0, &s->ivshmem); +static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp) +{ + IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd); + if (msg < -1 || msg > IVSHMEM_MAX_PEERS) { + error_setg(errp, "server sent invalid message %" PRId64, msg); + close(fd); return; } - /* each peer has an associated array of eventfds, and we keep - * track of how many eventfds received so far */ - /* get a new eventfd: */ - if (peer->nb_eventfds >= s->vectors) { - error_report("Too many eventfd received, device has %d vectors", - s->vectors); - close(incoming_fd); + if (msg == -1) { + process_msg_shmem(s, fd, errp); return; } - new_eventfd = peer->nb_eventfds++; + if (msg >= s->nb_peers) { + resize_peers(s, msg + 1); + } - /* this is an eventfd for a particular peer VM */ - IVSHMEM_DPRINTF("eventfds[%" PRId64 "][%d] = %d\n", incoming_posn, - new_eventfd, incoming_fd); - event_notifier_init_fd(&peer->eventfds[new_eventfd], incoming_fd); - fcntl_setfl(incoming_fd, O_NONBLOCK); /* msix/irqfd poll non block */ + if (fd >= 0) { + process_msg_connect(s, msg, fd, errp); + } else { + process_msg_disconnect(s, msg, errp); + } +} + +static int ivshmem_can_receive(void *opaque) +{ + IVShmemState *s = opaque; + + assert(s->msg_buffered_bytes < sizeof(s->msg_buf)); + return sizeof(s->msg_buf) - s->msg_buffered_bytes; +} + +static void ivshmem_read(void *opaque, const uint8_t *buf, int size) +{ + IVShmemState *s = opaque; + Error *err = NULL; + int fd; + int64_t msg; - if (incoming_posn == s->vm_id) { - setup_interrupt(s, new_eventfd); + assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf)); + memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size); + s->msg_buffered_bytes += size; + if (s->msg_buffered_bytes < sizeof(s->msg_buf)) { + return; } + msg = le64_to_cpu(s->msg_buf); + s->msg_buffered_bytes = 0; - if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) { - ivshmem_add_eventfd(s, incoming_posn, new_eventfd); + fd = qemu_chr_fe_get_msgfd(s->server_chr); + IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd); + + process_msg(s, msg, fd, &err); + if (err) { + error_report_err(err); } } -static void ivshmem_check_version(void *opaque, const uint8_t * buf, int size) +static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp) { - IVShmemState *s = opaque; - int tmp; - int64_t version; + int64_t msg; + int n, ret; + + n = 0; + do { + ret = qemu_chr_fe_read_all(s->server_chr, (uint8_t *)&msg + n, + sizeof(msg) - n); + if (ret < 0 && ret != -EINTR) { + error_setg_errno(errp, -ret, "read from server failed"); + return INT64_MIN; + } + n += ret; + } while (n < sizeof(msg)); - if (!fifo_update_and_get_i64(s, buf, size, &version)) { + *pfd = qemu_chr_fe_get_msgfd(s->server_chr); + return msg; +} + +static void ivshmem_recv_setup(IVShmemState *s, Error **errp) +{ + Error *err = NULL; + int64_t msg; + int fd; + + msg = ivshmem_recv_msg(s, &fd, &err); + if (err) { + error_propagate(errp, err); + return; + } + if (msg != IVSHMEM_PROTOCOL_VERSION) { + error_setg(errp, "server sent version %" PRId64 ", expecting %d", + msg, IVSHMEM_PROTOCOL_VERSION); + return; + } + if (fd != -1) { + error_setg(errp, "server sent invalid version message"); return; } - tmp = qemu_chr_fe_get_msgfd(s->server_chr); - if (tmp != -1 || version != IVSHMEM_PROTOCOL_VERSION) { - fprintf(stderr, "incompatible version, you are connecting to a ivshmem-" - "server using a different protocol please check your setup\n"); - qemu_chr_delete(s->server_chr); - s->server_chr = NULL; + /* + * ivshmem-server sends the remaining initial messages in a fixed + * order, but the device has always accepted them in any order. + * Stay as compatible as practical, just in case people use + * servers that behave differently. + */ + + /* + * ivshmem_device_spec.txt has always required the ID message + * right here, and ivshmem-server has always complied. However, + * older versions of the device accepted it out of order, but + * broke when an interrupt setup message arrived before it. + */ + msg = ivshmem_recv_msg(s, &fd, &err); + if (err) { + error_propagate(errp, err); return; } + if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) { + error_setg(errp, "server sent invalid ID message"); + return; + } + s->vm_id = msg; + + /* + * Receive more messages until we got shared memory. + */ + do { + msg = ivshmem_recv_msg(s, &fd, &err); + if (err) { + error_propagate(errp, err); + return; + } + process_msg(s, msg, fd, &err); + if (err) { + error_propagate(errp, err); + return; + } + } while (msg != -1); - IVSHMEM_DPRINTF("version check ok, switch to real chardev handler\n"); - qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read, - ivshmem_event, s); + /* + * This function must either map the shared memory or fail. The + * loop above ensures that: it terminates normally only after it + * successfully processed the server's shared memory message. + * Assert that actually mapped the shared memory: + */ + assert(s->ivshmem_bar2); } /* Select the MSI-X vectors used by device. * ivshmem maps events to vectors statically, so * we just enable all vectors on init and after reset. */ -static void ivshmem_use_msix(IVShmemState * s) +static void ivshmem_msix_vector_use(IVShmemState *s) { PCIDevice *d = PCI_DEVICE(s); int i; - IVSHMEM_DPRINTF("%s, msix present: %d\n", __func__, msix_present(d)); - if (!msix_present(d)) { - return; - } - for (i = 0; i < s->vectors; i++) { msix_vector_use(d, i); } @@ -744,11 +742,13 @@ static void ivshmem_use_msix(IVShmemState * s) static void ivshmem_reset(DeviceState *d) { - IVShmemState *s = IVSHMEM(d); + IVShmemState *s = IVSHMEM_COMMON(d); s->intrstatus = 0; s->intrmask = 0; - ivshmem_use_msix(s); + if (ivshmem_has_feature(s, IVSHMEM_MSI)) { + ivshmem_msix_vector_use(s); + } } static int ivshmem_setup_interrupts(IVShmemState *s) @@ -762,7 +762,7 @@ static int ivshmem_setup_interrupts(IVShmemState *s) } IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors); - ivshmem_use_msix(s); + ivshmem_msix_vector_use(s); } return 0; @@ -774,7 +774,13 @@ static void ivshmem_enable_irqfd(IVShmemState *s) int i; for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) { - ivshmem_add_kvm_msi_virq(s, i); + Error *err = NULL; + + ivshmem_add_kvm_msi_virq(s, i, &err); + if (err) { + error_report_err(err); + /* TODO do we need to handle the error? */ + } } if (msix_set_vector_notifiers(pdev, @@ -814,13 +820,13 @@ static void ivshmem_disable_irqfd(IVShmemState *s) static void ivshmem_write_config(PCIDevice *pdev, uint32_t address, uint32_t val, int len) { - IVShmemState *s = IVSHMEM(pdev); + IVShmemState *s = IVSHMEM_COMMON(pdev); int is_enabled, was_enabled = msix_enabled(pdev); pci_default_write_config(pdev, address, val, len); is_enabled = msix_enabled(pdev); - if (kvm_msi_via_irqfd_enabled() && s->vm_id != -1) { + if (kvm_msi_via_irqfd_enabled()) { if (!was_enabled && is_enabled) { ivshmem_enable_irqfd(s); } else if (was_enabled && !is_enabled) { @@ -829,42 +835,14 @@ static void ivshmem_write_config(PCIDevice *pdev, uint32_t address, } } -static void pci_ivshmem_realize(PCIDevice *dev, Error **errp) +static void ivshmem_common_realize(PCIDevice *dev, Error **errp) { - IVShmemState *s = IVSHMEM(dev); + IVShmemState *s = IVSHMEM_COMMON(dev); + Error *err = NULL; uint8_t *pci_conf; uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_PREFETCH; - if (!!s->server_chr + !!s->shmobj + !!s->hostmem != 1) { - error_setg(errp, - "You must specify either 'shm', 'chardev' or 'x-memdev'"); - return; - } - - if (s->hostmem) { - MemoryRegion *mr; - - if (s->sizearg) { - g_warning("size argument ignored with hostmem"); - } - - mr = host_memory_backend_get_memory(s->hostmem, errp); - s->ivshmem_size = memory_region_size(mr); - } else if (s->sizearg == NULL) { - s->ivshmem_size = 4 << 20; /* 4 MB default */ - } else { - char *end; - int64_t size = qemu_strtosz(s->sizearg, &end); - if (size < 0 || *end != '\0' || !is_power_of_2(size)) { - error_setg(errp, "Invalid size %s", s->sizearg); - return; - } - s->ivshmem_size = size; - } - - fifo8_create(&s->incoming_fifo, sizeof(int64_t)); - /* IRQFD requires MSI */ if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) && !ivshmem_has_feature(s, IVSHMEM_MSI)) { @@ -872,31 +850,9 @@ static void pci_ivshmem_realize(PCIDevice *dev, Error **errp) return; } - /* check that role is reasonable */ - if (s->role) { - if (strncmp(s->role, "peer", 5) == 0) { - s->role_val = IVSHMEM_PEER; - } else if (strncmp(s->role, "master", 7) == 0) { - s->role_val = IVSHMEM_MASTER; - } else { - error_setg(errp, "'role' must be 'peer' or 'master'"); - return; - } - } else { - s->role_val = IVSHMEM_MASTER; /* default */ - } - - if (s->role_val == IVSHMEM_PEER) { - error_setg(&s->migration_blocker, - "Migration is disabled when using feature 'peer mode' in device 'ivshmem'"); - migrate_add_blocker(s->migration_blocker); - } - pci_conf = dev->config; pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY; - pci_config_set_interrupt_pin(pci_conf, 1); - memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s, "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE); @@ -904,116 +860,87 @@ static void pci_ivshmem_realize(PCIDevice *dev, Error **errp) pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->ivshmem_mmio); - memory_region_init(&s->bar, OBJECT(s), "ivshmem-bar2-container", s->ivshmem_size); - if (s->ivshmem_64bit) { + if (!s->not_legacy_32bit) { attr |= PCI_BASE_ADDRESS_MEM_TYPE_64; } if (s->hostmem != NULL) { - MemoryRegion *mr; - IVSHMEM_DPRINTF("using hostmem\n"); - mr = host_memory_backend_get_memory(MEMORY_BACKEND(s->hostmem), errp); - vmstate_register_ram(mr, DEVICE(s)); - memory_region_add_subregion(&s->bar, 0, mr); - pci_register_bar(PCI_DEVICE(s), 2, attr, &s->bar); - } else if (s->server_chr != NULL) { - /* FIXME do not rely on what chr drivers put into filename */ - if (strncmp(s->server_chr->filename, "unix:", 5)) { - error_setg(errp, "chardev is not a unix client socket"); - return; - } - - /* if we get a UNIX socket as the parameter we will talk - * to the ivshmem server to receive the memory region */ - + s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem, + &error_abort); + } else { IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n", s->server_chr->filename); - if (ivshmem_setup_interrupts(s) < 0) { - error_setg(errp, "failed to initialize interrupts"); - return; - } - /* we allocate enough space for 16 peers and grow as needed */ resize_peers(s, 16); - s->vm_id = -1; - pci_register_bar(dev, 2, attr, &s->bar); + /* + * Receive setup messages from server synchronously. + * Older versions did it asynchronously, but that creates a + * number of entertaining race conditions. + */ + ivshmem_recv_setup(s, &err); + if (err) { + error_propagate(errp, err); + return; + } - s->eventfd_chr = g_malloc0(s->vectors * sizeof(CharDriverState *)); + if (s->master == ON_OFF_AUTO_ON && s->vm_id != 0) { + error_setg(errp, + "master must connect to the server before any peers"); + return; + } qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, - ivshmem_check_version, ivshmem_event, s); - } else { - /* just map the file immediately, we're not using a server */ - int fd; - - IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj); - - /* try opening with O_EXCL and if it succeeds zero the memory - * by truncating to 0 */ - if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL, - S_IRWXU|S_IRWXG|S_IRWXO)) > 0) { - /* truncate file to length PCI device's memory */ - if (ftruncate(fd, s->ivshmem_size) != 0) { - error_report("could not truncate shared file"); - } + ivshmem_read, NULL, s); - } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR, - S_IRWXU|S_IRWXG|S_IRWXO)) < 0) { - error_setg(errp, "could not open shared file"); + if (ivshmem_setup_interrupts(s) < 0) { + error_setg(errp, "failed to initialize interrupts"); return; } + } - if (check_shm_size(s, fd, errp) == -1) { - return; - } + vmstate_register_ram(s->ivshmem_bar2, DEVICE(s)); + pci_register_bar(PCI_DEVICE(s), 2, attr, s->ivshmem_bar2); - create_shared_memory_BAR(s, fd, attr, errp); + if (s->master == ON_OFF_AUTO_AUTO) { + s->master = s->vm_id == 0 ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; + } + + if (!ivshmem_is_master(s)) { + error_setg(&s->migration_blocker, + "Migration is disabled when using feature 'peer mode' in device 'ivshmem'"); + migrate_add_blocker(s->migration_blocker); } } -static void pci_ivshmem_exit(PCIDevice *dev) +static void ivshmem_exit(PCIDevice *dev) { - IVShmemState *s = IVSHMEM(dev); + IVShmemState *s = IVSHMEM_COMMON(dev); int i; - fifo8_destroy(&s->incoming_fifo); - if (s->migration_blocker) { migrate_del_blocker(s->migration_blocker); error_free(s->migration_blocker); } - if (memory_region_is_mapped(&s->ivshmem)) { + if (memory_region_is_mapped(s->ivshmem_bar2)) { if (!s->hostmem) { - void *addr = memory_region_get_ram_ptr(&s->ivshmem); + void *addr = memory_region_get_ram_ptr(s->ivshmem_bar2); int fd; - if (munmap(addr, s->ivshmem_size) == -1) { + if (munmap(addr, memory_region_size(s->ivshmem_bar2) == -1)) { error_report("Failed to munmap shared memory %s", strerror(errno)); } - fd = qemu_get_ram_fd(memory_region_get_ram_addr(&s->ivshmem)); - if (fd != -1) { - close(fd); - } + fd = qemu_get_ram_fd(memory_region_get_ram_addr(s->ivshmem_bar2)); + close(fd); } - vmstate_unregister_ram(&s->ivshmem, DEVICE(dev)); - memory_region_del_subregion(&s->bar, &s->ivshmem); - } - - if (s->eventfd_chr) { - for (i = 0; i < s->vectors; i++) { - if (s->eventfd_chr[i]) { - qemu_chr_free(s->eventfd_chr[i]); - } - } - g_free(s->eventfd_chr); + vmstate_unregister_ram(s->ivshmem_bar2, DEVICE(dev)); } if (s->peers) { @@ -1030,23 +957,11 @@ static void pci_ivshmem_exit(PCIDevice *dev) g_free(s->msi_vectors); } -static bool test_msix(void *opaque, int version_id) -{ - IVShmemState *s = opaque; - - return ivshmem_has_feature(s, IVSHMEM_MSI); -} - -static bool test_no_msix(void *opaque, int version_id) -{ - return !test_msix(opaque, version_id); -} - static int ivshmem_pre_load(void *opaque) { IVShmemState *s = opaque; - if (s->role_val == IVSHMEM_PEER) { + if (!ivshmem_is_master(s)) { error_report("'peer' devices are not migratable"); return -EINVAL; } @@ -1059,12 +974,145 @@ static int ivshmem_post_load(void *opaque, int version_id) IVShmemState *s = opaque; if (ivshmem_has_feature(s, IVSHMEM_MSI)) { - ivshmem_use_msix(s); + ivshmem_msix_vector_use(s); } - return 0; } +static void ivshmem_common_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + + k->realize = ivshmem_common_realize; + k->exit = ivshmem_exit; + k->config_write = ivshmem_write_config; + k->vendor_id = PCI_VENDOR_ID_IVSHMEM; + k->device_id = PCI_DEVICE_ID_IVSHMEM; + k->class_id = PCI_CLASS_MEMORY_RAM; + k->revision = 1; + dc->reset = ivshmem_reset; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + dc->desc = "Inter-VM shared memory"; +} + +static const TypeInfo ivshmem_common_info = { + .name = TYPE_IVSHMEM_COMMON, + .parent = TYPE_PCI_DEVICE, + .instance_size = sizeof(IVShmemState), + .abstract = true, + .class_init = ivshmem_common_class_init, +}; + +static void ivshmem_check_memdev_is_busy(Object *obj, const char *name, + Object *val, Error **errp) +{ + MemoryRegion *mr; + + mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), &error_abort); + if (memory_region_is_mapped(mr)) { + char *path = object_get_canonical_path_component(val); + error_setg(errp, "can't use already busy memdev: %s", path); + g_free(path); + } else { + qdev_prop_allow_set_link_before_realize(obj, name, val, errp); + } +} + +static const VMStateDescription ivshmem_plain_vmsd = { + .name = TYPE_IVSHMEM_PLAIN, + .version_id = 0, + .minimum_version_id = 0, + .pre_load = ivshmem_pre_load, + .post_load = ivshmem_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, IVShmemState), + VMSTATE_UINT32(intrstatus, IVShmemState), + VMSTATE_UINT32(intrmask, IVShmemState), + VMSTATE_END_OF_LIST() + }, +}; + +static Property ivshmem_plain_properties[] = { + DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF), + DEFINE_PROP_END_OF_LIST(), +}; + +static void ivshmem_plain_init(Object *obj) +{ + IVShmemState *s = IVSHMEM_PLAIN(obj); + + object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND, + (Object **)&s->hostmem, + ivshmem_check_memdev_is_busy, + OBJ_PROP_LINK_UNREF_ON_RELEASE, + &error_abort); +} + +static void ivshmem_plain_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->props = ivshmem_plain_properties; + dc->vmsd = &ivshmem_plain_vmsd; +} + +static const TypeInfo ivshmem_plain_info = { + .name = TYPE_IVSHMEM_PLAIN, + .parent = TYPE_IVSHMEM_COMMON, + .instance_size = sizeof(IVShmemState), + .instance_init = ivshmem_plain_init, + .class_init = ivshmem_plain_class_init, +}; + +static const VMStateDescription ivshmem_doorbell_vmsd = { + .name = TYPE_IVSHMEM_DOORBELL, + .version_id = 0, + .minimum_version_id = 0, + .pre_load = ivshmem_pre_load, + .post_load = ivshmem_post_load, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(parent_obj, IVShmemState), + VMSTATE_MSIX(parent_obj, IVShmemState), + VMSTATE_UINT32(intrstatus, IVShmemState), + VMSTATE_UINT32(intrmask, IVShmemState), + VMSTATE_END_OF_LIST() + }, +}; + +static Property ivshmem_doorbell_properties[] = { + DEFINE_PROP_CHR("chardev", IVShmemState, server_chr), + DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1), + DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, + true), + DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF), + DEFINE_PROP_END_OF_LIST(), +}; + +static void ivshmem_doorbell_init(Object *obj) +{ + IVShmemState *s = IVSHMEM_DOORBELL(obj); + + s->features |= (1 << IVSHMEM_MSI); + s->legacy_size = SIZE_MAX; /* whatever the server sends */ +} + +static void ivshmem_doorbell_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->props = ivshmem_doorbell_properties; + dc->vmsd = &ivshmem_doorbell_vmsd; +} + +static const TypeInfo ivshmem_doorbell_info = { + .name = TYPE_IVSHMEM_DOORBELL, + .parent = TYPE_IVSHMEM_COMMON, + .instance_size = sizeof(IVShmemState), + .instance_init = ivshmem_doorbell_init, + .class_init = ivshmem_doorbell_class_init, +}; + static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id) { IVShmemState *s = opaque; @@ -1077,9 +1125,9 @@ static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id) return -EINVAL; } - if (s->role_val == IVSHMEM_PEER) { - error_report("'peer' devices are not migratable"); - return -EINVAL; + ret = ivshmem_pre_load(s); + if (ret) { + return ret; } ret = pci_device_load(pdev, f); @@ -1089,7 +1137,7 @@ static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id) if (ivshmem_has_feature(s, IVSHMEM_MSI)) { msix_load(pdev, f); - ivshmem_use_msix(s); + ivshmem_msix_vector_use(s); } else { s->intrstatus = qemu_get_be32(f); s->intrmask = qemu_get_be32(f); @@ -1098,6 +1146,18 @@ static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id) return 0; } +static bool test_msix(void *opaque, int version_id) +{ + IVShmemState *s = opaque; + + return ivshmem_has_feature(s, IVSHMEM_MSI); +} + +static bool test_no_msix(void *opaque, int version_id) +{ + return !test_msix(opaque, version_id); +} + static const VMStateDescription ivshmem_vmsd = { .name = "ivshmem", .version_id = 1, @@ -1121,68 +1181,110 @@ static Property ivshmem_properties[] = { DEFINE_PROP_CHR("chardev", IVShmemState, server_chr), DEFINE_PROP_STRING("size", IVShmemState, sizearg), DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1), - DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false), + DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, + false), DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true), DEFINE_PROP_STRING("shm", IVShmemState, shmobj), DEFINE_PROP_STRING("role", IVShmemState, role), - DEFINE_PROP_UINT32("use64", IVShmemState, ivshmem_64bit, 1), + DEFINE_PROP_UINT32("use64", IVShmemState, not_legacy_32bit, 1), DEFINE_PROP_END_OF_LIST(), }; -static void ivshmem_class_init(ObjectClass *klass, void *data) +static void desugar_shm(IVShmemState *s) { - DeviceClass *dc = DEVICE_CLASS(klass); - PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); - - k->realize = pci_ivshmem_realize; - k->exit = pci_ivshmem_exit; - k->config_write = ivshmem_write_config; - k->vendor_id = PCI_VENDOR_ID_IVSHMEM; - k->device_id = PCI_DEVICE_ID_IVSHMEM; - k->class_id = PCI_CLASS_MEMORY_RAM; - dc->reset = ivshmem_reset; - dc->props = ivshmem_properties; - dc->vmsd = &ivshmem_vmsd; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); - dc->desc = "Inter-VM shared memory"; + Object *obj; + char *path; + + obj = object_new("memory-backend-file"); + path = g_strdup_printf("/dev/shm/%s", s->shmobj); + object_property_set_str(obj, path, "mem-path", &error_abort); + g_free(path); + object_property_set_int(obj, s->legacy_size, "size", &error_abort); + object_property_set_bool(obj, true, "share", &error_abort); + object_property_add_child(OBJECT(s), "internal-shm-backend", obj, + &error_abort); + user_creatable_complete(obj, &error_abort); + s->hostmem = MEMORY_BACKEND(obj); } -static void ivshmem_check_memdev_is_busy(Object *obj, const char *name, - Object *val, Error **errp) +static void ivshmem_realize(PCIDevice *dev, Error **errp) { - MemoryRegion *mr; + IVShmemState *s = IVSHMEM_COMMON(dev); - mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), errp); - if (memory_region_is_mapped(mr)) { - char *path = object_get_canonical_path_component(val); - error_setg(errp, "can't use already busy memdev: %s", path); - g_free(path); + if (!qtest_enabled()) { + error_report("ivshmem is deprecated, please use ivshmem-plain" + " or ivshmem-doorbell instead"); + } + + if (!!s->server_chr + !!s->shmobj != 1) { + error_setg(errp, "You must specify either 'shm' or 'chardev'"); + return; + } + + if (s->sizearg == NULL) { + s->legacy_size = 4 << 20; /* 4 MB default */ } else { - qdev_prop_allow_set_link_before_realize(obj, name, val, errp); + char *end; + int64_t size = qemu_strtosz(s->sizearg, &end); + if (size < 0 || (size_t)size != size || *end != '\0' + || !is_power_of_2(size)) { + error_setg(errp, "Invalid size %s", s->sizearg); + return; + } + s->legacy_size = size; } + + /* check that role is reasonable */ + if (s->role) { + if (strncmp(s->role, "peer", 5) == 0) { + s->master = ON_OFF_AUTO_OFF; + } else if (strncmp(s->role, "master", 7) == 0) { + s->master = ON_OFF_AUTO_ON; + } else { + error_setg(errp, "'role' must be 'peer' or 'master'"); + return; + } + } else { + s->master = ON_OFF_AUTO_AUTO; + } + + if (s->shmobj) { + desugar_shm(s); + } + + /* + * Note: we don't use INTx with IVSHMEM_MSI at all, so this is a + * bald-faced lie then. But it's a backwards compatible lie. + */ + pci_config_set_interrupt_pin(dev->config, 1); + + ivshmem_common_realize(dev, errp); } -static void ivshmem_init(Object *obj) +static void ivshmem_class_init(ObjectClass *klass, void *data) { - IVShmemState *s = IVSHMEM(obj); + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); - object_property_add_link(obj, "x-memdev", TYPE_MEMORY_BACKEND, - (Object **)&s->hostmem, - ivshmem_check_memdev_is_busy, - OBJ_PROP_LINK_UNREF_ON_RELEASE, - &error_abort); + k->realize = ivshmem_realize; + k->revision = 0; + dc->desc = "Inter-VM shared memory (legacy)"; + dc->props = ivshmem_properties; + dc->vmsd = &ivshmem_vmsd; } static const TypeInfo ivshmem_info = { .name = TYPE_IVSHMEM, - .parent = TYPE_PCI_DEVICE, + .parent = TYPE_IVSHMEM_COMMON, .instance_size = sizeof(IVShmemState), - .instance_init = ivshmem_init, .class_init = ivshmem_class_init, }; static void ivshmem_register_types(void) { + type_register_static(&ivshmem_common_info); + type_register_static(&ivshmem_plain_info); + type_register_static(&ivshmem_doorbell_info); type_register_static(&ivshmem_info); } diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h index 03a1b91f31..0586cacceb 100644 --- a/include/hw/qdev-properties.h +++ b/include/hw/qdev-properties.h @@ -18,6 +18,7 @@ extern PropertyInfo qdev_prop_string; extern PropertyInfo qdev_prop_chr; extern PropertyInfo qdev_prop_ptr; extern PropertyInfo qdev_prop_macaddr; +extern PropertyInfo qdev_prop_on_off_auto; extern PropertyInfo qdev_prop_losttickpolicy; extern PropertyInfo qdev_prop_bios_chs_trans; extern PropertyInfo qdev_prop_fdc_drive_type; @@ -155,6 +156,8 @@ extern PropertyInfo qdev_prop_arraylen; DEFINE_PROP(_n, _s, _f, qdev_prop_drive, BlockBackend *) #define DEFINE_PROP_MACADDR(_n, _s, _f) \ DEFINE_PROP(_n, _s, _f, qdev_prop_macaddr, MACAddr) +#define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \ + DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/qemu-doc.texi b/qemu-doc.texi index bc9dd13cc9..79141d3582 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -1262,13 +1262,18 @@ basic example. @subsection Inter-VM Shared Memory device -With KVM enabled on a Linux host, a shared memory device is available. Guests -map a POSIX shared memory region into the guest as a PCI device that enables -zero-copy communication to the application level of the guests. The basic -syntax is: +On Linux hosts, a shared memory device is available. The basic syntax +is: @example -qemu-system-i386 -device ivshmem,size=@var{size},shm=@var{shm-name} +qemu-system-x86_64 -device ivshmem-plain,memdev=@var{hostmem} +@end example + +where @var{hostmem} names a host memory backend. For a POSIX shared +memory backend, use something like + +@example +-object memory-backend-file,size=1M,share,mem-path=/dev/shm/ivshmem,id=@var{hostmem} @end example If desired, interrupts can be sent between guest VMs accessing the same shared @@ -1282,28 +1287,24 @@ memory server is: ivshmem-server -p @var{pidfile} -S @var{path} -m @var{shm-name} -l @var{shm-size} -n @var{vectors} # Then start your qemu instances with matching arguments -qemu-system-i386 -device ivshmem,size=@var{shm-size},vectors=@var{vectors},chardev=@var{id} - [,msi=on][,ioeventfd=on][,role=peer|master] +qemu-system-x86_64 -device ivshmem-doorbell,vectors=@var{vectors},chardev=@var{id} -chardev socket,path=@var{path},id=@var{id} @end example When using the server, the guest will be assigned a VM ID (>=0) that allows guests using the same server to communicate via interrupts. Guests can read their -VM ID from a device register (see example code). Since receiving the shared -memory region from the server is asynchronous, there is a (small) chance the -guest may boot before the shared memory is attached. To allow an application -to ensure shared memory is attached, the VM ID register will return -1 (an -invalid VM ID) until the memory is attached. Once the shared memory is -attached, the VM ID will return the guest's valid VM ID. With these semantics, -the guest application can check to ensure the shared memory is attached to the -guest before proceeding. - -The @option{role} argument can be set to either master or peer and will affect -how the shared memory is migrated. With @option{role=master}, the guest will -copy the shared memory on migration to the destination host. With -@option{role=peer}, the guest will not be able to migrate with the device attached. -With the @option{peer} case, the device should be detached and then reattached -after migration using the PCI hotplug support. +VM ID from a device register (see ivshmem-spec.txt). + +@subsubsection Migration with ivshmem + +With device property @option{master=on}, the guest will copy the shared +memory on migration to the destination host. With @option{master=off}, +the guest will not be able to migrate with the device attached. In the +latter case, the device should be detached and then reattached after +migration using the PCI hotplug support. + +At most one of the devices sharing the same memory can be master. The +master must complete migration before you plug back the other devices. @subsubsection ivshmem and hugepages @@ -1311,8 +1312,8 @@ Instead of specifying the <shm size> using POSIX shm, you may specify a memory backend that has hugepage support: @example -qemu-system-i386 -object memory-backend-file,size=1G,mem-path=/mnt/hugepages/my-shmem-file,id=mb1 - -device ivshmem,x-memdev=mb1 +qemu-system-x86_64 -object memory-backend-file,size=1G,mem-path=/dev/hugepages/my-shmem-file,share,id=mb1 + -device ivshmem-plain,memdev=mb1 @end example ivshmem-server also supports hugepages mount points with the diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 776336b8b4..2fc993143e 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -333,6 +333,12 @@ static long gethugepagesize(const char *mem_path) return fs.f_bsize; } +/* + * FIXME TOCTTOU: this iterates over memory backends' mem-path, which + * may or may not name the same files / on the same filesystem now as + * when we actually open and map them. Iterate over the file + * descriptors instead, and use qemu_fd_getpagesize(). + */ static int find_max_supported_pagesize(Object *obj, void *opaque) { char *mem_path; diff --git a/tests/Makefile b/tests/Makefile index 60371ca008..d1ff18200f 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -166,7 +166,7 @@ gcov-files-pci-y += hw/display/virtio-gpu-pci.c gcov-files-pci-$(CONFIG_VIRTIO_VGA) += hw/display/virtio-vga.c check-qtest-pci-y += tests/intel-hda-test$(EXESUF) gcov-files-pci-y += hw/audio/intel-hda.c hw/audio/hda-codec.c -check-qtest-pci-$(CONFIG_POSIX) += tests/ivshmem-test$(EXESUF) +check-qtest-pci-$(CONFIG_EVENTFD) += tests/ivshmem-test$(EXESUF) gcov-files-pci-y += hw/misc/ivshmem.c check-qtest-i386-y = tests/endianness-test$(EXESUF) diff --git a/tests/ivshmem-test.c b/tests/ivshmem-test.c index e184c67a1d..c027ff1e09 100644 --- a/tests/ivshmem-test.c +++ b/tests/ivshmem-test.c @@ -110,25 +110,26 @@ static void setup_vm_cmd(IVState *s, const char *cmd, bool msix) s->pcibus = qpci_init_pc(); s->dev = get_device(s->pcibus); - /* FIXME: other bar order fails, mappings changes */ - s->mem_base = qpci_iomap(s->dev, 2, &barsize); - g_assert_nonnull(s->mem_base); - g_assert_cmpuint(barsize, ==, TMPSHMSIZE); + s->reg_base = qpci_iomap(s->dev, 0, &barsize); + g_assert_nonnull(s->reg_base); + g_assert_cmpuint(barsize, ==, 256); if (msix) { qpci_msix_enable(s->dev); } - s->reg_base = qpci_iomap(s->dev, 0, &barsize); - g_assert_nonnull(s->reg_base); - g_assert_cmpuint(barsize, ==, 256); + s->mem_base = qpci_iomap(s->dev, 2, &barsize); + g_assert_nonnull(s->mem_base); + g_assert_cmpuint(barsize, ==, TMPSHMSIZE); qpci_device_enable(s->dev); } static void setup_vm(IVState *s) { - char *cmd = g_strdup_printf("-device ivshmem,shm=%s,size=1M", tmpshm); + char *cmd = g_strdup_printf("-object memory-backend-file" + ",id=mb1,size=1M,share,mem-path=/dev/shm%s" + " -device ivshmem-plain,memdev=mb1", tmpshm); setup_vm_cmd(s, cmd, false); @@ -144,32 +145,41 @@ static void test_ivshmem_single(void) setup_vm(&state); s = &state; - /* valid io */ - out_reg(s, INTRMASK, 0); - in_reg(s, INTRSTATUS); - in_reg(s, IVPOSITION); + /* initial state of readable registers */ + g_assert_cmpuint(in_reg(s, INTRMASK), ==, 0); + g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 0); + g_assert_cmpuint(in_reg(s, IVPOSITION), ==, 0); + /* trigger interrupt via registers */ out_reg(s, INTRMASK, 0xffffffff); g_assert_cmpuint(in_reg(s, INTRMASK), ==, 0xffffffff); out_reg(s, INTRSTATUS, 1); - /* XXX: intercept IRQ, not seen in resp */ + /* check interrupt status */ g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 1); + /* reading clears */ + g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 0); + /* TODO intercept actual interrupt (needs qtest work) */ - /* invalid io */ + /* invalid register access */ out_reg(s, IVPOSITION, 1); + in_reg(s, DOORBELL); + + /* ring the (non-functional) doorbell */ out_reg(s, DOORBELL, 8 << 16); + /* write shared memory */ for (i = 0; i < G_N_ELEMENTS(data); i++) { data[i] = i; } qtest_memwrite(s->qtest, (uintptr_t)s->mem_base, data, sizeof(data)); + /* verify write */ for (i = 0; i < G_N_ELEMENTS(data); i++) { g_assert_cmpuint(((uint32_t *)tmpshmem)[i], ==, i); } + /* read it back and verify read */ memset(data, 0, sizeof(data)); - qtest_memread(s->qtest, (uintptr_t)s->mem_base, data, sizeof(data)); for (i = 0; i < G_N_ELEMENTS(data); i++) { g_assert_cmpuint(data[i], ==, i); @@ -276,8 +286,10 @@ static void *server_thread(void *data) static void setup_vm_with_server(IVState *s, int nvectors, bool msi) { char *cmd = g_strdup_printf("-chardev socket,id=chr0,path=%s,nowait " - "-device ivshmem,size=1M,chardev=chr0,vectors=%d,msi=%s", - tmpserver, nvectors, msi ? "true" : "false"); + "-device ivshmem%s,chardev=chr0,vectors=%d", + tmpserver, + msi ? "-doorbell" : ",size=1M,msi=off", + nvectors); setup_vm_cmd(s, cmd, msi); @@ -293,8 +305,7 @@ static void test_ivshmem_server(bool msi) int nvectors = 2; guint64 end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; - memset(tmpshmem, 0x42, TMPSHMSIZE); - ret = ivshmem_server_init(&server, tmpserver, tmpshm, + ret = ivshmem_server_init(&server, tmpserver, tmpshm, true, TMPSHMSIZE, nvectors, g_test_verbose()); g_assert_cmpint(ret, ==, 0); @@ -302,49 +313,39 @@ static void test_ivshmem_server(bool msi) ret = ivshmem_server_start(&server); g_assert_cmpint(ret, ==, 0); - setup_vm_with_server(&state1, nvectors, msi); - s1 = &state1; - setup_vm_with_server(&state2, nvectors, msi); - s2 = &state2; - - g_assert_cmpuint(in_reg(s1, IVPOSITION), ==, 0xffffffff); - g_assert_cmpuint(in_reg(s2, IVPOSITION), ==, 0xffffffff); - - g_assert_cmpuint(qtest_readb(s1->qtest, (uintptr_t)s1->mem_base), ==, 0x00); - thread.server = &server; ret = pipe(thread.pipe); g_assert_cmpint(ret, ==, 0); thread.thread = g_thread_new("ivshmem-server", server_thread, &thread); g_assert(thread.thread != NULL); - /* waiting until mapping is done */ - while (g_get_monotonic_time() < end_time) { - g_usleep(1000); - - if (qtest_readb(s1->qtest, (uintptr_t)s1->mem_base) == 0x42 && - qtest_readb(s2->qtest, (uintptr_t)s2->mem_base) == 0x42) { - break; - } - } + setup_vm_with_server(&state1, nvectors, msi); + s1 = &state1; + setup_vm_with_server(&state2, nvectors, msi); + s2 = &state2; /* check got different VM ids */ vm1 = in_reg(s1, IVPOSITION); vm2 = in_reg(s2, IVPOSITION); - g_assert_cmpuint(vm1, !=, vm2); + g_assert_cmpint(vm1, >=, 0); + g_assert_cmpint(vm2, >=, 0); + g_assert_cmpint(vm1, !=, vm2); + /* check number of MSI-X vectors */ global_qtest = s1->qtest; if (msi) { ret = qpci_msix_table_size(s1->dev); g_assert_cmpuint(ret, ==, nvectors); } - /* ping vm2 -> vm1 */ + /* TODO test behavior before MSI-X is enabled */ + + /* ping vm2 -> vm1 on vector 0 */ if (msi) { ret = qpci_msix_pending(s1->dev, 0); g_assert_cmpuint(ret, ==, 0); } else { - out_reg(s1, INTRSTATUS, 0); + g_assert_cmpuint(in_reg(s1, INTRSTATUS), ==, 0); } out_reg(s2, DOORBELL, vm1 << 16); do { @@ -353,18 +354,18 @@ static void test_ivshmem_server(bool msi) } while (ret == 0 && g_get_monotonic_time() < end_time); g_assert_cmpuint(ret, !=, 0); - /* ping vm1 -> vm2 */ + /* ping vm1 -> vm2 on vector 1 */ global_qtest = s2->qtest; if (msi) { - ret = qpci_msix_pending(s2->dev, 0); + ret = qpci_msix_pending(s2->dev, 1); g_assert_cmpuint(ret, ==, 0); } else { - out_reg(s2, INTRSTATUS, 0); + g_assert_cmpuint(in_reg(s2, INTRSTATUS), ==, 0); } - out_reg(s1, DOORBELL, vm2 << 16); + out_reg(s1, DOORBELL, vm2 << 16 | 1); do { g_usleep(10000); - ret = msi ? qpci_msix_pending(s2->dev, 0) : in_reg(s2, INTRSTATUS); + ret = msi ? qpci_msix_pending(s2->dev, 1) : in_reg(s2, INTRSTATUS); } while (ret == 0 && g_get_monotonic_time() < end_time); g_assert_cmpuint(ret, !=, 0); @@ -415,7 +416,7 @@ static void test_ivshmem_memdev(void) /* just for the sake of checking memory-backend property */ setup_vm_cmd(&state, "-object memory-backend-ram,size=1M,id=mb1" - " -device ivshmem,x-memdev=mb1", false); + " -device ivshmem-plain,memdev=mb1", false); cleanup_vm(&state); } diff --git a/tests/libqos/pci-pc.c b/tests/libqos/pci-pc.c index 08167c09fe..77f15e5a0e 100644 --- a/tests/libqos/pci-pc.c +++ b/tests/libqos/pci-pc.c @@ -184,7 +184,9 @@ static void *qpci_pc_iomap(QPCIBus *bus, QPCIDevice *dev, int barno, uint64_t *s if (io_type == PCI_BASE_ADDRESS_SPACE_IO) { uint16_t loc; - g_assert((s->pci_iohole_alloc + size) <= s->pci_iohole_size); + g_assert(QEMU_ALIGN_UP(s->pci_iohole_alloc, size) + size + <= s->pci_iohole_size); + s->pci_iohole_alloc = QEMU_ALIGN_UP(s->pci_iohole_alloc, size); loc = s->pci_iohole_start + s->pci_iohole_alloc; s->pci_iohole_alloc += size; @@ -194,7 +196,9 @@ static void *qpci_pc_iomap(QPCIBus *bus, QPCIDevice *dev, int barno, uint64_t *s } else { uint64_t loc; - g_assert((s->pci_hole_alloc + size) <= s->pci_hole_size); + g_assert(QEMU_ALIGN_UP(s->pci_hole_alloc, size) + size + <= s->pci_hole_size); + s->pci_hole_alloc = QEMU_ALIGN_UP(s->pci_hole_alloc, size); loc = s->pci_hole_start + s->pci_hole_alloc; s->pci_hole_alloc += size; diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c index 2e30e74bd6..c9657a61ae 100644 --- a/util/event_notifier-posix.c +++ b/util/event_notifier-posix.c @@ -20,11 +20,17 @@ #include <sys/eventfd.h> #endif +#ifdef CONFIG_EVENTFD +/* + * Initialize @e with existing file descriptor @fd. + * @fd must be a genuine eventfd object, emulation with pipe won't do. + */ void event_notifier_init_fd(EventNotifier *e, int fd) { e->rfd = fd; e->wfd = fd; } +#endif int event_notifier_init(EventNotifier *e, int active) { |