diff options
74 files changed, 3741 insertions, 782 deletions
diff --git a/.gitignore b/.gitignore index c658613560..2286d0a109 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ /qapi-generated /qapi-types.[ch] /qapi-visit.[ch] +/qapi-event.[ch] /qmp-commands.h /qmp-marshal.c /qemu-doc.html @@ -248,7 +248,7 @@ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py) qapi-modules = $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/qapi/common.json \ $(SRC_PATH)/qapi/block.json $(SRC_PATH)/qapi/block-core.json \ - $(SRC_PATH)/qapi-event.json + $(SRC_PATH)/qapi/event.json qapi-types.c qapi-types.h :\ $(qapi-modules) $(SRC_PATH)/scripts/qapi-types.py $(qapi-py) diff --git a/coroutine-win32.c b/coroutine-win32.c index edc1f72c18..17ace37dee 100644 --- a/coroutine-win32.c +++ b/coroutine-win32.c @@ -36,8 +36,17 @@ typedef struct static __thread CoroutineWin32 leader; static __thread Coroutine *current; -CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, - CoroutineAction action) +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the SwitchToFiber() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) { CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_); CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_); diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt index 3a0c99e1da..a6197a9133 100644 --- a/docs/qapi-code-gen.txt +++ b/docs/qapi-code-gen.txt @@ -218,10 +218,10 @@ An example command is: === Events === Events are defined with the keyword 'event'. When 'data' is also specified, -additional info will be carried on. Finally there will be C API generated -in qapi-event.h; when called by QEMU code, a message with timestamp will -be emitted on the wire. If timestamp is -1, it means failure to retrieve host -time. +additional info will be included in the event. Finally there will be C API +generated in qapi-event.h; when called by QEMU code, a message with timestamp +will be emitted on the wire. If timestamp is -1, it means failure to retrieve +host time. An example event is: diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt new file mode 100644 index 0000000000..44be891261 --- /dev/null +++ b/docs/qmp/qmp-events.txt @@ -0,0 +1,559 @@ + QEMU Machine Protocol Events + ============================ + +ACPI_DEVICE_OST +--------------- + +Emitted when guest executes ACPI _OST method. + + - data: ACPIOSTInfo type as described in qapi-schema.json + +{ "event": "ACPI_DEVICE_OST", + "data": { "device": "d1", "slot": "0", "slot-type": "DIMM", "source": 1, "status": 0 } } + +BALLOON_CHANGE +-------------- + +Emitted when the guest changes the actual BALLOON level. This +value is equivalent to the 'actual' field return by the +'query-balloon' command + +Data: + +- "actual": actual level of the guest memory balloon in bytes (json-number) + +Example: + +{ "event": "BALLOON_CHANGE", + "data": { "actual": 944766976 }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +BLOCK_IMAGE_CORRUPTED +--------------------- + +Emitted when a disk image is being marked corrupt. + +Data: + +- "device": Device name (json-string) +- "msg": Informative message (e.g., reason for the corruption) (json-string) +- "offset": If the corruption resulted from an image access, this is the access + offset into the image (json-int) +- "size": If the corruption resulted from an image access, this is the access + size (json-int) + +Example: + +{ "event": "BLOCK_IMAGE_CORRUPTED", + "data": { "device": "ide0-hd0", + "msg": "Prevented active L1 table overwrite", "offset": 196608, + "size": 65536 }, + "timestamp": { "seconds": 1378126126, "microseconds": 966463 } } + +BLOCK_IO_ERROR +-------------- + +Emitted when a disk I/O error occurs. + +Data: + +- "device": device name (json-string) +- "operation": I/O operation (json-string, "read" or "write") +- "action": action that has been taken, it's one of the following (json-string): + "ignore": error has been ignored + "report": error has been reported to the device + "stop": the VM is going to stop because of the error + +Example: + +{ "event": "BLOCK_IO_ERROR", + "data": { "device": "ide0-hd1", + "operation": "write", + "action": "stop" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +Note: If action is "stop", a STOP event will eventually follow the +BLOCK_IO_ERROR event. + +BLOCK_JOB_CANCELLED +------------------- + +Emitted when a block job has been cancelled. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) + +Example: + +{ "event": "BLOCK_JOB_CANCELLED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 134217728, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +BLOCK_JOB_COMPLETED +------------------- + +Emitted when a block job has completed. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that streaming + has failed and clients should not try to interpret the error + string. + +Example: + +{ "event": "BLOCK_JOB_COMPLETED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 10737418240, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +BLOCK_JOB_ERROR +--------------- + +Emitted when a block job encounters an error. + +Data: + +- "device": device name (json-string) +- "operation": I/O operation (json-string, "read" or "write") +- "action": action that has been taken, it's one of the following (json-string): + "ignore": error has been ignored, the job may fail later + "report": error will be reported and the job canceled + "stop": error caused job to be paused + +Example: + +{ "event": "BLOCK_JOB_ERROR", + "data": { "device": "ide0-hd1", + "operation": "write", + "action": "stop" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +BLOCK_JOB_READY +--------------- + +Emitted when a block job is ready to complete. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) + +Example: + +{ "event": "BLOCK_JOB_READY", + "data": { "device": "drive0", "type": "mirror", "speed": 0, + "len": 2097152, "offset": 2097152 } + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +Note: The "ready to complete" status is always reset by a BLOCK_JOB_ERROR +event. + +DEVICE_DELETED +-------------- + +Emitted whenever the device removal completion is acknowledged +by the guest. +At this point, it's safe to reuse the specified device ID. +Device removal can be initiated by the guest or by HMP/QMP commands. + +Data: + +- "device": device name (json-string, optional) +- "path": device path (json-string) + +{ "event": "DEVICE_DELETED", + "data": { "device": "virtio-net-pci-0", + "path": "/machine/peripheral/virtio-net-pci-0" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +DEVICE_TRAY_MOVED +----------------- + +It's emitted whenever the tray of a removable device is moved by the guest +or by HMP/QMP commands. + +Data: + +- "device": device name (json-string) +- "tray-open": true if the tray has been opened or false if it has been closed + (json-bool) + +{ "event": "DEVICE_TRAY_MOVED", + "data": { "device": "ide1-cd0", + "tray-open": true + }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +GUEST_PANICKED +-------------- + +Emitted when guest OS panic is detected. + +Data: + +- "action": Action that has been taken (json-string, currently always "pause"). + +Example: + +{ "event": "GUEST_PANICKED", + "data": { "action": "pause" } } + +NIC_RX_FILTER_CHANGED +--------------------- + +The event is emitted once until the query command is executed, +the first event will always be emitted. + +Data: + +- "name": net client name (json-string) +- "path": device path (json-string) + +{ "event": "NIC_RX_FILTER_CHANGED", + "data": { "name": "vnet0", + "path": "/machine/peripheral/vnet0/virtio-backend" }, + "timestamp": { "seconds": 1368697518, "microseconds": 326866 } } +} + +QUORUM_FAILURE +-------------- + +Emitted by the Quorum block driver if it fails to establish a quorum. + +Data: + +- "reference": device name if defined else node name. +- "sector-num": Number of the first sector of the failed read operation. +- "sector-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_FAILURE", + "data": { "reference": "usr1", "sector-num": 345435, "sector-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +QUORUM_REPORT_BAD +----------------- + +Emitted to report a corruption of a Quorum file. + +Data: + +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that the + block layer reported an error and clients should not try to + interpret the error string. +- "node-name": The graph node name of the block driver state. +- "sector-num": Number of the first sector of the failed read operation. +- "sector-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_REPORT_BAD", + "data": { "node-name": "1.raw", "sector-num": 345435, "sector-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +RESET +----- + +Emitted when the Virtual Machine is reseted. + +Data: None. + +Example: + +{ "event": "RESET", + "timestamp": { "seconds": 1267041653, "microseconds": 9518 } } + +RESUME +------ + +Emitted when the Virtual Machine resumes execution. + +Data: None. + +Example: + +{ "event": "RESUME", + "timestamp": { "seconds": 1271770767, "microseconds": 582542 } } + +RTC_CHANGE +---------- + +Emitted when the guest changes the RTC time. + +Data: + +- "offset": Offset between base RTC clock (as specified by -rtc base), and +new RTC clock value (json-number) + +Example: + +{ "event": "RTC_CHANGE", + "data": { "offset": 78 }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +SHUTDOWN +-------- + +Emitted when the Virtual Machine is powered down. + +Data: None. + +Example: + +{ "event": "SHUTDOWN", + "timestamp": { "seconds": 1267040730, "microseconds": 682951 } } + +Note: If the command-line option "-no-shutdown" has been specified, a STOP +event will eventually follow the SHUTDOWN event. + +SPICE_CONNECTED, SPICE_DISCONNECTED +----------------------------------- + +Emitted when a SPICE client connects or disconnects. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") +- "client": Client information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 388707}, + "event": "SPICE_CONNECTED", + "data": { + "server": { "port": "5920", "family": "ipv4", "host": "127.0.0.1"}, + "client": {"port": "52873", "family": "ipv4", "host": "127.0.0.1"} +}} + +SPICE_INITIALIZED +----------------- + +Emitted after initial handshake and authentication takes place (if any) +and the SPICE channel is up'n'running + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "connection-id": spice connection id. All channels with the same id + belong to the same spice session (json-int) + - "channel-type": channel type. "1" is the main control channel, filter for + this one if you want track spice sessions only (json-int) + - "channel-id": channel id. Usually "0", might be different needed when + multiple channels of the same type exist, such as multiple + display channels in a multihead setup (json-int) + - "tls": whevener the channel is encrypted (json-bool) + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 417172}, + "event": "SPICE_INITIALIZED", + "data": {"server": {"auth": "spice", "port": "5921", + "family": "ipv4", "host": "127.0.0.1"}, + "client": {"port": "49004", "family": "ipv4", "channel-type": 3, + "connection-id": 1804289383, "host": "127.0.0.1", + "channel-id": 0, "tls": true} +}} + +STOP +---- + +Emitted when the Virtual Machine is stopped. + +Data: None. + +Example: + +{ "event": "STOP", + "timestamp": { "seconds": 1267041730, "microseconds": 281295 } } + +SUSPEND +------- + +Emitted when guest enters S3 state. + +Data: None. + +Example: + +{ "event": "SUSPEND", + "timestamp": { "seconds": 1344456160, "microseconds": 309119 } } + +SUSPEND_DISK +------------ + +Emitted when the guest makes a request to enter S4 state. + +Data: None. + +Example: + +{ "event": "SUSPEND_DISK", + "timestamp": { "seconds": 1344456160, "microseconds": 309119 } } + +Note: QEMU shuts down when entering S4 state. + +VNC_CONNECTED +------------- + +Emitted when a VNC client establishes a connection. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + +Example: + +{ "event": "VNC_CONNECTED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0" }, + "client": { "family": "ipv4", "service": "58425", + "host": "127.0.0.1" } }, + "timestamp": { "seconds": 1262976601, "microseconds": 975795 } } + + +Note: This event is emitted before any authentication takes place, thus +the authentication ID is not provided. + +VNC_DISCONNECTED +---------------- + +Emitted when the connection is closed. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "x509_dname": TLS dname (json-string, optional) + - "sasl_username": SASL username (json-string, optional) + +Example: + +{ "event": "VNC_DISCONNECTED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0" }, + "client": { "family": "ipv4", "service": "58425", + "host": "127.0.0.1", "sasl_username": "luiz" } }, + "timestamp": { "seconds": 1262976601, "microseconds": 975795 } } + +VNC_INITIALIZED +--------------- + +Emitted after authentication takes place (if any) and the VNC session is +made active. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "x509_dname": TLS dname (json-string, optional) + - "sasl_username": SASL username (json-string, optional) + +Example: + +{ "event": "VNC_INITIALIZED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0"}, + "client": { "family": "ipv4", "service": "46089", + "host": "127.0.0.1", "sasl_username": "luiz" } }, + "timestamp": { "seconds": 1263475302, "microseconds": 150772 } } + +WAKEUP +------ + +Emitted when the guest has woken up from S3 and is running. + +Data: None. + +Example: + +{ "event": "WAKEUP", + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +WATCHDOG +-------- + +Emitted when the watchdog device's timer is expired. + +Data: + +- "action": Action that has been taken, it's one of the following (json-string): + "reset", "shutdown", "poweroff", "pause", "debug", or "none" + +Example: + +{ "event": "WATCHDOG", + "data": { "action": "reset" }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is +followed respectively by the RESET, SHUTDOWN, or STOP events. diff --git a/hw/char/virtio-console.c b/hw/char/virtio-console.c index 6c8be0fe26..54eb15f3af 100644 --- a/hw/char/virtio-console.c +++ b/hw/char/virtio-console.c @@ -14,6 +14,7 @@ #include "qemu/error-report.h" #include "trace.h" #include "hw/virtio/virtio-serial.h" +#include "qapi-event.h" #define TYPE_VIRTIO_CONSOLE_SERIAL_PORT "virtserialport" #define VIRTIO_CONSOLE(obj) \ @@ -81,11 +82,16 @@ static ssize_t flush_buf(VirtIOSerialPort *port, static void set_guest_connected(VirtIOSerialPort *port, int guest_connected) { VirtConsole *vcon = VIRTIO_CONSOLE(port); + DeviceState *dev = DEVICE(port); - if (!vcon->chr) { - return; + if (vcon->chr) { + qemu_chr_fe_set_open(vcon->chr, guest_connected); + } + + if (dev->id) { + qapi_event_send_vserport_change(dev->id, guest_connected, + &error_abort); } - qemu_chr_fe_set_open(vcon->chr, guest_connected); } /* Readiness of the guest to accept data on a port */ diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index de433b2e38..52c2f8afa5 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -180,7 +180,6 @@ PropertyInfo qdev_prop_chr = { static int parse_netdev(DeviceState *dev, const char *str, void **ptr) { NICPeers *peers_ptr = (NICPeers *)ptr; - NICConf *conf = container_of(peers_ptr, NICConf, peers); NetClientState **ncs = peers_ptr->ncs; NetClientState *peers[MAX_QUEUE_NUM]; int queues, i = 0; @@ -219,7 +218,7 @@ static int parse_netdev(DeviceState *dev, const char *str, void **ptr) ncs[i]->queue_index = i; } - conf->queues = queues; + peers_ptr->queues = queues; return 0; diff --git a/hw/intc/xics.c b/hw/intc/xics.c index 76dd6f5708..0fd2a84c7b 100644 --- a/hw/intc/xics.c +++ b/hw/intc/xics.c @@ -437,7 +437,7 @@ static void ics_set_irq(void *opaque, int srcno, int val) { ICSState *ics = (ICSState *)opaque; - if (ics->islsi[srcno]) { + if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_LSI) { set_irq_lsi(ics, srcno, val); } else { set_irq_msi(ics, srcno, val); @@ -474,7 +474,7 @@ static void ics_write_xive(ICSState *ics, int nr, int server, trace_xics_ics_write_xive(nr, srcno, server, priority); - if (ics->islsi[srcno]) { + if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_LSI) { write_xive_lsi(ics, srcno); } else { write_xive_msi(ics, srcno); @@ -496,7 +496,7 @@ static void ics_resend(ICSState *ics) for (i = 0; i < ics->nr_irqs; i++) { /* FIXME: filter by server#? */ - if (ics->islsi[i]) { + if (ics->irqs[i].flags & XICS_FLAGS_IRQ_LSI) { resend_lsi(ics, i); } else { resend_msi(ics, i); @@ -511,7 +511,7 @@ static void ics_eoi(ICSState *ics, int nr) trace_xics_ics_eoi(nr); - if (ics->islsi[srcno]) { + if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_LSI) { irq->status &= ~XICS_STATUS_SENT; } } @@ -520,11 +520,18 @@ static void ics_reset(DeviceState *dev) { ICSState *ics = ICS(dev); int i; + uint8_t flags[ics->nr_irqs]; + + for (i = 0; i < ics->nr_irqs; i++) { + flags[i] = ics->irqs[i].flags; + } memset(ics->irqs, 0, sizeof(ICSIRQState) * ics->nr_irqs); + for (i = 0; i < ics->nr_irqs; i++) { ics->irqs[i].priority = 0xff; ics->irqs[i].saved_priority = 0xff; + ics->irqs[i].flags = flags[i]; } } @@ -563,13 +570,14 @@ static int ics_dispatch_post_load(void *opaque, int version_id) static const VMStateDescription vmstate_ics_irq = { .name = "ics/irq", - .version_id = 1, + .version_id = 2, .minimum_version_id = 1, .fields = (VMStateField[]) { VMSTATE_UINT32(server, ICSIRQState), VMSTATE_UINT8(priority, ICSIRQState), VMSTATE_UINT8(saved_priority, ICSIRQState), VMSTATE_UINT8(status, ICSIRQState), + VMSTATE_UINT8(flags, ICSIRQState), VMSTATE_END_OF_LIST() }, }; @@ -606,7 +614,6 @@ static void ics_realize(DeviceState *dev, Error **errp) return; } ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState)); - ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool)); ics->qirqs = qemu_allocate_irqs(ics_set_irq, ics, ics->nr_irqs); } @@ -633,21 +640,166 @@ static const TypeInfo ics_info = { /* * Exported functions */ +static int xics_find_source(XICSState *icp, int irq) +{ + int sources = 1; + int src; + + /* FIXME: implement multiple sources */ + for (src = 0; src < sources; ++src) { + ICSState *ics = &icp->ics[src]; + if (ics_valid_irq(ics, irq)) { + return src; + } + } + + return -1; +} qemu_irq xics_get_qirq(XICSState *icp, int irq) { - if (!ics_valid_irq(icp->ics, irq)) { - return NULL; + int src = xics_find_source(icp, irq); + + if (src >= 0) { + ICSState *ics = &icp->ics[src]; + return ics->qirqs[irq - ics->offset]; } - return icp->ics->qirqs[irq - icp->ics->offset]; + return NULL; +} + +static void ics_set_irq_type(ICSState *ics, int srcno, bool lsi) +{ + assert(!(ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MASK)); + + ics->irqs[srcno].flags |= + lsi ? XICS_FLAGS_IRQ_LSI : XICS_FLAGS_IRQ_MSI; } void xics_set_irq_type(XICSState *icp, int irq, bool lsi) { - assert(ics_valid_irq(icp->ics, irq)); + int src = xics_find_source(icp, irq); + ICSState *ics; - icp->ics->islsi[irq - icp->ics->offset] = lsi; + assert(src >= 0); + + ics = &icp->ics[src]; + ics_set_irq_type(ics, irq - ics->offset, lsi); +} + +#define ICS_IRQ_FREE(ics, srcno) \ + (!((ics)->irqs[(srcno)].flags & (XICS_FLAGS_IRQ_MASK))) + +static int ics_find_free_block(ICSState *ics, int num, int alignnum) +{ + int first, i; + + for (first = 0; first < ics->nr_irqs; first += alignnum) { + if (num > (ics->nr_irqs - first)) { + return -1; + } + for (i = first; i < first + num; ++i) { + if (!ICS_IRQ_FREE(ics, i)) { + break; + } + } + if (i == (first + num)) { + return first; + } + } + + return -1; +} + +int xics_alloc(XICSState *icp, int src, int irq_hint, bool lsi) +{ + ICSState *ics = &icp->ics[src]; + int irq; + + if (irq_hint) { + assert(src == xics_find_source(icp, irq_hint)); + if (!ICS_IRQ_FREE(ics, irq_hint - ics->offset)) { + trace_xics_alloc_failed_hint(src, irq_hint); + return -1; + } + irq = irq_hint; + } else { + irq = ics_find_free_block(ics, 1, 1); + if (irq < 0) { + trace_xics_alloc_failed_no_left(src); + return -1; + } + irq += ics->offset; + } + + ics_set_irq_type(ics, irq - ics->offset, lsi); + trace_xics_alloc(src, irq); + + return irq; +} + +/* + * Allocate block of consequtive IRQs, returns a number of the first. + * If align==true, aligns the first IRQ number to num. + */ +int xics_alloc_block(XICSState *icp, int src, int num, bool lsi, bool align) +{ + int i, first = -1; + ICSState *ics = &icp->ics[src]; + + assert(src == 0); + /* + * MSIMesage::data is used for storing VIRQ so + * it has to be aligned to num to support multiple + * MSI vectors. MSI-X is not affected by this. + * The hint is used for the first IRQ, the rest should + * be allocated continuously. + */ + if (align) { + assert((num == 1) || (num == 2) || (num == 4) || + (num == 8) || (num == 16) || (num == 32)); + first = ics_find_free_block(ics, num, num); + } else { + first = ics_find_free_block(ics, num, 1); + } + + if (first >= 0) { + for (i = first; i < first + num; ++i) { + ics_set_irq_type(ics, i, lsi); + } + } + first += ics->offset; + + trace_xics_alloc_block(src, first, num, lsi, align); + + return first; +} + +static void ics_free(ICSState *ics, int srcno, int num) +{ + int i; + + for (i = srcno; i < srcno + num; ++i) { + if (ICS_IRQ_FREE(ics, i)) { + trace_xics_ics_free_warn(ics - ics->icp->ics, i + ics->offset); + } + memset(&ics->irqs[i], 0, sizeof(ICSIRQState)); + } +} + +void xics_free(XICSState *icp, int irq, int num) +{ + int src = xics_find_source(icp, irq); + + if (src >= 0) { + ICSState *ics = &icp->ics[src]; + + /* FIXME: implement multiple sources */ + assert(src == 0); + + trace_xics_ics_free(ics - icp->ics, irq, num); + ics_free(ics, irq - ics->offset, num); + } } /* @@ -866,10 +1018,10 @@ static void xics_realize(DeviceState *dev, Error **errp) } /* Registration of global state belongs into realize */ - spapr_rtas_register("ibm,set-xive", rtas_set_xive); - spapr_rtas_register("ibm,get-xive", rtas_get_xive); - spapr_rtas_register("ibm,int-off", rtas_int_off); - spapr_rtas_register("ibm,int-on", rtas_int_on); + spapr_rtas_register(RTAS_IBM_SET_XIVE, "ibm,set-xive", rtas_set_xive); + spapr_rtas_register(RTAS_IBM_GET_XIVE, "ibm,get-xive", rtas_get_xive); + spapr_rtas_register(RTAS_IBM_INT_OFF, "ibm,int-off", rtas_int_off); + spapr_rtas_register(RTAS_IBM_INT_ON, "ibm,int-on", rtas_int_on); spapr_register_hypercall(H_CPPR, h_cppr); spapr_register_hypercall(H_IPI, h_ipi); diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c index 09476ae34d..20b19e9d4f 100644 --- a/hw/intc/xics_kvm.c +++ b/hw/intc/xics_kvm.c @@ -38,10 +38,6 @@ typedef struct KVMXICSState { XICSState parent_obj; - uint32_t set_xive_token; - uint32_t get_xive_token; - uint32_t int_off_token; - uint32_t int_on_token; int kernel_xics_fd; } KVMXICSState; @@ -224,7 +220,7 @@ static int ics_set_kvm_state(ICSState *ics, int version_id) state |= KVM_XICS_MASKED; } - if (ics->islsi[i]) { + if (ics->irqs[i].flags & XICS_FLAGS_IRQ_LSI) { state |= KVM_XICS_LEVEL_SENSITIVE; if (irq->status & XICS_STATUS_ASSERTED) { state |= KVM_XICS_PENDING; @@ -253,7 +249,7 @@ static void ics_kvm_set_irq(void *opaque, int srcno, int val) int rc; args.irq = srcno + ics->offset; - if (!ics->islsi[srcno]) { + if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MSI) { if (!val) { return; } @@ -271,11 +267,18 @@ static void ics_kvm_reset(DeviceState *dev) { ICSState *ics = ICS(dev); int i; + uint8_t flags[ics->nr_irqs]; + + for (i = 0; i < ics->nr_irqs; i++) { + flags[i] = ics->irqs[i].flags; + } memset(ics->irqs, 0, sizeof(ICSIRQState) * ics->nr_irqs); + for (i = 0; i < ics->nr_irqs; i++) { ics->irqs[i].priority = 0xff; ics->irqs[i].saved_priority = 0xff; + ics->irqs[i].flags = flags[i]; } ics_set_kvm_state(ics, 1); @@ -290,7 +293,6 @@ static void ics_kvm_realize(DeviceState *dev, Error **errp) return; } ics->irqs = g_malloc0(ics->nr_irqs * sizeof(ICSIRQState)); - ics->islsi = g_malloc0(ics->nr_irqs * sizeof(bool)); ics->qirqs = qemu_allocate_irqs(ics_kvm_set_irq, ics, ics->nr_irqs); } @@ -392,32 +394,30 @@ static void xics_kvm_realize(DeviceState *dev, Error **errp) goto fail; } - icpkvm->set_xive_token = spapr_rtas_register("ibm,set-xive", rtas_dummy); - icpkvm->get_xive_token = spapr_rtas_register("ibm,get-xive", rtas_dummy); - icpkvm->int_off_token = spapr_rtas_register("ibm,int-off", rtas_dummy); - icpkvm->int_on_token = spapr_rtas_register("ibm,int-on", rtas_dummy); + spapr_rtas_register(RTAS_IBM_SET_XIVE, "ibm,set-xive", rtas_dummy); + spapr_rtas_register(RTAS_IBM_GET_XIVE, "ibm,get-xive", rtas_dummy); + spapr_rtas_register(RTAS_IBM_INT_OFF, "ibm,int-off", rtas_dummy); + spapr_rtas_register(RTAS_IBM_INT_ON, "ibm,int-on", rtas_dummy); - rc = kvmppc_define_rtas_kernel_token(icpkvm->set_xive_token, - "ibm,set-xive"); + rc = kvmppc_define_rtas_kernel_token(RTAS_IBM_SET_XIVE, "ibm,set-xive"); if (rc < 0) { error_setg(errp, "kvmppc_define_rtas_kernel_token: ibm,set-xive"); goto fail; } - rc = kvmppc_define_rtas_kernel_token(icpkvm->get_xive_token, - "ibm,get-xive"); + rc = kvmppc_define_rtas_kernel_token(RTAS_IBM_GET_XIVE, "ibm,get-xive"); if (rc < 0) { error_setg(errp, "kvmppc_define_rtas_kernel_token: ibm,get-xive"); goto fail; } - rc = kvmppc_define_rtas_kernel_token(icpkvm->int_on_token, "ibm,int-on"); + rc = kvmppc_define_rtas_kernel_token(RTAS_IBM_INT_ON, "ibm,int-on"); if (rc < 0) { error_setg(errp, "kvmppc_define_rtas_kernel_token: ibm,int-on"); goto fail; } - rc = kvmppc_define_rtas_kernel_token(icpkvm->int_off_token, "ibm,int-off"); + rc = kvmppc_define_rtas_kernel_token(RTAS_IBM_INT_OFF, "ibm,int-off"); if (rc < 0) { error_setg(errp, "kvmppc_define_rtas_kernel_token: ibm,int-off"); goto fail; diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 7437c2e3c3..7b279c4f05 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -39,6 +39,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/sysemu.h" +#include "hw/misc/vfio.h" /* #define DEBUG_VFIO */ #ifdef DEBUG_VFIO @@ -3649,6 +3650,39 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) container->iommu_data.type1.initialized = true; + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_SPAPR_TCE_IOMMU)) { + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); + if (ret) { + error_report("vfio: failed to set group container: %m"); + ret = -errno; + goto free_container_exit; + } + + ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_SPAPR_TCE_IOMMU); + if (ret) { + error_report("vfio: failed to set iommu for container: %m"); + ret = -errno; + goto free_container_exit; + } + + /* + * The host kernel code implementing VFIO_IOMMU_DISABLE is called + * when container fd is closed so we do not call it explicitly + * in this file. + */ + ret = ioctl(fd, VFIO_IOMMU_ENABLE); + if (ret) { + error_report("vfio: failed to enable container: %m"); + ret = -errno; + goto free_container_exit; + } + + container->iommu_data.type1.listener = vfio_memory_listener; + container->iommu_data.release = vfio_listener_release; + + memory_listener_register(&container->iommu_data.type1.listener, + container->space->as); + } else { error_report("vfio: No available IOMMU models"); ret = -EINVAL; @@ -4318,3 +4352,47 @@ static void register_vfio_pci_dev_type(void) } type_init(register_vfio_pci_dev_type) + +static int vfio_container_do_ioctl(AddressSpace *as, int32_t groupid, + int req, void *param) +{ + VFIOGroup *group; + VFIOContainer *container; + int ret = -1; + + group = vfio_get_group(groupid, as); + if (!group) { + error_report("vfio: group %d not registered", groupid); + return ret; + } + + container = group->container; + if (group->container) { + ret = ioctl(container->fd, req, param); + if (ret < 0) { + error_report("vfio: failed to ioctl container: ret=%d, %s", + ret, strerror(errno)); + } + } + + vfio_put_group(group); + + return ret; +} + +int vfio_container_ioctl(AddressSpace *as, int32_t groupid, + int req, void *param) +{ + /* We allow only certain ioctls to the container */ + switch (req) { + case VFIO_CHECK_EXTENSION: + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: + break; + default: + /* Return an error on unknown requests */ + error_report("vfio: unsupported ioctl %X", req); + return -1; + } + + return vfio_container_do_ioctl(as, groupid, req, param); +} diff --git a/hw/net/eepro100.c b/hw/net/eepro100.c index aaa3ff2360..3263e3fe90 100644 --- a/hw/net/eepro100.c +++ b/hw/net/eepro100.c @@ -1217,7 +1217,6 @@ static void eepro100_write_mdi(EEPRO100State *s) break; case 1: /* Status Register */ missing("not writable"); - data = s->mdimem[reg]; break; case 2: /* PHY Identification Register (Word 1) */ case 3: /* PHY Identification Register (Word 2) */ @@ -1230,7 +1229,8 @@ static void eepro100_write_mdi(EEPRO100State *s) default: missing("not implemented"); } - s->mdimem[reg] = data; + s->mdimem[reg] &= eepro100_mdi_mask[reg]; + s->mdimem[reg] |= data & ~eepro100_mdi_mask[reg]; } else if (opcode == 2) { /* MDI read */ switch (reg) { diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 00b5e07ddd..e51d753cee 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1542,7 +1542,7 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size); - n->max_queues = MAX(n->nic_conf.queues, 1); + n->max_queues = MAX(n->nic_conf.peers.queues, 1); n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); n->vqs[0].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx); n->curr_queues = 1; diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c index af49c4667d..6a72ef4814 100644 --- a/hw/nvram/spapr_nvram.c +++ b/hw/nvram/spapr_nvram.c @@ -153,8 +153,8 @@ static int spapr_nvram_init(VIOsPAPRDevice *dev) return -1; } - spapr_rtas_register("nvram-fetch", rtas_nvram_fetch); - spapr_rtas_register("nvram-store", rtas_nvram_store); + spapr_rtas_register(RTAS_NVRAM_FETCH, "nvram-fetch", rtas_nvram_fetch); + spapr_rtas_register(RTAS_NVRAM_STORE, "nvram-store", rtas_nvram_store); return 0; } diff --git a/hw/pci-host/uninorth.c b/hw/pci-host/uninorth.c index e72fe2a70b..21f805f314 100644 --- a/hw/pci-host/uninorth.c +++ b/hw/pci-host/uninorth.c @@ -230,7 +230,7 @@ PCIBus *pci_pmac_init(qemu_irq *pic, d = UNI_NORTH_PCI_HOST_BRIDGE(dev); memory_region_init(&d->pci_mmio, OBJECT(d), "pci-mmio", 0x100000000ULL); memory_region_init_alias(&d->pci_hole, OBJECT(d), "pci-hole", &d->pci_mmio, - 0x80000000ULL, 0x70000000ULL); + 0x80000000ULL, 0x10000000ULL); memory_region_add_subregion(address_space_mem, 0x80000000ULL, &d->pci_hole); diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index ea747f0a20..edd44d03e7 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -4,6 +4,9 @@ obj-y += ppc.o ppc_booke.o obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o obj-$(CONFIG_PSERIES) += spapr_pci.o +ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) +obj-y += spapr_pci_vfio.o +endif # PowerPC 4xx boards obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o obj-y += ppc4xx_pci.o diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index a973c18d88..bb2e75fe1b 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -316,10 +316,15 @@ static int ppce500_load_device_tree(MachineState *machine, * device it finds in the dt as serial output device. And we generate * devices in reverse order to the dt. */ - dt_serial_create(fdt, MPC8544_SERIAL1_REGS_OFFSET, - soc, mpic, "serial1", 1, false); - dt_serial_create(fdt, MPC8544_SERIAL0_REGS_OFFSET, - soc, mpic, "serial0", 0, true); + if (serial_hds[1]) { + dt_serial_create(fdt, MPC8544_SERIAL1_REGS_OFFSET, + soc, mpic, "serial1", 1, false); + } + + if (serial_hds[0]) { + dt_serial_create(fdt, MPC8544_SERIAL0_REGS_OFFSET, + soc, mpic, "serial0", 0, true); + } snprintf(gutil, sizeof(gutil), "%s/global-utilities@%llx", soc, MPC8544_UTIL_OFFSET); diff --git a/hw/ppc/mac_newworld.c b/hw/ppc/mac_newworld.c index e493dc1b4b..89d3cadf19 100644 --- a/hw/ppc/mac_newworld.c +++ b/hw/ppc/mac_newworld.c @@ -373,18 +373,11 @@ static void ppc_core99_init(MachineState *machine) machine_arch = ARCH_MAC99; } /* init basic PC hardware */ - pci_vga_init(pci_bus); - escc_mem = escc_init(0, pic[0x25], pic[0x24], serial_hds[0], serial_hds[1], ESCC_CLOCK, 4); memory_region_init_alias(escc_bar, NULL, "escc-bar", escc_mem, 0, memory_region_size(escc_mem)); - for(i = 0; i < nb_nics; i++) - pci_nic_init_nofail(&nd_table[i], pci_bus, "ne2k_pci", NULL); - - ide_drive_get(hd, MAX_IDE_BUS); - macio = pci_create(pci_bus, -1, TYPE_NEWWORLD_MACIO); dev = DEVICE(macio); qdev_connect_gpio_out(dev, 0, pic[0x19]); /* CUDA */ @@ -395,6 +388,8 @@ static void ppc_core99_init(MachineState *machine) macio_init(macio, pic_mem, escc_bar); /* We only emulate 2 out of 3 IDE controllers for now */ + ide_drive_get(hd, MAX_IDE_BUS); + macio_ide = MACIO_IDE(object_resolve_path_component(OBJECT(macio), "ide[0]")); macio_ide_init_drives(macio_ide, hd); @@ -420,8 +415,15 @@ static void ppc_core99_init(MachineState *machine) } } - if (graphic_depth != 15 && graphic_depth != 32 && graphic_depth != 8) + pci_vga_init(pci_bus); + + if (graphic_depth != 15 && graphic_depth != 32 && graphic_depth != 8) { graphic_depth = 15; + } + + for (i = 0; i < nb_nics; i++) { + pci_nic_init_nofail(&nd_table[i], pci_bus, "ne2k_pci", NULL); + } /* The NewWorld NVRAM is not located in the MacIO device */ dev = qdev_create(NULL, TYPE_MACIO_NVRAM); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 82f183f173..a8ba916970 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -85,16 +85,16 @@ #define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift)) +typedef struct sPAPRMachineState sPAPRMachineState; -typedef struct SPAPRMachine SPAPRMachine; #define TYPE_SPAPR_MACHINE "spapr-machine" #define SPAPR_MACHINE(obj) \ - OBJECT_CHECK(SPAPRMachine, (obj), TYPE_SPAPR_MACHINE) + OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE) /** - * SPAPRMachine: + * sPAPRMachineState: */ -struct SPAPRMachine { +struct sPAPRMachineState { /*< private >*/ MachineState parent_obj; @@ -102,76 +102,8 @@ struct SPAPRMachine { char *kvm_type; }; - sPAPREnvironment *spapr; -int spapr_allocate_irq(int hint, bool lsi) -{ - int irq; - - if (hint) { - irq = hint; - if (hint >= spapr->next_irq) { - spapr->next_irq = hint + 1; - } - /* FIXME: we should probably check for collisions somehow */ - } else { - irq = spapr->next_irq++; - } - - /* Configure irq type */ - if (!xics_get_qirq(spapr->icp, irq)) { - return 0; - } - - xics_set_irq_type(spapr->icp, irq, lsi); - - return irq; -} - -/* - * Allocate block of consequtive IRQs, returns a number of the first. - * If msi==true, aligns the first IRQ number to num. - */ -int spapr_allocate_irq_block(int num, bool lsi, bool msi) -{ - int first = -1; - int i, hint = 0; - - /* - * MSIMesage::data is used for storing VIRQ so - * it has to be aligned to num to support multiple - * MSI vectors. MSI-X is not affected by this. - * The hint is used for the first IRQ, the rest should - * be allocated continuously. - */ - if (msi) { - assert((num == 1) || (num == 2) || (num == 4) || - (num == 8) || (num == 16) || (num == 32)); - hint = (spapr->next_irq + num - 1) & ~(num - 1); - } - - for (i = 0; i < num; ++i) { - int irq; - - irq = spapr_allocate_irq(hint, lsi); - if (!irq) { - return -1; - } - - if (0 == i) { - first = irq; - hint = 0; - } - - /* If the above doesn't create a consecutive block then that's - * an internal bug */ - assert(irq == (first + i)); - } - - return first; -} - static XICSState *try_create_xics(const char *type, int nr_servers, int nr_irqs) { @@ -442,6 +374,9 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, if (boot_device) { _FDT((fdt_property_string(fdt, "qemu,boot-device", boot_device))); } + if (boot_menu) { + _FDT((fdt_property_cell(fdt, "qemu,boot-menu", boot_menu))); + } _FDT((fdt_property_cell(fdt, "qemu,graphic-width", graphic_width))); _FDT((fdt_property_cell(fdt, "qemu,graphic-height", graphic_height))); _FDT((fdt_property_cell(fdt, "qemu,graphic-depth", graphic_depth))); @@ -956,7 +891,7 @@ static const VMStateDescription vmstate_spapr = { .version_id = 2, .minimum_version_id = 1, .fields = (VMStateField[]) { - VMSTATE_UINT32(next_irq, sPAPREnvironment), + VMSTATE_UNUSED(4), /* used to be @next_irq */ /* RTC offset */ VMSTATE_UINT64(rtc_offset, sPAPREnvironment), @@ -1360,7 +1295,6 @@ static void ppc_spapr_init(MachineState *machine) /* Set up Interrupt Controller before we create the VCPUs */ spapr->icp = xics_system_init(smp_cpus * kvmppc_smt_threads() / smp_threads, XICS_IRQS); - spapr->next_irq = XICS_IRQ_BASE; /* init CPUs */ if (cpu_model == NULL) { @@ -1619,14 +1553,14 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, static char *spapr_get_kvm_type(Object *obj, Error **errp) { - SPAPRMachine *sm = SPAPR_MACHINE(obj); + sPAPRMachineState *sm = SPAPR_MACHINE(obj); return g_strdup(sm->kvm_type); } static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp) { - SPAPRMachine *sm = SPAPR_MACHINE(obj); + sPAPRMachineState *sm = SPAPR_MACHINE(obj); g_free(sm->kvm_type); sm->kvm_type = g_strdup(value); @@ -1660,7 +1594,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) static const TypeInfo spapr_machine_info = { .name = TYPE_SPAPR_MACHINE, .parent = TYPE_MACHINE, - .instance_size = sizeof(SPAPRMachine), + .instance_size = sizeof(sPAPRMachineState), .instance_init = spapr_machine_initfn, .class_init = spapr_machine_class_init, .interfaces = (InterfaceInfo[]) { @@ -1669,9 +1603,25 @@ static const TypeInfo spapr_machine_info = { }, }; +static void spapr_machine_2_1_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + + mc->name = "pseries-2.1"; + mc->desc = "pSeries Logical Partition (PAPR compliant) v2.1"; + mc->is_default = 0; +} + +static const TypeInfo spapr_machine_2_1_info = { + .name = TYPE_SPAPR_MACHINE "2.1", + .parent = TYPE_SPAPR_MACHINE, + .class_init = spapr_machine_2_1_class_init, +}; + static void spapr_machine_register_types(void) { type_register_static(&spapr_machine_info); + type_register_static(&spapr_machine_2_1_info); } type_init(spapr_machine_register_types) diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index 16fa49e886..1b6157dec4 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -314,8 +314,9 @@ static void check_exception(PowerPCCPU *cpu, sPAPREnvironment *spapr, void spapr_events_init(sPAPREnvironment *spapr) { - spapr->epow_irq = spapr_allocate_msi(0); + spapr->epow_irq = xics_alloc(spapr->icp, 0, 0, false); spapr->epow_notifier.notify = spapr_powerdown_req; qemu_register_powerdown_notifier(&spapr->epow_notifier); - spapr_rtas_register("check-exception", check_exception); + spapr_rtas_register(RTAS_CHECK_EXCEPTION, "check-exception", + check_exception); } diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 9e49ec4a5c..698ae60953 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -118,7 +118,8 @@ static int spapr_tce_table_realize(DeviceState *dev) tcet->table = kvmppc_create_spapr_tce(tcet->liobn, tcet->nb_table << tcet->page_shift, - &tcet->fd); + &tcet->fd, + tcet->vfio_accel); } if (!tcet->table) { @@ -142,7 +143,8 @@ static int spapr_tce_table_realize(DeviceState *dev) sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, uint64_t bus_offset, uint32_t page_shift, - uint32_t nb_table) + uint32_t nb_table, + bool vfio_accel) { sPAPRTCETable *tcet; @@ -161,6 +163,7 @@ sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, tcet->bus_offset = bus_offset; tcet->page_shift = page_shift; tcet->nb_table = nb_table; + tcet->vfio_accel = vfio_accel; object_property_add_child(OBJECT(owner), "tce-table", OBJECT(tcet), NULL); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 988f8cb858..9ed39a93b7 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -220,36 +220,12 @@ static void rtas_write_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, } /* - * Find an entry with config_addr or returns the empty one if not found AND - * alloc_new is set. - * At the moment the msi_table entries are never released so there is - * no point to look till the end of the list if we need to find the free entry. - */ -static int spapr_msicfg_find(sPAPRPHBState *phb, uint32_t config_addr, - bool alloc_new) -{ - int i; - - for (i = 0; i < SPAPR_MSIX_MAX_DEVS; ++i) { - if (!phb->msi_table[i].nvec) { - break; - } - if (phb->msi_table[i].config_addr == config_addr) { - return i; - } - } - if ((i < SPAPR_MSIX_MAX_DEVS) && alloc_new) { - trace_spapr_pci_msi("Allocating new MSI config", i, config_addr); - return i; - } - - return -1; -} - -/* * Set MSI/MSIX message data. * This is required for msi_notify()/msix_notify() which * will write at the addresses via spapr_msi_write(). + * + * If hwaddr == 0, all entries will have .data == first_irq i.e. + * table will be reset. */ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix, unsigned first_irq, unsigned req_num) @@ -263,9 +239,12 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix, return; } - for (i = 0; i < req_num; ++i, ++msg.data) { + for (i = 0; i < req_num; ++i) { msix_set_message(pdev, i, msg); trace_spapr_pci_msi_setup(pdev->name, i, msg.address); + if (addr) { + ++msg.data; + } } } @@ -280,9 +259,12 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, unsigned int req_num = rtas_ld(args, 4); /* 0 == remove all */ unsigned int seq_num = rtas_ld(args, 5); unsigned int ret_intr_type; - int ndev, irq, max_irqs = 0; + unsigned int irq, max_irqs = 0, num = 0; sPAPRPHBState *phb = NULL; PCIDevice *pdev = NULL; + bool msix = false; + spapr_pci_msi *msi; + int *config_addr_key; switch (func) { case RTAS_CHANGE_MSI_FN: @@ -310,13 +292,18 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, /* Releasing MSIs */ if (!req_num) { - ndev = spapr_msicfg_find(phb, config_addr, false); - if (ndev < 0) { - trace_spapr_pci_msi("MSI has not been enabled", -1, config_addr); + msi = (spapr_pci_msi *) g_hash_table_lookup(phb->msi, &config_addr); + if (!msi) { + trace_spapr_pci_msi("Releasing wrong config", config_addr); rtas_st(rets, 0, RTAS_OUT_HW_ERROR); return; } - trace_spapr_pci_msi("Released MSIs", ndev, config_addr); + + xics_free(spapr->icp, msi->first_irq, msi->num); + spapr_msi_setmsg(pdev, 0, msix, 0, num); + g_hash_table_remove(phb->msi, &config_addr); + + trace_spapr_pci_msi("Released MSIs", config_addr); rtas_st(rets, 0, RTAS_OUT_SUCCESS); rtas_st(rets, 1, 0); return; @@ -324,15 +311,6 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, /* Enabling MSI */ - /* Find a device number in the map to add or reuse the existing one */ - ndev = spapr_msicfg_find(phb, config_addr, true); - if (ndev >= SPAPR_MSIX_MAX_DEVS || ndev < 0) { - error_report("No free entry for a new MSI device"); - rtas_st(rets, 0, RTAS_OUT_HW_ERROR); - return; - } - trace_spapr_pci_msi("Configuring MSI", ndev, config_addr); - /* Check if the device supports as many IRQs as requested */ if (ret_intr_type == RTAS_TYPE_MSI) { max_irqs = msi_nr_vectors_allocated(pdev); @@ -340,48 +318,47 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, max_irqs = pdev->msix_entries_nr; } if (!max_irqs) { - error_report("Requested interrupt type %d is not enabled for device#%d", - ret_intr_type, ndev); + error_report("Requested interrupt type %d is not enabled for device %x", + ret_intr_type, config_addr); rtas_st(rets, 0, -1); /* Hardware error */ return; } /* Correct the number if the guest asked for too many */ if (req_num > max_irqs) { + trace_spapr_pci_msi_retry(config_addr, req_num, max_irqs); req_num = max_irqs; + irq = 0; /* to avoid misleading trace */ + goto out; } - /* Check if there is an old config and MSI number has not changed */ - if (phb->msi_table[ndev].nvec && (req_num != phb->msi_table[ndev].nvec)) { - /* Unexpected behaviour */ - error_report("Cannot reuse MSI config for device#%d", ndev); + /* Allocate MSIs */ + irq = xics_alloc_block(spapr->icp, 0, req_num, false, + ret_intr_type == RTAS_TYPE_MSI); + if (!irq) { + error_report("Cannot allocate MSIs for device %x", config_addr); rtas_st(rets, 0, RTAS_OUT_HW_ERROR); return; } - /* There is no cached config, allocate MSIs */ - if (!phb->msi_table[ndev].nvec) { - irq = spapr_allocate_irq_block(req_num, false, - ret_intr_type == RTAS_TYPE_MSI); - if (irq < 0) { - error_report("Cannot allocate MSIs for device#%d", ndev); - rtas_st(rets, 0, RTAS_OUT_HW_ERROR); - return; - } - phb->msi_table[ndev].irq = irq; - phb->msi_table[ndev].nvec = req_num; - phb->msi_table[ndev].config_addr = config_addr; - } - /* Setup MSI/MSIX vectors in the device (via cfgspace or MSIX BAR) */ spapr_msi_setmsg(pdev, spapr->msi_win_addr, ret_intr_type == RTAS_TYPE_MSIX, - phb->msi_table[ndev].irq, req_num); + irq, req_num); + /* Add MSI device to cache */ + msi = g_new(spapr_pci_msi, 1); + msi->first_irq = irq; + msi->num = req_num; + config_addr_key = g_new(int, 1); + *config_addr_key = config_addr; + g_hash_table_insert(phb->msi, config_addr_key, msi); + +out: rtas_st(rets, 0, RTAS_OUT_SUCCESS); rtas_st(rets, 1, req_num); rtas_st(rets, 2, ++seq_num); rtas_st(rets, 3, ret_intr_type); - trace_spapr_pci_rtas_ibm_change_msi(func, req_num); + trace_spapr_pci_rtas_ibm_change_msi(config_addr, func, req_num, irq); } static void rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu, @@ -395,25 +372,28 @@ static void rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu, uint32_t config_addr = rtas_ld(args, 0); uint64_t buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); unsigned int intr_src_num = -1, ioa_intr_num = rtas_ld(args, 3); - int ndev; sPAPRPHBState *phb = NULL; + PCIDevice *pdev = NULL; + spapr_pci_msi *msi; - /* Fins sPAPRPHBState */ + /* Find sPAPRPHBState */ phb = find_phb(spapr, buid); - if (!phb) { + if (phb) { + pdev = find_dev(spapr, buid, config_addr); + } + if (!phb || !pdev) { rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); return; } /* Find device descriptor and start IRQ */ - ndev = spapr_msicfg_find(phb, config_addr, false); - if (ndev < 0) { - trace_spapr_pci_msi("MSI has not been enabled", -1, config_addr); + msi = (spapr_pci_msi *) g_hash_table_lookup(phb->msi, &config_addr); + if (!msi || !msi->first_irq || !msi->num || (ioa_intr_num >= msi->num)) { + trace_spapr_pci_msi("Failed to return vector", config_addr); rtas_st(rets, 0, RTAS_OUT_HW_ERROR); return; } - - intr_src_num = phb->msi_table[ndev].irq + ioa_intr_num; + intr_src_num = msi->first_irq + ioa_intr_num; trace_spapr_pci_rtas_ibm_query_interrupt_source_number(ioa_intr_num, intr_src_num); @@ -634,7 +614,7 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) for (i = 0; i < PCI_NUM_PINS; i++) { uint32_t irq; - irq = spapr_allocate_lsi(0); + irq = xics_alloc_block(spapr->icp, 0, 1, true, false); if (!irq) { error_setg(errp, "spapr_allocate_lsi failed"); return; @@ -649,6 +629,8 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp) } info->finish_realize(sphb, errp); + + sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, g_free); } static void spapr_phb_finish_realize(sPAPRPHBState *sphb, Error **errp) @@ -658,7 +640,7 @@ static void spapr_phb_finish_realize(sPAPRPHBState *sphb, Error **errp) tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn, 0, SPAPR_TCE_PAGE_SHIFT, - 0x40000000 >> SPAPR_TCE_PAGE_SHIFT); + 0x40000000 >> SPAPR_TCE_PAGE_SHIFT, false); if (!tcet) { error_setg(errp, "Unable to create TCE table for %s", sphb->dtbusname); @@ -712,22 +694,69 @@ static const VMStateDescription vmstate_spapr_pci_lsi = { }; static const VMStateDescription vmstate_spapr_pci_msi = { - .name = "spapr_pci/lsi", + .name = "spapr_pci/msi", .version_id = 1, .minimum_version_id = 1, - .fields = (VMStateField[]) { - VMSTATE_UINT32(config_addr, struct spapr_pci_msi), - VMSTATE_UINT32(irq, struct spapr_pci_msi), - VMSTATE_UINT32(nvec, struct spapr_pci_msi), - + .fields = (VMStateField []) { + VMSTATE_UINT32(key, spapr_pci_msi_mig), + VMSTATE_UINT32(value.first_irq, spapr_pci_msi_mig), + VMSTATE_UINT32(value.num, spapr_pci_msi_mig), VMSTATE_END_OF_LIST() }, }; +static void spapr_pci_pre_save(void *opaque) +{ + sPAPRPHBState *sphb = opaque; + GHashTableIter iter; + gpointer key, value; + int i; + + if (sphb->msi_devs) { + g_free(sphb->msi_devs); + sphb->msi_devs = NULL; + } + sphb->msi_devs_num = g_hash_table_size(sphb->msi); + if (!sphb->msi_devs_num) { + return; + } + sphb->msi_devs = g_malloc(sphb->msi_devs_num * sizeof(spapr_pci_msi_mig)); + + g_hash_table_iter_init(&iter, sphb->msi); + for (i = 0; g_hash_table_iter_next(&iter, &key, &value); ++i) { + sphb->msi_devs[i].key = *(uint32_t *) key; + sphb->msi_devs[i].value = *(spapr_pci_msi *) value; + } +} + +static int spapr_pci_post_load(void *opaque, int version_id) +{ + sPAPRPHBState *sphb = opaque; + gpointer key, value; + int i; + + for (i = 0; i < sphb->msi_devs_num; ++i) { + key = g_memdup(&sphb->msi_devs[i].key, + sizeof(sphb->msi_devs[i].key)); + value = g_memdup(&sphb->msi_devs[i].value, + sizeof(sphb->msi_devs[i].value)); + g_hash_table_insert(sphb->msi, key, value); + } + if (sphb->msi_devs) { + g_free(sphb->msi_devs); + sphb->msi_devs = NULL; + } + sphb->msi_devs_num = 0; + + return 0; +} + static const VMStateDescription vmstate_spapr_pci = { .name = "spapr_pci", - .version_id = 1, - .minimum_version_id = 1, + .version_id = 2, + .minimum_version_id = 2, + .pre_save = spapr_pci_pre_save, + .post_load = spapr_pci_post_load, .fields = (VMStateField[]) { VMSTATE_UINT64_EQUAL(buid, sPAPRPHBState), VMSTATE_UINT32_EQUAL(dma_liobn, sPAPRPHBState), @@ -737,9 +766,9 @@ static const VMStateDescription vmstate_spapr_pci = { VMSTATE_UINT64_EQUAL(io_win_size, sPAPRPHBState), VMSTATE_STRUCT_ARRAY(lsi_table, sPAPRPHBState, PCI_NUM_PINS, 0, vmstate_spapr_pci_lsi, struct spapr_pci_lsi), - VMSTATE_STRUCT_ARRAY(msi_table, sPAPRPHBState, SPAPR_MSIX_MAX_DEVS, 0, - vmstate_spapr_pci_msi, struct spapr_pci_msi), - + VMSTATE_INT32(msi_devs_num, sPAPRPHBState), + VMSTATE_STRUCT_VARRAY_ALLOC(msi_devs, sPAPRPHBState, msi_devs_num, 0, + vmstate_spapr_pci_msi, spapr_pci_msi_mig), VMSTATE_END_OF_LIST() }, }; @@ -909,14 +938,20 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, void spapr_pci_rtas_init(void) { - spapr_rtas_register("read-pci-config", rtas_read_pci_config); - spapr_rtas_register("write-pci-config", rtas_write_pci_config); - spapr_rtas_register("ibm,read-pci-config", rtas_ibm_read_pci_config); - spapr_rtas_register("ibm,write-pci-config", rtas_ibm_write_pci_config); + spapr_rtas_register(RTAS_READ_PCI_CONFIG, "read-pci-config", + rtas_read_pci_config); + spapr_rtas_register(RTAS_WRITE_PCI_CONFIG, "write-pci-config", + rtas_write_pci_config); + spapr_rtas_register(RTAS_IBM_READ_PCI_CONFIG, "ibm,read-pci-config", + rtas_ibm_read_pci_config); + spapr_rtas_register(RTAS_IBM_WRITE_PCI_CONFIG, "ibm,write-pci-config", + rtas_ibm_write_pci_config); if (msi_supported) { - spapr_rtas_register("ibm,query-interrupt-source-number", + spapr_rtas_register(RTAS_IBM_QUERY_INTERRUPT_SOURCE_NUMBER, + "ibm,query-interrupt-source-number", rtas_ibm_query_interrupt_source_number); - spapr_rtas_register("ibm,change-msi", rtas_ibm_change_msi); + spapr_rtas_register(RTAS_IBM_CHANGE_MSI, "ibm,change-msi", + rtas_ibm_change_msi); } } diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c new file mode 100644 index 0000000000..d3bddf2887 --- /dev/null +++ b/hw/ppc/spapr_pci_vfio.c @@ -0,0 +1,102 @@ +/* + * QEMU sPAPR PCI host for VFIO + * + * Copyright (c) 2011-2014 Alexey Kardashevskiy, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "hw/ppc/spapr.h" +#include "hw/pci-host/spapr.h" +#include "linux/vfio.h" +#include "hw/misc/vfio.h" + +static Property spapr_phb_vfio_properties[] = { + DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1), + DEFINE_PROP_END_OF_LIST(), +}; + +static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp) +{ + sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); + struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) }; + int ret; + sPAPRTCETable *tcet; + uint32_t liobn = svphb->phb.dma_liobn; + + if (svphb->iommugroupid == -1) { + error_setg(errp, "Wrong IOMMU group ID %d", svphb->iommugroupid); + return; + } + + ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid, + VFIO_CHECK_EXTENSION, + (void *) VFIO_SPAPR_TCE_IOMMU); + if (ret != 1) { + error_setg_errno(errp, -ret, + "spapr-vfio: SPAPR extension is not supported"); + return; + } + + ret = vfio_container_ioctl(&svphb->phb.iommu_as, svphb->iommugroupid, + VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + error_setg_errno(errp, -ret, + "spapr-vfio: get info from container failed"); + return; + } + + tcet = spapr_tce_new_table(DEVICE(sphb), liobn, info.dma32_window_start, + SPAPR_TCE_PAGE_SHIFT, + info.dma32_window_size >> SPAPR_TCE_PAGE_SHIFT, + true); + if (!tcet) { + error_setg(errp, "spapr-vfio: failed to create VFIO TCE table"); + return; + } + + /* Register default 32bit DMA window */ + memory_region_add_subregion(&sphb->iommu_root, tcet->bus_offset, + spapr_tce_get_iommu(tcet)); +} + +static void spapr_phb_vfio_reset(DeviceState *qdev) +{ + /* Do nothing */ +} + +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + sPAPRPHBClass *spc = SPAPR_PCI_HOST_BRIDGE_CLASS(klass); + + dc->props = spapr_phb_vfio_properties; + dc->reset = spapr_phb_vfio_reset; + spc->finish_realize = spapr_phb_vfio_finish_realize; +} + +static const TypeInfo spapr_phb_vfio_info = { + .name = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE, + .parent = TYPE_SPAPR_PCI_HOST_BRIDGE, + .instance_size = sizeof(sPAPRPHBVFIOState), + .class_init = spapr_phb_vfio_class_init, + .class_size = sizeof(sPAPRPHBClass), +}; + +static void spapr_pci_vfio_register_types(void) +{ + type_register_static(&spapr_phb_vfio_info); +} + +type_init(spapr_pci_vfio_register_types) diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index 8d08539baa..9ba1ba69f9 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -36,9 +36,6 @@ #include <libfdt.h> -#define TOKEN_BASE 0x2000 -#define TOKEN_MAX 0x100 - static void rtas_display_character(PowerPCCPU *cpu, sPAPREnvironment *spapr, uint32_t token, uint32_t nargs, target_ulong args, @@ -225,8 +222,6 @@ static void rtas_stop_self(PowerPCCPU *cpu, sPAPREnvironment *spapr, env->msr = 0; } -#define DIAGNOSTICS_RUN_MODE 42 - static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu, sPAPREnvironment *spapr, uint32_t token, uint32_t nargs, @@ -236,16 +231,28 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu, target_ulong parameter = rtas_ld(args, 0); target_ulong buffer = rtas_ld(args, 1); target_ulong length = rtas_ld(args, 2); - target_ulong ret = RTAS_OUT_NOT_SUPPORTED; + target_ulong ret = RTAS_OUT_SUCCESS; switch (parameter) { - case DIAGNOSTICS_RUN_MODE: - if (length == 1) { - rtas_st(buffer, 0, 0); - ret = RTAS_OUT_SUCCESS; - } + case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: { + char *param_val = g_strdup_printf("MaxEntCap=%d,MaxPlatProcs=%d", + max_cpus, smp_cpus); + rtas_st_buffer(buffer, length, (uint8_t *)param_val, strlen(param_val)); + g_free(param_val); break; } + case RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE: { + uint8_t param_val = DIAGNOSTICS_RUN_MODE_DISABLED; + + rtas_st_buffer(buffer, length, ¶m_val, sizeof(param_val)); + break; + } + case RTAS_SYSPARM_UUID: + rtas_st_buffer(buffer, length, qemu_uuid, (qemu_uuid_set ? 16 : 0)); + break; + default: + ret = RTAS_OUT_NOT_SUPPORTED; + } rtas_st(rets, 0, ret); } @@ -260,7 +267,9 @@ static void rtas_ibm_set_system_parameter(PowerPCCPU *cpu, target_ulong ret = RTAS_OUT_NOT_SUPPORTED; switch (parameter) { - case DIAGNOSTICS_RUN_MODE: + case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: + case RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE: + case RTAS_SYSPARM_UUID: ret = RTAS_OUT_NOT_AUTHORIZED; break; } @@ -271,17 +280,14 @@ static void rtas_ibm_set_system_parameter(PowerPCCPU *cpu, static struct rtas_call { const char *name; spapr_rtas_fn fn; -} rtas_table[TOKEN_MAX]; - -static struct rtas_call *rtas_next = rtas_table; +} rtas_table[RTAS_TOKEN_MAX - RTAS_TOKEN_BASE]; target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPREnvironment *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) { - if ((token >= TOKEN_BASE) - && ((token - TOKEN_BASE) < TOKEN_MAX)) { - struct rtas_call *call = rtas_table + (token - TOKEN_BASE); + if ((token >= RTAS_TOKEN_BASE) && (token < RTAS_TOKEN_MAX)) { + struct rtas_call *call = rtas_table + (token - RTAS_TOKEN_BASE); if (call->fn) { call->fn(cpu, spapr, token, nargs, args, nret, rets); @@ -303,23 +309,22 @@ target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_PARAMETER; } -int spapr_rtas_register(const char *name, spapr_rtas_fn fn) +void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn) { - int i; - - for (i = 0; i < (rtas_next - rtas_table); i++) { - if (strcmp(name, rtas_table[i].name) == 0) { - fprintf(stderr, "RTAS call \"%s\" registered twice\n", name); - exit(1); - } + if (!((token >= RTAS_TOKEN_BASE) && (token < RTAS_TOKEN_MAX))) { + fprintf(stderr, "RTAS invalid token 0x%x\n", token); + exit(1); } - assert(rtas_next < (rtas_table + TOKEN_MAX)); - - rtas_next->name = name; - rtas_next->fn = fn; + token -= RTAS_TOKEN_BASE; + if (rtas_table[token].name) { + fprintf(stderr, "RTAS call \"%s\" is registered already as 0x%x\n", + rtas_table[token].name, token); + exit(1); + } - return (rtas_next++ - rtas_table) + TOKEN_BASE; + rtas_table[token].name = name; + rtas_table[token].fn = fn; } int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, @@ -359,7 +364,7 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, return ret; } - for (i = 0; i < TOKEN_MAX; i++) { + for (i = 0; i < RTAS_TOKEN_MAX - RTAS_TOKEN_BASE; i++) { struct rtas_call *call = &rtas_table[i]; if (!call->name) { @@ -367,7 +372,7 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, } ret = qemu_fdt_setprop_cell(fdt, "/rtas", call->name, - i + TOKEN_BASE); + i + RTAS_TOKEN_BASE); if (ret < 0) { fprintf(stderr, "Couldn't add rtas token for %s: %s\n", call->name, fdt_strerror(ret)); @@ -380,18 +385,24 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, static void core_rtas_register_types(void) { - spapr_rtas_register("display-character", rtas_display_character); - spapr_rtas_register("get-time-of-day", rtas_get_time_of_day); - spapr_rtas_register("set-time-of-day", rtas_set_time_of_day); - spapr_rtas_register("power-off", rtas_power_off); - spapr_rtas_register("system-reboot", rtas_system_reboot); - spapr_rtas_register("query-cpu-stopped-state", + spapr_rtas_register(RTAS_DISPLAY_CHARACTER, "display-character", + rtas_display_character); + spapr_rtas_register(RTAS_GET_TIME_OF_DAY, "get-time-of-day", + rtas_get_time_of_day); + spapr_rtas_register(RTAS_SET_TIME_OF_DAY, "set-time-of-day", + rtas_set_time_of_day); + spapr_rtas_register(RTAS_POWER_OFF, "power-off", rtas_power_off); + spapr_rtas_register(RTAS_SYSTEM_REBOOT, "system-reboot", + rtas_system_reboot); + spapr_rtas_register(RTAS_QUERY_CPU_STOPPED_STATE, "query-cpu-stopped-state", rtas_query_cpu_stopped_state); - spapr_rtas_register("start-cpu", rtas_start_cpu); - spapr_rtas_register("stop-self", rtas_stop_self); - spapr_rtas_register("ibm,get-system-parameter", + spapr_rtas_register(RTAS_START_CPU, "start-cpu", rtas_start_cpu); + spapr_rtas_register(RTAS_STOP_SELF, "stop-self", rtas_stop_self); + spapr_rtas_register(RTAS_IBM_GET_SYSTEM_PARAMETER, + "ibm,get-system-parameter", rtas_ibm_get_system_parameter); - spapr_rtas_register("ibm,set-system-parameter", + spapr_rtas_register(RTAS_IBM_SET_SYSTEM_PARAMETER, + "ibm,set-system-parameter", rtas_ibm_set_system_parameter); } diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c index 04e16ae04d..dc9e46a7b1 100644 --- a/hw/ppc/spapr_vio.c +++ b/hw/ppc/spapr_vio.c @@ -449,7 +449,7 @@ static int spapr_vio_busdev_init(DeviceState *qdev) dev->qdev.id = id; } - dev->irq = spapr_allocate_msi(dev->irq); + dev->irq = xics_alloc(spapr->icp, 0, dev->irq, false); if (!dev->irq) { return -1; } @@ -460,7 +460,7 @@ static int spapr_vio_busdev_init(DeviceState *qdev) 0, SPAPR_TCE_PAGE_SHIFT, pc->rtce_window_size >> - SPAPR_TCE_PAGE_SHIFT); + SPAPR_TCE_PAGE_SHIFT, false); address_space_init(&dev->as, spapr_tce_get_iommu(dev->tcet), qdev->id); } @@ -517,8 +517,9 @@ VIOsPAPRBus *spapr_vio_bus_init(void) spapr_register_hypercall(H_ENABLE_CRQ, h_enable_crq); /* RTAS calls */ - spapr_rtas_register("ibm,set-tce-bypass", rtas_set_tce_bypass); - spapr_rtas_register("quiesce", rtas_quiesce); + spapr_rtas_register(RTAS_IBM_SET_TCE_BYPASS, "ibm,set-tce-bypass", + rtas_set_tce_bypass); + spapr_rtas_register(RTAS_QUIESCE, "quiesce", rtas_quiesce); return bus; } diff --git a/hw/watchdog/watchdog.c b/hw/watchdog/watchdog.c index 4aebd34924..9f607d42bb 100644 --- a/hw/watchdog/watchdog.c +++ b/hw/watchdog/watchdog.c @@ -106,7 +106,7 @@ int select_watchdog_action(const char *p) */ void watchdog_perform_action(void) { - switch(watchdog_action) { + switch (watchdog_action) { case WDT_RESET: /* same as 'system_reset' in monitor */ qapi_event_send_watchdog(WATCHDOG_EXPIRATION_ACTION_RESET, &error_abort); qemu_system_reset_request(); diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 67ca076380..f3cf63f9f5 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -225,7 +225,7 @@ void block_job_pause(BlockJob *job); void block_job_resume(BlockJob *job); /** - * block_job_event_cancle: + * block_job_event_cancelled: * @job: The job whose information is requested. * * Send a BLOCK_JOB_CANCELLED event for the specified job. diff --git a/include/hw/misc/vfio.h b/include/hw/misc/vfio.h new file mode 100644 index 0000000000..0b26cd8e11 --- /dev/null +++ b/include/hw/misc/vfio.h @@ -0,0 +1,9 @@ +#ifndef VFIO_API_H +#define VFIO_API_H + +#include "qemu/typedefs.h" + +extern int vfio_container_ioctl(AddressSpace *as, int32_t groupid, + int req, void *param); + +#endif diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 0934518bbd..32f0aa7d10 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -27,13 +27,15 @@ #include "hw/pci/pci_host.h" #include "hw/ppc/xics.h" -#define SPAPR_MSIX_MAX_DEVS 32 - #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge" +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge" #define SPAPR_PCI_HOST_BRIDGE(obj) \ OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE) +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \ + OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE) + #define SPAPR_PCI_HOST_BRIDGE_CLASS(klass) \ OBJECT_CLASS_CHECK(sPAPRPHBClass, (klass), TYPE_SPAPR_PCI_HOST_BRIDGE) #define SPAPR_PCI_HOST_BRIDGE_GET_CLASS(obj) \ @@ -41,6 +43,7 @@ typedef struct sPAPRPHBClass sPAPRPHBClass; typedef struct sPAPRPHBState sPAPRPHBState; +typedef struct sPAPRPHBVFIOState sPAPRPHBVFIOState; struct sPAPRPHBClass { PCIHostBridgeClass parent_class; @@ -48,6 +51,16 @@ struct sPAPRPHBClass { void (*finish_realize)(sPAPRPHBState *sphb, Error **errp); }; +typedef struct spapr_pci_msi { + uint32_t first_irq; + uint32_t num; +} spapr_pci_msi; + +typedef struct spapr_pci_msi_mig { + uint32_t key; + spapr_pci_msi value; +} spapr_pci_msi_mig; + struct sPAPRPHBState { PCIHostState parent_obj; @@ -67,15 +80,20 @@ struct sPAPRPHBState { uint32_t irq; } lsi_table[PCI_NUM_PINS]; - struct spapr_pci_msi { - uint32_t config_addr; - uint32_t irq; - uint32_t nvec; - } msi_table[SPAPR_MSIX_MAX_DEVS]; + GHashTable *msi; + /* Temporary cache for migration purposes */ + int32_t msi_devs_num; + spapr_pci_msi_mig *msi_devs; QLIST_ENTRY(sPAPRPHBState) list; }; +struct sPAPRPHBVFIOState { + sPAPRPHBState phb; + + int32_t iommugroupid; +}; + #define SPAPR_PCI_BASE_BUID 0x800000020000000ULL #define SPAPR_PCI_WINDOW_BASE 0x10000000000ULL diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 08c301f38d..bbba51a703 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -27,7 +27,6 @@ typedef struct sPAPREnvironment { long rtas_size; void *fdt_skel; target_ulong entry_point; - uint32_t next_irq; uint64_t rtc_offset; struct PPCTimebase tb; bool has_graphics; @@ -339,16 +338,6 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode, int spapr_allocate_irq(int hint, bool lsi); int spapr_allocate_irq_block(int num, bool lsi, bool msi); -static inline int spapr_allocate_msi(int hint) -{ - return spapr_allocate_irq(hint, false); -} - -static inline int spapr_allocate_lsi(int hint) -{ - return spapr_allocate_irq(hint, true); -} - /* RTAS return codes */ #define RTAS_OUT_SUCCESS 0 #define RTAS_OUT_NO_ERRORS_FOUND 1 @@ -358,6 +347,58 @@ static inline int spapr_allocate_lsi(int hint) #define RTAS_OUT_NOT_SUPPORTED -3 #define RTAS_OUT_NOT_AUTHORIZED -9002 +/* RTAS tokens */ +#define RTAS_TOKEN_BASE 0x2000 + +#define RTAS_DISPLAY_CHARACTER (RTAS_TOKEN_BASE + 0x00) +#define RTAS_GET_TIME_OF_DAY (RTAS_TOKEN_BASE + 0x01) +#define RTAS_SET_TIME_OF_DAY (RTAS_TOKEN_BASE + 0x02) +#define RTAS_POWER_OFF (RTAS_TOKEN_BASE + 0x03) +#define RTAS_SYSTEM_REBOOT (RTAS_TOKEN_BASE + 0x04) +#define RTAS_QUERY_CPU_STOPPED_STATE (RTAS_TOKEN_BASE + 0x05) +#define RTAS_START_CPU (RTAS_TOKEN_BASE + 0x06) +#define RTAS_STOP_SELF (RTAS_TOKEN_BASE + 0x07) +#define RTAS_IBM_GET_SYSTEM_PARAMETER (RTAS_TOKEN_BASE + 0x08) +#define RTAS_IBM_SET_SYSTEM_PARAMETER (RTAS_TOKEN_BASE + 0x09) +#define RTAS_IBM_SET_XIVE (RTAS_TOKEN_BASE + 0x0A) +#define RTAS_IBM_GET_XIVE (RTAS_TOKEN_BASE + 0x0B) +#define RTAS_IBM_INT_OFF (RTAS_TOKEN_BASE + 0x0C) +#define RTAS_IBM_INT_ON (RTAS_TOKEN_BASE + 0x0D) +#define RTAS_CHECK_EXCEPTION (RTAS_TOKEN_BASE + 0x0E) +#define RTAS_EVENT_SCAN (RTAS_TOKEN_BASE + 0x0F) +#define RTAS_IBM_SET_TCE_BYPASS (RTAS_TOKEN_BASE + 0x10) +#define RTAS_QUIESCE (RTAS_TOKEN_BASE + 0x11) +#define RTAS_NVRAM_FETCH (RTAS_TOKEN_BASE + 0x12) +#define RTAS_NVRAM_STORE (RTAS_TOKEN_BASE + 0x13) +#define RTAS_READ_PCI_CONFIG (RTAS_TOKEN_BASE + 0x14) +#define RTAS_WRITE_PCI_CONFIG (RTAS_TOKEN_BASE + 0x15) +#define RTAS_IBM_READ_PCI_CONFIG (RTAS_TOKEN_BASE + 0x16) +#define RTAS_IBM_WRITE_PCI_CONFIG (RTAS_TOKEN_BASE + 0x17) +#define RTAS_IBM_QUERY_INTERRUPT_SOURCE_NUMBER (RTAS_TOKEN_BASE + 0x18) +#define RTAS_IBM_CHANGE_MSI (RTAS_TOKEN_BASE + 0x19) +#define RTAS_SET_INDICATOR (RTAS_TOKEN_BASE + 0x1A) +#define RTAS_SET_POWER_LEVEL (RTAS_TOKEN_BASE + 0x1B) +#define RTAS_GET_POWER_LEVEL (RTAS_TOKEN_BASE + 0x1C) +#define RTAS_GET_SENSOR_STATE (RTAS_TOKEN_BASE + 0x1D) +#define RTAS_IBM_CONFIGURE_CONNECTOR (RTAS_TOKEN_BASE + 0x1E) +#define RTAS_IBM_OS_TERM (RTAS_TOKEN_BASE + 0x1F) +#define RTAS_IBM_EXTENDED_OS_TERM (RTAS_TOKEN_BASE + 0x20) + +#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x21) + +/* RTAS ibm,get-system-parameter token values */ +#define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20 +#define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE 42 +#define RTAS_SYSPARM_UUID 48 + +/* Possible values for the platform-processor-diagnostics-run-mode parameter + * of the RTAS ibm,get-system-parameter call. + */ +#define DIAGNOSTICS_RUN_MODE_DISABLED 0 +#define DIAGNOSTICS_RUN_MODE_STAGGERED 1 +#define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2 +#define DIAGNOSTICS_RUN_MODE_PERIODIC 3 + static inline uint64_t ppc64_phys_to_real(uint64_t addr) { return addr & ~0xF000000000000000ULL; @@ -373,11 +414,24 @@ static inline void rtas_st(target_ulong phys, int n, uint32_t val) stl_be_phys(&address_space_memory, ppc64_phys_to_real(phys + 4*n), val); } + +static inline void rtas_st_buffer(target_ulong phys, target_ulong phys_len, + uint8_t *buffer, uint16_t buffer_len) +{ + if (phys_len < 2) { + return; + } + stw_be_phys(&address_space_memory, + ppc64_phys_to_real(phys), buffer_len); + cpu_physical_memory_write(ppc64_phys_to_real(phys + 2), + buffer, MIN(buffer_len, phys_len - 2)); +} + typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPREnvironment *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets); -int spapr_rtas_register(const char *name, spapr_rtas_fn fn); +void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn); target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPREnvironment *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets); @@ -407,6 +461,7 @@ struct sPAPRTCETable { uint32_t page_shift; uint64_t *table; bool bypass; + bool vfio_accel; int fd; MemoryRegion iommu; QLIST_ENTRY(sPAPRTCETable) list; @@ -418,7 +473,8 @@ int spapr_h_cas_compose_response(target_ulong addr, target_ulong size); sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, uint64_t bus_offset, uint32_t page_shift, - uint32_t nb_table); + uint32_t nb_table, + bool vfio_accel); MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet); void spapr_tce_set_bypass(sPAPRTCETable *tcet, bool bypass); int spapr_dma_dt(void *fdt, int node_off, const char *propname, diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h index 85e4c8a3dd..a214dd7f28 100644 --- a/include/hw/ppc/xics.h +++ b/include/hw/ppc/xics.h @@ -136,7 +136,6 @@ struct ICSState { uint32_t nr_irqs; uint32_t offset; qemu_irq *qirqs; - bool *islsi; ICSIRQState *irqs; XICSState *icp; }; @@ -150,12 +149,20 @@ struct ICSIRQState { #define XICS_STATUS_REJECTED 0x4 #define XICS_STATUS_MASKED_PENDING 0x8 uint8_t status; +/* (flags & XICS_FLAGS_IRQ_MASK) == 0 means the interrupt is not allocated */ +#define XICS_FLAGS_IRQ_LSI 0x1 +#define XICS_FLAGS_IRQ_MSI 0x2 +#define XICS_FLAGS_IRQ_MASK 0x3 + uint8_t flags; }; #define XICS_IRQS 1024 qemu_irq xics_get_qirq(XICSState *icp, int irq); void xics_set_irq_type(XICSState *icp, int irq, bool lsi); +int xics_alloc(XICSState *icp, int src, int irq_hint, bool lsi); +int xics_alloc_block(XICSState *icp, int src, int num, bool lsi, bool align); +void xics_free(XICSState *icp, int irq, int num); void xics_cpu_setup(XICSState *icp, PowerPCCPU *cpu); diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 71a8a95641..9a001bd28f 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -101,6 +101,7 @@ enum VMStateFlags { VMS_VARRAY_UINT8 = 0x400, /* Array with size in uint8_t field*/ VMS_VARRAY_UINT32 = 0x800, /* Array with size in uint32_t field*/ VMS_MUST_EXIST = 0x1000, /* Field must exist in input */ + VMS_ALLOC = 0x2000, /* Alloc a buffer on the destination */ }; typedef struct { @@ -429,6 +430,16 @@ extern const VMStateInfo vmstate_info_bitmap; .offset = offsetof(_state, _field), \ } +#define VMSTATE_STRUCT_VARRAY_ALLOC(_field, _state, _field_num, _version, _vmsd, _type) {\ + .name = (stringify(_field)), \ + .version_id = (_version), \ + .vmsd = &(_vmsd), \ + .num_offset = vmstate_offset_value(_state, _field_num, int32_t), \ + .size = sizeof(_type), \ + .flags = VMS_STRUCT|VMS_VARRAY_INT32|VMS_ALLOC|VMS_POINTER, \ + .offset = vmstate_offset_pointer(_state, _field, _type), \ +} + #define VMSTATE_STATIC_BUFFER(_field, _state, _version, _test, _start, _size) { \ .name = (stringify(_field)), \ .version_id = (_version), \ diff --git a/include/net/net.h b/include/net/net.h index 8b189da5ee..ed594f9bdb 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -24,13 +24,13 @@ struct MACAddr { typedef struct NICPeers { NetClientState *ncs[MAX_QUEUE_NUM]; + int32_t queues; } NICPeers; typedef struct NICConf { MACAddr macaddr; NICPeers peers; int32_t bootindex; - int32_t queues; } NICConf; #define DEFINE_NIC_PROPERTIES(_state, _conf) \ diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 1248eda272..60777fecf6 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -736,6 +736,14 @@ enum { QEMU_PPC_FEATURE_TRUE_LE = 0x00000002, QEMU_PPC_FEATURE_PPC_LE = 0x00000001, + + /* Feature definitions in AT_HWCAP2. */ + QEMU_PPC_FEATURE2_ARCH_2_07 = 0x80000000, /* ISA 2.07 */ + QEMU_PPC_FEATURE2_HAS_HTM = 0x40000000, /* Hardware Transactional Memory */ + QEMU_PPC_FEATURE2_HAS_DSCR = 0x20000000, /* Data Stream Control Register */ + QEMU_PPC_FEATURE2_HAS_EBB = 0x10000000, /* Event Base Branching */ + QEMU_PPC_FEATURE2_HAS_ISEL = 0x08000000, /* Integer Select */ + QEMU_PPC_FEATURE2_HAS_TAR = 0x04000000, /* Target Address Register */ }; #define ELF_HWCAP get_elf_hwcap() @@ -749,6 +757,8 @@ static uint32_t get_elf_hwcap(void) Altivec/FP/SPE support. Anything else is just a bonus. */ #define GET_FEATURE(flag, feature) \ do { if (cpu->env.insns_flags & flag) { features |= feature; } } while (0) +#define GET_FEATURE2(flag, feature) \ + do { if (cpu->env.insns_flags2 & flag) { features |= feature; } } while (0) GET_FEATURE(PPC_64B, QEMU_PPC_FEATURE_64); GET_FEATURE(PPC_FLOAT, QEMU_PPC_FEATURE_HAS_FPU); GET_FEATURE(PPC_ALTIVEC, QEMU_PPC_FEATURE_HAS_ALTIVEC); @@ -757,7 +767,36 @@ static uint32_t get_elf_hwcap(void) GET_FEATURE(PPC_SPE_DOUBLE, QEMU_PPC_FEATURE_HAS_EFP_DOUBLE); GET_FEATURE(PPC_BOOKE, QEMU_PPC_FEATURE_BOOKE); GET_FEATURE(PPC_405_MAC, QEMU_PPC_FEATURE_HAS_4xxMAC); + GET_FEATURE2(PPC2_DFP, QEMU_PPC_FEATURE_HAS_DFP); + GET_FEATURE2(PPC2_VSX, QEMU_PPC_FEATURE_HAS_VSX); + GET_FEATURE2((PPC2_PERM_ISA206 | PPC2_DIVE_ISA206 | PPC2_ATOMIC_ISA206 | + PPC2_FP_CVT_ISA206 | PPC2_FP_TST_ISA206), + QEMU_PPC_FEATURE_ARCH_2_06); +#undef GET_FEATURE +#undef GET_FEATURE2 + + return features; +} + +#define ELF_HWCAP2 get_elf_hwcap2() + +static uint32_t get_elf_hwcap2(void) +{ + PowerPCCPU *cpu = POWERPC_CPU(thread_cpu); + uint32_t features = 0; + +#define GET_FEATURE(flag, feature) \ + do { if (cpu->env.insns_flags & flag) { features |= feature; } } while (0) +#define GET_FEATURE2(flag, feature) \ + do { if (cpu->env.insns_flags2 & flag) { features |= feature; } } while (0) + + GET_FEATURE(PPC_ISEL, QEMU_PPC_FEATURE2_HAS_ISEL); + GET_FEATURE2(PPC2_BCTAR_ISA207, QEMU_PPC_FEATURE2_HAS_TAR); + GET_FEATURE2((PPC2_BCTAR_ISA207 | PPC2_LSQ_ISA207 | PPC2_ALTIVEC_207 | + PPC2_ISA207S), QEMU_PPC_FEATURE2_ARCH_2_07); + #undef GET_FEATURE +#undef GET_FEATURE2 return features; } @@ -774,8 +813,9 @@ static uint32_t get_elf_hwcap(void) #define DLINFO_ARCH_ITEMS 5 #define ARCH_DLINFO \ do { \ - NEW_AUX_ENT(AT_DCACHEBSIZE, 0x20); \ - NEW_AUX_ENT(AT_ICACHEBSIZE, 0x20); \ + PowerPCCPU *cpu = POWERPC_CPU(thread_cpu); \ + NEW_AUX_ENT(AT_DCACHEBSIZE, cpu->env.dcache_line_size); \ + NEW_AUX_ENT(AT_ICACHEBSIZE, cpu->env.icache_line_size); \ NEW_AUX_ENT(AT_UCACHEBSIZE, 0); \ /* \ * Now handle glibc compatibility. \ @@ -570,6 +570,7 @@ monitor_qapi_event_throttle(QAPIEvent event, int64_t rate) trace_monitor_protocol_event_throttle(event, rate); evstate->event = event; + assert(rate * SCALE_MS <= INT64_MAX); evstate->rate = rate * SCALE_MS; evstate->last = 0; evstate->data = NULL; @@ -585,9 +586,9 @@ static void monitor_qapi_event_init(void) monitor_qapi_event_throttle(QAPI_EVENT_RTC_CHANGE, 1000); monitor_qapi_event_throttle(QAPI_EVENT_WATCHDOG, 1000); monitor_qapi_event_throttle(QAPI_EVENT_BALLOON_CHANGE, 1000); - /* limit the rate of quorum events to avoid hammering the management */ monitor_qapi_event_throttle(QAPI_EVENT_QUORUM_REPORT_BAD, 1000); monitor_qapi_event_throttle(QAPI_EVENT_QUORUM_FAILURE, 1000); + monitor_qapi_event_throttle(QAPI_EVENT_VSERPORT_CHANGE, 1000); qmp_event_set_func_emit(monitor_qapi_event_queue); } diff --git a/net/Makefile.objs b/net/Makefile.objs index 301f6b6b51..a06ba59dad 100644 --- a/net/Makefile.objs +++ b/net/Makefile.objs @@ -2,6 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o common-obj-y += socket.o common-obj-y += dump.o common-obj-y += eth.o +common-obj-$(CONFIG_LINUX) += l2tpv3.o common-obj-$(CONFIG_POSIX) += tap.o vhost-user.o common-obj-$(CONFIG_LINUX) += tap-linux.o common-obj-$(CONFIG_WIN32) += tap-win32.o diff --git a/net/clients.h b/net/clients.h index 7f3d4ae9f3..2e8fedad8d 100644 --- a/net/clients.h +++ b/net/clients.h @@ -47,6 +47,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, int net_init_bridge(const NetClientOptions *opts, const char *name, NetClientState *peer); +int net_init_l2tpv3(const NetClientOptions *opts, const char *name, + NetClientState *peer); #ifdef CONFIG_VDE int net_init_vde(const NetClientOptions *opts, const char *name, NetClientState *peer); diff --git a/net/l2tpv3.c b/net/l2tpv3.c new file mode 100644 index 0000000000..528d95b641 --- /dev/null +++ b/net/l2tpv3.c @@ -0,0 +1,757 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2012-2014 Cisco Systems + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <linux/ip.h> +#include <netdb.h> +#include "config-host.h" +#include "net/net.h" +#include "clients.h" +#include "monitor/monitor.h" +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/sockets.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" + + +/* The buffer size needs to be investigated for optimum numbers and + * optimum means of paging in on different systems. This size is + * chosen to be sufficient to accommodate one packet with some headers + */ + +#define BUFFER_ALIGN sysconf(_SC_PAGESIZE) +#define BUFFER_SIZE 2048 +#define IOVSIZE 2 +#define MAX_L2TPV3_MSGCNT 64 +#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE) + +/* Header set to 0x30000 signifies a data packet */ + +#define L2TPV3_DATA_PACKET 0x30000 + +/* IANA-assigned IP protocol ID for L2TPv3 */ + +#ifndef IPPROTO_L2TP +#define IPPROTO_L2TP 0x73 +#endif + +typedef struct NetL2TPV3State { + NetClientState nc; + int fd; + + /* + * these are used for xmit - that happens packet a time + * and for first sign of life packet (easier to parse that once) + */ + + uint8_t *header_buf; + struct iovec *vec; + + /* + * these are used for receive - try to "eat" up to 32 packets at a time + */ + + struct mmsghdr *msgvec; + + /* + * peer address + */ + + struct sockaddr_storage *dgram_dst; + uint32_t dst_size; + + /* + * L2TPv3 parameters + */ + + uint64_t rx_cookie; + uint64_t tx_cookie; + uint32_t rx_session; + uint32_t tx_session; + uint32_t header_size; + uint32_t counter; + + /* + * DOS avoidance in error handling + */ + + bool header_mismatch; + + /* + * Ring buffer handling + */ + + int queue_head; + int queue_tail; + int queue_depth; + + /* + * Precomputed offsets + */ + + uint32_t offset; + uint32_t cookie_offset; + uint32_t counter_offset; + uint32_t session_offset; + + /* Poll Control */ + + bool read_poll; + bool write_poll; + + /* Flags */ + + bool ipv6; + bool udp; + bool has_counter; + bool pin_counter; + bool cookie; + bool cookie_is_64; + +} NetL2TPV3State; + +static int l2tpv3_can_send(void *opaque); +static void net_l2tpv3_send(void *opaque); +static void l2tpv3_writable(void *opaque); + +static void l2tpv3_update_fd_handler(NetL2TPV3State *s) +{ + qemu_set_fd_handler2(s->fd, + s->read_poll ? l2tpv3_can_send : NULL, + s->read_poll ? net_l2tpv3_send : NULL, + s->write_poll ? l2tpv3_writable : NULL, + s); +} + +static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable) +{ + if (s->read_poll != enable) { + s->read_poll = enable; + l2tpv3_update_fd_handler(s); + } +} + +static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable) +{ + if (s->write_poll != enable) { + s->write_poll = enable; + l2tpv3_update_fd_handler(s); + } +} + +static void l2tpv3_writable(void *opaque) +{ + NetL2TPV3State *s = opaque; + l2tpv3_write_poll(s, false); + qemu_flush_queued_packets(&s->nc); +} + +static int l2tpv3_can_send(void *opaque) +{ + NetL2TPV3State *s = opaque; + + return qemu_can_send_packet(&s->nc); +} + +static void l2tpv3_send_completed(NetClientState *nc, ssize_t len) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + l2tpv3_read_poll(s, true); +} + +static void l2tpv3_poll(NetClientState *nc, bool enable) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + l2tpv3_write_poll(s, enable); + l2tpv3_read_poll(s, enable); +} + +static void l2tpv3_form_header(NetL2TPV3State *s) +{ + uint32_t *counter; + + if (s->udp) { + stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET); + } + stl_be_p( + (uint32_t *) (s->header_buf + s->session_offset), + s->tx_session + ); + if (s->cookie) { + if (s->cookie_is_64) { + stq_be_p( + (uint64_t *)(s->header_buf + s->cookie_offset), + s->tx_cookie + ); + } else { + stl_be_p( + (uint32_t *) (s->header_buf + s->cookie_offset), + s->tx_cookie + ); + } + } + if (s->has_counter) { + counter = (uint32_t *)(s->header_buf + s->counter_offset); + if (s->pin_counter) { + *counter = 0; + } else { + stl_be_p(counter, ++s->counter); + } + } +} + +static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc, + const struct iovec *iov, + int iovcnt) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + + struct msghdr message; + int ret; + + if (iovcnt > MAX_L2TPV3_IOVCNT - 1) { + error_report( + "iovec too long %d > %d, change l2tpv3.h", + iovcnt, MAX_L2TPV3_IOVCNT + ); + return -1; + } + l2tpv3_form_header(s); + memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec)); + s->vec->iov_base = s->header_buf; + s->vec->iov_len = s->offset; + message.msg_name = s->dgram_dst; + message.msg_namelen = s->dst_size; + message.msg_iov = s->vec; + message.msg_iovlen = iovcnt + 1; + message.msg_control = NULL; + message.msg_controllen = 0; + message.msg_flags = 0; + do { + ret = sendmsg(s->fd, &message, 0); + } while ((ret == -1) && (errno == EINTR)); + if (ret > 0) { + ret -= s->offset; + } else if (ret == 0) { + /* belt and braces - should not occur on DGRAM + * we should get an error and never a 0 send + */ + ret = iov_size(iov, iovcnt); + } else { + /* signal upper layer that socket buffer is full */ + ret = -errno; + if (ret == -EAGAIN || ret == -ENOBUFS) { + l2tpv3_write_poll(s, true); + ret = 0; + } + } + return ret; +} + +static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc, + const uint8_t *buf, + size_t size) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + + struct iovec *vec; + struct msghdr message; + ssize_t ret = 0; + + l2tpv3_form_header(s); + vec = s->vec; + vec->iov_base = s->header_buf; + vec->iov_len = s->offset; + vec++; + vec->iov_base = (void *) buf; + vec->iov_len = size; + message.msg_name = s->dgram_dst; + message.msg_namelen = s->dst_size; + message.msg_iov = s->vec; + message.msg_iovlen = 2; + message.msg_control = NULL; + message.msg_controllen = 0; + message.msg_flags = 0; + do { + ret = sendmsg(s->fd, &message, 0); + } while ((ret == -1) && (errno == EINTR)); + if (ret > 0) { + ret -= s->offset; + } else if (ret == 0) { + /* belt and braces - should not occur on DGRAM + * we should get an error and never a 0 send + */ + ret = size; + } else { + ret = -errno; + if (ret == -EAGAIN || ret == -ENOBUFS) { + /* signal upper layer that socket buffer is full */ + l2tpv3_write_poll(s, true); + ret = 0; + } + } + return ret; +} + +static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf) +{ + + uint32_t *session; + uint64_t cookie; + + if ((!s->udp) && (!s->ipv6)) { + buf += sizeof(struct iphdr) /* fix for ipv4 raw */; + } + + /* we do not do a strict check for "data" packets as per + * the RFC spec because the pure IP spec does not have + * that anyway. + */ + + if (s->cookie) { + if (s->cookie_is_64) { + cookie = ldq_be_p(buf + s->cookie_offset); + } else { + cookie = ldl_be_p(buf + s->cookie_offset); + } + if (cookie != s->rx_cookie) { + if (!s->header_mismatch) { + error_report("unknown cookie id"); + } + return -1; + } + } + session = (uint32_t *) (buf + s->session_offset); + if (ldl_be_p(session) != s->rx_session) { + if (!s->header_mismatch) { + error_report("session mismatch"); + } + return -1; + } + return 0; +} + +static void net_l2tpv3_process_queue(NetL2TPV3State *s) +{ + int size = 0; + struct iovec *vec; + bool bad_read; + int data_size; + struct mmsghdr *msgvec; + + /* go into ring mode only if there is a "pending" tail */ + if (s->queue_depth > 0) { + do { + msgvec = s->msgvec + s->queue_tail; + if (msgvec->msg_len > 0) { + data_size = msgvec->msg_len - s->header_size; + vec = msgvec->msg_hdr.msg_iov; + if ((data_size > 0) && + (l2tpv3_verify_header(s, vec->iov_base) == 0)) { + vec++; + /* Use the legacy delivery for now, we will + * switch to using our own ring as a queueing mechanism + * at a later date + */ + size = qemu_send_packet_async( + &s->nc, + vec->iov_base, + data_size, + l2tpv3_send_completed + ); + if (size == 0) { + l2tpv3_read_poll(s, false); + } + bad_read = false; + } else { + bad_read = true; + if (!s->header_mismatch) { + /* report error only once */ + error_report("l2tpv3 header verification failed"); + s->header_mismatch = true; + } + } + } else { + bad_read = true; + } + s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT; + s->queue_depth--; + } while ( + (s->queue_depth > 0) && + qemu_can_send_packet(&s->nc) && + ((size > 0) || bad_read) + ); + } +} + +static void net_l2tpv3_send(void *opaque) +{ + NetL2TPV3State *s = opaque; + int target_count, count; + struct mmsghdr *msgvec; + + /* go into ring mode only if there is a "pending" tail */ + + if (s->queue_depth) { + + /* The ring buffer we use has variable intake + * count of how much we can read varies - adjust accordingly + */ + + target_count = MAX_L2TPV3_MSGCNT - s->queue_depth; + + /* Ensure we do not overrun the ring when we have + * a lot of enqueued packets + */ + + if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) { + target_count = MAX_L2TPV3_MSGCNT - s->queue_head; + } + } else { + + /* we do not have any pending packets - we can use + * the whole message vector linearly instead of using + * it as a ring + */ + + s->queue_head = 0; + s->queue_tail = 0; + target_count = MAX_L2TPV3_MSGCNT; + } + + msgvec = s->msgvec + s->queue_head; + if (target_count > 0) { + do { + count = recvmmsg( + s->fd, + msgvec, + target_count, MSG_DONTWAIT, NULL); + } while ((count == -1) && (errno == EINTR)); + if (count < 0) { + /* Recv error - we still need to flush packets here, + * (re)set queue head to current position + */ + count = 0; + } + s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT; + s->queue_depth += count; + } + net_l2tpv3_process_queue(s); +} + +static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount) +{ + int i, j; + struct iovec *iov; + struct mmsghdr *cleanup = msgvec; + if (cleanup) { + for (i = 0; i < count; i++) { + if (cleanup->msg_hdr.msg_iov) { + iov = cleanup->msg_hdr.msg_iov; + for (j = 0; j < iovcount; j++) { + g_free(iov->iov_base); + iov++; + } + g_free(cleanup->msg_hdr.msg_iov); + } + cleanup++; + } + g_free(msgvec); + } +} + +static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count) +{ + int i; + struct iovec *iov; + struct mmsghdr *msgvec, *result; + + msgvec = g_malloc(sizeof(struct mmsghdr) * count); + result = msgvec; + for (i = 0; i < count ; i++) { + msgvec->msg_hdr.msg_name = NULL; + msgvec->msg_hdr.msg_namelen = 0; + iov = g_malloc(sizeof(struct iovec) * IOVSIZE); + msgvec->msg_hdr.msg_iov = iov; + iov->iov_base = g_malloc(s->header_size); + iov->iov_len = s->header_size; + iov++ ; + iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE); + iov->iov_len = BUFFER_SIZE; + msgvec->msg_hdr.msg_iovlen = 2; + msgvec->msg_hdr.msg_control = NULL; + msgvec->msg_hdr.msg_controllen = 0; + msgvec->msg_hdr.msg_flags = 0; + msgvec++; + } + return result; +} + +static void net_l2tpv3_cleanup(NetClientState *nc) +{ + NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc); + qemu_purge_queued_packets(nc); + l2tpv3_read_poll(s, false); + l2tpv3_write_poll(s, false); + if (s->fd > 0) { + close(s->fd); + } + destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE); + g_free(s->vec); + g_free(s->header_buf); + g_free(s->dgram_dst); +} + +static NetClientInfo net_l2tpv3_info = { + .type = NET_CLIENT_OPTIONS_KIND_L2TPV3, + .size = sizeof(NetL2TPV3State), + .receive = net_l2tpv3_receive_dgram, + .receive_iov = net_l2tpv3_receive_dgram_iov, + .poll = l2tpv3_poll, + .cleanup = net_l2tpv3_cleanup, +}; + +int net_init_l2tpv3(const NetClientOptions *opts, + const char *name, + NetClientState *peer) +{ + + + const NetdevL2TPv3Options *l2tpv3; + NetL2TPV3State *s; + NetClientState *nc; + int fd = -1, gairet; + struct addrinfo hints; + struct addrinfo *result = NULL; + char *srcport, *dstport; + + nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name); + + s = DO_UPCAST(NetL2TPV3State, nc, nc); + + s->queue_head = 0; + s->queue_tail = 0; + s->header_mismatch = false; + + assert(opts->kind == NET_CLIENT_OPTIONS_KIND_L2TPV3); + l2tpv3 = opts->l2tpv3; + + if (l2tpv3->has_ipv6 && l2tpv3->ipv6) { + s->ipv6 = l2tpv3->ipv6; + } else { + s->ipv6 = false; + } + + if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) { + error_report("l2tpv3_open : offset must be less than 256 bytes"); + goto outerr; + } + + if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) { + if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) { + s->cookie = true; + } else { + goto outerr; + } + } else { + s->cookie = false; + } + + if (l2tpv3->has_cookie64 || l2tpv3->cookie64) { + s->cookie_is_64 = true; + } else { + s->cookie_is_64 = false; + } + + if (l2tpv3->has_udp && l2tpv3->udp) { + s->udp = true; + if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) { + error_report("l2tpv3_open : need both src and dst port for udp"); + goto outerr; + } else { + srcport = l2tpv3->srcport; + dstport = l2tpv3->dstport; + } + } else { + s->udp = false; + srcport = NULL; + dstport = NULL; + } + + + s->offset = 4; + s->session_offset = 0; + s->cookie_offset = 4; + s->counter_offset = 4; + + s->tx_session = l2tpv3->txsession; + if (l2tpv3->has_rxsession) { + s->rx_session = l2tpv3->rxsession; + } else { + s->rx_session = s->tx_session; + } + + if (s->cookie) { + s->rx_cookie = l2tpv3->rxcookie; + s->tx_cookie = l2tpv3->txcookie; + if (s->cookie_is_64 == true) { + /* 64 bit cookie */ + s->offset += 8; + s->counter_offset += 8; + } else { + /* 32 bit cookie */ + s->offset += 4; + s->counter_offset += 4; + } + } + + memset(&hints, 0, sizeof(hints)); + + if (s->ipv6) { + hints.ai_family = AF_INET6; + } else { + hints.ai_family = AF_INET; + } + if (s->udp) { + hints.ai_socktype = SOCK_DGRAM; + hints.ai_protocol = 0; + s->offset += 4; + s->counter_offset += 4; + s->session_offset += 4; + s->cookie_offset += 4; + } else { + hints.ai_socktype = SOCK_RAW; + hints.ai_protocol = IPPROTO_L2TP; + } + + gairet = getaddrinfo(l2tpv3->src, srcport, &hints, &result); + + if ((gairet != 0) || (result == NULL)) { + error_report( + "l2tpv3_open : could not resolve src, errno = %s", + gai_strerror(gairet) + ); + goto outerr; + } + fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol); + if (fd == -1) { + fd = -errno; + error_report("l2tpv3_open : socket creation failed, errno = %d", -fd); + freeaddrinfo(result); + goto outerr; + } + if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) { + error_report("l2tpv3_open : could not bind socket err=%i", errno); + goto outerr; + } + if (result) { + freeaddrinfo(result); + } + + memset(&hints, 0, sizeof(hints)); + + if (s->ipv6) { + hints.ai_family = AF_INET6; + } else { + hints.ai_family = AF_INET; + } + if (s->udp) { + hints.ai_socktype = SOCK_DGRAM; + hints.ai_protocol = 0; + } else { + hints.ai_socktype = SOCK_RAW; + hints.ai_protocol = IPPROTO_L2TP; + } + + result = NULL; + gairet = getaddrinfo(l2tpv3->dst, dstport, &hints, &result); + if ((gairet != 0) || (result == NULL)) { + error_report( + "l2tpv3_open : could not resolve dst, error = %s", + gai_strerror(gairet) + ); + goto outerr; + } + + s->dgram_dst = g_malloc(sizeof(struct sockaddr_storage)); + memset(s->dgram_dst, '\0' , sizeof(struct sockaddr_storage)); + memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen); + s->dst_size = result->ai_addrlen; + + if (result) { + freeaddrinfo(result); + } + + if (l2tpv3->has_counter && l2tpv3->counter) { + s->has_counter = true; + s->offset += 4; + } else { + s->has_counter = false; + } + + if (l2tpv3->has_pincounter && l2tpv3->pincounter) { + s->has_counter = true; /* pin counter implies that there is counter */ + s->pin_counter = true; + } else { + s->pin_counter = false; + } + + if (l2tpv3->has_offset) { + /* extra offset */ + s->offset += l2tpv3->offset; + } + + if ((s->ipv6) || (s->udp)) { + s->header_size = s->offset; + } else { + s->header_size = s->offset + sizeof(struct iphdr); + } + + s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT); + s->vec = g_malloc(sizeof(struct iovec) * MAX_L2TPV3_IOVCNT); + s->header_buf = g_malloc(s->header_size); + + qemu_set_nonblock(fd); + + s->fd = fd; + s->counter = 0; + + l2tpv3_read_poll(s, true); + + snprintf(s->nc.info_str, sizeof(s->nc.info_str), + "l2tpv3: connected"); + return 0; +outerr: + qemu_del_net_client(nc); + if (fd > 0) { + close(fd); + } + if (result) { + freeaddrinfo(result); + } + return -1; +} + @@ -250,7 +250,7 @@ NICState *qemu_new_nic(NetClientInfo *info, { NetClientState **peers = conf->peers.ncs; NICState *nic; - int i, queues = MAX(1, conf->queues); + int i, queues = MAX(1, conf->peers.queues); assert(info->type == NET_CLIENT_OPTIONS_KIND_NIC); assert(info->size >= sizeof(NICState)); @@ -363,7 +363,7 @@ void qemu_del_net_client(NetClientState *nc) void qemu_del_nic(NICState *nic) { - int i, queues = MAX(nic->conf->queues, 1); + int i, queues = MAX(nic->conf->peers.queues, 1); /* If this is a peer NIC and peer has already been deleted, free it now. */ if (nic->peer_deleted) { @@ -806,6 +806,9 @@ static int (* const net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])( #ifdef CONFIG_VHOST_NET_USED [NET_CLIENT_OPTIONS_KIND_VHOST_USER] = net_init_vhost_user, #endif +#ifdef CONFIG_LINUX + [NET_CLIENT_OPTIONS_KIND_L2TPV3] = net_init_l2tpv3, +#endif }; @@ -842,6 +845,9 @@ static int net_client_init1(const void *object, int is_netdev, Error **errp) #ifdef CONFIG_VHOST_NET_USED case NET_CLIENT_OPTIONS_KIND_VHOST_USER: #endif +#ifdef CONFIG_LINUX + case NET_CLIENT_OPTIONS_KIND_L2TPV3: +#endif break; default: diff --git a/pc-bios/s390-ccw.img b/pc-bios/s390-ccw.img Binary files differindex 1c7f7640fc..603e19e003 100644 --- a/pc-bios/s390-ccw.img +++ b/pc-bios/s390-ccw.img diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c index 5ee3fcb38f..fa54abb4d3 100644 --- a/pc-bios/s390-ccw/bootmap.c +++ b/pc-bios/s390-ccw/bootmap.c @@ -9,8 +9,12 @@ */ #include "s390-ccw.h" +#include "bootmap.h" +#include "virtio.h" -// #define DEBUG_FALLBACK +#ifdef DEBUG +/* #define DEBUG_FALLBACK */ +#endif #ifdef DEBUG_FALLBACK #define dputs(txt) \ @@ -20,43 +24,8 @@ do { } while (0) #endif -struct scsi_blockptr { - uint64_t blockno; - uint16_t size; - uint16_t blockct; - uint8_t reserved[4]; -} __attribute__ ((packed)); - -struct component_entry { - struct scsi_blockptr data; - uint8_t pad[7]; - uint8_t component_type; - uint64_t load_address; -} __attribute((packed)); - -struct component_header { - uint8_t magic[4]; - uint8_t type; - uint8_t reserved[27]; -} __attribute((packed)); - -struct mbr { - uint8_t magic[4]; - uint32_t version_id; - uint8_t reserved[8]; - struct scsi_blockptr blockptr; -} __attribute__ ((packed)); - -#define ZIPL_MAGIC "zIPL" - -#define ZIPL_COMP_HEADER_IPL 0x00 -#define ZIPL_COMP_HEADER_DUMP 0x01 - -#define ZIPL_COMP_ENTRY_LOAD 0x02 -#define ZIPL_COMP_ENTRY_EXEC 0x01 - /* Scratch space */ -static uint8_t sec[SECTOR_SIZE] __attribute__((__aligned__(SECTOR_SIZE))); +static uint8_t sec[MAX_SECTOR_SIZE*4] __attribute__((__aligned__(PAGE_SIZE))); typedef struct ResetInfo { uint32_t ipl_mask; @@ -104,43 +73,243 @@ static void jump_to_IPL_code(uint64_t address) virtio_panic("\n! IPL returns !\n"); } -/* Check for ZIPL magic. Returns 0 if not matched. */ -static int zipl_magic(uint8_t *ptr) +/*********************************************************************** + * IPL an ECKD DASD (CDL or LDL/CMS format) + */ + +static unsigned char _bprs[8*1024]; /* guessed "max" ECKD sector size */ +const int max_bprs_entries = sizeof(_bprs) / sizeof(ExtEckdBlockPtr); + +static inline void verify_boot_info(BootInfo *bip) +{ + IPL_assert(magic_match(bip->magic, ZIPL_MAGIC), "No zIPL magic"); + IPL_assert(bip->version == BOOT_INFO_VERSION, "Wrong zIPL version"); + IPL_assert(bip->bp_type == BOOT_INFO_BP_TYPE_IPL, "DASD is not for IPL"); + IPL_assert(bip->dev_type == BOOT_INFO_DEV_TYPE_ECKD, "DASD is not ECKD"); + IPL_assert(bip->flags == BOOT_INFO_FLAGS_ARCH, "Not for this arch"); + IPL_assert(block_size_ok(bip->bp.ipl.bm_ptr.eckd.bptr.size), + "Bad block size in zIPL section of the 1st record."); +} + +static bool eckd_valid_address(BootMapPointer *p) { - uint32_t *p = (void*)ptr; - uint32_t *z = (void*)ZIPL_MAGIC; + const uint64_t cylinder = p->eckd.cylinder + + ((p->eckd.head & 0xfff0) << 12); + const uint64_t head = p->eckd.head & 0x000f; + + if (head >= virtio_get_heads() + || p->eckd.sector > virtio_get_sectors() + || p->eckd.sector <= 0) { + return false; + } - if (*p != *z) { - debug_print_int("invalid magic", *p); - virtio_panic("invalid magic"); + if (!virtio_guessed_disk_nature() && cylinder >= virtio_get_cylinders()) { + return false; } - return 1; + return true; } -#define FREE_SPACE_FILLER '\xAA' +static block_number_t eckd_block_num(BootMapPointer *p) +{ + const uint64_t sectors = virtio_get_sectors(); + const uint64_t heads = virtio_get_heads(); + const uint64_t cylinder = p->eckd.cylinder + + ((p->eckd.head & 0xfff0) << 12); + const uint64_t head = p->eckd.head & 0x000f; + const block_number_t block = sectors * heads * cylinder + + sectors * head + + p->eckd.sector + - 1; /* block nr starts with zero */ + return block; +} -static inline bool unused_space(const void *p, unsigned int size) +static block_number_t load_eckd_segments(block_number_t blk, uint64_t *address) { - int i; - const unsigned char *m = p; + block_number_t block_nr; + int j, rc; + BootMapPointer *bprs = (void *)_bprs; + bool more_data; + + memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs)); + read_block(blk, bprs, "BPRS read failed"); + + do { + more_data = false; + for (j = 0;; j++) { + block_nr = eckd_block_num((void *)&(bprs[j].xeckd)); + if (is_null_block_number(block_nr)) { /* end of chunk */ + break; + } + + /* we need the updated blockno for the next indirect entry + * in the chain, but don't want to advance address + */ + if (j == (max_bprs_entries - 1)) { + break; + } + + IPL_assert(block_size_ok(bprs[j].xeckd.bptr.size), + "bad chunk block size"); + IPL_assert(eckd_valid_address(&bprs[j]), "bad chunk ECKD addr"); + + if ((bprs[j].xeckd.bptr.count == 0) && unused_space(&(bprs[j+1]), + sizeof(EckdBlockPtr))) { + /* This is a "continue" pointer. + * This ptr should be the last one in the current + * script section. + * I.e. the next ptr must point to the unused memory area + */ + memset(_bprs, FREE_SPACE_FILLER, sizeof(_bprs)); + read_block(block_nr, bprs, "BPRS continuation read failed"); + more_data = true; + break; + } - for (i = 0; i < size; i++) { - if (m[i] != FREE_SPACE_FILLER) { - return false; + /* Load (count+1) blocks of code at (block_nr) + * to memory (address). + */ + rc = virtio_read_many(block_nr, (void *)(*address), + bprs[j].xeckd.bptr.count+1); + IPL_assert(rc == 0, "code chunk read failed"); + + *address += (bprs[j].xeckd.bptr.count+1) * virtio_get_block_size(); } + } while (more_data); + return block_nr; +} + +static void run_eckd_boot_script(block_number_t mbr_block_nr) +{ + int i; + block_number_t block_nr; + uint64_t address; + ScsiMbr *scsi_mbr = (void *)sec; + BootMapScript *bms = (void *)sec; + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(mbr_block_nr, sec, "Cannot read MBR"); + + block_nr = eckd_block_num((void *)&(scsi_mbr->blockptr)); + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(block_nr, sec, "Cannot read Boot Map Script"); + + for (i = 0; bms->entry[i].type == BOOT_SCRIPT_LOAD; i++) { + address = bms->entry[i].address.load_address; + block_nr = eckd_block_num(&(bms->entry[i].blkptr)); + + do { + block_nr = load_eckd_segments(block_nr, &address); + } while (block_nr != -1); + } + + IPL_assert(bms->entry[i].type == BOOT_SCRIPT_EXEC, + "Unknown script entry type"); + jump_to_IPL_code(bms->entry[i].address.load_address); /* no return */ +} + +static void ipl_eckd_cdl(void) +{ + XEckdMbr *mbr; + Ipl2 *ipl2 = (void *)sec; + IplVolumeLabel *vlbl = (void *)sec; + block_number_t block_nr; + + /* we have just read the block #0 and recognized it as "IPL1" */ + sclp_print("CDL\n"); + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(1, ipl2, "Cannot read IPL2 record at block 1"); + IPL_assert(magic_match(ipl2, IPL2_MAGIC), "No IPL2 record"); + + mbr = &ipl2->u.x.mbr; + IPL_assert(magic_match(mbr, ZIPL_MAGIC), "No zIPL section in IPL2 record."); + IPL_assert(block_size_ok(mbr->blockptr.xeckd.bptr.size), + "Bad block size in zIPL section of IPL2 record."); + IPL_assert(mbr->dev_type == DEV_TYPE_ECKD, + "Non-ECKD device type in zIPL section of IPL2 record."); + + /* save pointer to Boot Script */ + block_nr = eckd_block_num((void *)&(mbr->blockptr)); + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(2, vlbl, "Cannot read Volume Label at block 2"); + IPL_assert(magic_match(vlbl->key, VOL1_MAGIC), + "Invalid magic of volume label block"); + IPL_assert(magic_match(vlbl->f.key, VOL1_MAGIC), + "Invalid magic of volser block"); + print_volser(vlbl->f.volser); + + run_eckd_boot_script(block_nr); + /* no return */ +} + +static void ipl_eckd_ldl(ECKD_IPL_mode_t mode) +{ + LDL_VTOC *vlbl = (void *)sec; /* already read, 3rd block */ + char msg[4] = { '?', '.', '\n', '\0' }; + block_number_t block_nr; + BootInfo *bip; + + sclp_print((mode == ECKD_CMS) ? "CMS" : "LDL"); + sclp_print(" version "); + switch (vlbl->LDL_version) { + case LDL1_VERSION: + msg[0] = '1'; + break; + case LDL2_VERSION: + msg[0] = '2'; + break; + default: + msg[0] = vlbl->LDL_version; + msg[0] &= 0x0f; /* convert EBCDIC */ + msg[0] |= 0x30; /* to ASCII (digit) */ + msg[1] = '?'; + break; + } + sclp_print(msg); + print_volser(vlbl->volser); + + /* DO NOT read BootMap pointer (only one, xECKD) at block #2 */ + + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(0, sec, "Cannot read block 0"); + bip = (void *)(sec + 0x70); /* "boot info" is "eckd mbr" for LDL */ + verify_boot_info(bip); + + block_nr = eckd_block_num((void *)&(bip->bp.ipl.bm_ptr.eckd.bptr)); + run_eckd_boot_script(block_nr); + /* no return */ +} + +static void ipl_eckd(ECKD_IPL_mode_t mode) +{ + switch (mode) { + case ECKD_CDL: + ipl_eckd_cdl(); /* no return */ + case ECKD_CMS: + case ECKD_LDL: + ipl_eckd_ldl(mode); /* no return */ + default: + virtio_panic("\n! Unknown ECKD IPL mode !\n"); } - return true; } -static int zipl_load_segment(struct component_entry *entry) +/*********************************************************************** + * IPL a SCSI disk + */ + +static void zipl_load_segment(ComponentEntry *entry) { - const int max_entries = (SECTOR_SIZE / sizeof(struct scsi_blockptr)); - struct scsi_blockptr *bprs = (void*)sec; + const int max_entries = (MAX_SECTOR_SIZE / sizeof(ScsiBlockPtr)); + ScsiBlockPtr *bprs = (void *)sec; const int bprs_size = sizeof(sec); - uint64_t blockno; - long address; + block_number_t blockno; + uint64_t address; int i; + char err_msg[] = "zIPL failed to read BPRS at 0xZZZZZZZZZZZZZZZZ"; + char *blk_no = &err_msg[30]; /* where to print blockno in (those ZZs) */ blockno = entry->data.blockno; address = entry->load_address; @@ -150,25 +319,25 @@ static int zipl_load_segment(struct component_entry *entry) do { memset(bprs, FREE_SPACE_FILLER, bprs_size); - if (virtio_read(blockno, (uint8_t *)bprs)) { - debug_print_int("failed reading bprs at", blockno); - goto fail; - } + fill_hex_val(blk_no, &blockno, sizeof(blockno)); + read_block(blockno, bprs, err_msg); for (i = 0;; i++) { - u64 *cur_desc = (void*)&bprs[i]; + uint64_t *cur_desc = (void *)&bprs[i]; blockno = bprs[i].blockno; - if (!blockno) + if (!blockno) { break; + } /* we need the updated blockno for the next indirect entry in the chain, but don't want to advance address */ - if (i == (max_entries - 1)) + if (i == (max_entries - 1)) { break; + } if (bprs[i].blockct == 0 && unused_space(&bprs[i + 1], - sizeof(struct scsi_blockptr))) { + sizeof(ScsiBlockPtr))) { /* This is a "continue" pointer. * This ptr is the last one in the current script section. * I.e. the next ptr must point to the unused memory area. @@ -178,102 +347,66 @@ static int zipl_load_segment(struct component_entry *entry) break; } address = virtio_load_direct(cur_desc[0], cur_desc[1], 0, - (void*)address); - if (address == -1) - goto fail; + (void *)address); + IPL_assert(address != -1, "zIPL load segment failed"); } } while (blockno); - - return 0; - -fail: - sclp_print("failed loading segment\n"); - return -1; } /* Run a zipl program */ -static int zipl_run(struct scsi_blockptr *pte) +static void zipl_run(ScsiBlockPtr *pte) { - struct component_header *header; - struct component_entry *entry; - uint8_t tmp_sec[SECTOR_SIZE]; + ComponentHeader *header; + ComponentEntry *entry; + uint8_t tmp_sec[MAX_SECTOR_SIZE]; - virtio_read(pte->blockno, tmp_sec); - header = (struct component_header *)tmp_sec; + read_block(pte->blockno, tmp_sec, "Cannot read header"); + header = (ComponentHeader *)tmp_sec; - if (!zipl_magic(tmp_sec)) { - goto fail; - } - - if (header->type != ZIPL_COMP_HEADER_IPL) { - goto fail; - } + IPL_assert(magic_match(tmp_sec, ZIPL_MAGIC), "No zIPL magic"); + IPL_assert(header->type == ZIPL_COMP_HEADER_IPL, "Bad header type"); dputs("start loading images\n"); /* Load image(s) into RAM */ - entry = (struct component_entry *)(&header[1]); + entry = (ComponentEntry *)(&header[1]); while (entry->component_type == ZIPL_COMP_ENTRY_LOAD) { - if (zipl_load_segment(entry) < 0) { - goto fail; - } + zipl_load_segment(entry); entry++; - if ((uint8_t*)(&entry[1]) > (tmp_sec + SECTOR_SIZE)) { - goto fail; - } + IPL_assert((uint8_t *)(&entry[1]) <= (tmp_sec + MAX_SECTOR_SIZE), + "Wrong entry value"); } - if (entry->component_type != ZIPL_COMP_ENTRY_EXEC) { - goto fail; - } + IPL_assert(entry->component_type == ZIPL_COMP_ENTRY_EXEC, "No EXEC entry"); /* should not return */ jump_to_IPL_code(entry->load_address); - - return 0; - -fail: - sclp_print("failed running zipl\n"); - return -1; } -int zipl_load(void) +static void ipl_scsi(void) { - struct mbr *mbr = (void*)sec; + ScsiMbr *mbr = (void *)sec; uint8_t *ns, *ns_end; int program_table_entries = 0; - int pte_len = sizeof(struct scsi_blockptr); - struct scsi_blockptr *prog_table_entry; - const char *error = ""; - - /* Grab the MBR */ - virtio_read(0, (void*)mbr); - - dputs("checking magic\n"); + const int pte_len = sizeof(ScsiBlockPtr); + ScsiBlockPtr *prog_table_entry; - if (!zipl_magic(mbr->magic)) { - error = "zipl_magic 1"; - goto fail; - } + /* The 0-th block (MBR) was already read into sec[] */ + sclp_print("Using SCSI scheme.\n"); debug_print_int("program table", mbr->blockptr.blockno); /* Parse the program table */ - if (virtio_read(mbr->blockptr.blockno, sec)) { - error = "virtio_read"; - goto fail; - } + read_block(mbr->blockptr.blockno, sec, + "Error reading Program Table"); - if (!zipl_magic(sec)) { - error = "zipl_magic 2"; - goto fail; - } + IPL_assert(magic_match(sec, ZIPL_MAGIC), "No zIPL magic"); - ns_end = sec + SECTOR_SIZE; + ns_end = sec + virtio_get_block_size(); for (ns = (sec + pte_len); (ns + pte_len) < ns_end; ns++) { - prog_table_entry = (struct scsi_blockptr *)ns; + prog_table_entry = (ScsiBlockPtr *)ns; if (!prog_table_entry->blockno) { break; } @@ -283,19 +416,55 @@ int zipl_load(void) debug_print_int("program table entries", program_table_entries); - if (!program_table_entries) { - goto fail; - } + IPL_assert(program_table_entries != 0, "Empty Program Table"); /* Run the default entry */ - prog_table_entry = (struct scsi_blockptr *)(sec + pte_len); + prog_table_entry = (ScsiBlockPtr *)(sec + pte_len); - return zipl_run(prog_table_entry); + zipl_run(prog_table_entry); /* no return */ +} -fail: - sclp_print("failed loading zipl: "); - sclp_print(error); - sclp_print("\n"); - return -1; +/*********************************************************************** + * IPL starts here + */ + +void zipl_load(void) +{ + ScsiMbr *mbr = (void *)sec; + LDL_VTOC *vlbl = (void *)sec; + + /* Grab the MBR */ + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(0, mbr, "Cannot read block 0"); + + dputs("checking magic\n"); + + if (magic_match(mbr->magic, ZIPL_MAGIC)) { + ipl_scsi(); /* no return */ + } + + /* We have failed to follow the SCSI scheme, so */ + sclp_print("Using ECKD scheme.\n"); + if (virtio_guessed_disk_nature()) { + sclp_print("Using guessed DASD geometry.\n"); + virtio_assume_eckd(); + } + + if (magic_match(mbr->magic, IPL1_MAGIC)) { + ipl_eckd(ECKD_CDL); /* no return */ + } + + /* LDL/CMS? */ + memset(sec, FREE_SPACE_FILLER, sizeof(sec)); + read_block(2, vlbl, "Cannot read block 2"); + + if (magic_match(vlbl->magic, CMS1_MAGIC)) { + ipl_eckd(ECKD_CMS); /* no return */ + } + if (magic_match(vlbl->magic, LNX1_MAGIC)) { + ipl_eckd(ECKD_LDL); /* no return */ + } + + virtio_panic("\n* invalid MBR magic *\n"); } diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h new file mode 100644 index 0000000000..30ef22fe61 --- /dev/null +++ b/pc-bios/s390-ccw/bootmap.h @@ -0,0 +1,344 @@ +/* + * QEMU S390 bootmap interpreter -- declarations + * + * Copyright 2014 IBM Corp. + * Author(s): Eugene (jno) Dvurechenski <jno@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ +#ifndef _PC_BIOS_S390_CCW_BOOTMAP_H +#define _PC_BIOS_S390_CCW_BOOTMAP_H + +#include "s390-ccw.h" +#include "virtio.h" + +typedef uint64_t block_number_t; +#define NULL_BLOCK_NR 0xffffffffffffffff + +#define FREE_SPACE_FILLER '\xAA' + +typedef struct ScsiBlockPtr { + uint64_t blockno; + uint16_t size; + uint16_t blockct; + uint8_t reserved[4]; +} __attribute__ ((packed)) ScsiBlockPtr; + +typedef struct FbaBlockPtr { + uint32_t blockno; + uint16_t size; + uint16_t blockct; +} __attribute__ ((packed)) FbaBlockPtr; + +typedef struct EckdBlockPtr { + uint16_t cylinder; /* cylinder/head/sector is an address of the block */ + uint16_t head; + uint8_t sector; + uint16_t size; + uint8_t count; /* (size_in_blocks-1); + * it's 0 for TablePtr, ScriptPtr, and SectionPtr */ +} __attribute__ ((packed)) EckdBlockPtr; + +typedef struct ExtEckdBlockPtr { + EckdBlockPtr bptr; + uint8_t reserved[8]; +} __attribute__ ((packed)) ExtEckdBlockPtr; + +typedef union BootMapPointer { + ScsiBlockPtr scsi; + FbaBlockPtr fba; + EckdBlockPtr eckd; + ExtEckdBlockPtr xeckd; +} __attribute__ ((packed)) BootMapPointer; + +typedef struct ComponentEntry { + ScsiBlockPtr data; + uint8_t pad[7]; + uint8_t component_type; + uint64_t load_address; +} __attribute((packed)) ComponentEntry; + +typedef struct ComponentHeader { + uint8_t magic[4]; /* == "zIPL" */ + uint8_t type; /* == ZIPL_COMP_HEADER_* */ + uint8_t reserved[27]; +} __attribute((packed)) ComponentHeader; + +typedef struct ScsiMbr { + uint8_t magic[4]; + uint32_t version_id; + uint8_t reserved[8]; + ScsiBlockPtr blockptr; +} __attribute__ ((packed)) ScsiMbr; + +#define ZIPL_MAGIC "zIPL" +#define IPL1_MAGIC "\xc9\xd7\xd3\xf1" /* == "IPL1" in EBCDIC */ +#define IPL2_MAGIC "\xc9\xd7\xd3\xf2" /* == "IPL2" in EBCDIC */ +#define VOL1_MAGIC "\xe5\xd6\xd3\xf1" /* == "VOL1" in EBCDIC */ +#define LNX1_MAGIC "\xd3\xd5\xe7\xf1" /* == "LNX1" in EBCDIC */ +#define CMS1_MAGIC "\xc3\xd4\xe2\xf1" /* == "CMS1" in EBCDIC */ + +#define LDL1_VERSION '\x40' /* == ' ' in EBCDIC */ +#define LDL2_VERSION '\xf2' /* == '2' in EBCDIC */ + +#define ZIPL_COMP_HEADER_IPL 0x00 +#define ZIPL_COMP_HEADER_DUMP 0x01 + +#define ZIPL_COMP_ENTRY_LOAD 0x02 +#define ZIPL_COMP_ENTRY_EXEC 0x01 + +typedef struct XEckdMbr { + uint8_t magic[4]; /* == "xIPL" */ + uint8_t version; + uint8_t bp_type; + uint8_t dev_type; /* == DEV_TYPE_* */ +#define DEV_TYPE_ECKD 0x00 +#define DEV_TYPE_FBA 0x01 + uint8_t flags; + BootMapPointer blockptr; + uint8_t reserved[8]; +} __attribute__ ((packed)) XEckdMbr; /* see also BootInfo */ + +typedef struct BootMapScriptEntry { + BootMapPointer blkptr; + uint8_t pad[7]; + uint8_t type; /* == BOOT_SCRIPT_* */ +#define BOOT_SCRIPT_EXEC 0x01 +#define BOOT_SCRIPT_LOAD 0x02 + union { + uint64_t load_address; + uint64_t load_psw; + } address; +} __attribute__ ((packed)) BootMapScriptEntry; + +typedef struct BootMapScriptHeader { + uint32_t magic; + uint8_t type; +#define BOOT_SCRIPT_HDR_IPL 0x00 + uint8_t reserved[27]; +} __attribute__ ((packed)) BootMapScriptHeader; + +typedef struct BootMapScript { + BootMapScriptHeader header; + BootMapScriptEntry entry[0]; +} __attribute__ ((packed)) BootMapScript; + +/* + * These aren't real VTOCs, but referred to this way in some docs. + * They are "volume labels" actually. + * + * Some structures looks similar to described above, but left + * separate as there is no indication that they are the same. + * So, the value definitions are left separate too. + */ +typedef struct LDL_VTOC { /* @ rec.3 cyl.0 trk.0 for ECKD */ + char magic[4]; /* "LNX1", EBCDIC */ + char volser[6]; /* volser, EBCDIC */ + uint8_t reserved[69]; /* reserved, 0x40 */ + uint8_t LDL_version; /* 0x40 or 0xF2 */ + uint64_t formatted_blocks; /* if LDL_version >= 0xF2 */ +} __attribute__ ((packed)) LDL_VTOC; + +typedef struct format_date { + uint8_t YY; + uint8_t MM; + uint8_t DD; + uint8_t hh; + uint8_t mm; + uint8_t ss; +} __attribute__ ((packed)) format_date_t; + +typedef struct CMS_VTOC { /* @ rec.3 cyl.0 trk.0 for ECKD */ + /* @ blk.1 (zero based) for FBA */ + char magic[4]; /* 'CMS1', EBCDIC */ + char volser[6]; /* volser, EBCDIC */ + uint16_t version; /* = 0 */ + uint32_t block_size; /* = 512, 1024, 2048, or 4096 */ + uint32_t disk_origin; /* = 4 or 5 */ + uint32_t blocks; /* Number of usable cyls/blocks */ + uint32_t formatted; /* Max number of fmtd cyls/blks */ + uint32_t CMS_blocks; /* disk size in CMS blocks */ + uint32_t CMS_used; /* Number of CMS blocks in use */ + uint32_t FST_size; /* = 64, bytes */ + uint32_t FST_per_CMS_blk; /* */ + format_date_t format_date; /* YYMMDDhhmmss as 6 bytes */ + uint8_t reserved1[2]; /* = 0 */ + uint32_t offset; /* disk offset when reserved */ + uint32_t next_hole; /* block nr */ + uint32_t HBLK_hole_offset; /* >> HBLK data of next hole */ + uint32_t alloc_map_usr_off; /* >> user part of Alloc map */ + uint8_t reserved2[4]; /* = 0 */ + char shared_seg_name[8]; /* */ +} __attribute__ ((packed)) CMS_VTOC; + +/* from zipl/include/boot.h */ +typedef struct BootInfoBpIpl { + union { + ExtEckdBlockPtr eckd; + ScsiBlockPtr linr; + } bm_ptr; + uint8_t unused[16]; +} __attribute__ ((packed)) BootInfoBpIpl; + +typedef struct EckdDumpParam { + uint32_t start_blk; + uint32_t end_blk; + uint16_t blocksize; + uint8_t num_heads; + uint8_t bpt; + char reserved[4]; +} __attribute((packed, may_alias)) EckdDumpParam; + +typedef struct FbaDumpParam { + uint64_t start_blk; + uint64_t blockct; +} __attribute((packed)) FbaDumpParam; + +typedef struct BootInfoBpDump { + union { + EckdDumpParam eckd; + FbaDumpParam fba; + } param; + uint8_t unused[16]; +} __attribute__ ((packed)) BootInfoBpDump; + +typedef struct BootInfo { /* @ 0x70, record #0 */ + unsigned char magic[4]; /* = 'zIPL', ASCII */ + uint8_t version; /* = 1 */ +#define BOOT_INFO_VERSION 1 + uint8_t bp_type; /* = 0 */ +#define BOOT_INFO_BP_TYPE_IPL 0x00 +#define BOOT_INFO_BP_TYPE_DUMP 0x01 + uint8_t dev_type; /* = 0 */ +#define BOOT_INFO_DEV_TYPE_ECKD 0x00 +#define BOOT_INFO_DEV_TYPE_FBA 0x01 + uint8_t flags; /* = 1 */ +#ifdef __s390x__ +#define BOOT_INFO_FLAGS_ARCH 0x01 +#else +#define BOOT_INFO_FLAGS_ARCH 0x00 +#endif + union { + BootInfoBpDump dump; + BootInfoBpIpl ipl; + } bp; +} __attribute__ ((packed)) BootInfo; /* see also XEckdMbr */ + +typedef struct Ipl1 { + unsigned char key[4]; /* == "IPL1" */ + unsigned char data[24]; +} __attribute__((packed)) Ipl1; + +typedef struct Ipl2 { + unsigned char key[4]; /* == "IPL2" */ + union { + unsigned char data[144]; + struct { + unsigned char reserved1[92-4]; + XEckdMbr mbr; + unsigned char reserved2[144-(92-4)-sizeof(XEckdMbr)]; + } x; + } u; +} __attribute__((packed)) Ipl2; + +typedef struct IplVolumeLabel { + unsigned char key[4]; /* == "VOL1" */ + union { + unsigned char data[80]; + struct { + unsigned char key[4]; /* == "VOL1" */ + unsigned char volser[6]; + unsigned char reserved[6]; + } f; + }; +} __attribute__((packed)) IplVolumeLabel; + +typedef enum { + ECKD_NO_IPL, + ECKD_CDL, + ECKD_CMS, + ECKD_LDL, +} ECKD_IPL_mode_t; + +/* utility code below */ + +static inline void IPL_assert(bool term, const char *message) +{ + if (!term) { + sclp_print("\n! "); + sclp_print(message); + virtio_panic(" !\n"); /* no return */ + } +} + +static const unsigned char ebc2asc[256] = + /* 0123456789abcdef0123456789abcdef */ + "................................" /* 1F */ + "................................" /* 3F */ + " ...........<(+|&.........!$*);." /* 5F first.chr.here.is.real.space */ + "-/.........,%_>?.........`:#@'=\""/* 7F */ + ".abcdefghi.......jklmnopqr......" /* 9F */ + "..stuvwxyz......................" /* BF */ + ".ABCDEFGHI.......JKLMNOPQR......" /* DF */ + "..STUVWXYZ......0123456789......";/* FF */ + +static inline void ebcdic_to_ascii(const char *src, + char *dst, + unsigned int size) +{ + unsigned int i; + for (i = 0; i < size; i++) { + unsigned c = src[i]; + dst[i] = ebc2asc[c]; + } +} + +static inline void print_volser(const void *volser) +{ + char ascii[8]; + + ebcdic_to_ascii((char *)volser, ascii, 6); + ascii[6] = '\0'; + sclp_print("VOLSER=["); + sclp_print(ascii); + sclp_print("]\n"); +} + +static inline bool unused_space(const void *p, size_t size) +{ + size_t i; + const unsigned char *m = p; + + for (i = 0; i < size; i++) { + if (m[i] != FREE_SPACE_FILLER) { + return false; + } + } + return true; +} + +static inline bool is_null_block_number(block_number_t x) +{ + return x == NULL_BLOCK_NR; +} + +static inline void read_block(block_number_t blockno, + void *buffer, + const char *errmsg) +{ + IPL_assert(virtio_read(blockno, buffer) == 0, errmsg); +} + +static inline bool block_size_ok(uint32_t block_size) +{ + return block_size == virtio_get_block_size(); +} + +static inline bool magic_match(const void *data, const void *magic) +{ + return *((uint32_t *)data) == *((uint32_t *)magic); +} + +#endif /* _PC_BIOS_S390_CCW_BOOTMAP_H */ diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c index 5c33766533..dbfb40e685 100644 --- a/pc-bios/s390-ccw/main.c +++ b/pc-bios/s390-ccw/main.c @@ -9,6 +9,7 @@ */ #include "s390-ccw.h" +#include "virtio.h" char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE))); uint64_t boot_value; @@ -64,6 +65,10 @@ static void virtio_setup(uint64_t dev_info) } virtio_setup_block(blk_schid); + + if (!virtio_ipl_disk_is_valid()) { + virtio_panic("No valid hard disk detected.\n"); + } } int main(void) @@ -72,8 +77,8 @@ int main(void) debug_print_int("boot reg[7] ", boot_value); virtio_setup(boot_value); - if (zipl_load() < 0) - sclp_print("Failed to load OS from hard disk\n"); - disabled_wait(); - while (1) { } + zipl_load(); /* no return */ + + virtio_panic("Failed to load OS from hard disk\n"); + return 0; /* make compiler happy */ } diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h index 5e871ac84c..959aed0d0b 100644 --- a/pc-bios/s390-ccw/s390-ccw.h +++ b/pc-bios/s390-ccw/s390-ccw.h @@ -34,10 +34,10 @@ typedef unsigned long long __u64; #define PAGE_SIZE 4096 #ifndef EIO -#define EIO 1 +#define EIO 1 #endif #ifndef EBUSY -#define EBUSY 2 +#define EBUSY 2 #endif #ifndef NULL #define NULL 0 @@ -57,14 +57,14 @@ void sclp_setup(void); /* virtio.c */ unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2, - ulong subchan_id, void *load_addr); + ulong subchan_id, void *load_addr); bool virtio_is_blk(struct subchannel_id schid); void virtio_setup_block(struct subchannel_id schid); int virtio_read(ulong sector, void *load_addr); int enable_mss_facility(void); /* bootmap.c */ -int zipl_load(void); +void zipl_load(void); static inline void *memset(void *s, int c, size_t n) { @@ -86,15 +86,21 @@ static inline void fill_hex(char *out, unsigned char val) out[1] = hex[val & 0xf]; } -static inline void print_int(const char *desc, u64 addr) +static inline void fill_hex_val(char *out, void *ptr, unsigned size) { - unsigned char *addr_c = (unsigned char*)&addr; - char out[] = ": 0xffffffffffffffff\n"; + unsigned char *value = ptr; unsigned int i; - for (i = 0; i < sizeof(addr); i++) { - fill_hex(&out[4 + (i*2)], addr_c[i]); + for (i = 0; i < size; i++) { + fill_hex(&out[i*2], value[i]); } +} + +static inline void print_int(const char *desc, u64 addr) +{ + char out[] = ": 0xffffffffffffffff\n"; + + fill_hex_val(&out[4], &addr, sizeof(addr)); sclp_print(desc); sclp_print(out); @@ -118,18 +124,18 @@ static inline void debug_print_addr(const char *desc, void *p) * Hypercall functions * ***********************************************/ -#define KVM_S390_VIRTIO_NOTIFY 0 -#define KVM_S390_VIRTIO_RESET 1 -#define KVM_S390_VIRTIO_SET_STATUS 2 +#define KVM_S390_VIRTIO_NOTIFY 0 +#define KVM_S390_VIRTIO_RESET 1 +#define KVM_S390_VIRTIO_SET_STATUS 2 #define KVM_S390_VIRTIO_CCW_NOTIFY 3 static inline void yield(void) { - asm volatile ("diag 0,0,0x44" - : : - : "memory", "cc"); + asm volatile ("diag 0,0,0x44" + : : + : "memory", "cc"); } -#define SECTOR_SIZE 512 +#define MAX_SECTOR_SIZE 4096 #endif /* S390_CCW_H */ diff --git a/pc-bios/s390-ccw/sclp-ascii.c b/pc-bios/s390-ccw/sclp-ascii.c index 1c93937104..761fb44ff5 100644 --- a/pc-bios/s390-ccw/sclp-ascii.c +++ b/pc-bios/s390-ccw/sclp-ascii.c @@ -33,7 +33,7 @@ static int sclp_service_call(unsigned int command, void *sccb) static void sclp_set_write_mask(void) { - WriteEventMask *sccb = (void*)_sccb; + WriteEventMask *sccb = (void *)_sccb; sccb->h.length = sizeof(WriteEventMask); sccb->mask_length = sizeof(unsigned int); @@ -68,7 +68,7 @@ static void _memcpy(char *dest, const char *src, int len) void sclp_print(const char *str) { int len = _strlen(str); - WriteEventData *sccb = (void*)_sccb; + WriteEventData *sccb = (void *)_sccb; sccb->h.length = sizeof(WriteEventData) + len; sccb->h.function_code = SCLP_FC_NORMAL_WRITE; diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c index bbb3c4f36d..31b23b086c 100644 --- a/pc-bios/s390-ccw/virtio.c +++ b/pc-bios/s390-ccw/virtio.c @@ -18,22 +18,22 @@ static char chsc_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); static long kvm_hypercall(unsigned long nr, unsigned long param1, unsigned long param2) { - register ulong r_nr asm("1") = nr; - register ulong r_param1 asm("2") = param1; - register ulong r_param2 asm("3") = param2; - register long retval asm("2"); + register ulong r_nr asm("1") = nr; + register ulong r_param1 asm("2") = param1; + register ulong r_param2 asm("3") = param2; + register long retval asm("2"); - asm volatile ("diag 2,4,0x500" - : "=d" (retval) - : "d" (r_nr), "0" (r_param1), "r"(r_param2) - : "memory", "cc"); + asm volatile ("diag 2,4,0x500" + : "=d" (retval) + : "d" (r_nr), "0" (r_param1), "r"(r_param2) + : "memory", "cc"); - return retval; + return retval; } static void virtio_notify(struct subchannel_id schid) { - kvm_hypercall(KVM_S390_VIRTIO_CCW_NOTIFY, *(u32*)&schid, 0); + kvm_hypercall(KVM_S390_VIRTIO_CCW_NOTIFY, *(u32 *)&schid, 0); } /*********************************************** @@ -202,7 +202,7 @@ static int vring_wait_reply(struct vring *vr, int timeout) * Virtio block * ***********************************************/ -static int virtio_read_many(ulong sector, void *load_addr, int sec_num) +int virtio_read_many(ulong sector, void *load_addr, int sec_num) { struct virtio_blk_outhdr out_hdr; u8 status; @@ -211,12 +211,12 @@ static int virtio_read_many(ulong sector, void *load_addr, int sec_num) /* Tell the host we want to read */ out_hdr.type = VIRTIO_BLK_T_IN; out_hdr.ioprio = 99; - out_hdr.sector = sector; + out_hdr.sector = virtio_sector_adjust(sector); vring_send_buf(&block, &out_hdr, sizeof(out_hdr), VRING_DESC_F_NEXT); /* This is where we want to receive data */ - vring_send_buf(&block, load_addr, SECTOR_SIZE * sec_num, + vring_send_buf(&block, load_addr, virtio_get_block_size() * sec_num, VRING_DESC_F_WRITE | VRING_HIDDEN_IS_CHAIN | VRING_DESC_F_NEXT); @@ -236,7 +236,7 @@ static int virtio_read_many(ulong sector, void *load_addr, int sec_num) } unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2, - ulong subchan_id, void *load_addr) + ulong subchan_id, void *load_addr) { u8 status; int sec = rec_list1; @@ -244,16 +244,16 @@ unsigned long virtio_load_direct(ulong rec_list1, ulong rec_list2, int sec_len = rec_list2 >> 48; ulong addr = (ulong)load_addr; - if (sec_len != SECTOR_SIZE) { + if (sec_len != virtio_get_block_size()) { return -1; } sclp_print("."); - status = virtio_read_many(sec, (void*)addr, sec_num); + status = virtio_read_many(sec, (void *)addr, sec_num); if (status) { virtio_panic("I/O Error"); } - addr += sec_num * SECTOR_SIZE; + addr += sec_num * virtio_get_block_size(); return addr; } @@ -263,18 +263,98 @@ int virtio_read(ulong sector, void *load_addr) return virtio_read_many(sector, load_addr, 1); } +static VirtioBlkConfig blk_cfg = {}; +static bool guessed_disk_nature; + +bool virtio_guessed_disk_nature(void) +{ + return guessed_disk_nature; +} + +void virtio_assume_scsi(void) +{ + guessed_disk_nature = true; + blk_cfg.blk_size = 512; +} + +void virtio_assume_eckd(void) +{ + guessed_disk_nature = true; + blk_cfg.blk_size = 4096; + + /* this must be here to calculate code segment position */ + blk_cfg.geometry.heads = 15; + blk_cfg.geometry.sectors = 12; +} + +bool virtio_disk_is_scsi(void) +{ + if (guessed_disk_nature) { + return (blk_cfg.blk_size == 512); + } + return (blk_cfg.geometry.heads == 255) + && (blk_cfg.geometry.sectors == 63) + && (blk_cfg.blk_size == 512); +} + +bool virtio_disk_is_eckd(void) +{ + if (guessed_disk_nature) { + return (blk_cfg.blk_size == 4096); + } + return (blk_cfg.geometry.heads == 15) + && (blk_cfg.geometry.sectors == 12) + && (blk_cfg.blk_size == 4096); +} + +bool virtio_ipl_disk_is_valid(void) +{ + return blk_cfg.blk_size && (virtio_disk_is_scsi() || virtio_disk_is_eckd()); +} + +int virtio_get_block_size(void) +{ + return blk_cfg.blk_size; +} + +uint16_t virtio_get_cylinders(void) +{ + return blk_cfg.geometry.cylinders; +} + +uint8_t virtio_get_heads(void) +{ + return blk_cfg.geometry.heads; +} + +uint8_t virtio_get_sectors(void) +{ + return blk_cfg.geometry.sectors; +} + void virtio_setup_block(struct subchannel_id schid) { struct vq_info_block info; struct vq_config_block config = {}; + blk_cfg.blk_size = 0; /* mark "illegal" - setup started... */ + virtio_reset(schid); + /* + * Skipping CCW_CMD_READ_FEAT. We're not doing anything fancy, and + * we'll just stop dead anyway if anything does not work like we + * expect it. + */ + config.index = 0; if (run_ccw(schid, CCW_CMD_READ_VQ_CONF, &config, sizeof(config))) { + virtio_panic("Could not get block device VQ configuration\n"); + } + if (run_ccw(schid, CCW_CMD_READ_CONF, &blk_cfg, sizeof(blk_cfg))) { virtio_panic("Could not get block device configuration\n"); } - vring_init(&block, config.num, (void*)(100 * 1024 * 1024), + vring_init(&block, config.num, (void *)(100 * 1024 * 1024), KVM_S390_VIRTIO_RING_ALIGN); info.queue = (100ULL * 1024ULL* 1024ULL); @@ -286,6 +366,12 @@ void virtio_setup_block(struct subchannel_id schid) if (!run_ccw(schid, CCW_CMD_SET_VQ, &info, sizeof(info))) { virtio_set_status(schid, VIRTIO_CONFIG_S_DRIVER_OK); } + + if (!virtio_ipl_disk_is_valid()) { + /* make sure all getters but blocksize return 0 for invalid IPL disk */ + memset(&blk_cfg, 0, sizeof(blk_cfg)); + virtio_assume_scsi(); + } } bool virtio_is_blk(struct subchannel_id schid) diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h index 772a63f152..f1fb1b08fa 100644 --- a/pc-bios/s390-ccw/virtio.h +++ b/pc-bios/s390-ccw/virtio.h @@ -66,7 +66,7 @@ struct virtio_dev { char *config; }; -#define KVM_S390_VIRTIO_RING_ALIGN 4096 +#define KVM_S390_VIRTIO_RING_ALIGN 4096 #define VRING_USED_F_NO_NOTIFY 1 @@ -161,4 +161,52 @@ struct virtio_blk_outhdr { u64 sector; }; +typedef struct VirtioBlkConfig { + u64 capacity; /* in 512-byte sectors */ + u32 size_max; /* max segment size (if VIRTIO_BLK_F_SIZE_MAX) */ + u32 seg_max; /* max number of segments (if VIRTIO_BLK_F_SEG_MAX) */ + + struct virtio_blk_geometry { + u16 cylinders; + u8 heads; + u8 sectors; + } geometry; /* (if VIRTIO_BLK_F_GEOMETRY) */ + + u32 blk_size; /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */ + + /* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */ + u8 physical_block_exp; /* exponent for physical block per logical block */ + u8 alignment_offset; /* alignment offset in logical blocks */ + u16 min_io_size; /* min I/O size without performance penalty + in logical blocks */ + u32 opt_io_size; /* optimal sustained I/O size in logical blocks */ + + u8 wce; /* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */ +} __attribute__((packed)) VirtioBlkConfig; + +bool virtio_guessed_disk_nature(void); +void virtio_assume_scsi(void); +void virtio_assume_eckd(void); + +extern bool virtio_disk_is_scsi(void); +extern bool virtio_disk_is_eckd(void); +extern bool virtio_ipl_disk_is_valid(void); +extern int virtio_get_block_size(void); +extern uint16_t virtio_get_cylinders(void); +extern uint8_t virtio_get_heads(void); +extern uint8_t virtio_get_sectors(void); +extern int virtio_read_many(ulong sector, void *load_addr, int sec_num); + +#define VIRTIO_SECTOR_SIZE 512 + +static inline ulong virtio_eckd_sector_adjust(ulong sector) +{ + return sector * (virtio_get_block_size() / VIRTIO_SECTOR_SIZE); +} + +static inline ulong virtio_sector_adjust(ulong sector) +{ + return virtio_disk_is_eckd() ? virtio_eckd_sector_adjust(sector) : sector; +} + #endif /* VIRTIO_H */ diff --git a/qapi-schema.json b/qapi-schema.json index e7727a1153..a83befc05b 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -8,6 +8,9 @@ # QAPI block definitions { 'include': 'qapi/block.json' } +# QAPI event definitions +{ 'include': 'qapi/event.json' } + ## # LostTickPolicy: # @@ -211,12 +214,18 @@ # # @filename: the filename of the character device # +# @frontend-open: shows whether the frontend device attached to this backend +# (eg. with the chardev=... option) is in open or closed state +# (since 2.1) +# # Notes: @filename is encoded using the QEMU command line character device # encoding. See the QEMU man page for details. # # Since: 0.14.0 ## -{ 'type': 'ChardevInfo', 'data': {'label': 'str', 'filename': 'str'} } +{ 'type': 'ChardevInfo', 'data': {'label': 'str', + 'filename': 'str', + 'frontend-open': 'bool'} } ## # @query-chardev: @@ -654,8 +663,9 @@ # # @host: IP address # -# @service: The service name of vnc port. This may depend on the host system's -# service database so symbolic names should not be relied on. +# @service: The service name of the vnc port. This may depend on the host +# system's service database so symbolic names should not be relied +# on. # # @family: address family # @@ -694,7 +704,7 @@ ## { 'type': 'VncClientInfo', 'base': 'VncBasicInfo', - 'data': { '*x509_dname' : 'str', '*sasl_username': 'str' } } + 'data': { '*x509_dname': 'str', '*sasl_username': 'str' } } ## # @VncInfo: @@ -2040,6 +2050,62 @@ '*udp': 'str' } } ## +# @NetdevL2TPv3Options +# +# Connect the VLAN to Ethernet over L2TPv3 Static tunnel +# +# @src: source address +# +# @dst: destination address +# +# @srcport: #optional source port - mandatory for udp, optional for ip +# +# @dstport: #optional destination port - mandatory for udp, optional for ip +# +# @ipv6: #optional - force the use of ipv6 +# +# @udp: #optional - use the udp version of l2tpv3 encapsulation +# +# @cookie64: #optional - use 64 bit coookies +# +# @counter: #optional have sequence counter +# +# @pincounter: #optional pin sequence counter to zero - +# workaround for buggy implementations or +# networks with packet reorder +# +# @txcookie: #optional 32 or 64 bit transmit cookie +# +# @rxcookie: #optional 32 or 64 bit receive cookie +# +# @txsession: 32 bit transmit session +# +# @rxsession: #optional 32 bit receive session - if not specified +# set to the same value as transmit +# +# @offset: #optional additional offset - allows the insertion of +# additional application-specific data before the packet payload +# +# Since 2.1 +## +{ 'type': 'NetdevL2TPv3Options', + 'data': { + 'src': 'str', + 'dst': 'str', + '*srcport': 'str', + '*dstport': 'str', + '*ipv6': 'bool', + '*udp': 'bool', + '*cookie64': 'bool', + '*counter': 'bool', + '*pincounter': 'bool', + '*txcookie': 'uint64', + '*rxcookie': 'uint64', + 'txsession': 'uint32', + '*rxsession': 'uint32', + '*offset': 'uint32' } } + +## # @NetdevVdeOptions # # Connect the VLAN to a vde switch running on the host. @@ -2150,6 +2216,9 @@ # A discriminated record of network device traits. # # Since 1.2 +# +# 'l2tpv3' - since 2.1 +# ## { 'union': 'NetClientOptions', 'data': { @@ -2157,6 +2226,7 @@ 'nic': 'NetLegacyNicOptions', 'user': 'NetdevUserOptions', 'tap': 'NetdevTapOptions', + 'l2tpv3': 'NetdevL2TPv3Options', 'socket': 'NetdevSocketOptions', 'vde': 'NetdevVdeOptions', 'dump': 'NetdevDumpOptions', @@ -3398,5 +3468,3 @@ ## { 'enum': 'GuestPanicAction', 'data': [ 'pause' ] } - -{ 'include': 'qapi-event.json' } diff --git a/qapi/block-core.json b/qapi/block-core.json index 406ce03951..faf394cc76 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1447,7 +1447,8 @@ # @device: device name # # @msg: informative message for human consumption, such as the kind of -# corruption being detected +# corruption being detected. It should not be parsed by machine as it is +# not guaranteed to be stable # # @offset: #optional, if the corruption resulted from an image access, this is # the access offset into the image diff --git a/qapi-event.json b/qapi/event.json index e7a47f9927..ff97aeb377 100644 --- a/qapi-event.json +++ b/qapi/event.json @@ -1,8 +1,8 @@ ## # @SHUTDOWN # -# Emitted when the virtual machine has shutdown, possibly indicating that QEMU -# is about about to exit. +# Emitted when the virtual machine has shut down, indicating that qemu is +# about to exit. # # Note: If the command-line option "-no-shutdown" has been specified, qemu will # not exit, and a STOP event will eventually follow the SHUTDOWN event @@ -316,3 +316,17 @@ { 'event': 'QUORUM_REPORT_BAD', 'data': { '*error': 'str', 'node-name': 'str', 'sector-num': 'int', 'sector-count': 'int' } } + +## +# @VSERPORT_CHANGE +# +# Emitted when the guest opens or closes a virtio-serial port. +# +# @id: device identifier of the virtio-serial port +# +# @open: true if the guest has opened the virtio-serial port +# +# Since: 2.1 +## +{ 'event': 'VSERPORT_CHANGE', + 'data': { 'id': 'str', 'open': 'bool' } } diff --git a/qemu-bridge-helper.c b/qemu-bridge-helper.c index 6a0974eb48..36eb3bcfd6 100644 --- a/qemu-bridge-helper.c +++ b/qemu-bridge-helper.c @@ -229,7 +229,7 @@ int main(int argc, char **argv) unsigned long ifargs[4]; #endif int ifindex; - int fd, ctlfd, unixfd = -1; + int fd = -1, ctlfd = -1, unixfd = -1; int use_vnet = 0; int mtu; const char *bridge = NULL; @@ -436,7 +436,12 @@ int main(int argc, char **argv) /* profit! */ cleanup: - + if (fd >= 0) { + close(fd); + } + if (ctlfd >= 0) { + close(ctlfd); + } while ((acl_rule = QSIMPLEQ_FIRST(&acl_list)) != NULL) { QSIMPLEQ_REMOVE_HEAD(&acl_list, entry); g_free(acl_rule); diff --git a/qemu-char.c b/qemu-char.c index 2e50a1093e..51917de462 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -94,6 +94,7 @@ static QTAILQ_HEAD(CharDriverStateHead, CharDriverState) chardevs = CharDriverState *qemu_chr_alloc(void) { CharDriverState *chr = g_malloc0(sizeof(CharDriverState)); + qemu_mutex_init(&chr->chr_write_lock); return chr; } @@ -132,7 +133,7 @@ int qemu_chr_fe_write(CharDriverState *s, const uint8_t *buf, int len) int qemu_chr_fe_write_all(CharDriverState *s, const uint8_t *buf, int len) { int offset = 0; - int res; + int res = 0; qemu_mutex_lock(&s->chr_write_lock); while (offset < len) { @@ -3704,6 +3705,7 @@ ChardevInfoList *qmp_query_chardev(Error **errp) info->value = g_malloc0(sizeof(*info->value)); info->value->label = g_strdup(chr->label); info->value->filename = g_strdup(chr->filename); + info->value->frontend_open = chr->fe_open; info->next = chr_list; chr_list = info; diff --git a/qemu-options.hx b/qemu-options.hx index ff76ad4830..9e5468678b 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1433,6 +1433,29 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, " (default=" DEFAULT_BRIDGE_INTERFACE ") using the program 'helper'\n" " (default=" DEFAULT_BRIDGE_HELPER ")\n" #endif +#ifdef __linux__ + "-net l2tpv3[,vlan=n][,name=str],src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport],txsession=txsession[,rxsession=rxsession][,ipv6=on/off][,udp=on/off][,cookie64=on/off][,counter][,pincounter][,txcookie=txcookie][,rxcookie=rxcookie][,offset=offset]\n" + " connect the VLAN to an Ethernet over L2TPv3 pseudowire\n" + " Linux kernel 3.3+ as well as most routers can talk\n" + " L2TPv3. This transport allows to connect a VM to a VM,\n" + " VM to a router and even VM to Host. It is a nearly-universal\n" + " standard (RFC3391). Note - this implementation uses static\n" + " pre-configured tunnels (same as the Linux kernel).\n" + " use 'src=' to specify source address\n" + " use 'dst=' to specify destination address\n" + " use 'udp=on' to specify udp encapsulation\n" + " use 'dstport=' to specify destination udp port\n" + " use 'dstport=' to specify destination udp port\n" + " use 'ipv6=on' to force v6\n" + " L2TPv3 uses cookies to prevent misconfiguration as\n" + " well as a weak security measure\n" + " use 'rxcookie=0x012345678' to specify a rxcookie\n" + " use 'txcookie=0x012345678' to specify a txcookie\n" + " use 'cookie64=on' to set cookie size to 64 bit, otherwise 32\n" + " use 'counter=off' to force a 'cut-down' L2TPv3 with no counter\n" + " use 'pincounter=on' to work around broken counter handling in peer\n" + " use 'offset=X' to add an extra offset between header and data\n" +#endif "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" " connect the vlan 'n' to another VLAN using a socket connection\n" "-net socket[,vlan=n][,name=str][,fd=h][,mcast=maddr:port[,localaddr=addr]]\n" @@ -1778,6 +1801,65 @@ qemu-system-i386 linux.img \ -net socket,mcast=239.192.168.1:1102,localaddr=1.2.3.4 @end example +@item -netdev l2tpv3,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}] +@item -net l2tpv3[,vlan=@var{n}][,name=@var{name}],src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}] +Connect VLAN @var{n} to L2TPv3 pseudowire. L2TPv3 (RFC3391) is a popular +protocol to transport Ethernet (and other Layer 2) data frames between +two systems. It is present in routers, firewalls and the Linux kernel +(from version 3.3 onwards). + +This transport allows a VM to communicate to another VM, router or firewall directly. + +@item src=@var{srcaddr} + source address (mandatory) +@item dst=@var{dstaddr} + destination address (mandatory) +@item udp + select udp encapsulation (default is ip). +@item srcport=@var{srcport} + source udp port. +@item dstport=@var{dstport} + destination udp port. +@item ipv6 + force v6, otherwise defaults to v4. +@item rxcookie=@var{rxcookie} +@item txcookie=@var{txcookie} + Cookies are a weak form of security in the l2tpv3 specification. +Their function is mostly to prevent misconfiguration. By default they are 32 +bit. +@item cookie64 + Set cookie size to 64 bit instead of the default 32 +@item counter=off + Force a 'cut-down' L2TPv3 with no counter as in +draft-mkonstan-l2tpext-keyed-ipv6-tunnel-00 +@item pincounter=on + Work around broken counter handling in peer. This may also help on +networks which have packet reorder. +@item offset=@var{offset} + Add an extra offset between header and data + +For example, to attach a VM running on host 4.3.2.1 via L2TPv3 to the bridge br-lan +on the remote Linux host 1.2.3.4: +@example +# Setup tunnel on linux host using raw ip as encapsulation +# on 1.2.3.4 +ip l2tp add tunnel remote 4.3.2.1 local 1.2.3.4 tunnel_id 1 peer_tunnel_id 1 \ + encap udp udp_sport 16384 udp_dport 16384 +ip l2tp add session tunnel_id 1 name vmtunnel0 session_id \ + 0xFFFFFFFF peer_session_id 0xFFFFFFFF +ifconfig vmtunnel0 mtu 1500 +ifconfig vmtunnel0 up +brctl addif br-lan vmtunnel0 + + +# on 4.3.2.1 +# launch QEMU instance - if your network has reorder or is very lossy add ,pincounter + +qemu-system-i386 linux.img -net nic -net l2tpv3,src=4.2.3.1,dst=1.2.3.4,udp,srcport=16384,dstport=16384,rxsession=0xffffffff,txsession=0xffffffff,counter + + +@end example + @item -netdev vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}] @item -net vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}] Connect VLAN @var{n} to PORT @var{n} of a vde switch running on host and diff --git a/qmp-commands.hx b/qmp-commands.hx index d342b8a067..65218bc147 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -1926,19 +1926,28 @@ Each json-object contain the following: - "label": device's label (json-string) - "filename": device's file (json-string) +- "frontend-open": open/closed state of the frontend device attached to this + backend (json-bool) Example: -> { "execute": "query-chardev" } <- { - "return":[ + "return": [ + { + "label": "charchannel0", + "filename": "unix:/var/lib/libvirt/qemu/seabios.rhel6.agent,server", + "frontend-open": false + }, { - "label":"monitor", - "filename":"stdio" + "label": "charmonitor", + "filename": "unix:/var/lib/libvirt/qemu/seabios.rhel6.monitor,server", + "frontend-open": true }, { - "label":"serial0", - "filename":"vc" + "label": "charserial0", + "filename": "pty:/dev/pts/2", + "frontend-open": true } ] } @@ -232,7 +232,6 @@ typedef struct SaveStateEntry { const VMStateDescription *vmsd; void *opaque; CompatEntry *compat; - int no_migrate; int is_ram; } SaveStateEntry; @@ -430,7 +429,6 @@ int register_savevm_live(DeviceState *dev, se->ops = ops; se->opaque = opaque; se->vmsd = NULL; - se->no_migrate = 0; /* if this is a live_savem then set is_ram */ if (ops->save_live_setup != NULL) { se->is_ram = 1; @@ -521,7 +519,6 @@ int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, se->opaque = opaque; se->vmsd = vmsd; se->alias_id = alias_id; - se->no_migrate = vmsd->unmigratable; if (dev) { char *id = qdev_get_dev_path(dev); @@ -590,7 +587,7 @@ bool qemu_savevm_state_blocked(Error **errp) SaveStateEntry *se; QTAILQ_FOREACH(se, &savevm_handlers, entry) { - if (se->no_migrate) { + if (se->vmsd && se->vmsd->unmigratable) { error_setg(errp, "State blocked by non-migratable device '%s'", se->idstr); return true; diff --git a/scripts/qapi-event.py b/scripts/qapi-event.py index 3a1cd61914..601e3076ab 100644 --- a/scripts/qapi-event.py +++ b/scripts/qapi-event.py @@ -26,9 +26,8 @@ def _generate_event_api_name(event_name, params): api_name += "bool has_%s,\n" % c_var(argname) api_name += "".ljust(l) - if argentry == "str": - api_name += "const " - api_name += "%s %s,\n" % (c_type(argentry), c_var(argname)) + api_name += "%s %s,\n" % (c_type(argentry, is_param=True), + c_var(argname)) api_name += "".ljust(l) api_name += "Error **errp)" diff --git a/scripts/qapi.py b/scripts/qapi.py index 54b97cb48e..f2c6d1f840 100644 --- a/scripts/qapi.py +++ b/scripts/qapi.py @@ -255,7 +255,7 @@ def check_event(expr, expr_info): if structured: raise QAPIExprError(expr_info, "Nested structure define in event is not " - "supported now, event '%s', argname '%s'" + "supported, event '%s', argname '%s'" % (expr['event'], argname)) def check_union(expr, expr_info): diff --git a/target-i386/cpu-qom.h b/target-i386/cpu-qom.h index 0808cfc67d..71a1b97cfc 100644 --- a/target-i386/cpu-qom.h +++ b/target-i386/cpu-qom.h @@ -71,6 +71,9 @@ typedef struct X86CPUClass { /** * X86CPU: * @env: #CPUX86State + * @migratable: If set, only migratable flags will be accepted when "enforce" + * mode is used, and only migratable flags will be included in the "host" + * CPU model. * * An x86 CPU. */ @@ -88,6 +91,7 @@ typedef struct X86CPU { bool check_cpuid; bool enforce_cpuid; bool expose_kvm; + bool migratable; /* if true the CPUID code directly forward host cache leaves to the guest */ bool cache_info_passthrough; @@ -117,7 +121,7 @@ static inline X86CPU *x86_env_get_cpu(CPUX86State *env) #define ENV_OFFSET offsetof(X86CPU, env) #ifndef CONFIG_USER_ONLY -extern const struct VMStateDescription vmstate_x86_cpu; +extern struct VMStateDescription vmstate_x86_cpu; #endif /** diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 8983457e23..45c662dad4 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -263,48 +263,133 @@ static const char *cpuid_7_0_ebx_feature_name[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; +static const char *cpuid_apm_edx_feature_name[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "invtsc", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +}; + +#define I486_FEATURES (CPUID_FP87 | CPUID_VME | CPUID_PSE) +#define PENTIUM_FEATURES (I486_FEATURES | CPUID_DE | CPUID_TSC | \ + CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_MMX | CPUID_APIC) +#define PENTIUM2_FEATURES (PENTIUM_FEATURES | CPUID_PAE | CPUID_SEP | \ + CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | CPUID_PAT | \ + CPUID_PSE36 | CPUID_FXSR) +#define PENTIUM3_FEATURES (PENTIUM2_FEATURES | CPUID_SSE) +#define PPRO_FEATURES (CPUID_FP87 | CPUID_DE | CPUID_PSE | CPUID_TSC | \ + CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_PGE | CPUID_CMOV | \ + CPUID_PAT | CPUID_FXSR | CPUID_MMX | CPUID_SSE | CPUID_SSE2 | \ + CPUID_PAE | CPUID_SEP | CPUID_APIC) + +#define TCG_FEATURES (CPUID_FP87 | CPUID_PSE | CPUID_TSC | CPUID_MSR | \ + CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | CPUID_SEP | \ + CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | CPUID_PAT | \ + CPUID_PSE36 | CPUID_CLFLUSH | CPUID_ACPI | CPUID_MMX | \ + CPUID_FXSR | CPUID_SSE | CPUID_SSE2 | CPUID_SS) + /* partly implemented: + CPUID_MTRR, CPUID_MCA, CPUID_CLFLUSH (needed for Win64) */ + /* missing: + CPUID_VME, CPUID_DTS, CPUID_SS, CPUID_HT, CPUID_TM, CPUID_PBE */ +#define TCG_EXT_FEATURES (CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | \ + CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | \ + CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ + CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR) + /* missing: + CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, + CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, + CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, + CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_XSAVE, + CPUID_EXT_OSXSAVE, CPUID_EXT_AVX, CPUID_EXT_F16C, + CPUID_EXT_RDRAND */ + +#ifdef TARGET_X86_64 +#define TCG_EXT2_X86_64_FEATURES (CPUID_EXT2_SYSCALL | CPUID_EXT2_LM) +#else +#define TCG_EXT2_X86_64_FEATURES 0 +#endif + +#define TCG_EXT2_FEATURES ((TCG_FEATURES & CPUID_EXT2_AMD_ALIASES) | \ + CPUID_EXT2_NX | CPUID_EXT2_MMXEXT | CPUID_EXT2_RDTSCP | \ + CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT | CPUID_EXT2_PDPE1GB | \ + TCG_EXT2_X86_64_FEATURES) +#define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \ + CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A) +#define TCG_EXT4_FEATURES 0 +#define TCG_SVM_FEATURES 0 +#define TCG_KVM_FEATURES 0 +#define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \ + CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX) + /* missing: + CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2, + CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, + CPUID_7_0_EBX_RDSEED */ +#define TCG_APM_FEATURES 0 + + typedef struct FeatureWordInfo { const char **feat_names; uint32_t cpuid_eax; /* Input EAX for CPUID */ bool cpuid_needs_ecx; /* CPUID instruction uses ECX as input */ uint32_t cpuid_ecx; /* Input ECX value for CPUID */ int cpuid_reg; /* output register (R_* constant) */ + uint32_t tcg_features; /* Feature flags supported by TCG */ + uint32_t unmigratable_flags; /* Feature flags known to be unmigratable */ } FeatureWordInfo; static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { [FEAT_1_EDX] = { .feat_names = feature_name, .cpuid_eax = 1, .cpuid_reg = R_EDX, + .tcg_features = TCG_FEATURES, }, [FEAT_1_ECX] = { .feat_names = ext_feature_name, .cpuid_eax = 1, .cpuid_reg = R_ECX, + .tcg_features = TCG_EXT_FEATURES, }, [FEAT_8000_0001_EDX] = { .feat_names = ext2_feature_name, .cpuid_eax = 0x80000001, .cpuid_reg = R_EDX, + .tcg_features = TCG_EXT2_FEATURES, }, [FEAT_8000_0001_ECX] = { .feat_names = ext3_feature_name, .cpuid_eax = 0x80000001, .cpuid_reg = R_ECX, + .tcg_features = TCG_EXT3_FEATURES, }, [FEAT_C000_0001_EDX] = { .feat_names = ext4_feature_name, .cpuid_eax = 0xC0000001, .cpuid_reg = R_EDX, + .tcg_features = TCG_EXT4_FEATURES, }, [FEAT_KVM] = { .feat_names = kvm_feature_name, .cpuid_eax = KVM_CPUID_FEATURES, .cpuid_reg = R_EAX, + .tcg_features = TCG_KVM_FEATURES, }, [FEAT_SVM] = { .feat_names = svm_feature_name, .cpuid_eax = 0x8000000A, .cpuid_reg = R_EDX, + .tcg_features = TCG_SVM_FEATURES, }, [FEAT_7_0_EBX] = { .feat_names = cpuid_7_0_ebx_feature_name, .cpuid_eax = 7, .cpuid_needs_ecx = true, .cpuid_ecx = 0, .cpuid_reg = R_EBX, + .tcg_features = TCG_7_0_EBX_FEATURES, + }, + [FEAT_8000_0007_EDX] = { + .feat_names = cpuid_apm_edx_feature_name, + .cpuid_eax = 0x80000007, + .cpuid_reg = R_EDX, + .tcg_features = TCG_APM_FEATURES, + .unmigratable_flags = CPUID_APM_INVTSC, }, }; @@ -373,11 +458,42 @@ static uint32_t kvm_default_features[FEATURE_WORDS] = { [FEAT_1_ECX] = CPUID_EXT_X2APIC, }; +/* Features that are not added by default to any CPU model when KVM is enabled. + */ +static uint32_t kvm_default_unset_features[FEATURE_WORDS] = { + [FEAT_1_ECX] = CPUID_EXT_MONITOR, +}; + void x86_cpu_compat_disable_kvm_features(FeatureWord w, uint32_t features) { kvm_default_features[w] &= ~features; } +/* + * Returns the set of feature flags that are supported and migratable by + * QEMU, for a given FeatureWord. + */ +static uint32_t x86_cpu_get_migratable_flags(FeatureWord w) +{ + FeatureWordInfo *wi = &feature_word_info[w]; + uint32_t r = 0; + int i; + + for (i = 0; i < 32; i++) { + uint32_t f = 1U << i; + /* If the feature name is unknown, it is not supported by QEMU yet */ + if (!wi->feat_names[i]) { + continue; + } + /* Skip features known to QEMU, but explicitly marked as unmigratable */ + if (wi->unmigratable_flags & f) { + continue; + } + r |= f; + } + return r; +} + void host_cpuid(uint32_t function, uint32_t count, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { @@ -534,51 +650,6 @@ struct X86CPUDefinition { bool cache_info_passthrough; }; -#define I486_FEATURES (CPUID_FP87 | CPUID_VME | CPUID_PSE) -#define PENTIUM_FEATURES (I486_FEATURES | CPUID_DE | CPUID_TSC | \ - CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_MMX | CPUID_APIC) -#define PENTIUM2_FEATURES (PENTIUM_FEATURES | CPUID_PAE | CPUID_SEP | \ - CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | CPUID_PAT | \ - CPUID_PSE36 | CPUID_FXSR) -#define PENTIUM3_FEATURES (PENTIUM2_FEATURES | CPUID_SSE) -#define PPRO_FEATURES (CPUID_FP87 | CPUID_DE | CPUID_PSE | CPUID_TSC | \ - CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_PGE | CPUID_CMOV | \ - CPUID_PAT | CPUID_FXSR | CPUID_MMX | CPUID_SSE | CPUID_SSE2 | \ - CPUID_PAE | CPUID_SEP | CPUID_APIC) - -#define TCG_FEATURES (CPUID_FP87 | CPUID_PSE | CPUID_TSC | CPUID_MSR | \ - CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | CPUID_SEP | \ - CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | CPUID_PAT | \ - CPUID_PSE36 | CPUID_CLFLUSH | CPUID_ACPI | CPUID_MMX | \ - CPUID_FXSR | CPUID_SSE | CPUID_SSE2 | CPUID_SS) - /* partly implemented: - CPUID_MTRR, CPUID_MCA, CPUID_CLFLUSH (needed for Win64) */ - /* missing: - CPUID_VME, CPUID_DTS, CPUID_SS, CPUID_HT, CPUID_TM, CPUID_PBE */ -#define TCG_EXT_FEATURES (CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | \ - CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | \ - CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ - CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR) - /* missing: - CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, - CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, - CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, - CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, CPUID_EXT_XSAVE, - CPUID_EXT_OSXSAVE, CPUID_EXT_AVX, CPUID_EXT_F16C, - CPUID_EXT_RDRAND */ -#define TCG_EXT2_FEATURES ((TCG_FEATURES & CPUID_EXT2_AMD_ALIASES) | \ - CPUID_EXT2_NX | CPUID_EXT2_MMXEXT | CPUID_EXT2_RDTSCP | \ - CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT | CPUID_EXT2_PDPE1GB) -#define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \ - CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A) -#define TCG_SVM_FEATURES 0 -#define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP \ - CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX) - /* missing: - CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2, - CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, - CPUID_7_0_EBX_RDSEED */ - static X86CPUDefinition builtin_x86_defs[] = { { .name = "qemu64", @@ -827,10 +898,10 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 3, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_SSSE3 | CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = @@ -849,13 +920,13 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 3, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | - CPUID_EXT_SSE3, + CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_NX | CPUID_EXT2_SYSCALL, .features[FEAT_8000_0001_ECX] = @@ -872,13 +943,13 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 3, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | - CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_SSE3, + CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, .features[FEAT_8000_0001_ECX] = @@ -895,14 +966,14 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_AES | CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | - CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | - CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, + CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | + CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, .features[FEAT_8000_0001_ECX] = @@ -919,19 +990,19 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | - CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_POPCNT | - CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | - CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | - CPUID_EXT_SSE3, + CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_POPCNT | + CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | + CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | + CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_NX | - CPUID_EXT2_SYSCALL, + CPUID_EXT2_SYSCALL, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_LAHF_LM, .xlevel = 0x8000000A, @@ -946,20 +1017,20 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | - CPUID_EXT_POPCNT | CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | - CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | - CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | - CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_FMA | CPUID_EXT_MOVBE | - CPUID_EXT_PCID, + CPUID_EXT_POPCNT | CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | + CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | + CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | + CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_FMA | CPUID_EXT_MOVBE | + CPUID_EXT_PCID, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_NX | - CPUID_EXT2_SYSCALL, + CPUID_EXT2_SYSCALL, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_LAHF_LM, .features[FEAT_7_0_EBX] = @@ -971,6 +1042,40 @@ static X86CPUDefinition builtin_x86_defs[] = { .model_id = "Intel Core Processor (Haswell)", }, { + .name = "Broadwell", + .level = 0xd, + .vendor = CPUID_VENDOR_INTEL, + .family = 6, + .model = 61, + .stepping = 2, + .features[FEAT_1_EDX] = + CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, + .features[FEAT_1_ECX] = + CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | + CPUID_EXT_POPCNT | CPUID_EXT_X2APIC | CPUID_EXT_SSE42 | + CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | + CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3 | + CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_FMA | CPUID_EXT_MOVBE | + CPUID_EXT_PCID, + .features[FEAT_8000_0001_EDX] = + CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_NX | + CPUID_EXT2_SYSCALL, + .features[FEAT_8000_0001_ECX] = + CPUID_EXT3_LAHF_LM | CPUID_EXT3_3DNOWPREFETCH, + .features[FEAT_7_0_EBX] = + CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | + CPUID_7_0_EBX_HLE | CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | + CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | + CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | + CPUID_7_0_EBX_SMAP, + .xlevel = 0x8000000A, + .model_id = "Intel Core Processor (Broadwell)", + }, + { .name = "Opteron_G1", .level = 5, .vendor = CPUID_VENDOR_AMD, @@ -979,19 +1084,19 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_FXSR | CPUID_EXT2_MMX | - CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | - CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | - CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | - CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | - CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, + CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | + CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | + CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | + CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | + CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, .xlevel = 0x80000008, .model_id = "AMD Opteron 240 (Gen 1 Class Opteron)", }, @@ -1004,20 +1109,20 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_CX16 | CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_FXSR | - CPUID_EXT2_MMX | CPUID_EXT2_NX | CPUID_EXT2_PSE36 | - CPUID_EXT2_PAT | CPUID_EXT2_CMOV | CPUID_EXT2_MCA | - CPUID_EXT2_PGE | CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | - CPUID_EXT2_APIC | CPUID_EXT2_CX8 | CPUID_EXT2_MCE | - CPUID_EXT2_PAE | CPUID_EXT2_MSR | CPUID_EXT2_TSC | CPUID_EXT2_PSE | - CPUID_EXT2_DE | CPUID_EXT2_FPU, + CPUID_EXT2_MMX | CPUID_EXT2_NX | CPUID_EXT2_PSE36 | + CPUID_EXT2_PAT | CPUID_EXT2_CMOV | CPUID_EXT2_MCA | + CPUID_EXT2_PGE | CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | + CPUID_EXT2_APIC | CPUID_EXT2_CX8 | CPUID_EXT2_MCE | + CPUID_EXT2_PAE | CPUID_EXT2_MSR | CPUID_EXT2_TSC | CPUID_EXT2_PSE | + CPUID_EXT2_DE | CPUID_EXT2_FPU, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x80000008, @@ -1032,24 +1137,24 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 1, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_POPCNT | CPUID_EXT_CX16 | CPUID_EXT_MONITOR | - CPUID_EXT_SSE3, + CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | CPUID_EXT2_FXSR | - CPUID_EXT2_MMX | CPUID_EXT2_NX | CPUID_EXT2_PSE36 | - CPUID_EXT2_PAT | CPUID_EXT2_CMOV | CPUID_EXT2_MCA | - CPUID_EXT2_PGE | CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | - CPUID_EXT2_APIC | CPUID_EXT2_CX8 | CPUID_EXT2_MCE | - CPUID_EXT2_PAE | CPUID_EXT2_MSR | CPUID_EXT2_TSC | CPUID_EXT2_PSE | - CPUID_EXT2_DE | CPUID_EXT2_FPU, + CPUID_EXT2_MMX | CPUID_EXT2_NX | CPUID_EXT2_PSE36 | + CPUID_EXT2_PAT | CPUID_EXT2_CMOV | CPUID_EXT2_MCA | + CPUID_EXT2_PGE | CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | + CPUID_EXT2_APIC | CPUID_EXT2_CX8 | CPUID_EXT2_MCE | + CPUID_EXT2_PAE | CPUID_EXT2_MSR | CPUID_EXT2_TSC | CPUID_EXT2_PSE | + CPUID_EXT2_DE | CPUID_EXT2_FPU, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_MISALIGNSSE | CPUID_EXT3_SSE4A | - CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, + CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x80000008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", }, @@ -1062,28 +1167,28 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 2, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_AVX | CPUID_EXT_XSAVE | CPUID_EXT_AES | - CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | - CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | - CPUID_EXT_SSE3, + CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | CPUID_EXT_SSE41 | + CPUID_EXT_CX16 | CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | + CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | - CPUID_EXT2_PDPE1GB | CPUID_EXT2_FXSR | CPUID_EXT2_MMX | - CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | - CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | - CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | - CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | - CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, + CPUID_EXT2_PDPE1GB | CPUID_EXT2_FXSR | CPUID_EXT2_MMX | + CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | + CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | + CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | + CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | + CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_FMA4 | CPUID_EXT3_XOP | - CPUID_EXT3_3DNOWPREFETCH | CPUID_EXT3_MISALIGNSSE | - CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | CPUID_EXT3_SVM | - CPUID_EXT3_LAHF_LM, + CPUID_EXT3_3DNOWPREFETCH | CPUID_EXT3_MISALIGNSSE | + CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | CPUID_EXT3_SVM | + CPUID_EXT3_LAHF_LM, .xlevel = 0x8000001A, .model_id = "AMD Opteron 62xx class CPU", }, @@ -1096,28 +1201,28 @@ static X86CPUDefinition builtin_x86_defs[] = { .stepping = 0, .features[FEAT_1_EDX] = CPUID_SSE2 | CPUID_SSE | CPUID_FXSR | CPUID_MMX | - CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | - CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | - CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | - CPUID_DE | CPUID_FP87, + CPUID_CLFLUSH | CPUID_PSE36 | CPUID_PAT | CPUID_CMOV | CPUID_MCA | + CPUID_PGE | CPUID_MTRR | CPUID_SEP | CPUID_APIC | CPUID_CX8 | + CPUID_MCE | CPUID_PAE | CPUID_MSR | CPUID_TSC | CPUID_PSE | + CPUID_DE | CPUID_FP87, .features[FEAT_1_ECX] = CPUID_EXT_F16C | CPUID_EXT_AVX | CPUID_EXT_XSAVE | - CPUID_EXT_AES | CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | - CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_FMA | - CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, + CPUID_EXT_AES | CPUID_EXT_POPCNT | CPUID_EXT_SSE42 | + CPUID_EXT_SSE41 | CPUID_EXT_CX16 | CPUID_EXT_FMA | + CPUID_EXT_SSSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSE3, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_RDTSCP | - CPUID_EXT2_PDPE1GB | CPUID_EXT2_FXSR | CPUID_EXT2_MMX | - CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | - CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | - CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | - CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | - CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, + CPUID_EXT2_PDPE1GB | CPUID_EXT2_FXSR | CPUID_EXT2_MMX | + CPUID_EXT2_NX | CPUID_EXT2_PSE36 | CPUID_EXT2_PAT | + CPUID_EXT2_CMOV | CPUID_EXT2_MCA | CPUID_EXT2_PGE | + CPUID_EXT2_MTRR | CPUID_EXT2_SYSCALL | CPUID_EXT2_APIC | + CPUID_EXT2_CX8 | CPUID_EXT2_MCE | CPUID_EXT2_PAE | CPUID_EXT2_MSR | + CPUID_EXT2_TSC | CPUID_EXT2_PSE | CPUID_EXT2_DE | CPUID_EXT2_FPU, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_TBM | CPUID_EXT3_FMA4 | CPUID_EXT3_XOP | - CPUID_EXT3_3DNOWPREFETCH | CPUID_EXT3_MISALIGNSSE | - CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | CPUID_EXT3_SVM | - CPUID_EXT3_LAHF_LM, + CPUID_EXT3_3DNOWPREFETCH | CPUID_EXT3_MISALIGNSSE | + CPUID_EXT3_SSE4A | CPUID_EXT3_ABM | CPUID_EXT3_SVM | + CPUID_EXT3_LAHF_LM, .xlevel = 0x8000001A, .model_id = "AMD Opteron 63xx class CPU", }, @@ -1168,12 +1273,18 @@ static int cpu_x86_fill_model_id(char *str) static X86CPUDefinition host_cpudef; +static Property host_x86_cpu_properties[] = { + DEFINE_PROP_BOOL("migratable", X86CPU, migratable, true), + DEFINE_PROP_END_OF_LIST() +}; + /* class_init for the "host" CPU model * * This function may be called before KVM is initialized. */ static void host_x86_cpu_class_init(ObjectClass *oc, void *data) { + DeviceClass *dc = DEVICE_CLASS(oc); X86CPUClass *xcc = X86_CPU_CLASS(oc); uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; @@ -1195,8 +1306,13 @@ static void host_x86_cpu_class_init(ObjectClass *oc, void *data) /* level, xlevel, xlevel2, and the feature words are initialized on * instance_init, because they require KVM to be initialized. */ + + dc->props = host_x86_cpu_properties; } +static uint32_t x86_cpu_get_supported_feature_word(FeatureWord w, + bool migratable_only); + static void host_x86_cpu_initfn(Object *obj) { X86CPU *cpu = X86_CPU(obj); @@ -1211,10 +1327,8 @@ static void host_x86_cpu_initfn(Object *obj) env->cpuid_xlevel2 = kvm_arch_get_supported_cpuid(s, 0xC0000000, 0, R_EAX); for (w = 0; w < FEATURE_WORDS; w++) { - FeatureWordInfo *wi = &feature_word_info[w]; env->features[w] = - kvm_arch_get_supported_cpuid(s, wi->cpuid_eax, wi->cpuid_ecx, - wi->cpuid_reg); + x86_cpu_get_supported_feature_word(w, cpu->migratable); } object_property_set_bool(OBJECT(cpu), true, "pmu", &error_abort); } @@ -1228,53 +1342,23 @@ static const TypeInfo host_x86_cpu_type_info = { #endif -static int unavailable_host_feature(FeatureWordInfo *f, uint32_t mask) +static void report_unavailable_features(FeatureWord w, uint32_t mask) { + FeatureWordInfo *f = &feature_word_info[w]; int i; - for (i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { if (1 << i & mask) { const char *reg = get_register_name_32(f->cpuid_reg); assert(reg); - fprintf(stderr, "warning: host doesn't support requested feature: " + fprintf(stderr, "warning: %s doesn't support requested feature: " "CPUID.%02XH:%s%s%s [bit %d]\n", + kvm_enabled() ? "host" : "TCG", f->cpuid_eax, reg, f->feat_names[i] ? "." : "", f->feat_names[i] ? f->feat_names[i] : "", i); - break; - } - return 0; -} - -/* Check if all requested cpu flags are making their way to the guest - * - * Returns 0 if all flags are supported by the host, non-zero otherwise. - * - * This function may be called only if KVM is enabled. - */ -static int kvm_check_features_against_host(KVMState *s, X86CPU *cpu) -{ - CPUX86State *env = &cpu->env; - int rv = 0; - FeatureWord w; - - assert(kvm_enabled()); - - for (w = 0; w < FEATURE_WORDS; w++) { - FeatureWordInfo *wi = &feature_word_info[w]; - uint32_t guest_feat = env->features[w]; - uint32_t host_feat = kvm_arch_get_supported_cpuid(s, wi->cpuid_eax, - wi->cpuid_ecx, - wi->cpuid_reg); - uint32_t mask; - for (mask = 1; mask; mask <<= 1) { - if (guest_feat & mask && !(host_feat & mask)) { - unavailable_host_feature(wi, mask); - rv = 1; - } } } - return rv; } static void x86_cpuid_version_get_family(Object *obj, Visitor *v, void *opaque, @@ -1663,6 +1747,7 @@ static void x86_cpu_parse_featurestr(CPUState *cs, char *features, { X86CPU *cpu = X86_CPU(cs); char *featurestr; /* Single 'key=value" string being parsed */ + FeatureWord w; /* Features to be added */ FeatureWordArray plus_features = { 0 }; /* Features to be removed */ @@ -1742,22 +1827,11 @@ static void x86_cpu_parse_featurestr(CPUState *cs, char *features, } featurestr = strtok(NULL, ","); } - env->features[FEAT_1_EDX] |= plus_features[FEAT_1_EDX]; - env->features[FEAT_1_ECX] |= plus_features[FEAT_1_ECX]; - env->features[FEAT_8000_0001_EDX] |= plus_features[FEAT_8000_0001_EDX]; - env->features[FEAT_8000_0001_ECX] |= plus_features[FEAT_8000_0001_ECX]; - env->features[FEAT_C000_0001_EDX] |= plus_features[FEAT_C000_0001_EDX]; - env->features[FEAT_KVM] |= plus_features[FEAT_KVM]; - env->features[FEAT_SVM] |= plus_features[FEAT_SVM]; - env->features[FEAT_7_0_EBX] |= plus_features[FEAT_7_0_EBX]; - env->features[FEAT_1_EDX] &= ~minus_features[FEAT_1_EDX]; - env->features[FEAT_1_ECX] &= ~minus_features[FEAT_1_ECX]; - env->features[FEAT_8000_0001_EDX] &= ~minus_features[FEAT_8000_0001_EDX]; - env->features[FEAT_8000_0001_ECX] &= ~minus_features[FEAT_8000_0001_ECX]; - env->features[FEAT_C000_0001_EDX] &= ~minus_features[FEAT_C000_0001_EDX]; - env->features[FEAT_KVM] &= ~minus_features[FEAT_KVM]; - env->features[FEAT_SVM] &= ~minus_features[FEAT_SVM]; - env->features[FEAT_7_0_EBX] &= ~minus_features[FEAT_7_0_EBX]; + + for (w = 0; w < FEATURE_WORDS; w++) { + env->features[w] |= plus_features[w]; + env->features[w] &= ~minus_features[w]; + } } /* generate a composite string into buf of all cpuid names in featureset @@ -1840,21 +1914,53 @@ CpuDefinitionInfoList *arch_query_cpu_definitions(Error **errp) return cpu_list; } -static void filter_features_for_kvm(X86CPU *cpu) +static uint32_t x86_cpu_get_supported_feature_word(FeatureWord w, + bool migratable_only) +{ + FeatureWordInfo *wi = &feature_word_info[w]; + uint32_t r; + + if (kvm_enabled()) { + r = kvm_arch_get_supported_cpuid(kvm_state, wi->cpuid_eax, + wi->cpuid_ecx, + wi->cpuid_reg); + } else if (tcg_enabled()) { + r = wi->tcg_features; + } else { + return ~0; + } + if (migratable_only) { + r &= x86_cpu_get_migratable_flags(w); + } + return r; +} + +/* + * Filters CPU feature words based on host availability of each feature. + * + * Returns: 0 if all flags are supported by the host, non-zero otherwise. + */ +static int x86_cpu_filter_features(X86CPU *cpu) { CPUX86State *env = &cpu->env; - KVMState *s = kvm_state; FeatureWord w; + int rv = 0; for (w = 0; w < FEATURE_WORDS; w++) { - FeatureWordInfo *wi = &feature_word_info[w]; - uint32_t host_feat = kvm_arch_get_supported_cpuid(s, wi->cpuid_eax, - wi->cpuid_ecx, - wi->cpuid_reg); + uint32_t host_feat = + x86_cpu_get_supported_feature_word(w, cpu->migratable); uint32_t requested_features = env->features[w]; env->features[w] &= host_feat; cpu->filtered_features[w] = requested_features & ~env->features[w]; + if (cpu->filtered_features[w]) { + if (cpu->check_cpuid || cpu->enforce_cpuid) { + report_unavailable_features(w, cpu->filtered_features[w]); + } + rv = 1; + } } + + return rv; } /* Load data from X86CPUDefinition @@ -1864,30 +1970,26 @@ static void x86_cpu_load_def(X86CPU *cpu, X86CPUDefinition *def, Error **errp) CPUX86State *env = &cpu->env; const char *vendor; char host_vendor[CPUID_VENDOR_SZ + 1]; + FeatureWord w; object_property_set_int(OBJECT(cpu), def->level, "level", errp); object_property_set_int(OBJECT(cpu), def->family, "family", errp); object_property_set_int(OBJECT(cpu), def->model, "model", errp); object_property_set_int(OBJECT(cpu), def->stepping, "stepping", errp); - env->features[FEAT_1_EDX] = def->features[FEAT_1_EDX]; - env->features[FEAT_1_ECX] = def->features[FEAT_1_ECX]; - env->features[FEAT_8000_0001_EDX] = def->features[FEAT_8000_0001_EDX]; - env->features[FEAT_8000_0001_ECX] = def->features[FEAT_8000_0001_ECX]; object_property_set_int(OBJECT(cpu), def->xlevel, "xlevel", errp); - env->features[FEAT_KVM] = def->features[FEAT_KVM]; - env->features[FEAT_SVM] = def->features[FEAT_SVM]; - env->features[FEAT_C000_0001_EDX] = def->features[FEAT_C000_0001_EDX]; - env->features[FEAT_7_0_EBX] = def->features[FEAT_7_0_EBX]; env->cpuid_xlevel2 = def->xlevel2; cpu->cache_info_passthrough = def->cache_info_passthrough; - object_property_set_str(OBJECT(cpu), def->model_id, "model-id", errp); + for (w = 0; w < FEATURE_WORDS; w++) { + env->features[w] = def->features[w]; + } /* Special cases not set in the X86CPUDefinition structs: */ if (kvm_enabled()) { FeatureWord w; for (w = 0; w < FEATURE_WORDS; w++) { env->features[w] |= kvm_default_features[w]; + env->features[w] &= ~kvm_default_unset_features[w]; } } @@ -2336,6 +2438,12 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, (AMD_ENC_ASSOC(L3_ASSOCIATIVITY) << 12) | \ (L3_LINES_PER_TAG << 8) | (L3_LINE_SIZE); break; + case 0x80000007: + *eax = 0; + *ebx = 0; + *ecx = 0; + *edx = env->features[FEAT_8000_0007_EDX]; + break; case 0x80000008: /* virtual & phys address size in low 2 bytes. */ /* XXX: This value must match the one used in the MMU code. */ @@ -2593,25 +2701,13 @@ static void x86_cpu_realizefn(DeviceState *dev, Error **errp) & CPUID_EXT2_AMD_ALIASES); } - if (!kvm_enabled()) { - env->features[FEAT_1_EDX] &= TCG_FEATURES; - env->features[FEAT_1_ECX] &= TCG_EXT_FEATURES; - env->features[FEAT_8000_0001_EDX] &= (TCG_EXT2_FEATURES -#ifdef TARGET_X86_64 - | CPUID_EXT2_SYSCALL | CPUID_EXT2_LM -#endif - ); - env->features[FEAT_8000_0001_ECX] &= TCG_EXT3_FEATURES; - env->features[FEAT_SVM] &= TCG_SVM_FEATURES; - } else { - KVMState *s = kvm_state; - if ((cpu->check_cpuid || cpu->enforce_cpuid) - && kvm_check_features_against_host(s, cpu) && cpu->enforce_cpuid) { - error_setg(&local_err, - "Host's CPU doesn't support requested features"); - goto out; - } - filter_features_for_kvm(cpu); + + if (x86_cpu_filter_features(cpu) && cpu->enforce_cpuid) { + error_setg(&local_err, + kvm_enabled() ? + "Host doesn't support requested features" : + "TCG doesn't support requested features"); + goto out; } #ifndef CONFIG_USER_ONLY diff --git a/target-i386/cpu.h b/target-i386/cpu.h index b5e1b411fb..e634d83e86 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -402,6 +402,7 @@ typedef enum FeatureWord { FEAT_7_0_EBX, /* CPUID[EAX=7,ECX=0].EBX */ FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */ FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */ + FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */ FEAT_C000_0001_EDX, /* CPUID[C000_0001].EDX */ FEAT_KVM, /* CPUID[4000_0001].EAX (KVM_CPUID_FEATURES) */ FEAT_SVM, /* CPUID[8000_000A].EDX */ @@ -561,6 +562,9 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_EBX_ADX (1U << 19) #define CPUID_7_0_EBX_SMAP (1U << 20) +/* CPUID[0x80000007].EDX flags: */ +#define CPUID_APM_INVTSC (1U << 8) + #define CPUID_VENDOR_SZ 12 #define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */ diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 4bf0ac9e76..097fe1188d 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -35,6 +35,8 @@ #include "exec/ioport.h" #include <asm/hyperv.h> #include "hw/pci/pci.h" +#include "migration/migration.h" +#include "qapi/qmp/qerror.h" //#define DEBUG_KVM @@ -448,6 +450,8 @@ static bool hyperv_enabled(X86CPU *cpu) cpu->hyperv_relaxed_timing); } +static Error *invtsc_mig_blocker; + #define KVM_MAX_CPUID_ENTRIES 100 int kvm_arch_init_vcpu(CPUState *cs) @@ -705,6 +709,17 @@ int kvm_arch_init_vcpu(CPUState *cs) !!(c->ecx & CPUID_EXT_SMX); } + c = cpuid_find_entry(&cpuid_data.cpuid, 0x80000007, 0); + if (c && (c->edx & 1<<8) && invtsc_mig_blocker == NULL) { + /* for migration */ + error_setg(&invtsc_mig_blocker, + "State blocked by non-migratable CPU device" + " (invtsc flag)"); + migrate_add_blocker(invtsc_mig_blocker); + /* for savevm */ + vmstate_x86_cpu.unmigratable = 1; + } + cpuid_data.cpuid.padding = 0; r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data); if (r) { diff --git a/target-i386/machine.c b/target-i386/machine.c index b8dcd2f943..16d2f6a809 100644 --- a/target-i386/machine.c +++ b/target-i386/machine.c @@ -603,7 +603,7 @@ static const VMStateDescription vmstate_msr_hyperv_time = { } }; -const VMStateDescription vmstate_x86_cpu = { +VMStateDescription vmstate_x86_cpu = { .name = "cpu", .version_id = 12, .minimum_version_id = 3, diff --git a/target-ppc/cpu-models.c b/target-ppc/cpu-models.c index 97a81d8660..9a91af9dbf 100644 --- a/target-ppc/cpu-models.c +++ b/target-ppc/cpu-models.c @@ -1138,6 +1138,8 @@ "POWER7 v2.3") POWERPC_DEF("POWER7+_v2.1", CPU_POWERPC_POWER7P_v21, POWER7P, "POWER7+ v2.1") + POWERPC_DEF("POWER8E_v1.0", CPU_POWERPC_POWER8E_v10, POWER8E, + "POWER8E v1.0") POWERPC_DEF("POWER8_v1.0", CPU_POWERPC_POWER8_v10, POWER8, "POWER8 v1.0") POWERPC_DEF("970", CPU_POWERPC_970, 970, @@ -1386,6 +1388,7 @@ PowerPCCPUAlias ppc_cpu_aliases[] = { { "POWER5gs", "POWER5+" }, { "POWER7", "POWER7_v2.3" }, { "POWER7+", "POWER7+_v2.1" }, + { "POWER8E", "POWER8E_v1.0" }, { "POWER8", "POWER8_v1.0" }, { "970fx", "970fx_v3.1" }, { "970mp", "970mp_v1.1" }, diff --git a/target-ppc/cpu-models.h b/target-ppc/cpu-models.h index db75896012..c39d03a504 100644 --- a/target-ppc/cpu-models.h +++ b/target-ppc/cpu-models.h @@ -559,9 +559,12 @@ enum { CPU_POWERPC_POWER7P_BASE = 0x004A0000, CPU_POWERPC_POWER7P_MASK = 0xFFFF0000, CPU_POWERPC_POWER7P_v21 = 0x004A0201, - CPU_POWERPC_POWER8_BASE = 0x004B0000, + CPU_POWERPC_POWER8E_BASE = 0x004B0000, + CPU_POWERPC_POWER8E_MASK = 0xFFFF0000, + CPU_POWERPC_POWER8E_v10 = 0x004B0100, + CPU_POWERPC_POWER8_BASE = 0x004D0000, CPU_POWERPC_POWER8_MASK = 0xFFFF0000, - CPU_POWERPC_POWER8_v10 = 0x004B0100, + CPU_POWERPC_POWER8_v10 = 0x004D0100, CPU_POWERPC_970 = 0x00390202, CPU_POWERPC_970FX_v10 = 0x00391100, CPU_POWERPC_970FX_v20 = 0x003C0200, diff --git a/target-ppc/cpu-qom.h b/target-ppc/cpu-qom.h index 13c7031265..f1f0a52998 100644 --- a/target-ppc/cpu-qom.h +++ b/target-ppc/cpu-qom.h @@ -119,7 +119,9 @@ void ppc_cpu_dump_statistics(CPUState *cpu, FILE *f, fprintf_function cpu_fprintf, int flags); hwaddr ppc_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr); int ppc_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg); +int ppc_cpu_gdb_read_register_apple(CPUState *cpu, uint8_t *buf, int reg); int ppc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg); +int ppc_cpu_gdb_write_register_apple(CPUState *cpu, uint8_t *buf, int reg); int ppc64_cpu_write_elf64_qemunote(WriteCoreDumpFunction f, CPUState *cpu, void *opaque); int ppc64_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h index 74407ee209..08ae527cb6 100644 --- a/target-ppc/cpu.h +++ b/target-ppc/cpu.h @@ -2012,7 +2012,7 @@ enum { PPC2_DIVE_ISA206 | PPC2_ATOMIC_ISA206 | \ PPC2_FP_CVT_ISA206 | PPC2_FP_TST_ISA206 | \ PPC2_BCTAR_ISA207 | PPC2_LSQ_ISA207 | \ - PPC2_ALTIVEC_207 | PPC2_ISA207S) + PPC2_ALTIVEC_207 | PPC2_ISA207S | PPC2_DFP) }; /*****************************************************************************/ diff --git a/target-ppc/gdbstub.c b/target-ppc/gdbstub.c index 381a3c7e31..694d303e19 100644 --- a/target-ppc/gdbstub.c +++ b/target-ppc/gdbstub.c @@ -21,6 +21,31 @@ #include "qemu-common.h" #include "exec/gdbstub.h" +static int ppc_gdb_register_len_apple(int n) +{ + switch (n) { + case 0 ... 31: + /* gprs */ + return 8; + case 32 ... 63: + /* fprs */ + return 8; + case 64 ... 95: + return 16; + case 64+32: /* nip */ + case 65+32: /* msr */ + case 67+32: /* lr */ + case 68+32: /* ctr */ + case 69+32: /* xer */ + case 70+32: /* fpscr */ + return 8; + case 66+32: /* cr */ + return 4; + default: + return 0; + } +} + static int ppc_gdb_register_len(int n) { switch (n) { @@ -132,6 +157,65 @@ int ppc_cpu_gdb_read_register(CPUState *cs, uint8_t *mem_buf, int n) return r; } +int ppc_cpu_gdb_read_register_apple(CPUState *cs, uint8_t *mem_buf, int n) +{ + PowerPCCPU *cpu = POWERPC_CPU(cs); + CPUPPCState *env = &cpu->env; + int r = ppc_gdb_register_len_apple(n); + + if (!r) { + return r; + } + + if (n < 32) { + /* gprs */ + gdb_get_reg64(mem_buf, env->gpr[n]); + } else if (n < 64) { + /* fprs */ + stfq_p(mem_buf, env->fpr[n-32]); + } else if (n < 96) { + /* Altivec */ + stq_p(mem_buf, n - 64); + stq_p(mem_buf + 8, 0); + } else { + switch (n) { + case 64 + 32: + gdb_get_reg64(mem_buf, env->nip); + break; + case 65 + 32: + gdb_get_reg64(mem_buf, env->msr); + break; + case 66 + 32: + { + uint32_t cr = 0; + int i; + for (i = 0; i < 8; i++) { + cr |= env->crf[i] << (32 - ((i + 1) * 4)); + } + gdb_get_reg32(mem_buf, cr); + break; + } + case 67 + 32: + gdb_get_reg64(mem_buf, env->lr); + break; + case 68 + 32: + gdb_get_reg64(mem_buf, env->ctr); + break; + case 69 + 32: + gdb_get_reg64(mem_buf, env->xer); + break; + case 70 + 32: + gdb_get_reg64(mem_buf, env->fpscr); + break; + } + } + if (msr_le) { + /* If cpu is in LE mode, convert memory contents to LE. */ + ppc_gdb_swap_register(mem_buf, n, r); + } + return r; +} + int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) { PowerPCCPU *cpu = POWERPC_CPU(cs); @@ -185,3 +269,56 @@ int ppc_cpu_gdb_write_register(CPUState *cs, uint8_t *mem_buf, int n) } return r; } +int ppc_cpu_gdb_write_register_apple(CPUState *cs, uint8_t *mem_buf, int n) +{ + PowerPCCPU *cpu = POWERPC_CPU(cs); + CPUPPCState *env = &cpu->env; + int r = ppc_gdb_register_len_apple(n); + + if (!r) { + return r; + } + if (msr_le) { + /* If cpu is in LE mode, convert memory contents to LE. */ + ppc_gdb_swap_register(mem_buf, n, r); + } + if (n < 32) { + /* gprs */ + env->gpr[n] = ldq_p(mem_buf); + } else if (n < 64) { + /* fprs */ + env->fpr[n-32] = ldfq_p(mem_buf); + } else { + switch (n) { + case 64 + 32: + env->nip = ldq_p(mem_buf); + break; + case 65 + 32: + ppc_store_msr(env, ldq_p(mem_buf)); + break; + case 66 + 32: + { + uint32_t cr = ldl_p(mem_buf); + int i; + for (i = 0; i < 8; i++) { + env->crf[i] = (cr >> (32 - ((i + 1) * 4))) & 0xF; + } + break; + } + case 67 + 32: + env->lr = ldq_p(mem_buf); + break; + case 68 + 32: + env->ctr = ldq_p(mem_buf); + break; + case 69 + 32: + env->xer = ldq_p(mem_buf); + break; + case 70 + 32: + /* fpscr */ + store_fpscr(env, ldq_p(mem_buf), 0xffffffff); + break; + } + } + return r; +} diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 561f8ccf2f..2d87108d8b 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -63,6 +63,7 @@ static int cap_ppc_smt; static int cap_ppc_rma; static int cap_spapr_tce; static int cap_spapr_multitce; +static int cap_spapr_vfio; static int cap_hior; static int cap_one_reg; static int cap_epr; @@ -101,6 +102,7 @@ int kvm_arch_init(KVMState *s) cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA); cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE); cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE); + cap_spapr_vfio = false; cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG); cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR); cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR); @@ -1660,7 +1662,8 @@ bool kvmppc_spapr_use_multitce(void) return cap_spapr_multitce; } -void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd) +void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd, + bool vfio_accel) { struct kvm_create_spapr_tce args = { .liobn = liobn, @@ -1674,7 +1677,7 @@ void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd) * destroying the table, which the upper layers -will- do */ *pfd = -1; - if (!cap_spapr_tce) { + if (!cap_spapr_tce || (vfio_accel && !cap_spapr_vfio)) { return NULL; } diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h index 412cc7f3c1..1118122d89 100644 --- a/target-ppc/kvm_ppc.h +++ b/target-ppc/kvm_ppc.h @@ -33,7 +33,8 @@ int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu); #ifndef CONFIG_USER_ONLY off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem); bool kvmppc_spapr_use_multitce(void); -void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd); +void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd, + bool vfio_accel); int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size); int kvmppc_reset_htab(int shift_hint); uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift); @@ -144,7 +145,8 @@ static inline bool kvmppc_spapr_use_multitce(void) } static inline void *kvmppc_create_spapr_tce(uint32_t liobn, - uint32_t window_size, int *fd) + uint32_t window_size, int *fd, + bool vfio_accel) { return NULL; } diff --git a/target-ppc/translate.c b/target-ppc/translate.c index 48017219a4..b23933f7bd 100644 --- a/target-ppc/translate.c +++ b/target-ppc/translate.c @@ -426,7 +426,6 @@ static inline uint32_t SPR(uint32_t opcode) return ((sprn >> 5) & 0x1F) | ((sprn & 0x1F) << 5); } /*** Get constants ***/ -EXTRACT_HELPER(IMM, 12, 8); /* 16 bits signed immediate value */ EXTRACT_SHELPER(SIMM, 0, 16); /* 16 bits unsigned immediate value */ @@ -459,8 +458,6 @@ EXTRACT_HELPER(FPFLM, 17, 8); EXTRACT_HELPER(FPW, 16, 1); /*** Jump target decoding ***/ -/* Displacement */ -EXTRACT_SHELPER(d, 0, 16); /* Immediate address */ static inline target_ulong LI(uint32_t opcode) { @@ -2665,11 +2662,6 @@ static inline void gen_qemu_ld8u(DisasContext *ctx, TCGv arg1, TCGv arg2) tcg_gen_qemu_ld8u(arg1, arg2, ctx->mem_idx); } -static inline void gen_qemu_ld8s(DisasContext *ctx, TCGv arg1, TCGv arg2) -{ - tcg_gen_qemu_ld8s(arg1, arg2, ctx->mem_idx); -} - static inline void gen_qemu_ld16u(DisasContext *ctx, TCGv arg1, TCGv arg2) { TCGMemOp op = MO_UW | ctx->default_tcg_memop_mask; @@ -4123,8 +4115,9 @@ static void gen_mcrxr(DisasContext *ctx) tcg_gen_trunc_tl_i32(t0, cpu_so); tcg_gen_trunc_tl_i32(t1, cpu_ov); tcg_gen_trunc_tl_i32(dst, cpu_ca); - tcg_gen_shri_i32(t0, t0, 2); - tcg_gen_shri_i32(t1, t1, 1); + tcg_gen_shli_i32(t0, t0, 3); + tcg_gen_shli_i32(t1, t1, 2); + tcg_gen_shli_i32(dst, dst, 1); tcg_gen_or_i32(dst, dst, t0); tcg_gen_or_i32(dst, dst, t1); tcg_temp_free_i32(t0); diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c index 85581c9537..a3bb336e16 100644 --- a/target-ppc/translate_init.c +++ b/target-ppc/translate_init.c @@ -34,6 +34,7 @@ //#define PPC_DUMP_CPU //#define PPC_DEBUG_SPR //#define PPC_DUMP_SPR_ACCESSES +/* #define USE_APPLE_GDB */ /* For user-mode emulation, we don't emulate any IRQ controller */ #if defined(CONFIG_USER_ONLY) @@ -8188,16 +8189,16 @@ static void init_proc_POWER8(CPUPPCState *env) init_proc_book3s_64(env, BOOK3S_CPU_POWER8); } -POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) +POWERPC_FAMILY(POWER8E)(ObjectClass *oc, void *data) { DeviceClass *dc = DEVICE_CLASS(oc); PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc); dc->fw_name = "PowerPC,POWER8"; - dc->desc = "POWER8"; + dc->desc = "POWER8E"; dc->props = powerpc_servercpu_properties; - pcc->pvr = CPU_POWERPC_POWER8_BASE; - pcc->pvr_mask = CPU_POWERPC_POWER8_MASK; + pcc->pvr = CPU_POWERPC_POWER8E_BASE; + pcc->pvr_mask = CPU_POWERPC_POWER8E_MASK; pcc->pcr_mask = PCR_COMPAT_2_05 | PCR_COMPAT_2_06; pcc->init_proc = init_proc_POWER8; pcc->check_pow = check_pow_nocheck; @@ -8251,6 +8252,18 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) pcc->l1_icache_size = 0x8000; pcc->interrupts_big_endian = ppc_cpu_interrupts_big_endian_lpcr; } + +POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc); + + ppc_POWER8E_cpu_family_class_init(oc, data); + + dc->desc = "POWER8"; + pcc->pvr = CPU_POWERPC_POWER8_BASE; + pcc->pvr_mask = CPU_POWERPC_POWER8_MASK; +} #endif /* defined (TARGET_PPC64) */ @@ -9667,6 +9680,13 @@ static void ppc_cpu_class_init(ObjectClass *oc, void *data) #endif cc->gdb_num_core_regs = 71; + +#ifdef USE_APPLE_GDB + cc->gdb_read_register = ppc_cpu_gdb_read_register_apple; + cc->gdb_write_register = ppc_cpu_gdb_write_register_apple; + cc->gdb_num_core_regs = 71 + 32; +#endif + #if defined(TARGET_PPC64) cc->gdb_core_xml_file = "power64-core.xml"; #else diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c index c83fd9fdc0..203027eb3e 100644 --- a/tcg/ppc/tcg-target.c +++ b/tcg/ppc/tcg-target.c @@ -805,7 +805,10 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt, /* For unaligned, or very large offsets, use the indexed form. */ if (offset & align || offset != (int32_t)offset) { - tcg_debug_assert(rs != base && (!is_store || rs != rt)); + if (rs == base) { + rs = TCG_REG_R0; + } + tcg_debug_assert(!is_store || rs != rt); tcg_out_movi(s, TCG_TYPE_PTR, rs, orig); tcg_out32(s, opx | TAB(rt, base, rs)); return; @@ -1716,6 +1719,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64) # define LINK_AREA_SIZE (6 * SZR) # define LR_OFFSET (1 * SZR) # define TCG_TARGET_CALL_STACK_OFFSET (LINK_AREA_SIZE + 8 * SZR) +#elif defined(TCG_TARGET_CALL_DARWIN) +# define LINK_AREA_SIZE (6 * SZR) +# define LR_OFFSET (2 * SZR) #elif TCG_TARGET_REG_BITS == 64 # if defined(_CALL_ELF) && _CALL_ELF == 2 # define LINK_AREA_SIZE (4 * SZR) @@ -1725,9 +1731,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is_64) # if defined(_CALL_SYSV) # define LINK_AREA_SIZE (2 * SZR) # define LR_OFFSET (1 * SZR) -# elif defined(TCG_TARGET_CALL_DARWIN) -# define LINK_AREA_SIZE 24 -# define LR_OFFSET 8 # endif #endif #ifndef LR_OFFSET diff --git a/tests/qapi-schema/event-nest-struct.err b/tests/qapi-schema/event-nest-struct.err index e4a0faac9c..91bde1c967 100644 --- a/tests/qapi-schema/event-nest-struct.err +++ b/tests/qapi-schema/event-nest-struct.err @@ -1 +1 @@ -tests/qapi-schema/event-nest-struct.json:1: Nested structure define in event is not supported now, event 'EVENT_A', argname 'a' +tests/qapi-schema/event-nest-struct.json:1: Nested structure define in event is not supported, event 'EVENT_A', argname 'a' diff --git a/trace-events b/trace-events index ba01ad52cf..d071b97dd7 100644 --- a/trace-events +++ b/trace-events @@ -1169,12 +1169,13 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride, qxl_render_update_area_done(void *cookie) "%p" # hw/ppc/spapr_pci.c -spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)" +spapr_pci_msi(const char *msg, uint32_t ca) "%s (cfg=%x)" spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64 -spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u" +spapr_pci_rtas_ibm_change_msi(unsigned cfg, unsigned func, unsigned req, unsigned first) "cfgaddr %x func %u, requested %u, first irq %u" spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, unsigned intr) "queries for #%u, IRQ%u" spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) "@%"PRIx64"<=%"PRIx64" IRQ %u" spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u" +spapr_pci_msi_retry(unsigned config_addr, unsigned req_num, unsigned max_irqs) "Guest device at %x asked %u, have only %u" # hw/intc/xics.c xics_icp_check_ipi(int server, uint8_t mfrr) "CPU %d can take IPI mfrr=%#x" @@ -1188,6 +1189,12 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]" xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x" xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]" xics_ics_eoi(int nr) "ics_eoi: irq %#x" +xics_alloc(int src, int irq) "source#%d, irq %d" +xics_alloc_failed_hint(int src, int irq) "source#%d, irq %d is already in use" +xics_alloc_failed_no_left(int src) "source#%d, no irq left" +xics_alloc_block(int src, int first, int num, bool lsi, int align) "source#%d, first irq %d, %d irqs, lsi=%d, alignnum %d" +xics_ics_free(int src, int irq, int num) "Source#%d, first irq %d, %d irqs" +xics_ics_free_warn(int src, int irq) "Source#%d, irq %d is already free" # hw/ppc/spapr.c spapr_cas_failed(unsigned long n) "DT diff buffer is too small: %ld bytes" @@ -43,11 +43,18 @@ static int vmstate_size(void *opaque, VMStateField *field) return size; } -static void *vmstate_base_addr(void *opaque, VMStateField *field) +static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc) { void *base_addr = opaque + field->offset; if (field->flags & VMS_POINTER) { + if (alloc && (field->flags & VMS_ALLOC)) { + int n_elems = vmstate_n_elems(opaque, field); + if (n_elems) { + gsize size = n_elems * field->size; + *((void **)base_addr + field->start) = g_malloc(size); + } + } base_addr = *(void **)base_addr + field->start; } @@ -81,7 +88,7 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, field->field_exists(opaque, version_id)) || (!field->field_exists && field->version_id <= version_id)) { - void *base_addr = vmstate_base_addr(opaque, field); + void *base_addr = vmstate_base_addr(opaque, field, true); int i, n_elems = vmstate_n_elems(opaque, field); int size = vmstate_size(opaque, field); @@ -135,7 +142,7 @@ void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, while (field->name) { if (!field->field_exists || field->field_exists(opaque, vmsd->version_id)) { - void *base_addr = vmstate_base_addr(opaque, field); + void *base_addr = vmstate_base_addr(opaque, field, false); int i, n_elems = vmstate_n_elems(opaque, field); int size = vmstate_size(opaque, field); |