aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2022-06-23 14:52:30 -0700
committerRichard Henderson <richard.henderson@linaro.org>2022-06-23 14:52:30 -0700
commit3a821c52e1a30ecd9a436f2c67cc66b5628c829f (patch)
tree57873bbe6aa3ddb6cc420350ed29e1aa438c14cd
parent7db86fe2ed220c196061824e652b94e7a2acbabf (diff)
parent98836e8e012a959ec515c041e4fdd7f2ae87ae16 (diff)
Merge tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme into staging
hw/nvme updates - sriov functionality - odd fixes # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCAAdFiEEUigzqnXi3OaiR2bATeGvMW1PDekFAmK02wUACgkQTeGvMW1P # DenNPwgAwhQCXXacTb+6vEdxN30QoWygzQj5BLm//SiXlj7hBX7P/JqCxYF5vUDU # EaZkl4n3ry5T1xqlUWIBFdIAmKyrsWz2eKTrX41g64i/L+/nfJXZ+IgQc3WkM/FK # 5NwwAE8q/JGiRczLesF/9QvQq/90L6QtyC48bsS8AIcl5IcqHCKGwEJS7LErltex # YZDJyTNU4wB2XFophylJUL43GrHa/kUFA2ZHgs9iuH0p5LGG6UM3KoinBKcbwn47 # iEWKccvsHSyfE8VpJJS5STMEeGGaBPziZ654ElLmzVq6EXDKMCoX03naQ9Q8oSpl # FiktbxllCYdmECb44PNBEd/nLdpCdQ== # =o54a # -----END PGP SIGNATURE----- # gpg: Signature made Thu 23 Jun 2022 02:28:37 PM PDT # gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9 # gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown] # gpg: aka "Klaus Jensen <k.jensen@samsung.com>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838 # Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9 * tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme: hw/nvme: clear aen mask on reset Revert "hw/block/nvme: add support for sgl bit bucket descriptor" hw/nvme: clean up CC register write logic hw/acpi: Make the PCI hot-plug aware of SR-IOV hw/nvme: Update the initalization place for the AER queue docs: Add documentation for SR-IOV and Virtualization Enhancements hw/nvme: Add support for the Virtualization Management command hw/nvme: Initialize capability structures for primary/secondary controllers hw/nvme: Calculate BAR attributes in a function hw/nvme: Remove reg_size variable and update BAR0 size calculation hw/nvme: Make max_ioqpairs and msix_qsize configurable in runtime hw/nvme: Implement the Function Level Reset hw/nvme: Add support for Secondary Controller List hw/nvme: Add support for Primary Controller Capabilities hw/nvme: Add support for SR-IOV Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r--docs/system/devices/nvme.rst82
-rw-r--r--hw/acpi/pcihp.c6
-rw-r--r--hw/nvme/ctrl.c741
-rw-r--r--hw/nvme/ns.c2
-rw-r--r--hw/nvme/nvme.h55
-rw-r--r--hw/nvme/subsys.c75
-rw-r--r--hw/nvme/trace-events6
-rw-r--r--include/block/nvme.h65
-rw-r--r--include/hw/pci/pci_ids.h1
9 files changed, 932 insertions, 101 deletions
diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index b5acb2a9c1..aba253304e 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -239,3 +239,85 @@ The virtual namespace device supports DIF- and DIX-based protection information
to ``1`` to transfer protection information as the first eight bytes of
metadata. Otherwise, the protection information is transferred as the last
eight bytes.
+
+Virtualization Enhancements and SR-IOV (Experimental Support)
+-------------------------------------------------------------
+
+The ``nvme`` device supports Single Root I/O Virtualization and Sharing
+along with Virtualization Enhancements. The controller has to be linked to
+an NVM Subsystem device (``nvme-subsys``) for use with SR-IOV.
+
+A number of parameters are present (**please note, that they may be
+subject to change**):
+
+``sriov_max_vfs`` (default: ``0``)
+ Indicates the maximum number of PCIe virtual functions supported
+ by the controller. Specifying a non-zero value enables reporting of both
+ SR-IOV and ARI (Alternative Routing-ID Interpretation) capabilities
+ by the NVMe device. Virtual function controllers will not report SR-IOV.
+
+``sriov_vq_flexible``
+ Indicates the total number of flexible queue resources assignable to all
+ the secondary controllers. Implicitly sets the number of primary
+ controller's private resources to ``(max_ioqpairs - sriov_vq_flexible)``.
+
+``sriov_vi_flexible``
+ Indicates the total number of flexible interrupt resources assignable to
+ all the secondary controllers. Implicitly sets the number of primary
+ controller's private resources to ``(msix_qsize - sriov_vi_flexible)``.
+
+``sriov_max_vi_per_vf`` (default: ``0``)
+ Indicates the maximum number of virtual interrupt resources assignable
+ to a secondary controller. The default ``0`` resolves to
+ ``(sriov_vi_flexible / sriov_max_vfs)``
+
+``sriov_max_vq_per_vf`` (default: ``0``)
+ Indicates the maximum number of virtual queue resources assignable to
+ a secondary controller. The default ``0`` resolves to
+ ``(sriov_vq_flexible / sriov_max_vfs)``
+
+The simplest possible invocation enables the capability to set up one VF
+controller and assign an admin queue, an IO queue, and a MSI-X interrupt.
+
+.. code-block:: console
+
+ -device nvme-subsys,id=subsys0
+ -device nvme,serial=deadbeef,subsys=subsys0,sriov_max_vfs=1,
+ sriov_vq_flexible=2,sriov_vi_flexible=1
+
+The minimum steps required to configure a functional NVMe secondary
+controller are:
+
+ * unbind flexible resources from the primary controller
+
+.. code-block:: console
+
+ nvme virt-mgmt /dev/nvme0 -c 0 -r 1 -a 1 -n 0
+ nvme virt-mgmt /dev/nvme0 -c 0 -r 0 -a 1 -n 0
+
+ * perform a Function Level Reset on the primary controller to actually
+ release the resources
+
+.. code-block:: console
+
+ echo 1 > /sys/bus/pci/devices/0000:01:00.0/reset
+
+ * enable VF
+
+.. code-block:: console
+
+ echo 1 > /sys/bus/pci/devices/0000:01:00.0/sriov_numvfs
+
+ * assign the flexible resources to the VF and set it ONLINE
+
+.. code-block:: console
+
+ nvme virt-mgmt /dev/nvme0 -c 1 -r 1 -a 8 -n 1
+ nvme virt-mgmt /dev/nvme0 -c 1 -r 0 -a 8 -n 2
+ nvme virt-mgmt /dev/nvme0 -c 1 -r 0 -a 9 -n 0
+
+ * bind the NVMe driver to the VF
+
+.. code-block:: console
+
+ echo 0000:01:00.1 > /sys/bus/pci/drivers/nvme/bind \ No newline at end of file
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index bf65bbea49..84d75e6b84 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -192,8 +192,12 @@ static bool acpi_pcihp_pc_no_hotplug(AcpiPciHpState *s, PCIDevice *dev)
* ACPI doesn't allow hotplug of bridge devices. Don't allow
* hot-unplug of bridge devices unless they were added by hotplug
* (and so, not described by acpi).
+ *
+ * Don't allow hot-unplug of SR-IOV Virtual Functions, as they
+ * will be removed implicitly, when Physical Function is unplugged.
*/
- return (pc->is_bridge && !dev->qdev.hotplugged) || !dc->hotpluggable;
+ return (pc->is_bridge && !dev->qdev.hotplugged) || !dc->hotpluggable ||
+ pci_is_vf(dev);
}
static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slots)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 1e6e0fcad9..d349b3e426 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -35,6 +35,11 @@
* mdts=<N[optional]>,vsl=<N[optional]>, \
* zoned.zasl=<N[optional]>, \
* zoned.auto_transition=<on|off[optional]>, \
+ * sriov_max_vfs=<N[optional]> \
+ * sriov_vq_flexible=<N[optional]> \
+ * sriov_vi_flexible=<N[optional]> \
+ * sriov_max_vi_per_vf=<N[optional]> \
+ * sriov_max_vq_per_vf=<N[optional]> \
* subsys=<subsys_id>
* -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
* zoned=<true|false[optional]>, \
@@ -106,6 +111,35 @@
* transitioned to zone state closed for resource management purposes.
* Defaults to 'on'.
*
+ * - `sriov_max_vfs`
+ * Indicates the maximum number of PCIe virtual functions supported
+ * by the controller. The default value is 0. Specifying a non-zero value
+ * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
+ * Virtual function controllers will not report SR-IOV capability.
+ *
+ * NOTE: Single Root I/O Virtualization support is experimental.
+ * All the related parameters may be subject to change.
+ *
+ * - `sriov_vq_flexible`
+ * Indicates the total number of flexible queue resources assignable to all
+ * the secondary controllers. Implicitly sets the number of primary
+ * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
+ *
+ * - `sriov_vi_flexible`
+ * Indicates the total number of flexible interrupt resources assignable to
+ * all the secondary controllers. Implicitly sets the number of primary
+ * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
+ *
+ * - `sriov_max_vi_per_vf`
+ * Indicates the maximum number of virtual interrupt resources assignable
+ * to a secondary controller. The default 0 resolves to
+ * `(sriov_vi_flexible / sriov_max_vfs)`.
+ *
+ * - `sriov_max_vq_per_vf`
+ * Indicates the maximum number of virtual queue resources assignable to
+ * a secondary controller. The default 0 resolves to
+ * `(sriov_vq_flexible / sriov_max_vfs)`.
+ *
* nvme namespace device parameters
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* - `shared`
@@ -154,12 +188,14 @@
#include "qemu/error-report.h"
#include "qemu/log.h"
#include "qemu/units.h"
+#include "qemu/range.h"
#include "qapi/error.h"
#include "qapi/visitor.h"
#include "sysemu/sysemu.h"
#include "sysemu/block-backend.h"
#include "sysemu/hostmem.h"
#include "hw/pci/msix.h"
+#include "hw/pci/pcie_sriov.h"
#include "migration/vmstate.h"
#include "nvme.h"
@@ -176,6 +212,10 @@
#define NVME_TEMPERATURE_CRITICAL 0x175
#define NVME_NUM_FW_SLOTS 1
#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_VFS 127
+#define NVME_VF_RES_GRANULARITY 1
+#define NVME_VF_OFFSET 0x1
+#define NVME_VF_STRIDE 1
#define NVME_GUEST_ERR(trace, fmt, ...) \
do { \
@@ -223,6 +263,7 @@ static const uint32_t nvme_cse_acs[256] = {
[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
+ [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
};
@@ -254,6 +295,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = {
};
static void nvme_process_sq(void *opaque);
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
static uint16_t nvme_sqid(NvmeRequest *req)
{
@@ -437,12 +479,12 @@ static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
{
- return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
+ return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
}
static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
{
- return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
+ return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
}
static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -808,10 +850,6 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
uint8_t type = NVME_SGL_TYPE(segment[i].type);
switch (type) {
- case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
- if (cmd->opcode == NVME_CMD_WRITE) {
- continue;
- }
case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
break;
case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -844,10 +882,6 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
trans_len = MIN(*len, dlen);
- if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
- goto next;
- }
-
addr = le64_to_cpu(segment[i].addr);
if (UINT64_MAX - addr < dlen) {
@@ -859,7 +893,6 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
return status;
}
-next:
*len -= trans_len;
}
@@ -917,8 +950,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
seg_len = le32_to_cpu(sgld->len);
/* check the length of the (Last) Segment descriptor */
- if ((!seg_len || seg_len & 0xf) &&
- (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
+ if (!seg_len || seg_len & 0xf) {
return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
}
@@ -956,26 +988,20 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
last_sgld = &segment[nsgld - 1];
/*
- * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
- * then we are done.
+ * If the segment ends with a Data Block, then we are done.
*/
- switch (NVME_SGL_TYPE(last_sgld->type)) {
- case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
- case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
+ if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
if (status) {
goto unmap;
}
goto out;
-
- default:
- break;
}
/*
- * If the last descriptor was not a Data Block or Bit Bucket, then the
- * current segment must not be a Last Segment.
+ * If the last descriptor was not a Data Block, then the current
+ * segment must not be a Last Segment.
*/
if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
@@ -4284,8 +4310,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
return NVME_INVALID_CQID | NVME_DNR;
}
- if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
- n->sq[sqid] != NULL)) {
+ if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
return NVME_INVALID_QID | NVME_DNR;
}
@@ -4637,8 +4662,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
NVME_CQ_FLAGS_IEN(qflags) != 0);
- if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
- n->cq[cqid] != NULL)) {
+ if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
return NVME_INVALID_QID | NVME_DNR;
}
@@ -4654,7 +4678,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_err_invalid_create_cq_vector(vector);
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
}
- if (unlikely(vector >= n->params.msix_qsize)) {
+ if (unlikely(vector >= n->conf_msix_qsize)) {
trace_pci_nvme_err_invalid_create_cq_vector(vector);
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
}
@@ -4793,6 +4817,37 @@ static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
}
+static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
+{
+ trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
+
+ return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
+ sizeof(NvmePriCtrlCap), req);
+}
+
+static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+ uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
+ uint16_t min_id = le16_to_cpu(c->ctrlid);
+ uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
+ NvmeSecCtrlList list = {0};
+ uint8_t i;
+
+ for (i = 0; i < num_sec_ctrl; i++) {
+ if (n->sec_ctrl_list.sec[i].scid >= min_id) {
+ list.numcntl = num_sec_ctrl - i;
+ memcpy(&list.sec, n->sec_ctrl_list.sec + i,
+ list.numcntl * sizeof(NvmeSecCtrlEntry));
+ break;
+ }
+ }
+
+ trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
+
+ return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
+}
+
static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
bool active)
{
@@ -5009,6 +5064,10 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
return nvme_identify_ctrl_list(n, req, true);
case NVME_ID_CNS_CTRL_LIST:
return nvme_identify_ctrl_list(n, req, false);
+ case NVME_ID_CNS_PRIMARY_CTRL_CAP:
+ return nvme_identify_pri_ctrl_cap(n, req);
+ case NVME_ID_CNS_SECONDARY_CTRL_LIST:
+ return nvme_identify_sec_ctrl_list(n, req);
case NVME_ID_CNS_CS_NS:
return nvme_identify_ns_csi(n, req, true);
case NVME_ID_CNS_CS_NS_PRESENT:
@@ -5217,13 +5276,12 @@ defaults:
break;
case NVME_NUMBER_OF_QUEUES:
- result = (n->params.max_ioqpairs - 1) |
- ((n->params.max_ioqpairs - 1) << 16);
+ result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
trace_pci_nvme_getfeat_numq(result);
break;
case NVME_INTERRUPT_VECTOR_CONF:
iv = dw11 & 0xffff;
- if (iv >= n->params.max_ioqpairs + 1) {
+ if (iv >= n->conf_ioqpairs + 1) {
return NVME_INVALID_FIELD | NVME_DNR;
}
@@ -5379,10 +5437,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
((dw11 >> 16) & 0xffff) + 1,
- n->params.max_ioqpairs,
- n->params.max_ioqpairs);
- req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
- ((n->params.max_ioqpairs - 1) << 16));
+ n->conf_ioqpairs,
+ n->conf_ioqpairs);
+ req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
+ ((n->conf_ioqpairs - 1) << 16));
break;
case NVME_ASYNCHRONOUS_EVENT_CONF:
n->features.async_config = dw11;
@@ -5769,6 +5827,167 @@ out:
return status;
}
+static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
+ int *num_prim, int *num_sec)
+{
+ *num_total = le32_to_cpu(rt ?
+ n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
+ *num_prim = le16_to_cpu(rt ?
+ n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
+ *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
+}
+
+static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
+ uint16_t cntlid, uint8_t rt,
+ int nr)
+{
+ int num_total, num_prim, num_sec;
+
+ if (cntlid != n->cntlid) {
+ return NVME_INVALID_CTRL_ID | NVME_DNR;
+ }
+
+ nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
+
+ if (nr > num_total) {
+ return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+ }
+
+ if (nr > num_total - num_sec) {
+ return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+ }
+
+ if (rt) {
+ n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
+ } else {
+ n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
+ }
+
+ req->cqe.result = cpu_to_le32(nr);
+ return req->status;
+}
+
+static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
+ uint8_t rt, int nr)
+{
+ int prev_nr, prev_total;
+
+ if (rt) {
+ prev_nr = le16_to_cpu(sctrl->nvi);
+ prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
+ sctrl->nvi = cpu_to_le16(nr);
+ n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
+ } else {
+ prev_nr = le16_to_cpu(sctrl->nvq);
+ prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
+ sctrl->nvq = cpu_to_le16(nr);
+ n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
+ }
+}
+
+static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
+ uint16_t cntlid, uint8_t rt, int nr)
+{
+ int num_total, num_prim, num_sec, num_free, diff, limit;
+ NvmeSecCtrlEntry *sctrl;
+
+ sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+ if (!sctrl) {
+ return NVME_INVALID_CTRL_ID | NVME_DNR;
+ }
+
+ if (sctrl->scs) {
+ return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+ }
+
+ limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
+ if (nr > limit) {
+ return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
+ }
+
+ nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
+ num_free = num_total - num_prim - num_sec;
+ diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
+
+ if (diff > num_free) {
+ return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+ }
+
+ nvme_update_virt_res(n, sctrl, rt, nr);
+ req->cqe.result = cpu_to_le32(nr);
+
+ return req->status;
+}
+
+static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
+{
+ NvmeCtrl *sn = NULL;
+ NvmeSecCtrlEntry *sctrl;
+ int vf_index;
+
+ sctrl = nvme_sctrl_for_cntlid(n, cntlid);
+ if (!sctrl) {
+ return NVME_INVALID_CTRL_ID | NVME_DNR;
+ }
+
+ if (!pci_is_vf(&n->parent_obj)) {
+ vf_index = le16_to_cpu(sctrl->vfn) - 1;
+ sn = NVME(pcie_sriov_get_vf_at_index(&n->parent_obj, vf_index));
+ }
+
+ if (online) {
+ if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
+ return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
+ }
+
+ if (!sctrl->scs) {
+ sctrl->scs = 0x1;
+ nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
+ }
+ } else {
+ nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
+ nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
+
+ if (sctrl->scs) {
+ sctrl->scs = 0x0;
+ if (sn) {
+ nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
+ }
+ }
+ }
+
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
+{
+ uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+ uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
+ uint8_t act = dw10 & 0xf;
+ uint8_t rt = (dw10 >> 8) & 0x7;
+ uint16_t cntlid = (dw10 >> 16) & 0xffff;
+ int nr = dw11 & 0xffff;
+
+ trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
+
+ if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
+ return NVME_INVALID_RESOURCE_ID | NVME_DNR;
+ }
+
+ switch (act) {
+ case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
+ return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
+ case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
+ return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
+ case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
+ return nvme_virt_set_state(n, cntlid, true);
+ case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
+ return nvme_virt_set_state(n, cntlid, false);
+ default:
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+}
+
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
{
trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -5811,6 +6030,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
return nvme_aer(n, req);
case NVME_ADM_CMD_NS_ATTACHMENT:
return nvme_ns_attachment(n, req);
+ case NVME_ADM_CMD_VIRT_MNGMT:
+ return nvme_virt_mngmt(n, req);
case NVME_ADM_CMD_FORMAT_NVM:
return nvme_format(n, req);
default:
@@ -5857,8 +6078,48 @@ static void nvme_process_sq(void *opaque)
}
}
-static void nvme_ctrl_reset(NvmeCtrl *n)
+static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
{
+ uint8_t *config;
+
+ if (!msix_present(pci_dev)) {
+ return;
+ }
+
+ assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
+
+ config = pci_dev->config + pci_dev->msix_cap;
+ pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
+ table_size - 1);
+}
+
+static void nvme_activate_virt_res(NvmeCtrl *n)
+{
+ PCIDevice *pci_dev = &n->parent_obj;
+ NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
+ NvmeSecCtrlEntry *sctrl;
+
+ /* -1 to account for the admin queue */
+ if (pci_is_vf(pci_dev)) {
+ sctrl = nvme_sctrl(n);
+ cap->vqprt = sctrl->nvq;
+ cap->viprt = sctrl->nvi;
+ n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
+ n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
+ } else {
+ cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
+ cap->virfap = n->next_pri_ctrl_cap.virfap;
+ n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
+ le16_to_cpu(cap->vqrfap) - 1;
+ n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
+ le16_to_cpu(cap->virfap);
+ }
+}
+
+static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
+{
+ PCIDevice *pci_dev = &n->parent_obj;
+ NvmeSecCtrlEntry *sctrl;
NvmeNamespace *ns;
int i;
@@ -5888,9 +6149,41 @@ static void nvme_ctrl_reset(NvmeCtrl *n)
g_free(event);
}
+ if (n->params.sriov_max_vfs) {
+ if (!pci_is_vf(pci_dev)) {
+ for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
+ sctrl = &n->sec_ctrl_list.sec[i];
+ nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
+ }
+
+ if (rst != NVME_RESET_CONTROLLER) {
+ pcie_sriov_pf_disable_vfs(pci_dev);
+ }
+ }
+
+ if (rst != NVME_RESET_CONTROLLER) {
+ nvme_activate_virt_res(n);
+ }
+ }
+
n->aer_queued = 0;
+ n->aer_mask = 0;
n->outstanding_aers = 0;
n->qs_created = false;
+
+ nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
+
+ if (pci_is_vf(pci_dev)) {
+ sctrl = nvme_sctrl(n);
+
+ stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
+ } else {
+ stl_le_p(&n->bar.csts, 0);
+ }
+
+ stl_le_p(&n->bar.intms, 0);
+ stl_le_p(&n->bar.intmc, 0);
+ stl_le_p(&n->bar.cc, 0);
}
static void nvme_ctrl_shutdown(NvmeCtrl *n)
@@ -5936,7 +6229,15 @@ static int nvme_start_ctrl(NvmeCtrl *n)
uint64_t acq = ldq_le_p(&n->bar.acq);
uint32_t page_bits = NVME_CC_MPS(cc) + 12;
uint32_t page_size = 1 << page_bits;
+ NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
+ if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
+ trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
+ le16_to_cpu(sctrl->nvq),
+ sctrl->scs ? "ONLINE" :
+ "OFFLINE");
+ return -1;
+ }
if (unlikely(n->cq[0])) {
trace_pci_nvme_err_startfail_cq();
return -1;
@@ -6017,8 +6318,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
nvme_set_timestamp(n, 0ULL);
- QTAILQ_INIT(&n->aer_queue);
-
nvme_select_iocs(n);
return 0;
@@ -6096,20 +6395,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
nvme_irq_check(n);
break;
case NVME_REG_CC:
+ stl_le_p(&n->bar.cc, data);
+
trace_pci_nvme_mmio_cfg(data & 0xffffffff);
- /* Windows first sends data, then sends enable bit */
- if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
- !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
- {
- cc = data;
+ if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
+ trace_pci_nvme_mmio_shutdown_set();
+ nvme_ctrl_shutdown(n);
+ csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
+ csts |= NVME_CSTS_SHST_COMPLETE;
+ } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
+ trace_pci_nvme_mmio_shutdown_cleared();
+ csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
}
if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
- cc = data;
-
- /* flush CC since nvme_start_ctrl() needs the value */
- stl_le_p(&n->bar.cc, cc);
if (unlikely(nvme_start_ctrl(n))) {
trace_pci_nvme_err_startfail();
csts = NVME_CSTS_FAILED;
@@ -6119,23 +6419,11 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
}
} else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
trace_pci_nvme_mmio_stopped();
- nvme_ctrl_reset(n);
- cc = 0;
- csts &= ~NVME_CSTS_READY;
- }
+ nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
- if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
- trace_pci_nvme_mmio_shutdown_set();
- nvme_ctrl_shutdown(n);
- cc = data;
- csts |= NVME_CSTS_SHST_COMPLETE;
- } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
- trace_pci_nvme_mmio_shutdown_cleared();
- csts &= ~NVME_CSTS_SHST_COMPLETE;
- cc = data;
+ break;
}
- stl_le_p(&n->bar.cc, cc);
stl_le_p(&n->bar.csts, csts);
break;
@@ -6319,6 +6607,12 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
return 0;
}
+ if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
+ addr != NVME_REG_CSTS) {
+ trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
+ return 0;
+ }
+
/*
* When PMRWBM bit 1 is set then read from
* from PMRSTS should ensure prior writes
@@ -6468,6 +6762,12 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
trace_pci_nvme_mmio_write(addr, data, size);
+ if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
+ addr != NVME_REG_CSTS) {
+ trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
+ return;
+ }
+
if (addr < sizeof(n->bar)) {
nvme_write_bar(n, addr, data, size);
} else {
@@ -6569,19 +6869,140 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
error_setg(errp, "vsl must be non-zero");
return;
}
+
+ if (params->sriov_max_vfs) {
+ if (!n->subsys) {
+ error_setg(errp, "subsystem is required for the use of SR-IOV");
+ return;
+ }
+
+ if (params->sriov_max_vfs > NVME_MAX_VFS) {
+ error_setg(errp, "sriov_max_vfs must be between 0 and %d",
+ NVME_MAX_VFS);
+ return;
+ }
+
+ if (params->cmb_size_mb) {
+ error_setg(errp, "CMB is not supported with SR-IOV");
+ return;
+ }
+
+ if (n->pmr.dev) {
+ error_setg(errp, "PMR is not supported with SR-IOV");
+ return;
+ }
+
+ if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
+ error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
+ " must be set for the use of SR-IOV");
+ return;
+ }
+
+ if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
+ error_setg(errp, "sriov_vq_flexible must be greater than or equal"
+ " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
+ return;
+ }
+
+ if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
+ error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
+ " greater than or equal to 2");
+ return;
+ }
+
+ if (params->sriov_vi_flexible < params->sriov_max_vfs) {
+ error_setg(errp, "sriov_vi_flexible must be greater than or equal"
+ " to %d (sriov_max_vfs)", params->sriov_max_vfs);
+ return;
+ }
+
+ if (params->msix_qsize < params->sriov_vi_flexible + 1) {
+ error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
+ " greater than or equal to 1");
+ return;
+ }
+
+ if (params->sriov_max_vi_per_vf &&
+ (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
+ error_setg(errp, "sriov_max_vi_per_vf must meet:"
+ " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
+ " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
+ return;
+ }
+
+ if (params->sriov_max_vq_per_vf &&
+ (params->sriov_max_vq_per_vf < 2 ||
+ (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
+ error_setg(errp, "sriov_max_vq_per_vf must meet:"
+ " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
+ " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
+ return;
+ }
+ }
}
static void nvme_init_state(NvmeCtrl *n)
{
- /* add one to max_ioqpairs to account for the admin queue pair */
- n->reg_size = pow2ceil(sizeof(NvmeBar) +
- 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
+ NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
+ NvmeSecCtrlList *list = &n->sec_ctrl_list;
+ NvmeSecCtrlEntry *sctrl;
+ uint8_t max_vfs;
+ int i;
+
+ if (pci_is_vf(&n->parent_obj)) {
+ sctrl = nvme_sctrl(n);
+ max_vfs = 0;
+ n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
+ n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
+ } else {
+ max_vfs = n->params.sriov_max_vfs;
+ n->conf_ioqpairs = n->params.max_ioqpairs;
+ n->conf_msix_qsize = n->params.msix_qsize;
+ }
+
n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
n->temperature = NVME_TEMPERATURE;
n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+ QTAILQ_INIT(&n->aer_queue);
+
+ list->numcntl = cpu_to_le16(max_vfs);
+ for (i = 0; i < max_vfs; i++) {
+ sctrl = &list->sec[i];
+ sctrl->pcid = cpu_to_le16(n->cntlid);
+ sctrl->vfn = cpu_to_le16(i + 1);
+ }
+
+ cap->cntlid = cpu_to_le16(n->cntlid);
+ cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
+
+ if (pci_is_vf(&n->parent_obj)) {
+ cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
+ } else {
+ cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
+ n->params.sriov_vq_flexible);
+ cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
+ cap->vqrfap = cap->vqfrt;
+ cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
+ cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
+ cpu_to_le16(n->params.sriov_max_vq_per_vf) :
+ cap->vqfrt / MAX(max_vfs, 1);
+ }
+
+ if (pci_is_vf(&n->parent_obj)) {
+ cap->viprt = cpu_to_le16(n->conf_msix_qsize);
+ } else {
+ cap->viprt = cpu_to_le16(n->params.msix_qsize -
+ n->params.sriov_vi_flexible);
+ cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
+ cap->virfap = cap->vifrt;
+ cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
+ cap->vifrsm = n->params.sriov_max_vi_per_vf ?
+ cpu_to_le16(n->params.sriov_max_vi_per_vf) :
+ cap->vifrt / MAX(max_vfs, 1);
+ }
}
static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -6626,10 +7047,77 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
memory_region_set_enabled(&n->pmr.dev->mr, false);
}
+static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
+ unsigned *msix_table_offset,
+ unsigned *msix_pba_offset)
+{
+ uint64_t bar_size, msix_table_size, msix_pba_size;
+
+ bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
+ bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+ if (msix_table_offset) {
+ *msix_table_offset = bar_size;
+ }
+
+ msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
+ bar_size += msix_table_size;
+ bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+
+ if (msix_pba_offset) {
+ *msix_pba_offset = bar_size;
+ }
+
+ msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
+ bar_size += msix_pba_size;
+
+ bar_size = pow2ceil(bar_size);
+ return bar_size;
+}
+
+static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
+{
+ uint16_t vf_dev_id = n->params.use_intel_id ?
+ PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
+ NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
+ uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm),
+ le16_to_cpu(cap->vifrsm),
+ NULL, NULL);
+
+ pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
+ n->params.sriov_max_vfs, n->params.sriov_max_vfs,
+ NVME_VF_OFFSET, NVME_VF_STRIDE);
+
+ pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
+}
+
+static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
+{
+ Error *err = NULL;
+ int ret;
+
+ ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
+ PCI_PM_SIZEOF, &err);
+ if (err) {
+ error_report_err(err);
+ return ret;
+ }
+
+ pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
+ PCI_PM_CAP_VER_1_2);
+ pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_NO_SOFT_RESET);
+ pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
+ PCI_PM_CTRL_STATE_MASK);
+
+ return 0;
+}
+
static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
{
uint8_t *pci_conf = pci_dev->config;
- uint64_t bar_size, msix_table_size, msix_pba_size;
+ uint64_t bar_size;
unsigned msix_table_offset, msix_pba_offset;
int ret;
@@ -6640,34 +7128,35 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
if (n->params.use_intel_id) {
pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
- pci_config_set_device_id(pci_conf, 0x5845);
+ pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
} else {
pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
}
pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
+ nvme_add_pm_capability(pci_dev, 0x60);
pcie_endpoint_cap_init(pci_dev, 0x80);
+ pcie_cap_flr_init(pci_dev);
+ if (n->params.sriov_max_vfs) {
+ pcie_ari_init(pci_dev, 0x100, 1);
+ }
- bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
- msix_table_offset = bar_size;
- msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
-
- bar_size += msix_table_size;
- bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
- msix_pba_offset = bar_size;
- msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
-
- bar_size += msix_pba_size;
- bar_size = pow2ceil(bar_size);
+ /* add one to max_ioqpairs to account for the admin queue pair */
+ bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
+ &msix_table_offset, &msix_pba_offset);
memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
- n->reg_size);
+ msix_table_offset);
memory_region_add_subregion(&n->bar0, 0, &n->iomem);
- pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
- PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+ if (pci_is_vf(pci_dev)) {
+ pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
+ } else {
+ pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
+ PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+ }
ret = msix_init(pci_dev, n->params.msix_qsize,
&n->bar0, 0, msix_table_offset,
&n->bar0, 0, msix_pba_offset, 0, &err);
@@ -6680,6 +7169,8 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
}
}
+ nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
+
if (n->params.cmb_size_mb) {
nvme_init_cmb(n, pci_dev);
}
@@ -6688,6 +7179,10 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
nvme_init_pmr(n, pci_dev);
}
+ if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
+ nvme_init_sriov(n, pci_dev, 0x120);
+ }
+
return 0;
}
@@ -6709,6 +7204,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
NvmeIdCtrl *id = &n->id_ctrl;
uint8_t *pci_conf = pci_dev->config;
uint64_t cap = ldq_le_p(&n->bar.cap);
+ NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
@@ -6775,8 +7271,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1);
- id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
- NVME_CTRL_SGLS_BITBUCKET);
+ id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
nvme_init_subnqn(n);
@@ -6801,6 +7296,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
stl_le_p(&n->bar.vs, NVME_SPEC_VER);
n->bar.intmc = n->bar.intms = 0;
+
+ if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
+ stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
+ }
}
static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
@@ -6838,6 +7337,16 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
NvmeCtrl *n = NVME(pci_dev);
NvmeNamespace *ns;
Error *local_err = NULL;
+ NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
+
+ if (pci_is_vf(pci_dev)) {
+ /*
+ * VFs derive settings from the parent. PF's lifespan exceeds
+ * that of VF's, so it's safe to share params.serial.
+ */
+ memcpy(&n->params, &pn->params, sizeof(NvmeParams));
+ n->subsys = pn->subsys;
+ }
nvme_check_constraints(n, &local_err);
if (local_err) {
@@ -6848,15 +7357,14 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
&pci_dev->qdev, n->parent_obj.qdev.id);
- nvme_init_state(n);
- if (nvme_init_pci(n, pci_dev, errp)) {
- return;
- }
-
if (nvme_init_subsys(n, errp)) {
error_propagate(errp, local_err);
return;
}
+ nvme_init_state(n);
+ if (nvme_init_pci(n, pci_dev, errp)) {
+ return;
+ }
nvme_init_ctrl(n, pci_dev);
/* setup a namespace if the controller drive property was given */
@@ -6878,7 +7386,7 @@ static void nvme_exit(PCIDevice *pci_dev)
NvmeNamespace *ns;
int i;
- nvme_ctrl_reset(n);
+ nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
if (n->subsys) {
for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
@@ -6902,6 +7410,11 @@ static void nvme_exit(PCIDevice *pci_dev)
if (n->pmr.dev) {
host_memory_backend_set_mapped(n->pmr.dev, false);
}
+
+ if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
+ pcie_sriov_pf_exit(pci_dev);
+ }
+
msix_uninit(pci_dev, &n->bar0, &n->bar0);
memory_region_del_subregion(&n->bar0, &n->iomem);
}
@@ -6926,6 +7439,15 @@ static Property nvme_props[] = {
DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
params.auto_transition_zones, true),
+ DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
+ DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
+ params.sriov_vq_flexible, 0),
+ DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
+ params.sriov_vi_flexible, 0),
+ DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
+ params.sriov_max_vi_per_vf, 0),
+ DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
+ params.sriov_max_vq_per_vf, 0),
DEFINE_PROP_END_OF_LIST(),
};
@@ -6971,6 +7493,47 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
}
}
+static void nvme_pci_reset(DeviceState *qdev)
+{
+ PCIDevice *pci_dev = PCI_DEVICE(qdev);
+ NvmeCtrl *n = NVME(pci_dev);
+
+ trace_pci_nvme_pci_reset();
+ nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
+}
+
+static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
+ uint32_t val, int len)
+{
+ NvmeCtrl *n = NVME(dev);
+ NvmeSecCtrlEntry *sctrl;
+ uint16_t sriov_cap = dev->exp.sriov_cap;
+ uint32_t off = address - sriov_cap;
+ int i, num_vfs;
+
+ if (!sriov_cap) {
+ return;
+ }
+
+ if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
+ if (!(val & PCI_SRIOV_CTRL_VFE)) {
+ num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
+ for (i = 0; i < num_vfs; i++) {
+ sctrl = &n->sec_ctrl_list.sec[i];
+ nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
+ }
+ }
+ }
+}
+
+static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
+ uint32_t val, int len)
+{
+ nvme_sriov_pre_write_ctrl(dev, address, val, len);
+ pci_default_write_config(dev, address, val, len);
+ pcie_cap_flr_write_config(dev, address, val, len);
+}
+
static const VMStateDescription nvme_vmstate = {
.name = "nvme",
.unmigratable = 1,
@@ -6982,6 +7545,7 @@ static void nvme_class_init(ObjectClass *oc, void *data)
PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
pc->realize = nvme_realize;
+ pc->config_write = nvme_pci_write_config;
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->revision = 2;
@@ -6990,6 +7554,7 @@ static void nvme_class_init(ObjectClass *oc, void *data)
dc->desc = "Non-Volatile Memory Express";
device_class_set_props(dc, nvme_props);
dc->vmsd = &nvme_vmstate;
+ dc->reset = nvme_pci_reset;
}
static void nvme_instance_init(Object *obj)
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 1b9c9d1156..870c3ca1a2 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -597,7 +597,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
NvmeCtrl *ctrl = subsys->ctrls[i];
- if (ctrl) {
+ if (ctrl && ctrl != SUBSYS_SLOT_RSVD) {
nvme_attach_ns(ctrl, ns);
}
}
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index e41771604f..99437d39bb 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -24,7 +24,7 @@
#include "block/nvme.h"
-#define NVME_MAX_CONTROLLERS 32
+#define NVME_MAX_CONTROLLERS 256
#define NVME_MAX_NAMESPACES 256
#define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
@@ -43,6 +43,7 @@ typedef struct NvmeBus {
#define TYPE_NVME_SUBSYS "nvme-subsys"
#define NVME_SUBSYS(obj) \
OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
+#define SUBSYS_SLOT_RSVD (void *)0xFFFF
typedef struct NvmeSubsystem {
DeviceState parent_obj;
@@ -68,6 +69,10 @@ static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
return NULL;
}
+ if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
+ return NULL;
+ }
+
return subsys->ctrls[cntlid];
}
@@ -335,6 +340,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
+ case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
default: return "NVME_ADM_CMD_UNKNOWN";
}
@@ -406,6 +412,11 @@ typedef struct NvmeParams {
uint8_t zasl;
bool auto_transition_zones;
bool legacy_cmb;
+ uint8_t sriov_max_vfs;
+ uint16_t sriov_vq_flexible;
+ uint16_t sriov_vi_flexible;
+ uint8_t sriov_max_vq_per_vf;
+ uint8_t sriov_max_vi_per_vf;
} NvmeParams;
typedef struct NvmeCtrl {
@@ -423,7 +434,6 @@ typedef struct NvmeCtrl {
uint16_t max_prp_ents;
uint16_t cqe_size;
uint16_t sqe_size;
- uint32_t reg_size;
uint32_t max_q_ents;
uint8_t outstanding_aers;
uint32_t irq_status;
@@ -433,6 +443,8 @@ typedef struct NvmeCtrl {
uint64_t starttime_ms;
uint16_t temperature;
uint8_t smart_critical_warning;
+ uint32_t conf_msix_qsize;
+ uint32_t conf_ioqpairs;
struct {
MemoryRegion mem;
@@ -477,8 +489,20 @@ typedef struct NvmeCtrl {
uint32_t async_config;
NvmeHostBehaviorSupport hbs;
} features;
+
+ NvmePriCtrlCap pri_ctrl_cap;
+ NvmeSecCtrlList sec_ctrl_list;
+ struct {
+ uint16_t vqrfap;
+ uint16_t virfap;
+ } next_pri_ctrl_cap; /* These override pri_ctrl_cap after reset */
} NvmeCtrl;
+typedef enum NvmeResetType {
+ NVME_RESET_FUNCTION = 0,
+ NVME_RESET_CONTROLLER = 1,
+} NvmeResetType;
+
static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
{
if (!nsid || nsid > NVME_MAX_NAMESPACES) {
@@ -511,6 +535,33 @@ static inline uint16_t nvme_cid(NvmeRequest *req)
return le16_to_cpu(req->cqe.cid);
}
+static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
+{
+ PCIDevice *pci_dev = &n->parent_obj;
+ NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
+
+ if (pci_is_vf(pci_dev)) {
+ return &pf->sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
+ }
+
+ return NULL;
+}
+
+static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,
+ uint16_t cntlid)
+{
+ NvmeSecCtrlList *list = &n->sec_ctrl_list;
+ uint8_t i;
+
+ for (i = 0; i < list->numcntl; i++) {
+ if (le16_to_cpu(list->sec[i].scid) == cntlid) {
+ return &list->sec[i];
+ }
+ }
+
+ return NULL;
+}
+
void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
NvmeTxDirection dir, NvmeRequest *req);
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 691a90d209..9d2643678b 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -11,20 +11,71 @@
#include "nvme.h"
-int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)
+static int nvme_subsys_reserve_cntlids(NvmeCtrl *n, int start, int num)
{
NvmeSubsystem *subsys = n->subsys;
- int cntlid, nsid;
+ NvmeSecCtrlList *list = &n->sec_ctrl_list;
+ NvmeSecCtrlEntry *sctrl;
+ int i, cnt = 0;
+
+ for (i = start; i < ARRAY_SIZE(subsys->ctrls) && cnt < num; i++) {
+ if (!subsys->ctrls[i]) {
+ sctrl = &list->sec[cnt];
+ sctrl->scid = cpu_to_le16(i);
+ subsys->ctrls[i] = SUBSYS_SLOT_RSVD;
+ cnt++;
+ }
+ }
+
+ return cnt;
+}
- for (cntlid = 0; cntlid < ARRAY_SIZE(subsys->ctrls); cntlid++) {
- if (!subsys->ctrls[cntlid]) {
- break;
+static void nvme_subsys_unreserve_cntlids(NvmeCtrl *n)
+{
+ NvmeSubsystem *subsys = n->subsys;
+ NvmeSecCtrlList *list = &n->sec_ctrl_list;
+ NvmeSecCtrlEntry *sctrl;
+ int i, cntlid;
+
+ for (i = 0; i < n->params.sriov_max_vfs; i++) {
+ sctrl = &list->sec[i];
+ cntlid = le16_to_cpu(sctrl->scid);
+
+ if (cntlid) {
+ assert(subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD);
+ subsys->ctrls[cntlid] = NULL;
+ sctrl->scid = 0;
}
}
+}
- if (cntlid == ARRAY_SIZE(subsys->ctrls)) {
- error_setg(errp, "no more free controller id");
- return -1;
+int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)
+{
+ NvmeSubsystem *subsys = n->subsys;
+ NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
+ int cntlid, nsid, num_rsvd, num_vfs = n->params.sriov_max_vfs;
+
+ if (pci_is_vf(&n->parent_obj)) {
+ cntlid = le16_to_cpu(sctrl->scid);
+ } else {
+ for (cntlid = 0; cntlid < ARRAY_SIZE(subsys->ctrls); cntlid++) {
+ if (!subsys->ctrls[cntlid]) {
+ break;
+ }
+ }
+
+ if (cntlid == ARRAY_SIZE(subsys->ctrls)) {
+ error_setg(errp, "no more free controller id");
+ return -1;
+ }
+
+ num_rsvd = nvme_subsys_reserve_cntlids(n, cntlid + 1, num_vfs);
+ if (num_rsvd != num_vfs) {
+ nvme_subsys_unreserve_cntlids(n);
+ error_setg(errp,
+ "no more free controller ids for secondary controllers");
+ return -1;
+ }
}
if (!subsys->serial) {
@@ -48,7 +99,13 @@ int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)
void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n)
{
- subsys->ctrls[n->cntlid] = NULL;
+ if (pci_is_vf(&n->parent_obj)) {
+ subsys->ctrls[n->cntlid] = SUBSYS_SLOT_RSVD;
+ } else {
+ subsys->ctrls[n->cntlid] = NULL;
+ nvme_subsys_unreserve_cntlids(n);
+ }
+
n->cntlid = -1;
}
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ff1b458969..065e1c891d 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -56,6 +56,8 @@ pci_nvme_identify_ctrl(void) "identify controller"
pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32""
pci_nvme_identify_ctrl_list(uint8_t cns, uint16_t cntid) "cns 0x%"PRIx8" cntid %"PRIu16""
+pci_nvme_identify_pri_ctrl_cap(uint16_t cntlid) "identify primary controller capabilities cntlid=%"PRIu16""
+pci_nvme_identify_sec_ctrl_list(uint16_t cntlid, uint8_t numcntl) "identify secondary controller list cntlid=%"PRIu16" numcntl=%"PRIu8""
pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8""
pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8""
@@ -108,6 +110,8 @@ pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_
pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state"
pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state"
pci_nvme_zoned_zrwa_implicit_flush(uint64_t zslba, uint32_t nlb) "zslba 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_pci_reset(void) "PCI Function Level Reset"
+pci_nvme_virt_mngmt(uint16_t cid, uint16_t act, uint16_t cntlid, const char* rt, uint16_t nr) "cid %"PRIu16", act=0x%"PRIx16", ctrlid=%"PRIu16" %s nr=%"PRIu16""
# error conditions
pci_nvme_err_mdts(size_t len) "len %zu"
@@ -177,7 +181,9 @@ pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the
pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero"
pci_nvme_err_startfail_zasl_too_small(uint32_t zasl, uint32_t pagesz) "nvme_start_ctrl failed because zone append size limit %"PRIu32" is too small, needs to be >= %"PRIu32""
pci_nvme_err_startfail(void) "setting controller enable bit failed"
+pci_nvme_err_startfail_virt_state(uint16_t vq, uint16_t vi, const char *state) "nvme_start_ctrl failed due to ctrl state: vi=%u vq=%u %s"
pci_nvme_err_invalid_mgmt_action(uint8_t action) "action=0x%"PRIx8""
+pci_nvme_err_ignored_mmio_vf_offline(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
# undefined behavior
pci_nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3737351cc8..373c70b5ca 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -595,6 +595,7 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_ACTIVATE_FW = 0x10,
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
+ NVME_ADM_CMD_VIRT_MNGMT = 0x1c,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
@@ -899,6 +900,10 @@ enum NvmeStatusCodes {
NVME_NS_PRIVATE = 0x0119,
NVME_NS_NOT_ATTACHED = 0x011a,
NVME_NS_CTRL_LIST_INVALID = 0x011c,
+ NVME_INVALID_CTRL_ID = 0x011f,
+ NVME_INVALID_SEC_CTRL_STATE = 0x0120,
+ NVME_INVALID_NUM_RESOURCES = 0x0121,
+ NVME_INVALID_RESOURCE_ID = 0x0122,
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
@@ -1033,6 +1038,8 @@ enum NvmeIdCns {
NVME_ID_CNS_NS_PRESENT = 0x11,
NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
NVME_ID_CNS_CTRL_LIST = 0x13,
+ NVME_ID_CNS_PRIMARY_CTRL_CAP = 0x14,
+ NVME_ID_CNS_SECONDARY_CTRL_LIST = 0x15,
NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a,
NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
NVME_ID_CNS_IO_COMMAND_SET = 0x1c,
@@ -1553,6 +1560,61 @@ typedef enum NvmeZoneState {
NVME_ZONE_STATE_OFFLINE = 0x0f,
} NvmeZoneState;
+typedef struct QEMU_PACKED NvmePriCtrlCap {
+ uint16_t cntlid;
+ uint16_t portid;
+ uint8_t crt;
+ uint8_t rsvd5[27];
+ uint32_t vqfrt;
+ uint32_t vqrfa;
+ uint16_t vqrfap;
+ uint16_t vqprt;
+ uint16_t vqfrsm;
+ uint16_t vqgran;
+ uint8_t rsvd48[16];
+ uint32_t vifrt;
+ uint32_t virfa;
+ uint16_t virfap;
+ uint16_t viprt;
+ uint16_t vifrsm;
+ uint16_t vigran;
+ uint8_t rsvd80[4016];
+} NvmePriCtrlCap;
+
+typedef enum NvmePriCtrlCapCrt {
+ NVME_CRT_VQ = 1 << 0,
+ NVME_CRT_VI = 1 << 1,
+} NvmePriCtrlCapCrt;
+
+typedef struct QEMU_PACKED NvmeSecCtrlEntry {
+ uint16_t scid;
+ uint16_t pcid;
+ uint8_t scs;
+ uint8_t rsvd5[3];
+ uint16_t vfn;
+ uint16_t nvq;
+ uint16_t nvi;
+ uint8_t rsvd14[18];
+} NvmeSecCtrlEntry;
+
+typedef struct QEMU_PACKED NvmeSecCtrlList {
+ uint8_t numcntl;
+ uint8_t rsvd1[31];
+ NvmeSecCtrlEntry sec[127];
+} NvmeSecCtrlList;
+
+typedef enum NvmeVirtMngmtAction {
+ NVME_VIRT_MNGMT_ACTION_PRM_ALLOC = 0x01,
+ NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE = 0x07,
+ NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN = 0x08,
+ NVME_VIRT_MNGMT_ACTION_SEC_ONLINE = 0x09,
+} NvmeVirtMngmtAction;
+
+typedef enum NvmeVirtualResourceType {
+ NVME_VIRT_RES_QUEUE = 0x00,
+ NVME_VIRT_RES_INTERRUPT = 0x01,
+} NvmeVirtualResourceType;
+
static inline void _nvme_check_size(void)
{
QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
@@ -1588,5 +1650,8 @@ static inline void _nvme_check_size(void)
QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 16);
+ QEMU_BUILD_BUG_ON(sizeof(NvmePriCtrlCap) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlEntry) != 32);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlList) != 4096);
}
#endif
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
index 898083b86f..d5ddea558b 100644
--- a/include/hw/pci/pci_ids.h
+++ b/include/hw/pci/pci_ids.h
@@ -238,6 +238,7 @@
#define PCI_DEVICE_ID_INTEL_82801BA_11 0x244e
#define PCI_DEVICE_ID_INTEL_82801D 0x24CD
#define PCI_DEVICE_ID_INTEL_ESB_9 0x25ab
+#define PCI_DEVICE_ID_INTEL_NVME 0x5845
#define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000
#define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010
#define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020