[Yakkety][PATCH] UBUNTU: SAUCE: (no-up) NVMe: only setup MSIX once

Dan Streetman dan.streetman at canonical.com
Fri Jan 6 19:27:05 UTC 2017


BugLink: http://bugs.launchpad.net/bugs/1648449
BugLink: http://bugs.launchpad.net/bugs/1651602

The NVMe driver currently configures a single MSI/MSIX interrupt for its
admin queue, then later after initializing the NVMe controller and
detecting the number of queues the controller can handle, the driver
releases its MSI/MSIX interrupt and immediately re-configures enough
MSI/MSIX interrupts for all the queues.  However, on Amazon AWS instances
with passthrough NVMe drives, this MSI/MSIX release/request causes
failures of some of the interrupts, resulting in some (usually all but one
or two but sometimes none) of the NVMe drive(s) failing to initialize.

This combines the MSI/MSIX configurations, so all the MSI/MSIX configuration
is done once.  This avoids the interrupt failures, but it also results in
more MSI/MSIX interrupts being configured than are needed.

This patch for yakkety also includes the regression fix from xenial:
8fb7c1f39c99 ("nvme: only require 1 interrupt vector, not 2+")

Signed-off-by: Dan Streetman <dan.streetman at canonical.com>
(forward-ported from xenial commit 96fce9e4025b96b08bfe5196d3380ab9215cb64b)
---
 drivers/nvme/host/pci.c | 70 +++++++++++++++----------------------------------
 1 file changed, 21 insertions(+), 49 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3a4078a..9206ff1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1490,7 +1490,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int result, i, vecs, nr_io_queues, size;
 
-	nr_io_queues = num_online_cpus();
+	nr_io_queues = dev->max_qid;
 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
 	if (result < 0)
 		return result;
@@ -1522,45 +1522,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		adminq->q_db = dev->dbs;
 	}
 
-	/* Deregister the admin queue's interrupt */
-	free_irq(dev->entry[0].vector, adminq);
-
-	/*
-	 * If we enable msix early due to not intx, disable it again before
-	 * setting up the full range we need.
-	 */
-	if (pdev->msi_enabled)
-		pci_disable_msi(pdev);
-	else if (pdev->msix_enabled)
-		pci_disable_msix(pdev);
-
-	for (i = 0; i < nr_io_queues; i++)
-		dev->entry[i].entry = i;
-	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
-	if (vecs < 0) {
-		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
-		if (vecs < 0) {
-			vecs = 1;
-		} else {
-			for (i = 0; i < vecs; i++)
-				dev->entry[i].vector = i + pdev->irq;
-		}
-	}
-
-	/*
-	 * Should investigate if there's a performance win from allocating
-	 * more queues than interrupt vectors; it might allow the submission
-	 * path to scale better, even if the receive path is limited by the
-	 * number of interrupts.
-	 */
-	nr_io_queues = vecs;
 	dev->max_qid = nr_io_queues;
 
-	result = queue_request_irq(dev, adminq, adminq->irqname);
-	if (result) {
-		adminq->cq_vector = -1;
-		goto free_queues;
-	}
 	return nvme_create_io_queues(dev);
 
  free_queues:
@@ -1712,7 +1675,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	u64 cap;
-	int result = -ENOMEM;
+	int result = -ENOMEM, nr_io_queues, i, vecs;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_enable_device_mem(pdev))
@@ -1729,21 +1692,30 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 		goto disable;
 	}
 
-	/*
-	 * Some devices and/or platforms don't advertise or work with INTx
-	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
-	 * adjust this later.
-	 */
-	if (pci_enable_msix(pdev, dev->entry, 1)) {
-		pci_enable_msi(pdev);
-		dev->entry[0].vector = pdev->irq;
+	nr_io_queues = num_possible_cpus();
+
+	for (i = 0; i < nr_io_queues; i++)
+		dev->entry[i].entry = i;
+	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
+	if (vecs < 0) {
+		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
+		if (vecs < 0) {
+			result = vecs;
+			goto disable;
+		} else {
+			for (i = 0; i < vecs; i++)
+				dev->entry[i].vector = i + pdev->irq;
+		}
 	}
 
-	if (!dev->entry[0].vector) {
-		result = -ENODEV;
+	if (vecs < 1) {
+		dev_err(dev->ctrl.device, "Failed to get any MSI/MSIX interrupts\n");
+		result = -ENOSPC;
 		goto disable;
 	}
 
+	dev->max_qid = vecs;
+
 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
 
 	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
-- 
2.9.3





More information about the kernel-team mailing list