[SRU][J:linux-bluefield][PATCH v7 4/6] UBUNTU: SAUCE: vfio/pci: Allow MMIO regions to be exported through dma-buf
William Tu
witu at nvidia.com
Wed Sep 25 17:22:12 UTC 2024
From: Jason Gunthorpe <jgg at nvidia.com>
BugLink: https://bugs.launchpad.net/bugs/2077887
dma-buf has become a way to safely acquire a handle to non-struct page
memory that can still have lifetime controlled by the exporter. Notably
RDMA can now import dma-buf FDs and build them into MRs which allows for
PCI P2P operations. Extend this to allow vfio-pci to export MMIO memory
from PCI device BARs.
The patch design loosely follows the pattern in commit
db1a8dd916aa ("habanalabs: add support for dma-buf exporter") except this
does not support pinning.
Instead, this implements what, in the past, we've called a revocable
attachment using move. In normal situations the attachment is pinned, as a
BAR does not change physical address. However when the VFIO device is
closed, or a PCI reset is issued, access to the MMIO memory is revoked.
Revoked means that move occurs, but an attempt to immediately re-map the
memory will fail. In the reset case a future move will be triggered when
MMIO access returns. As both close and reset are under userspace control
it is expected that userspace will suspend use of the dma-buf before doing
these operations, the revoke is purely for kernel self-defense against a
hostile userspace.
Co-authored-by: William Tu <witu at nvidia.com>
[witu: Add new ioctl uAPI for P2P dma buf]
Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
Signed-off-by: William Tu <witu at nvidia.com>
---
drivers/vfio/pci/Makefile | 1 +
drivers/vfio/pci/dma_buf.c | 262 +++++++++++++++++++++++++++++
drivers/vfio/pci/vfio_pci_config.c | 10 +-
drivers/vfio/pci/vfio_pci_core.c | 22 ++-
drivers/vfio/pci/vfio_pci_priv.h | 21 +++
include/linux/vfio_pci_core.h | 1 +
include/uapi/linux/vfio.h | 17 ++
7 files changed, 331 insertions(+), 3 deletions(-)
create mode 100644 drivers/vfio/pci/dma_buf.c
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 74ee55f9c261..14f198c9d9ea 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_DMA_SHARED_BUFFER) += dma_buf.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
vfio-pci-y := vfio_pci.o
diff --git a/drivers/vfio/pci/dma_buf.c b/drivers/vfio/pci/dma_buf.c
new file mode 100644
index 000000000000..14d32a580190
--- /dev/null
+++ b/drivers/vfio/pci/dma_buf.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#include <linux/dma-buf.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/dma-resv.h>
+
+#include "vfio_pci_priv.h"
+
+MODULE_IMPORT_NS(DMA_BUF);
+
+struct vfio_pci_dma_buf {
+ struct dma_buf *dmabuf;
+ struct vfio_pci_core_device *vdev;
+ struct list_head dmabufs_elm;
+ unsigned int index;
+ unsigned int orig_nents;
+ size_t offset;
+ bool revoked;
+};
+
+static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
+ struct dma_buf_attachment *attachment)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+ int rc;
+
+ rc = pci_p2pdma_distance_many(priv->vdev->pdev, &attachment->dev, 1,
+ true);
+ if (rc < 0)
+ attachment->peer2peer = false;
+ return 0;
+}
+
+static void vfio_pci_dma_buf_unpin(struct dma_buf_attachment *attachment)
+{
+}
+
+static int vfio_pci_dma_buf_pin(struct dma_buf_attachment *attachment)
+{
+ /*
+ * Uses the dynamic interface but must always allow for
+ * dma_buf_move_notify() to do revoke
+ */
+ return -EINVAL;
+}
+
+static struct sg_table *
+vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
+ enum dma_data_direction dir)
+{
+ size_t sgl_size = dma_get_max_seg_size(attachment->dev);
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+ struct scatterlist *sgl;
+ struct sg_table *sgt;
+ dma_addr_t dma_addr;
+ unsigned int nents;
+ size_t offset;
+ int ret;
+
+ dma_resv_assert_held(priv->dmabuf->resv);
+
+ if (!attachment->peer2peer)
+ return ERR_PTR(-EPERM);
+
+ if (priv->revoked)
+ return ERR_PTR(-ENODEV);
+
+ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+ if (!sgt)
+ return ERR_PTR(-ENOMEM);
+
+ nents = DIV_ROUND_UP(priv->dmabuf->size, sgl_size);
+ ret = sg_alloc_table(sgt, nents, GFP_KERNEL);
+ if (ret)
+ goto err_kfree_sgt;
+
+ /*
+ * Since the memory being mapped is a device memory it could never be in
+ * CPU caches.
+ */
+ dma_addr = dma_map_resource(
+ attachment->dev,
+ pci_resource_start(priv->vdev->pdev, priv->index) +
+ priv->offset,
+ priv->dmabuf->size, dir, DMA_ATTR_SKIP_CPU_SYNC);
+ ret = dma_mapping_error(attachment->dev, dma_addr);
+ if (ret)
+ goto err_free_sgt;
+
+ /*
+ * Break the BAR's physical range up into max sized SGL's according to
+ * the device's requirement.
+ */
+ sgl = sgt->sgl;
+ for (offset = 0; offset != priv->dmabuf->size;) {
+ size_t chunk_size = min(priv->dmabuf->size - offset, sgl_size);
+
+ sg_set_page(sgl, NULL, chunk_size, 0);
+ sg_dma_address(sgl) = dma_addr + offset;
+ sg_dma_len(sgl) = chunk_size;
+ sgl = sg_next(sgl);
+ offset += chunk_size;
+ }
+
+ /*
+ * Because we are not going to include a CPU list we want to have some
+ * chance that other users will detect this by setting the orig_nents to
+ * 0 and using only nents (length of DMA list) when going over the sgl
+ */
+ priv->orig_nents = sgt->orig_nents;
+ sgt->orig_nents = 0;
+ return sgt;
+
+err_free_sgt:
+ sg_free_table(sgt);
+err_kfree_sgt:
+ kfree(sgt);
+ return ERR_PTR(ret);
+}
+
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
+ struct sg_table *sgt,
+ enum dma_data_direction dir)
+{
+ struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
+
+ sgt->orig_nents = priv->orig_nents;
+ dma_unmap_resource(attachment->dev, sg_dma_address(sgt->sgl),
+ priv->dmabuf->size, dir, DMA_ATTR_SKIP_CPU_SYNC);
+ sg_free_table(sgt);
+ kfree(sgt);
+}
+
+static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
+{
+ struct vfio_pci_dma_buf *priv = dmabuf->priv;
+
+ /*
+ * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
+ * The refcount prevents both.
+ */
+ if (priv->vdev) {
+ down_write(&priv->vdev->memory_lock);
+ list_del_init(&priv->dmabufs_elm);
+ up_write(&priv->vdev->memory_lock);
+ vfio_device_put(&priv->vdev->vdev);
+ }
+ kfree(priv);
+}
+
+static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
+ .attach = vfio_pci_dma_buf_attach,
+ .map_dma_buf = vfio_pci_dma_buf_map,
+ .pin = vfio_pci_dma_buf_pin,
+ .unpin = vfio_pci_dma_buf_unpin,
+ .release = vfio_pci_dma_buf_release,
+ .unmap_dma_buf = vfio_pci_dma_buf_unmap,
+};
+
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev,
+ struct vfio_device_p2p_dma_buf *p2p_dma_buf)
+{
+ struct vfio_device_p2p_dma_buf get_dma_buf;
+ DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+ struct vfio_pci_dma_buf *priv;
+ int ret;
+
+ memcpy(&get_dma_buf, p2p_dma_buf, sizeof(get_dma_buf));
+
+ /* For PCI the region_index is the BAR number like everything else */
+ if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
+ return -EINVAL;
+
+ exp_info.ops = &vfio_pci_dmabuf_ops;
+ exp_info.size = pci_resource_len(vdev->pdev, get_dma_buf.region_index);
+ if (!exp_info.size)
+ return -EINVAL;
+ if (get_dma_buf.offset || get_dma_buf.length) {
+ if (get_dma_buf.length > exp_info.size ||
+ get_dma_buf.offset >= exp_info.size ||
+ get_dma_buf.length > exp_info.size - get_dma_buf.offset ||
+ get_dma_buf.offset % PAGE_SIZE ||
+ get_dma_buf.length % PAGE_SIZE)
+ return -EINVAL;
+ exp_info.size = get_dma_buf.length;
+ }
+ exp_info.flags = get_dma_buf.open_flags;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&priv->dmabufs_elm);
+ priv->offset = get_dma_buf.offset;
+ priv->index = get_dma_buf.region_index;
+
+ exp_info.priv = priv;
+ priv->dmabuf = dma_buf_export(&exp_info);
+ if (IS_ERR(priv->dmabuf)) {
+ ret = PTR_ERR(priv->dmabuf);
+ kfree(priv);
+ return ret;
+ }
+
+ /* dma_buf_put() now frees priv */
+
+ down_write(&vdev->memory_lock);
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = !__vfio_pci_memory_enabled(vdev);
+ priv->vdev = vdev;
+ vfio_device_get(&vdev->vdev);
+ list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
+ dma_resv_unlock(priv->dmabuf->resv);
+ up_write(&vdev->memory_lock);
+
+ /*
+ * dma_buf_fd() consumes the reference, when the file closes the dmabuf
+ * will be released.
+ */
+ return dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
+}
+
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ lockdep_assert_held_write(&vdev->memory_lock);
+
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!dma_buf_try_get(priv->dmabuf))
+ continue;
+ if (priv->revoked != revoked) {
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ priv->revoked = revoked;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ }
+ dma_buf_put(priv->dmabuf);
+ }
+}
+
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+ struct vfio_pci_dma_buf *priv;
+ struct vfio_pci_dma_buf *tmp;
+
+ down_write(&vdev->memory_lock);
+ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
+ if (!dma_buf_try_get(priv->dmabuf))
+ continue;
+ dma_resv_lock(priv->dmabuf->resv, NULL);
+ list_del_init(&priv->dmabufs_elm);
+ priv->vdev = NULL;
+ priv->revoked = true;
+ dma_buf_move_notify(priv->dmabuf);
+ dma_resv_unlock(priv->dmabuf->resv);
+ vfio_device_put(&vdev->vdev);
+ dma_buf_put(priv->dmabuf);
+ }
+ up_write(&vdev->memory_lock);
+}
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 78c626b8907d..f14a8b81fe35 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -28,6 +28,8 @@
#include <linux/vfio_pci_core.h>
+#include "vfio_pci_priv.h"
+
/* Fake capability ID for standard config space */
#define PCI_CAP_ID_BASIC 0
@@ -581,10 +583,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY);
new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
- if (!new_mem)
+ if (!new_mem) {
vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
+ vfio_pci_dma_buf_move(vdev, true);
+ } else {
down_write(&vdev->memory_lock);
+ }
/*
* If the user is writing mem/io enable (new_mem/io) and we
@@ -619,6 +623,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos,
*virt_cmd &= cpu_to_le16(~mask);
*virt_cmd |= cpu_to_le16(new_cmd & mask);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
}
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index a45cdd3cfb84..54ff7630947c 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -465,6 +465,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
vfio_spapr_pci_eeh_release(vdev->pdev);
vfio_pci_core_disable(vdev);
+ vfio_pci_dma_buf_cleanup(vdev);
+
mutex_lock(&vdev->igate);
if (vdev->err_trigger) {
eventfd_ctx_put(vdev->err_trigger);
@@ -671,7 +673,10 @@ int vfio_pci_try_reset_function(struct vfio_pci_core_device *vdev)
*/
vfio_pci_set_power_state(vdev, PCI_D0);
+ vfio_pci_dma_buf_move(vdev, true);
ret = pci_try_reset_function(vdev->pdev);
+ if (__vfio_pci_memory_enabled(vdev))
+ vfio_pci_dma_buf_move(vdev, false);
up_write(&vdev->memory_lock);
return ret;
@@ -1206,6 +1211,14 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
default:
return -ENOTTY;
}
+ } else if (cmd == VFIO_DEVICE_P2P_DMA_BUF) {
+ struct vfio_device_p2p_dma_buf p2p_dma_buf;
+
+ if (copy_from_user(&p2p_dma_buf, (void __user *)arg,
+ sizeof(p2p_dma_buf)))
+ return -EFAULT;
+
+ return vfio_pci_core_feature_dma_buf(vdev, &p2p_dma_buf);
}
return -ENOTTY;
@@ -1839,6 +1852,7 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
INIT_LIST_HEAD(&vdev->vma_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
init_rwsem(&vdev->memory_lock);
+ INIT_LIST_HEAD(&vdev->dmabufs);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
@@ -2151,11 +2165,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* cause the PCI config space reset without restoring the original
* state (saved locally in 'vdev->pm_save').
*/
- list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
vfio_pci_set_power_state(cur, PCI_D0);
+ vfio_pci_dma_buf_move(cur, true);
+ }
ret = pci_reset_bus(pdev);
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+ if (__vfio_pci_memory_enabled(cur))
+ vfio_pci_dma_buf_move(cur, false);
+
err_undo:
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
if (cur == cur_mem)
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 4971cd95b431..d60e9cb99140 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -4,4 +4,25 @@
int vfio_pci_try_reset_function(struct vfio_pci_core_device *vdev);
+#ifdef CONFIG_DMA_SHARED_BUFFER
+int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev,
+ struct vfio_device_p2p_dma_buf *arg);
+void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev);
+void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked);
+#else
+static int
+vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev,
+ struct vfio_device_p2p_dma_buf *arg)
+{
+ return -ENOTTY;
+}
+static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
+{
+}
+static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev,
+ bool revoked)
+{
+}
+#endif
+
#endif
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f22d5a382c15..1136ed4a08f9 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -139,6 +139,7 @@ struct vfio_pci_core_device {
struct mutex vma_lock;
struct list_head vma_list;
struct rw_semaphore memory_lock;
+ struct list_head dmabufs;
};
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..0f74ddcfc6e3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1002,6 +1002,23 @@ struct vfio_device_feature {
*/
#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
+/**
+ * VFIO_DEVICE_P2P_DMA_BUF - _IORW(VFIO_TYPE, VFIO_BASE + 22,
+ * struct vfio_device_p2p_dma_buf)
+ *
+ * Set the region index, open flags, offset and length to create a dma_buf
+ * for p2p dma.
+ *
+ * Return 0 on success, -errno on failure.
+ */
+struct vfio_device_p2p_dma_buf {
+ __u32 region_index;
+ __u32 open_flags;
+ __u32 offset;
+ __u64 length;
+};
+#define VFIO_DEVICE_P2P_DMA_BUF _IO(VFIO_TYPE, VFIO_BASE + 22)
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
--
2.34.1
More information about the kernel-team
mailing list