[PATCH 3/3][SRU Focal] UBUNTU: SAUCE: RDMA/core: Updated ib_peer_memory
dann frazier
dann.frazier at canonical.com
Thu Oct 14 23:04:06 UTC 2021
From: Jack Morgenstein <jackm at dev.mellanox.co.il>
BugLink: https://launchpad.net/bugs/1947206
- Allow clients to opt out of unmap during invalidation
- Fix race condition with clients when deregistering
a peer mr.
- Enable ATS for peer memory
To fix the race condition, we disable the client invalidation
notifier before calling destroy_mkey().
The race condition fix depends on being able to call procedure
mlx5_mr_cache_invalidate() twice on the same mr, and not have
the second call fail if the first call succeeds.
Based on upstream kernel patch from Jason Gunthorpe,
adapted for mlnx_ofed 5.2-2 and Ubuntu 20.04.
Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>
Signed-off-by: Amir Tzin <amirtz at nvidia.com>
(provided by Nvidia via private email)
Signed-off-by: dann frazier <dann.frazier at canonical.com>
---
drivers/infiniband/core/ib_peer_mem.h | 8 +-
drivers/infiniband/core/peer_mem.c | 137 ++++++++++++++++++++------
drivers/infiniband/hw/mlx5/devx.c | 2 +
drivers/infiniband/hw/mlx5/mr.c | 20 +++-
include/linux/mlx5/mlx5_ifc.h | 11 ++-
include/rdma/ib_umem.h | 5 +
include/rdma/peer_mem.h | 10 ++
7 files changed, 157 insertions(+), 36 deletions(-)
diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
index bb38ffee724a..3ceecb63127e 100644
--- a/drivers/infiniband/core/ib_peer_mem.h
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -32,6 +32,12 @@ struct ib_peer_memory_client {
bool invalidation_required;
};
+enum ib_umem_mapped_state {
+ UMEM_PEER_UNMAPPED,
+ UMEM_PEER_MAPPED,
+ UMEM_PEER_INVALIDATED,
+};
+
struct ib_umem_peer {
struct ib_umem umem;
struct kref kref;
@@ -41,7 +47,7 @@ struct ib_umem_peer {
umem_invalidate_func_t invalidation_func;
void *invalidation_private;
struct mutex mapping_lock;
- bool mapped;
+ enum ib_umem_mapped_state mapped_state;
u32 xa_id;
};
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
index 833865578cb0..0ee80e97474c 100644
--- a/drivers/infiniband/core/peer_mem.c
+++ b/drivers/infiniband/core/peer_mem.c
@@ -277,32 +277,60 @@ static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
static void ib_peer_umem_kref_release(struct kref *kref)
{
- kfree(container_of(kref, struct ib_umem_peer, kref));
+ struct ib_umem_peer *umem_p =
+ container_of(kref, struct ib_umem_peer, kref);
+
+ mutex_destroy(&umem_p->mapping_lock);
+ kfree(umem_p);
}
-static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p,
+ enum ib_umem_mapped_state cur_state,
+ enum ib_umem_mapped_state to_state)
{
struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
struct ib_umem *umem = &umem_p->umem;
- lockdep_assert_held(&umem_p->mapping_lock);
-
- peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
+ if (cur_state == UMEM_PEER_MAPPED &&
+ (to_state == UMEM_PEER_UNMAPPED ||
+ to_state == UMEM_PEER_INVALIDATED)) {
+ if (to_state == UMEM_PEER_UNMAPPED) {
+ peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
umem_p->umem.ibdev->dma_device);
- peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
- memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+ peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
+ }
+ memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+ atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+ }
- atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
- atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
- atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+ if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) ||
+ (cur_state == UMEM_PEER_INVALIDATED &&
+ to_state == UMEM_PEER_UNMAPPED)) {
+ atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
+ atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
+ }
- if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
- xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
- GFP_KERNEL);
- umem_p->mapped = false;
+ umem_p->mapped_state = to_state;
}
+static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p)
+{
+ const struct peer_memory_client *peer_mem =
+ umem_p->ib_peer_client->peer_mem;
+ const struct peer_memory_client_ex *peer_mem_ex;
+
+ if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0)
+ return false;
+ peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex,
+ client);
+ if (peer_mem_ex->ex_size <
+ offsetofend(struct peer_memory_client_ex, flags))
+ return false;
+ return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS;
+}
+
+
static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
{
struct ib_peer_memory_client *ib_peer_client = reg_handle;
@@ -323,16 +351,22 @@ static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
kref_get(&umem_p->kref);
xa_unlock(&ib_peer_client->umem_xa);
mutex_lock(&umem_p->mapping_lock);
- if (umem_p->mapped) {
- /*
- * At this point the invalidation_func must be !NULL as the get
- * flow does not unlock mapping_lock until it is set, and umems
- * that do not require invalidation are not in the xarray.
- */
+ /*
+ * For flows that require invalidation the invalidation_func should not
+ * be NULL while the device can be doing DMA. The mapping_lock ensures
+ * that the device is ready to receive an invalidation before one is
+ * triggered here.
+ */
+ if (umem_p->mapped_state == UMEM_PEER_MAPPED &&
+ umem_p->invalidation_func)
umem_p->invalidation_func(&umem_p->umem,
umem_p->invalidation_private);
- ib_unmap_peer_client(umem_p);
- }
+ if (ib_peer_unmap_on_invalidate(umem_p))
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_INVALIDATED);
+ else
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
mutex_unlock(&umem_p->mapping_lock);
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
return 0;
@@ -362,6 +396,47 @@ void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
}
EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
+/*
+ * Caller has blocked DMA and will no longer be able to handle invalidate
+ * callbacks. Callers using invalidation must call this function before calling
+ * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional
+ * before doing this.
+ */
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(umem, struct ib_umem_peer, umem);
+ bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p);
+ enum ib_umem_mapped_state cur_state;
+
+ if (umem_p->invalidation_func) {
+ mutex_lock(&umem_p->mapping_lock);
+ umem_p->invalidation_func = NULL;
+ } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) {
+ mutex_lock(&umem_p->mapping_lock);
+ } else {
+ /*
+ * Haven't called ib_umem_activate_invalidation_notifier() yet,
+ * still have the lock
+ */
+ }
+
+ if (!unmap_on_invalidate) {
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
+ } else {
+ /* Block ib_invalidate_peer_memory() */
+ cur_state = umem_p->mapped_state;
+ umem_p->mapped_state = UMEM_PEER_UNMAPPED;
+ }
+ mutex_unlock(&umem_p->mapping_lock);
+
+ if (unmap_on_invalidate)
+ ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED);
+
+}
+EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier);
+
struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
unsigned long peer_mem_flags)
{
@@ -424,7 +499,7 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
if (ret)
goto err_pages;
- umem_p->mapped = true;
+ umem_p->mapped_state = UMEM_PEER_MAPPED;
atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
@@ -461,15 +536,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
{
struct ib_umem_peer *umem_p =
container_of(umem, struct ib_umem_peer, umem);
+ /*
+ * If ib_umem_activate_invalidation_notifier() is called then
+ * ib_umem_stop_invalidation_notifier() must be called before release.
+ */
+ WARN_ON(umem_p->invalidation_func);
- /* invalidation_func being set indicates activate was called */
- if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
- umem_p->invalidation_func)
- mutex_lock(&umem_p->mapping_lock);
-
- if (umem_p->mapped)
- ib_unmap_peer_client(umem_p);
- mutex_unlock(&umem_p->mapping_lock);
+ /* For no invalidation cases, make sure it is unmapped */
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED);
if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
@@ -482,3 +556,4 @@ void ib_peer_umem_release(struct ib_umem *umem)
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
}
+
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index f8f8507c7938..35c7b1d0b7fd 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2190,6 +2190,8 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt,
(obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
MLX5_IB_MTT_READ);
+ if (obj->umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats))
+ MLX5_SET(umem, umem, ats, 1);
}
static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2d075ca40bfc..17037c34c4aa 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1117,6 +1117,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
get_octo_len(virt_addr, length, page_shift));
}
+ if (umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats))
+ MLX5_SET(mkc, mkc, ma_tranlation_mode, 1);
+
err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
if (err) {
mlx5_ib_warn(dev, "create mkey failed\n");
@@ -1295,7 +1298,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
use_umr = mlx5_ib_can_use_umr(dev, true);
- if (order <= mr_cache_max_order(dev) && use_umr) {
+ if (umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats)) {
+ use_umr = false;
+ } else if (order <= mr_cache_max_order(dev) && use_umr) {
mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
page_shift, order, access_flags);
if (PTR_ERR(mr) == -EAGAIN) {
@@ -1320,6 +1325,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ERR(mr)) {
err = PTR_ERR(mr);
+ if (umem->is_peer)
+ ib_umem_stop_invalidation_notifier(umem);
goto error;
}
@@ -1626,6 +1633,17 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
/* Avoid double-freeing the umem. */
umem = NULL;
}
+ else {
+ /*
+ * For peers, need to disable the invalidation notifier
+ * before calling destroy_mkey().
+ */
+ if (umem && umem->is_peer) {
+ if (unreg_umr(mr->dev ,mr))
+ return;
+ ib_umem_stop_invalidation_notifier(umem);
+ }
+ }
clean_mr(dev, mr);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 800eb70f3490..939bc1d3fa50 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1430,7 +1430,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 steering_format_version[0x4];
u8 create_qp_start_hint[0x18];
- u8 reserved_at_460[0x3];
+ u8 reserved_at_460[0x1];
+ u8 ats[0x1];
+ u8 reserved_at_462[0x1];
u8 log_max_uctx[0x5];
u8 reserved_at_468[0x3];
u8 log_max_umem[0x5];
@@ -3258,7 +3260,9 @@ struct mlx5_ifc_mkc_bits {
u8 lw[0x1];
u8 lr[0x1];
u8 access_mode_1_0[0x2];
- u8 reserved_at_18[0x8];
+ u8 reserved_at_18[0x2];
+ u8 ma_tranlation_mode[0x2];
+ u8 reserved_at_1c[0x4];
u8 qpn[0x18];
u8 mkey_7_0[0x8];
@@ -9951,7 +9955,8 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
struct mlx5_ifc_umem_bits {
u8 reserved_at_0[0x80];
- u8 reserved_at_80[0x1b];
+ u8 ats[0x1];
+ u8 reserved_at_81[0x1a];
u8 log_page_size[0x5];
u8 page_offset[0x20];
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index ec9824cbf49d..3a8175a8ed70 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
umem_invalidate_func_t func,
void *cookie);
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem);
#else /* CONFIG_INFINIBAND_USER_MEM */
@@ -131,6 +132,10 @@ static inline void ib_umem_activate_invalidation_notifier(
{
}
+static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+}
+
#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
index 563a820dbc32..cefee23c439f 100644
--- a/include/rdma/peer_mem.h
+++ b/include/rdma/peer_mem.h
@@ -137,6 +137,16 @@ struct peer_memory_client {
void (*release)(void *client_context);
};
+enum {
+ PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
+};
+
+struct peer_memory_client_ex {
+ struct peer_memory_client client;
+ size_t ex_size;
+ u32 flags;
+};
+
/*
* If invalidate_callback() is non-NULL then the client will only support
* umems which can be invalidated. The caller may call the
--
2.33.0
More information about the kernel-team
mailing list