[Resubmit][PATCH][SRU Impish] UBUNTU: SAUCE: RDMA/core: Updated ib_peer_memory
dann frazier
dann.frazier at canonical.com
Wed Nov 17 15:49:21 UTC 2021
From: Jason Gunthorpe <jgg at nvidia.com>
BugLink: https://launchpad.net/bugs/1947206
- Allow clients to opt out of unmap during invalidation
- Fix some bugs in the sequencing of mlx5 MRs
- Enable ATS for peer memory
Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
(provided by Nvidia via private email)
Signed-off-by: dann frazier <dann.frazier at canonical.com>
---
drivers/infiniband/core/ib_peer_mem.h | 8 +-
drivers/infiniband/core/peer_mem.c | 211 +++++++++++++++++++++-----
drivers/infiniband/hw/mlx5/devx.c | 2 +
drivers/infiniband/hw/mlx5/mr.c | 47 ++++--
include/linux/mlx5/mlx5_ifc.h | 11 +-
include/rdma/ib_umem.h | 4 +
include/rdma/peer_mem.h | 10 ++
7 files changed, 235 insertions(+), 58 deletions(-)
diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
index 684bcb8ff33e..248530a7f931 100644
--- a/drivers/infiniband/core/ib_peer_mem.h
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -32,6 +32,12 @@ struct ib_peer_memory_client {
bool invalidation_required;
};
+enum ib_umem_mapped_state {
+ UMEM_PEER_UNMAPPED,
+ UMEM_PEER_MAPPED,
+ UMEM_PEER_INVALIDATED,
+};
+
struct ib_umem_peer {
struct ib_umem umem;
struct kref kref;
@@ -41,7 +47,7 @@ struct ib_umem_peer {
umem_invalidate_func_t invalidation_func;
void *invalidation_private;
struct mutex mapping_lock;
- bool mapped;
+ enum ib_umem_mapped_state mapped_state;
u32 xa_id;
struct scatterlist *first_sg;
dma_addr_t first_dma_address;
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
index f01bf924bc3e..7f0ea9479728 100644
--- a/drivers/infiniband/core/peer_mem.c
+++ b/drivers/infiniband/core/peer_mem.c
@@ -296,40 +296,111 @@ static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
static void ib_peer_umem_kref_release(struct kref *kref)
{
- kfree(container_of(kref, struct ib_umem_peer, kref));
+ struct ib_umem_peer *umem_p =
+ container_of(kref, struct ib_umem_peer, kref);
+
+ mutex_destroy(&umem_p->mapping_lock);
+ kfree(umem_p);
}
-static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p,
+ enum ib_umem_mapped_state cur_state,
+ enum ib_umem_mapped_state to_state)
{
struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
struct ib_umem *umem = &umem_p->umem;
- lockdep_assert_held(&umem_p->mapping_lock);
+ if (cur_state == UMEM_PEER_MAPPED &&
+ (to_state == UMEM_PEER_UNMAPPED ||
+ to_state == UMEM_PEER_INVALIDATED)) {
+ /*
+ * In the invalidated state we will never touch the sg again,
+ * but the client might, so fix it anyhow.
+ */
+ if (umem_p->last_sg) {
+ umem_p->last_sg->length = umem_p->last_length;
+ sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
+ }
- if (umem_p->last_sg) {
- umem_p->last_sg->length = umem_p->last_length;
- sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
- }
+ if (umem_p->first_sg) {
+ umem_p->first_sg->dma_address =
+ umem_p->first_dma_address;
+ umem_p->first_sg->length = umem_p->first_length;
+ sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
+ }
+
+ if (to_state == UMEM_PEER_UNMAPPED) {
+ peer_mem->dma_unmap(&umem_p->umem.sg_head,
+ umem_p->peer_client_context,
+ umem_p->umem.ibdev->dma_device);
+ peer_mem->put_pages(&umem_p->umem.sg_head,
+ umem_p->peer_client_context);
+ }
- if (umem_p->first_sg) {
- umem_p->first_sg->dma_address = umem_p->first_dma_address;
- umem_p->first_sg->length = umem_p->first_length;
- sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
+ memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+ atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
}
- peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
- umem_p->umem.ibdev->dma_device);
- peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
- memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+ if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) ||
+ (cur_state == UMEM_PEER_INVALIDATED &&
+ to_state == UMEM_PEER_UNMAPPED)) {
+ atomic64_add(umem->sg_head.nents,
+ &ib_peer_client->stats.num_dereg_pages);
+ atomic64_add(umem->length,
+ &ib_peer_client->stats.num_dereg_bytes);
+ }
+ umem_p->mapped_state = to_state;
+}
- atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
- atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
- atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
- if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
- xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
- GFP_KERNEL);
- umem_p->mapped = false;
+/*
+ * True if the client should do unmap itself after the invalidate callback
+ * returns. Clients operating in this mode need to use this locking pattern:
+ *
+ * client_invalidate:
+ * mutex_lock(&client_lock)
+ * invalidate_callback():
+ * mutex_lock(mapping_lock)
+ * mutex_unlock(mapping_lock)
+ * client_dma_unmap()
+ * client_put_pages()
+ * mutex_unlock(&client_lock)
+ *
+ * ib_umem_stop_invalidation_notifier():
+ * mutex_lock(mapping_lock)
+ * mutex_unlock(mapping_lock)
+ * peer_mem->dma_unmap():
+ * mutex_lock(&client_lock)
+ * client_dma_unmap()
+ * mutex_unlock(&client_lock)
+ * peer_mem->put_pages():
+ * mutex_lock(&client_lock)
+ * client_put_pages()
+ * mutex_unlock(&client_lock)
+ *
+ * ib_peer_umem_release():
+ * peer_mem->release():
+ * mutex_lock(&client_lock)
+ * mutex_unlock(&client_lock)
+ *
+ * Noting that dma_unmap/put_pages can be called even though invalidate has
+ * already done the unmap, and release() can be called concurrently with
+ * invalidate. The client must protect itself against these races.
+ */
+static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p)
+{
+ const struct peer_memory_client *peer_mem =
+ umem_p->ib_peer_client->peer_mem;
+ const struct peer_memory_client_ex *peer_mem_ex;
+
+ if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0)
+ return false;
+ peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex,
+ client);
+ if (peer_mem_ex->ex_size <
+ offsetofend(struct peer_memory_client_ex, flags))
+ return false;
+ return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS;
}
static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
@@ -353,16 +424,22 @@ static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
xa_unlock(&ib_peer_client->umem_xa);
mutex_lock(&umem_p->mapping_lock);
- if (umem_p->mapped) {
- /*
- * At this point the invalidation_func must be !NULL as the get
- * flow does not unlock mapping_lock until it is set, and umems
- * that do not require invalidation are not in the xarray.
- */
+ /*
+ * For flows that require invalidation the invalidation_func should not
+ * be NULL while the device can be doing DMA. The mapping_lock ensures
+ * that the device is ready to receive an invalidation before one is
+ * triggered here.
+ */
+ if (umem_p->mapped_state == UMEM_PEER_MAPPED &&
+ umem_p->invalidation_func)
umem_p->invalidation_func(&umem_p->umem,
umem_p->invalidation_private);
- ib_unmap_peer_client(umem_p);
- }
+ if (ib_peer_unmap_on_invalidate(umem_p))
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_INVALIDATED);
+ else
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
mutex_unlock(&umem_p->mapping_lock);
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
return 0;
@@ -393,6 +470,47 @@ void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
}
EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
+/*
+ * Caller has blocked DMA and will no longer be able to handle invalidate
+ * callbacks. Callers using invalidation must call this function before calling
+ * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional
+ * before doing this.
+ */
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+ struct ib_umem_peer *umem_p =
+ container_of(umem, struct ib_umem_peer, umem);
+ bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p);
+ enum ib_umem_mapped_state cur_state;
+
+ if (umem_p->invalidation_func) {
+ mutex_lock(&umem_p->mapping_lock);
+ umem_p->invalidation_func = NULL;
+ } else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) {
+ mutex_lock(&umem_p->mapping_lock);
+ } else {
+ /*
+ * Haven't called ib_umem_activate_invalidation_notifier() yet,
+ * still have the lock
+ */
+ }
+
+ if (!unmap_on_invalidate) {
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+ UMEM_PEER_UNMAPPED);
+ } else {
+ /* Block ib_invalidate_peer_memory() */
+ cur_state = umem_p->mapped_state;
+ umem_p->mapped_state = UMEM_PEER_UNMAPPED;
+ }
+ mutex_unlock(&umem_p->mapping_lock);
+
+ if (unmap_on_invalidate)
+ ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED);
+
+}
+EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier);
+
static void fix_peer_sgls(struct ib_umem_peer *umem_p, unsigned long peer_page_size)
{
struct ib_umem *umem = &umem_p->umem;
@@ -497,7 +615,7 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
if (peer_page_size != PAGE_SIZE)
fix_peer_sgls(umem_p, peer_page_size);
- umem_p->mapped = true;
+ umem_p->mapped_state = UMEM_PEER_MAPPED;
atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
@@ -505,9 +623,9 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
/*
* If invalidation is allowed then the caller must call
* ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to
- * unlock this mutex. The call to should be done after the last
- * read to sg_head, once the caller is ready for the invalidation
- * function to be called.
+ * unlock this mutex. This call should be done after the last read to
+ * sg_head, once the caller is ready for the invalidation function to be
+ * called.
*/
if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
mutex_unlock(&umem_p->mapping_lock);
@@ -537,14 +655,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
struct ib_umem_peer *umem_p =
container_of(umem, struct ib_umem_peer, umem);
- /* invalidation_func being set indicates activate was called */
- if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
- umem_p->invalidation_func)
- mutex_lock(&umem_p->mapping_lock);
+ /*
+ * If ib_umem_activate_invalidation_notifier() is called then
+ * ib_umem_stop_invalidation_notifier() must be called before release.
+ */
+ WARN_ON(umem_p->invalidation_func);
- if (umem_p->mapped)
- ib_unmap_peer_client(umem_p);
- mutex_unlock(&umem_p->mapping_lock);
+ /* For no invalidation cases, make sure it is unmapped */
+ ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED);
if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
@@ -557,3 +675,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
kref_put(&umem_p->kref, ib_peer_umem_kref_release);
}
+
+/* Use it like this:
+struct peer_memory_client_ex peer_memory_test = {
+ .client = {
+ .version = "1.0",
+ .version[IB_PEER_MEMORY_VER_MAX-1] = 1,
+ },
+ .ex_size = sizeof(struct peer_memory_client_ex),
+ .flags = PEER_MEM_INVALIDATE_UNMAPS,
+};
+*/
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index d056e75b4e9b..4ec319be6938 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2265,6 +2265,8 @@ static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
MLX5_SET(umem, umem, page_offset,
ib_umem_dma_offset(obj->umem, page_size));
+ if (obj->umem->is_peer)
+ MLX5_SET(umem, umem, ats, MLX5_CAP_GEN(dev->mdev, ats));
mlx5_ib_populate_pas(obj->umem, page_size, mtt,
(obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4e17a540774a..bf1a018b526c 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1326,6 +1326,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
MLX5_SET(mkc, mkc, translations_octword_size,
get_octo_len(iova, umem->length, mr->page_shift));
MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
+ if (umem->is_peer)
+ MLX5_SET(mkc, mkc, ma_translation_mode,
+ MLX5_CAP_GEN(dev->mdev, ats));
if (populate) {
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
get_octo_len(iova, umem->length, mr->page_shift));
@@ -1456,17 +1459,20 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
int err;
xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
- if (xlt_with_umr) {
+ if (xlt_with_umr && !umem->is_peer) {
mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
} else {
unsigned int page_size = mlx5_umem_find_best_pgsz(
umem, mkc, log_page_size, 0, iova);
mutex_lock(&dev->slow_path_mutex);
- mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+ mr = reg_create(pd, umem, iova, access_flags, page_size,
+ !xlt_with_umr);
mutex_unlock(&dev->slow_path_mutex);
}
if (IS_ERR(mr)) {
+ if (umem->is_peer)
+ ib_umem_stop_invalidation_notifier(umem);
ib_umem_release(umem);
return ERR_CAST(mr);
}
@@ -1827,8 +1833,13 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
return ERR_PTR(err);
return NULL;
}
- /* DM or ODP MR's don't have a normal umem so we can't re-use it */
- if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+ /*
+ * DM or ODP MR's don't have a normal umem so we can't re-use it.
+ * Peer umems cannot have their MR's changed once created due
+ * to races with invalidation.
+ */
+ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) ||
+ mr->umem->is_peer)
goto recreate;
/*
@@ -1847,10 +1858,11 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
}
/*
- * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
- * but the logic around releasing the umem is different
+ * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does but
+ * the logic around releasing the umem is different, peer memory
+ * invalidation semantics are incompatible.
*/
- if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+ if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || mr->umem->is_peer)
goto recreate;
if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
@@ -1980,14 +1992,23 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
}
/* Stop DMA */
- if (mr->cache_ent) {
- if (revoke_mr(mr)) {
- spin_lock_irq(&mr->cache_ent->lock);
- mr->cache_ent->total_mrs--;
- spin_unlock_irq(&mr->cache_ent->lock);
- mr->cache_ent = NULL;
+ rc = 0;
+ if (mr->cache_ent || (mr->umem && mr->umem->is_peer)) {
+ rc = revoke_mr(mr);
+ if (mr->umem && mr->umem->is_peer) {
+ if (rc)
+ return rc;
+ ib_umem_stop_invalidation_notifier(mr->umem);
}
}
+
+ if (mr->cache_ent && rc) {
+ spin_lock_irq(&mr->cache_ent->lock);
+ mr->cache_ent->total_mrs--;
+ spin_unlock_irq(&mr->cache_ent->lock);
+ mr->cache_ent = NULL;
+ }
+
if (!mr->cache_ent) {
rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
if (rc)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 857529a5568d..1cd44fb39311 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1630,7 +1630,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 steering_format_version[0x4];
u8 create_qp_start_hint[0x18];
- u8 reserved_at_460[0x3];
+ u8 reserved_at_460[0x1];
+ u8 ats[0x1];
+ u8 reserved_at_462[0x1];
u8 log_max_uctx[0x5];
u8 reserved_at_468[0x2];
u8 ipsec_offload[0x1];
@@ -3642,7 +3644,9 @@ struct mlx5_ifc_mkc_bits {
u8 lw[0x1];
u8 lr[0x1];
u8 access_mode_1_0[0x2];
- u8 reserved_at_18[0x8];
+ u8 reserved_at_18[0x2];
+ u8 ma_translation_mode[0x2];
+ u8 reserved_at_1c[0x4];
u8 qpn[0x18];
u8 mkey_7_0[0x8];
@@ -10620,7 +10624,8 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
struct mlx5_ifc_umem_bits {
u8 reserved_at_0[0x80];
- u8 reserved_at_80[0x1b];
+ u8 ats[0x1];
+ u8 reserved_at_81[0x1a];
u8 log_page_size[0x5];
u8 page_offset[0x20];
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 383be28b3f06..c418f91c2886 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -157,6 +157,7 @@ struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
umem_invalidate_func_t func,
void *cookie);
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem);
#else /* CONFIG_INFINIBAND_USER_MEM */
@@ -211,6 +212,9 @@ static inline void ib_umem_activate_invalidation_notifier(
struct ib_umem *umem, umem_invalidate_func_t func, void *cookie)
{
}
+static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+}
#endif /* CONFIG_INFINIBAND_USER_MEM */
#endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
index 4c2e633cb4a7..e3dc2ca630d6 100644
--- a/include/rdma/peer_mem.h
+++ b/include/rdma/peer_mem.h
@@ -137,6 +137,16 @@ struct peer_memory_client {
void (*release)(void *client_context);
};
+enum {
+ PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
+};
+
+struct peer_memory_client_ex {
+ struct peer_memory_client client;
+ size_t ex_size;
+ u32 flags;
+};
+
/*
* If invalidate_callback() is non-NULL then the client will only support
* umems which can be invalidated. The caller may call the
--
2.33.1
More information about the kernel-team
mailing list