[PATCH 3/3][SRU Focal] UBUNTU: SAUCE: RDMA/core: Updated ib_peer_memory

dann frazier dann.frazier at canonical.com
Thu Oct 14 23:04:06 UTC 2021


From: Jack Morgenstein <jackm at dev.mellanox.co.il>

BugLink: https://launchpad.net/bugs/1947206

- Allow clients to opt out of unmap during invalidation
- Fix race condition with clients when deregistering
  a peer mr.
- Enable ATS for peer memory

To fix the race condition, we disable the client invalidation
notifier before calling destroy_mkey().

The race condition fix depends on being able to call procedure
mlx5_mr_cache_invalidate() twice on the same mr, and not have
the second call fail if the first call succeeds.

Based on upstream kernel patch from Jason Gunthorpe,
adapted for mlnx_ofed 5.2-2 and Ubuntu 20.04.

Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>
Signed-off-by: Amir Tzin <amirtz at nvidia.com>
(provided by Nvidia via private email)
Signed-off-by: dann frazier <dann.frazier at canonical.com>
---
 drivers/infiniband/core/ib_peer_mem.h |   8 +-
 drivers/infiniband/core/peer_mem.c    | 137 ++++++++++++++++++++------
 drivers/infiniband/hw/mlx5/devx.c     |   2 +
 drivers/infiniband/hw/mlx5/mr.c       |  20 +++-
 include/linux/mlx5/mlx5_ifc.h         |  11 ++-
 include/rdma/ib_umem.h                |   5 +
 include/rdma/peer_mem.h               |  10 ++
 7 files changed, 157 insertions(+), 36 deletions(-)

diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
index bb38ffee724a..3ceecb63127e 100644
--- a/drivers/infiniband/core/ib_peer_mem.h
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -32,6 +32,12 @@ struct ib_peer_memory_client {
 	bool invalidation_required;
 };
 
+enum ib_umem_mapped_state {
+	UMEM_PEER_UNMAPPED,
+	UMEM_PEER_MAPPED,
+	UMEM_PEER_INVALIDATED,
+};
+
 struct ib_umem_peer {
 	struct ib_umem umem;
 	struct kref kref;
@@ -41,7 +47,7 @@ struct ib_umem_peer {
 	umem_invalidate_func_t invalidation_func;
 	void *invalidation_private;
 	struct mutex mapping_lock;
-	bool mapped;
+	enum ib_umem_mapped_state mapped_state;
 	u32 xa_id;
 };
 
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
index 833865578cb0..0ee80e97474c 100644
--- a/drivers/infiniband/core/peer_mem.c
+++ b/drivers/infiniband/core/peer_mem.c
@@ -277,32 +277,60 @@ static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
 
 static void ib_peer_umem_kref_release(struct kref *kref)
 {
-	kfree(container_of(kref, struct ib_umem_peer, kref));
+	struct ib_umem_peer *umem_p =
+		container_of(kref, struct ib_umem_peer, kref);
+
+	mutex_destroy(&umem_p->mapping_lock);
+	kfree(umem_p);
 }
 
-static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p,
+				 enum ib_umem_mapped_state cur_state,
+				 enum ib_umem_mapped_state to_state)
 {
 	struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
 	const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
 	struct ib_umem *umem = &umem_p->umem;
 
-	lockdep_assert_held(&umem_p->mapping_lock);
-
-	peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
+	if (cur_state == UMEM_PEER_MAPPED &&
+	    (to_state == UMEM_PEER_UNMAPPED ||
+	     to_state == UMEM_PEER_INVALIDATED)) {
+		if (to_state == UMEM_PEER_UNMAPPED) {
+			peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
 			    umem_p->umem.ibdev->dma_device);
-	peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
-	memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+			peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
+		}
+		memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+		atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+	}
 
-	atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
-	atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
-	atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+	if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) ||
+	    (cur_state == UMEM_PEER_INVALIDATED &&
+	     to_state == UMEM_PEER_UNMAPPED)) {
+		atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
+		atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
+	}
 
-	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
-		xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
-			 GFP_KERNEL);
-	umem_p->mapped = false;
+	umem_p->mapped_state = to_state;
 }
 
+static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p)
+{
+	const struct peer_memory_client *peer_mem =
+		umem_p->ib_peer_client->peer_mem;
+	const struct peer_memory_client_ex *peer_mem_ex;
+
+	if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0)
+		return false;
+	peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex,
+				   client);
+	if (peer_mem_ex->ex_size <
+	    offsetofend(struct peer_memory_client_ex, flags))
+		return false;
+	return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS;
+}
+
+
 static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
 {
 	struct ib_peer_memory_client *ib_peer_client = reg_handle;
@@ -323,16 +351,22 @@ static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
 	kref_get(&umem_p->kref);
 	xa_unlock(&ib_peer_client->umem_xa);
 	mutex_lock(&umem_p->mapping_lock);
-	if (umem_p->mapped) {
-		/*
-		 * At this point the invalidation_func must be !NULL as the get
-		 * flow does not unlock mapping_lock until it is set, and umems
-		 * that do not require invalidation are not in the xarray.
-		 */
+	/*
+	 * For flows that require invalidation the invalidation_func should not
+	 * be NULL while the device can be doing DMA. The mapping_lock ensures
+	 * that the device is ready to receive an invalidation before one is
+	 * triggered here.
+	 */
+	if (umem_p->mapped_state == UMEM_PEER_MAPPED &&
+	    umem_p->invalidation_func)
 		umem_p->invalidation_func(&umem_p->umem,
 					  umem_p->invalidation_private);
-		ib_unmap_peer_client(umem_p);
-	}
+	if (ib_peer_unmap_on_invalidate(umem_p))
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_INVALIDATED);
+	else
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_UNMAPPED);
 	mutex_unlock(&umem_p->mapping_lock);
 	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
 	return 0;
@@ -362,6 +396,47 @@ void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 }
 EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
 
+/*
+ * Caller has blocked DMA and will no longer be able to handle invalidate
+ * callbacks. Callers using invalidation must call this function before calling
+ * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional
+ * before doing this.
+ */
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+	struct ib_umem_peer *umem_p =
+		container_of(umem, struct ib_umem_peer, umem);
+	bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p);
+	enum ib_umem_mapped_state cur_state;
+
+	if (umem_p->invalidation_func) {
+		mutex_lock(&umem_p->mapping_lock);
+		umem_p->invalidation_func = NULL;
+	} else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) {
+		mutex_lock(&umem_p->mapping_lock);
+	} else {
+		/*
+		 * Haven't called ib_umem_activate_invalidation_notifier() yet,
+		 * still have the lock
+		 */
+	}
+
+	if (!unmap_on_invalidate) {
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_UNMAPPED);
+	} else {
+		/* Block ib_invalidate_peer_memory() */
+		cur_state = umem_p->mapped_state;
+		umem_p->mapped_state = UMEM_PEER_UNMAPPED;
+	}
+	mutex_unlock(&umem_p->mapping_lock);
+
+	if (unmap_on_invalidate)
+		ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED);
+
+}
+EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier);
+
 struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
 				 unsigned long peer_mem_flags)
 {
@@ -424,7 +499,7 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
 	if (ret)
 		goto err_pages;
 
-	umem_p->mapped = true;
+	umem_p->mapped_state = UMEM_PEER_MAPPED;
 	atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
 	atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
 	atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
@@ -461,15 +536,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
 {
 	struct ib_umem_peer *umem_p =
 		container_of(umem, struct ib_umem_peer, umem);
+	/*
+	 * If ib_umem_activate_invalidation_notifier() is called then
+	 * ib_umem_stop_invalidation_notifier() must be called before release.
+	 */
+	WARN_ON(umem_p->invalidation_func);
 
-	/* invalidation_func being set indicates activate was called */
-	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
-	    umem_p->invalidation_func)
-		mutex_lock(&umem_p->mapping_lock);
-
-	if (umem_p->mapped)
-		ib_unmap_peer_client(umem_p);
-	mutex_unlock(&umem_p->mapping_lock);
+	/* For no invalidation cases, make sure it is unmapped */
+	ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED);
 
 	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
 		xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
@@ -482,3 +556,4 @@ void ib_peer_umem_release(struct ib_umem *umem)
 
 	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
 }
+
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index f8f8507c7938..35c7b1d0b7fd 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2190,6 +2190,8 @@ static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev,
 	mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt,
 			     (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
 			     MLX5_IB_MTT_READ);
+	if (obj->umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats))
+		MLX5_SET(umem, umem, ats, 1);
 }
 
 static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)(
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2d075ca40bfc..17037c34c4aa 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1117,6 +1117,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 			 get_octo_len(virt_addr, length, page_shift));
 	}
 
+	if (umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats))
+		MLX5_SET(mkc, mkc, ma_tranlation_mode, 1);
+
 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
 	if (err) {
 		mlx5_ib_warn(dev, "create mkey failed\n");
@@ -1295,7 +1298,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	use_umr = mlx5_ib_can_use_umr(dev, true);
 
-	if (order <= mr_cache_max_order(dev) && use_umr) {
+	if (umem->is_peer && MLX5_CAP_GEN(dev->mdev, ats)) {
+		use_umr = false;
+	} else if (order <= mr_cache_max_order(dev) && use_umr) {
 		mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
 					 page_shift, order, access_flags);
 		if (PTR_ERR(mr) == -EAGAIN) {
@@ -1320,6 +1325,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	if (IS_ERR(mr)) {
 		err = PTR_ERR(mr);
+		if (umem->is_peer)
+			ib_umem_stop_invalidation_notifier(umem);
 		goto error;
 	}
 
@@ -1626,6 +1633,17 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		/* Avoid double-freeing the umem. */
 		umem = NULL;
 	}
+	else {
+		/*
+		 * For peers, need to disable the invalidation notifier
+		 * before calling destroy_mkey().
+		 */
+		if (umem && umem->is_peer) {
+			if (unreg_umr(mr->dev ,mr))
+				return;
+			ib_umem_stop_invalidation_notifier(umem);
+		}
+	}
 
 	clean_mr(dev, mr);
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 800eb70f3490..939bc1d3fa50 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1430,7 +1430,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         steering_format_version[0x4];
 	u8         create_qp_start_hint[0x18];
 
-	u8         reserved_at_460[0x3];
+	u8         reserved_at_460[0x1];
+	u8         ats[0x1];
+	u8         reserved_at_462[0x1];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x3];
 	u8         log_max_umem[0x5];
@@ -3258,7 +3260,9 @@ struct mlx5_ifc_mkc_bits {
 	u8         lw[0x1];
 	u8         lr[0x1];
 	u8         access_mode_1_0[0x2];
-	u8         reserved_at_18[0x8];
+	u8         reserved_at_18[0x2];
+	u8         ma_tranlation_mode[0x2];
+	u8         reserved_at_1c[0x4];
 
 	u8         qpn[0x18];
 	u8         mkey_7_0[0x8];
@@ -9951,7 +9955,8 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 struct mlx5_ifc_umem_bits {
 	u8         reserved_at_0[0x80];
 
-	u8         reserved_at_80[0x1b];
+	u8         ats[0x1];
+	u8         reserved_at_81[0x1a];
 	u8         log_page_size[0x5];
 
 	u8         page_offset[0x20];
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index ec9824cbf49d..3a8175a8ed70 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -94,6 +94,7 @@ struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
 void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 					    umem_invalidate_func_t func,
 					    void *cookie);
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -131,6 +132,10 @@ static inline void ib_umem_activate_invalidation_notifier(
 {
 }
 
+static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
index 563a820dbc32..cefee23c439f 100644
--- a/include/rdma/peer_mem.h
+++ b/include/rdma/peer_mem.h
@@ -137,6 +137,16 @@ struct peer_memory_client {
 	void (*release)(void *client_context);
 };
 
+enum {
+	PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
+};
+
+struct peer_memory_client_ex {
+	struct peer_memory_client client;
+	size_t ex_size;
+	u32 flags;
+};
+
 /*
  * If invalidate_callback() is non-NULL then the client will only support
  * umems which can be invalidated. The caller may call the
-- 
2.33.0




More information about the kernel-team mailing list