[Resubmit][PATCH][SRU Impish] UBUNTU: SAUCE: RDMA/core: Updated ib_peer_memory

dann frazier dann.frazier at canonical.com
Wed Nov 17 15:49:21 UTC 2021


From: Jason Gunthorpe <jgg at nvidia.com>

BugLink: https://launchpad.net/bugs/1947206

- Allow clients to opt out of unmap during invalidation
- Fix some bugs in the sequencing of mlx5 MRs
- Enable ATS for peer memory

Signed-off-by: Jason Gunthorpe <jgg at nvidia.com>
(provided by Nvidia via private email)
Signed-off-by: dann frazier <dann.frazier at canonical.com>
---
 drivers/infiniband/core/ib_peer_mem.h |   8 +-
 drivers/infiniband/core/peer_mem.c    | 211 +++++++++++++++++++++-----
 drivers/infiniband/hw/mlx5/devx.c     |   2 +
 drivers/infiniband/hw/mlx5/mr.c       |  47 ++++--
 include/linux/mlx5/mlx5_ifc.h         |  11 +-
 include/rdma/ib_umem.h                |   4 +
 include/rdma/peer_mem.h               |  10 ++
 7 files changed, 235 insertions(+), 58 deletions(-)

diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
index 684bcb8ff33e..248530a7f931 100644
--- a/drivers/infiniband/core/ib_peer_mem.h
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -32,6 +32,12 @@ struct ib_peer_memory_client {
 	bool invalidation_required;
 };
 
+enum ib_umem_mapped_state {
+	UMEM_PEER_UNMAPPED,
+	UMEM_PEER_MAPPED,
+	UMEM_PEER_INVALIDATED,
+};
+
 struct ib_umem_peer {
 	struct ib_umem umem;
 	struct kref kref;
@@ -41,7 +47,7 @@ struct ib_umem_peer {
 	umem_invalidate_func_t invalidation_func;
 	void *invalidation_private;
 	struct mutex mapping_lock;
-	bool mapped;
+	enum ib_umem_mapped_state mapped_state;
 	u32 xa_id;
 	struct scatterlist *first_sg;
 	dma_addr_t first_dma_address;
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
index f01bf924bc3e..7f0ea9479728 100644
--- a/drivers/infiniband/core/peer_mem.c
+++ b/drivers/infiniband/core/peer_mem.c
@@ -296,40 +296,111 @@ static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
 
 static void ib_peer_umem_kref_release(struct kref *kref)
 {
-	kfree(container_of(kref, struct ib_umem_peer, kref));
+	struct ib_umem_peer *umem_p =
+		container_of(kref, struct ib_umem_peer, kref);
+
+	mutex_destroy(&umem_p->mapping_lock);
+	kfree(umem_p);
 }
 
-static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p,
+				 enum ib_umem_mapped_state cur_state,
+				 enum ib_umem_mapped_state to_state)
 {
 	struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
 	const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
 	struct ib_umem *umem = &umem_p->umem;
 
-	lockdep_assert_held(&umem_p->mapping_lock);
+	if (cur_state == UMEM_PEER_MAPPED &&
+	    (to_state == UMEM_PEER_UNMAPPED ||
+	     to_state == UMEM_PEER_INVALIDATED)) {
+		/*
+		 * In the invalidated state we will never touch the sg again,
+		 * but the client might, so fix it anyhow.
+		 */
+		if (umem_p->last_sg) {
+			umem_p->last_sg->length = umem_p->last_length;
+			sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
+		}
 
-	if (umem_p->last_sg) {
-		umem_p->last_sg->length = umem_p->last_length;
-		sg_dma_len(umem_p->last_sg) = umem_p->last_dma_length;
-	}
+		if (umem_p->first_sg) {
+			umem_p->first_sg->dma_address =
+				umem_p->first_dma_address;
+			umem_p->first_sg->length = umem_p->first_length;
+			sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
+		}
+
+		if (to_state == UMEM_PEER_UNMAPPED) {
+			peer_mem->dma_unmap(&umem_p->umem.sg_head,
+					    umem_p->peer_client_context,
+					    umem_p->umem.ibdev->dma_device);
+			peer_mem->put_pages(&umem_p->umem.sg_head,
+					    umem_p->peer_client_context);
+		}
 
-	if (umem_p->first_sg) {
-		umem_p->first_sg->dma_address = umem_p->first_dma_address;
-		umem_p->first_sg->length = umem_p->first_length;
-		sg_dma_len(umem_p->first_sg) = umem_p->first_dma_length;
+		memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+		atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
 	}
 
-	peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
-			    umem_p->umem.ibdev->dma_device);
-	peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
-	memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+	if ((cur_state == UMEM_PEER_MAPPED && to_state == UMEM_PEER_UNMAPPED) ||
+	    (cur_state == UMEM_PEER_INVALIDATED &&
+	     to_state == UMEM_PEER_UNMAPPED)) {
+		atomic64_add(umem->sg_head.nents,
+			     &ib_peer_client->stats.num_dereg_pages);
+		atomic64_add(umem->length,
+			     &ib_peer_client->stats.num_dereg_bytes);
+	}
+	umem_p->mapped_state = to_state;
+}
 
-	atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
-	atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
-	atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
-	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
-		xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
-			 GFP_KERNEL);
-	umem_p->mapped = false;
+/*
+ * True if the client should do unmap itself after the invalidate callback
+ * returns. Clients operating in this mode need to use this locking pattern:
+ *
+ * client_invalidate:
+ *    mutex_lock(&client_lock)
+ *     invalidate_callback():
+ *       mutex_lock(mapping_lock)
+ *       mutex_unlock(mapping_lock)
+ *     client_dma_unmap()
+ *     client_put_pages()
+ *    mutex_unlock(&client_lock)
+ *
+ * ib_umem_stop_invalidation_notifier():
+ *  mutex_lock(mapping_lock)
+ *  mutex_unlock(mapping_lock)
+ *  peer_mem->dma_unmap():
+ *    mutex_lock(&client_lock)
+ *     client_dma_unmap()
+ *    mutex_unlock(&client_lock)
+ *  peer_mem->put_pages():
+ *    mutex_lock(&client_lock)
+ *     client_put_pages()
+ *    mutex_unlock(&client_lock)
+ *
+ * ib_peer_umem_release():
+ *  peer_mem->release():
+ *    mutex_lock(&client_lock)
+ *    mutex_unlock(&client_lock)
+ *
+ * Noting that dma_unmap/put_pages can be called even though invalidate has
+ * already done the unmap, and release() can be called concurrently with
+ * invalidate. The client must protect itself against these races.
+ */
+static bool ib_peer_unmap_on_invalidate(struct ib_umem_peer *umem_p)
+{
+	const struct peer_memory_client *peer_mem =
+		umem_p->ib_peer_client->peer_mem;
+	const struct peer_memory_client_ex *peer_mem_ex;
+
+	if (peer_mem->version[IB_PEER_MEMORY_VER_MAX - 1] == 0)
+		return false;
+	peer_mem_ex = container_of(peer_mem, const struct peer_memory_client_ex,
+				   client);
+	if (peer_mem_ex->ex_size <
+	    offsetofend(struct peer_memory_client_ex, flags))
+		return false;
+	return peer_mem_ex->flags & PEER_MEM_INVALIDATE_UNMAPS;
 }
 
 static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
@@ -353,16 +424,22 @@ static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
 	xa_unlock(&ib_peer_client->umem_xa);
 
 	mutex_lock(&umem_p->mapping_lock);
-	if (umem_p->mapped) {
-		/*
-		 * At this point the invalidation_func must be !NULL as the get
-		 * flow does not unlock mapping_lock until it is set, and umems
-		 * that do not require invalidation are not in the xarray.
-		 */
+	/*
+	 * For flows that require invalidation the invalidation_func should not
+	 * be NULL while the device can be doing DMA. The mapping_lock ensures
+	 * that the device is ready to receive an invalidation before one is
+	 * triggered here.
+	 */
+	if (umem_p->mapped_state == UMEM_PEER_MAPPED &&
+	    umem_p->invalidation_func)
 		umem_p->invalidation_func(&umem_p->umem,
 					  umem_p->invalidation_private);
-		ib_unmap_peer_client(umem_p);
-	}
+	if (ib_peer_unmap_on_invalidate(umem_p))
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_INVALIDATED);
+	else
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_UNMAPPED);
 	mutex_unlock(&umem_p->mapping_lock);
 	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
 	return 0;
@@ -393,6 +470,47 @@ void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 }
 EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
 
+/*
+ * Caller has blocked DMA and will no longer be able to handle invalidate
+ * callbacks. Callers using invalidation must call this function before calling
+ * ib_peer_umem_release(). ib_umem_activate_invalidation_notifier() is optional
+ * before doing this.
+ */
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+	struct ib_umem_peer *umem_p =
+		container_of(umem, struct ib_umem_peer, umem);
+	bool unmap_on_invalidate = ib_peer_unmap_on_invalidate(umem_p);
+	enum ib_umem_mapped_state cur_state;
+
+	if (umem_p->invalidation_func) {
+		mutex_lock(&umem_p->mapping_lock);
+		umem_p->invalidation_func = NULL;
+	} else if (umem_p->xa_id != PEER_NO_INVALIDATION_ID) {
+		mutex_lock(&umem_p->mapping_lock);
+	} else {
+		/*
+		 * Haven't called ib_umem_activate_invalidation_notifier() yet,
+		 * still have the lock
+		 */
+	}
+
+	if (!unmap_on_invalidate) {
+		ib_unmap_peer_client(umem_p, umem_p->mapped_state,
+				     UMEM_PEER_UNMAPPED);
+	} else {
+		/* Block ib_invalidate_peer_memory() */
+		cur_state = umem_p->mapped_state;
+		umem_p->mapped_state = UMEM_PEER_UNMAPPED;
+	}
+	mutex_unlock(&umem_p->mapping_lock);
+
+	if (unmap_on_invalidate)
+		ib_unmap_peer_client(umem_p, cur_state, UMEM_PEER_UNMAPPED);
+
+}
+EXPORT_SYMBOL(ib_umem_stop_invalidation_notifier);
+
 static void fix_peer_sgls(struct ib_umem_peer *umem_p, unsigned long peer_page_size)
 {
 	struct ib_umem *umem = &umem_p->umem;
@@ -497,7 +615,7 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
 	if (peer_page_size != PAGE_SIZE)
 		fix_peer_sgls(umem_p, peer_page_size);
 
-	umem_p->mapped = true;
+	umem_p->mapped_state = UMEM_PEER_MAPPED;
 	atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
 	atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
 	atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
@@ -505,9 +623,9 @@ struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
 	/*
 	 * If invalidation is allowed then the caller must call
 	 * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to
-	 * unlock this mutex. The call to  should be done after the last
-	 * read to sg_head, once the caller is ready for the invalidation
-	 * function to be called.
+	 * unlock this mutex. This call should be done after the last read to
+	 * sg_head, once the caller is ready for the invalidation function to be
+	 * called.
 	 */
 	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
 		mutex_unlock(&umem_p->mapping_lock);
@@ -537,14 +655,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
 	struct ib_umem_peer *umem_p =
 		container_of(umem, struct ib_umem_peer, umem);
 
-	/* invalidation_func being set indicates activate was called */
-	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
-	    umem_p->invalidation_func)
-		mutex_lock(&umem_p->mapping_lock);
+	/*
+	 * If ib_umem_activate_invalidation_notifier() is called then
+	 * ib_umem_stop_invalidation_notifier() must be called before release.
+	 */
+	WARN_ON(umem_p->invalidation_func);
 
-	if (umem_p->mapped)
-		ib_unmap_peer_client(umem_p);
-	mutex_unlock(&umem_p->mapping_lock);
+	/* For no invalidation cases, make sure it is unmapped */
+	ib_unmap_peer_client(umem_p, umem_p->mapped_state, UMEM_PEER_UNMAPPED);
 
 	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
 		xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
@@ -557,3 +675,14 @@ void ib_peer_umem_release(struct ib_umem *umem)
 
 	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
 }
+
+/* Use it like this:
+struct peer_memory_client_ex peer_memory_test = {
+	.client = {
+		.version = "1.0",
+		.version[IB_PEER_MEMORY_VER_MAX-1] = 1,
+	},
+	.ex_size = sizeof(struct peer_memory_client_ex),
+	.flags = PEER_MEM_INVALIDATE_UNMAPS,
+};
+*/
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index d056e75b4e9b..4ec319be6938 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2265,6 +2265,8 @@ static int devx_umem_reg_cmd_alloc(struct mlx5_ib_dev *dev,
 		 order_base_2(page_size) - MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(umem, umem, page_offset,
 		 ib_umem_dma_offset(obj->umem, page_size));
+	if (obj->umem->is_peer)
+		MLX5_SET(umem, umem, ats, MLX5_CAP_GEN(dev->mdev, ats));
 
 	mlx5_ib_populate_pas(obj->umem, page_size, mtt,
 			     (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) |
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 4e17a540774a..bf1a018b526c 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1326,6 +1326,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 	MLX5_SET(mkc, mkc, translations_octword_size,
 		 get_octo_len(iova, umem->length, mr->page_shift));
 	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
+	if (umem->is_peer)
+		MLX5_SET(mkc, mkc, ma_translation_mode,
+			 MLX5_CAP_GEN(dev->mdev, ats));
 	if (populate) {
 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
 			 get_octo_len(iova, umem->length, mr->page_shift));
@@ -1456,17 +1459,20 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
 	int err;
 
 	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length);
-	if (xlt_with_umr) {
+	if (xlt_with_umr && !umem->is_peer) {
 		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
 	} else {
 		unsigned int page_size = mlx5_umem_find_best_pgsz(
 			umem, mkc, log_page_size, 0, iova);
 
 		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+		mr = reg_create(pd, umem, iova, access_flags, page_size,
+				!xlt_with_umr);
 		mutex_unlock(&dev->slow_path_mutex);
 	}
 	if (IS_ERR(mr)) {
+		if (umem->is_peer)
+			ib_umem_stop_invalidation_notifier(umem);
 		ib_umem_release(umem);
 		return ERR_CAST(mr);
 	}
@@ -1827,8 +1833,13 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 				return ERR_PTR(err);
 			return NULL;
 		}
-		/* DM or ODP MR's don't have a normal umem so we can't re-use it */
-		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+		/*
+		 * DM or ODP MR's don't have a normal umem so we can't re-use it.
+		 * Peer umems cannot have their MR's changed once created due
+		 * to races with invalidation.
+		 */
+		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) ||
+		    mr->umem->is_peer)
 			goto recreate;
 
 		/*
@@ -1847,10 +1858,11 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	}
 
 	/*
-	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
-	 * but the logic around releasing the umem is different
+	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does but
+	 * the logic around releasing the umem is different, peer memory
+	 * invalidation semantics are incompatible.
 	 */
-	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
+	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr) || mr->umem->is_peer)
 		goto recreate;
 
 	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
@@ -1980,14 +1992,23 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	}
 
 	/* Stop DMA */
-	if (mr->cache_ent) {
-		if (revoke_mr(mr)) {
-			spin_lock_irq(&mr->cache_ent->lock);
-			mr->cache_ent->total_mrs--;
-			spin_unlock_irq(&mr->cache_ent->lock);
-			mr->cache_ent = NULL;
+	rc = 0;
+	if (mr->cache_ent || (mr->umem && mr->umem->is_peer)) {
+		rc = revoke_mr(mr);
+		if (mr->umem && mr->umem->is_peer) {
+			if (rc)
+				return rc;
+			ib_umem_stop_invalidation_notifier(mr->umem);
 		}
 	}
+
+	if (mr->cache_ent && rc) {
+		spin_lock_irq(&mr->cache_ent->lock);
+		mr->cache_ent->total_mrs--;
+		spin_unlock_irq(&mr->cache_ent->lock);
+		mr->cache_ent = NULL;
+	}
+
 	if (!mr->cache_ent) {
 		rc = destroy_mkey(to_mdev(mr->ibmr.device), mr);
 		if (rc)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 857529a5568d..1cd44fb39311 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1630,7 +1630,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         steering_format_version[0x4];
 	u8         create_qp_start_hint[0x18];
 
-	u8         reserved_at_460[0x3];
+	u8         reserved_at_460[0x1];
+	u8         ats[0x1];
+	u8         reserved_at_462[0x1];
 	u8         log_max_uctx[0x5];
 	u8         reserved_at_468[0x2];
 	u8         ipsec_offload[0x1];
@@ -3642,7 +3644,9 @@ struct mlx5_ifc_mkc_bits {
 	u8         lw[0x1];
 	u8         lr[0x1];
 	u8         access_mode_1_0[0x2];
-	u8         reserved_at_18[0x8];
+	u8         reserved_at_18[0x2];
+	u8         ma_translation_mode[0x2];
+	u8         reserved_at_1c[0x4];
 
 	u8         qpn[0x18];
 	u8         mkey_7_0[0x8];
@@ -10620,7 +10624,8 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 struct mlx5_ifc_umem_bits {
 	u8         reserved_at_0[0x80];
 
-	u8         reserved_at_80[0x1b];
+	u8         ats[0x1];
+	u8         reserved_at_81[0x1a];
 	u8         log_page_size[0x5];
 
 	u8         page_offset[0x20];
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 383be28b3f06..c418f91c2886 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -157,6 +157,7 @@ struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
 void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
 					   umem_invalidate_func_t func,
 					   void *cookie);
+void ib_umem_stop_invalidation_notifier(struct ib_umem *umem);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -211,6 +212,9 @@ static inline void ib_umem_activate_invalidation_notifier(
 	struct ib_umem *umem, umem_invalidate_func_t func, void *cookie)
 {
 }
+static inline void ib_umem_stop_invalidation_notifier(struct ib_umem *umem)
+{
+}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 #endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
index 4c2e633cb4a7..e3dc2ca630d6 100644
--- a/include/rdma/peer_mem.h
+++ b/include/rdma/peer_mem.h
@@ -137,6 +137,16 @@ struct peer_memory_client {
 	void (*release)(void *client_context);
 };
 
+enum {
+	PEER_MEM_INVALIDATE_UNMAPS = 1 << 0,
+};
+
+struct peer_memory_client_ex {
+	struct peer_memory_client client;
+	size_t ex_size;
+	u32 flags;
+};
+
 /*
  * If invalidate_callback() is non-NULL then the client will only support
  * umems which can be invalidated. The caller may call the
-- 
2.33.1




More information about the kernel-team mailing list