[RFC v2 2/2][Focal] UBUNTU: SAUCE: RDMA/core: Introduce peer memory interface

dann frazier dann.frazier at canonical.com
Tue Aug 31 22:39:06 UTC 2021


From: Feras Daoud <ferasda at mellanox.com>

BugLink: https://bugs.launchpad.net/bugs/1923104

The peer_memory_client scheme allows a driver to register with the ib_umem
system that it has the ability to understand user virtual address ranges
that are not compatible with get_user_pages(). For instance VMAs created
with io_remap_pfn_range(), or other driver special VMA.

For ranges the interface understands it can provide a DMA mapped sg_table
for use by the ib_umem, allowing user virtual ranges that cannot be
supported by get_user_pages() to be used as umems for RDMA.

This is designed to preserve the kABI, no functions or structures are
changed, only new symbols are added:

 ib_register_peer_memory_client
 ib_unregister_peer_memory_client
 ib_umem_activate_invalidation_notifier
 ib_umem_get_peer

And a bitfield in struct ib_umem uses more bits.

This interface is compatible with the two out of tree GPU drivers:
 https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/master/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
 https://github.com/Mellanox/nv_peer_memory/blob/master/nv_peer_mem.c

NOTES (remote before sending):
 - The exact locking semantics from the GPU side during invalidation
   are confusing. I've made it sane but perhaps this will hit locking
   problems. Test with lockdep and test invalidation.

   The main difference here is that get_pages and dma_map are called
   from a context that will block progress of invalidation.

   The old design blocked progress of invalidation using a completion for
   unmap and unpin, so those should be proven safe now.

   Since the old design used a completion it doesn't work with lockdep,
   even though it has basically the same blocking semantics.

 - The API exported to the GPU side is crufty and makes very little
   sense. Functionally it should be the same still, but many useless
   things were dropped off

 - I rewrote all the comments please check spelling/grammar

 - Compile tested only

Signed-off-by: Yishai Hadas <yishaih at mellanox.com>
Signed-off-by: Feras Daoud <ferasda at mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg at mellanox.com>
(backported from commit a42989294cf39d6e829424734ab0e7ec48bebcef
 git://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git)
[ dannf: Backport provided to Ubuntu's 5.4 by Feras; confirmed
  to pass Nvidia's internal testing. ]
Signed-off-by: dann frazier <dann.frazier at canonical.com>
---
 drivers/infiniband/core/Makefile      |   2 +-
 drivers/infiniband/core/ib_peer_mem.h |  52 +++
 drivers/infiniband/core/peer_mem.c    | 484 ++++++++++++++++++++++++++
 drivers/infiniband/core/umem.c        |  44 ++-
 drivers/infiniband/hw/mlx5/cq.c       |  11 +-
 drivers/infiniband/hw/mlx5/devx.c     |   2 +-
 drivers/infiniband/hw/mlx5/doorbell.c |   4 +-
 drivers/infiniband/hw/mlx5/mem.c      |  11 +-
 drivers/infiniband/hw/mlx5/mr.c       |  47 ++-
 drivers/infiniband/hw/mlx5/qp.c       |   2 +-
 drivers/infiniband/hw/mlx5/srq.c      |   2 +-
 include/rdma/ib_umem.h                |  29 ++
 include/rdma/peer_mem.h               | 165 +++++++++
 13 files changed, 828 insertions(+), 27 deletions(-)
 create mode 100644 drivers/infiniband/core/ib_peer_mem.h
 create mode 100644 drivers/infiniband/core/peer_mem.c
 create mode 100644 include/rdma/peer_mem.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 9a8871e21545..4b7838ff6e90 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -34,5 +34,5 @@ ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
 				uverbs_std_types_mr.o uverbs_std_types_counters.o \
 				uverbs_uapi.o uverbs_std_types_device.o
-ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o peer_mem.o
 ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/ib_peer_mem.h b/drivers/infiniband/core/ib_peer_mem.h
new file mode 100644
index 000000000000..bb38ffee724a
--- /dev/null
+++ b/drivers/infiniband/core/ib_peer_mem.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014-2020,  Mellanox Technologies. All rights reserved.
+ */
+#ifndef RDMA_IB_PEER_MEM_H
+#define RDMA_IB_PEER_MEM_H
+
+#include <rdma/peer_mem.h>
+#include <linux/kobject.h>
+#include <linux/xarray.h>
+#include <rdma/ib_umem.h>
+
+struct ib_peer_memory_statistics {
+	atomic64_t num_alloc_mrs;
+	atomic64_t num_dealloc_mrs;
+	atomic64_t num_reg_pages;
+	atomic64_t num_dereg_pages;
+	atomic64_t num_reg_bytes;
+	atomic64_t num_dereg_bytes;
+	unsigned long num_free_callbacks;
+};
+
+struct ib_peer_memory_client {
+	struct kobject kobj;
+	refcount_t usecnt;
+	struct completion usecnt_zero;
+	const struct peer_memory_client *peer_mem;
+	struct list_head core_peer_list;
+	struct ib_peer_memory_statistics stats;
+	struct xarray umem_xa;
+	u32 xa_cyclic_next;
+	bool invalidation_required;
+};
+
+struct ib_umem_peer {
+	struct ib_umem umem;
+	struct kref kref;
+	/* peer memory that manages this umem */
+	struct ib_peer_memory_client *ib_peer_client;
+	void *peer_client_context;
+	umem_invalidate_func_t invalidation_func;
+	void *invalidation_private;
+	struct mutex mapping_lock;
+	bool mapped;
+	u32 xa_id;
+};
+
+struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
+				 unsigned long peer_mem_flags);
+void ib_peer_umem_release(struct ib_umem *umem);
+
+#endif
diff --git a/drivers/infiniband/core/peer_mem.c b/drivers/infiniband/core/peer_mem.c
new file mode 100644
index 000000000000..833865578cb0
--- /dev/null
+++ b/drivers/infiniband/core/peer_mem.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2014-2020,  Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <linux/sched/mm.h>
+#include "ib_peer_mem.h"
+static DEFINE_MUTEX(peer_memory_mutex);
+static LIST_HEAD(peer_memory_list);
+static struct kobject *peers_kobj;
+#define PEER_NO_INVALIDATION_ID U32_MAX
+static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context);
+struct peer_mem_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_peer_memory_client *ib_peer_client,
+			struct peer_mem_attribute *attr, char *buf);
+	ssize_t (*store)(struct ib_peer_memory_client *ib_peer_client,
+			 struct peer_mem_attribute *attr, const char *buf,
+			 size_t count);
+};
+
+#define PEER_ATTR_RO(_name)                                                    \
+	struct peer_mem_attribute peer_attr_ ## _name = __ATTR_RO(_name)
+
+static ssize_t version_show(struct ib_peer_memory_client *ib_peer_client,
+			    struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+			 ib_peer_client->peer_mem->version);
+}
+
+static PEER_ATTR_RO(version);
+static ssize_t num_alloc_mrs_show(struct ib_peer_memory_client *ib_peer_client,
+				  struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_alloc_mrs));
+}
+
+static PEER_ATTR_RO(num_alloc_mrs);
+static ssize_t
+num_dealloc_mrs_show(struct ib_peer_memory_client *ib_peer_client,
+		     struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_dealloc_mrs));
+}
+
+static PEER_ATTR_RO(num_dealloc_mrs);
+static ssize_t num_reg_pages_show(struct ib_peer_memory_client *ib_peer_client,
+				  struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_reg_pages));
+}
+
+static PEER_ATTR_RO(num_reg_pages);
+static ssize_t
+num_dereg_pages_show(struct ib_peer_memory_client *ib_peer_client,
+		     struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_dereg_pages));
+}
+
+static PEER_ATTR_RO(num_dereg_pages);
+static ssize_t num_reg_bytes_show(struct ib_peer_memory_client *ib_peer_client,
+				  struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_reg_bytes));
+}
+
+static PEER_ATTR_RO(num_reg_bytes);
+static ssize_t
+num_dereg_bytes_show(struct ib_peer_memory_client *ib_peer_client,
+		     struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(
+		buf, PAGE_SIZE, "%llu\n",
+		(u64)atomic64_read(&ib_peer_client->stats.num_dereg_bytes));
+}
+
+static PEER_ATTR_RO(num_dereg_bytes);
+static ssize_t
+num_free_callbacks_show(struct ib_peer_memory_client *ib_peer_client,
+			struct peer_mem_attribute *attr, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%lu\n",
+			 ib_peer_client->stats.num_free_callbacks);
+}
+
+static PEER_ATTR_RO(num_free_callbacks);
+static struct attribute *peer_mem_attrs[] = {
+			&peer_attr_version.attr,
+			&peer_attr_num_alloc_mrs.attr,
+			&peer_attr_num_dealloc_mrs.attr,
+			&peer_attr_num_reg_pages.attr,
+			&peer_attr_num_dereg_pages.attr,
+			&peer_attr_num_reg_bytes.attr,
+			&peer_attr_num_dereg_bytes.attr,
+			&peer_attr_num_free_callbacks.attr,
+			NULL,
+};
+
+static const struct attribute_group peer_mem_attr_group = {
+	.attrs = peer_mem_attrs,
+};
+
+static ssize_t peer_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct peer_mem_attribute *peer_attr =
+		container_of(attr, struct peer_mem_attribute, attr);
+	if (!peer_attr->show)
+		return -EIO;
+	return peer_attr->show(container_of(kobj, struct ib_peer_memory_client,
+					    kobj),
+			       peer_attr, buf);
+}
+
+static const struct sysfs_ops peer_mem_sysfs_ops = {
+	.show = peer_attr_show,
+};
+
+static void ib_peer_memory_client_release(struct kobject *kobj)
+{
+	struct ib_peer_memory_client *ib_peer_client =
+		container_of(kobj, struct ib_peer_memory_client, kobj);
+	kfree(ib_peer_client);
+}
+
+static struct kobj_type peer_mem_type = {
+	.sysfs_ops = &peer_mem_sysfs_ops,
+	.release = ib_peer_memory_client_release,
+};
+
+static int ib_memory_peer_check_mandatory(const struct peer_memory_client
+						     *peer_client)
+{
+#define PEER_MEM_MANDATORY_FUNC(x) {offsetof(struct peer_memory_client, x), #x}
+	int i;
+	static const struct {
+		size_t offset;
+		char *name;
+	} mandatory_table[] = {
+		PEER_MEM_MANDATORY_FUNC(acquire),
+		PEER_MEM_MANDATORY_FUNC(get_pages),
+		PEER_MEM_MANDATORY_FUNC(put_pages),
+		PEER_MEM_MANDATORY_FUNC(dma_map),
+		PEER_MEM_MANDATORY_FUNC(dma_unmap),
+	};
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **)((void *)peer_client +
+				mandatory_table[i].offset)) {
+			pr_err("Peer memory %s is missing mandatory function %s\n",
+			       peer_client->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+void *
+ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
+			       invalidate_peer_memory *invalidate_callback)
+{
+	struct ib_peer_memory_client *ib_peer_client;
+	int ret;
+	if (ib_memory_peer_check_mandatory(peer_client))
+		return NULL;
+	ib_peer_client = kzalloc(sizeof(*ib_peer_client), GFP_KERNEL);
+	if (!ib_peer_client)
+		return NULL;
+	kobject_init(&ib_peer_client->kobj, &peer_mem_type);
+	refcount_set(&ib_peer_client->usecnt, 1);
+	init_completion(&ib_peer_client->usecnt_zero);
+	ib_peer_client->peer_mem = peer_client;
+	xa_init_flags(&ib_peer_client->umem_xa, XA_FLAGS_ALLOC);
+	/*
+	 * If the peer wants the invalidation_callback then all memory users
+	 * linked to that peer must support invalidation.
+	 */
+	if (invalidate_callback) {
+		*invalidate_callback = ib_invalidate_peer_memory;
+		ib_peer_client->invalidation_required = true;
+	}
+	mutex_lock(&peer_memory_mutex);
+	if (!peers_kobj) {
+		/* Created under /sys/kernel/mm */
+		peers_kobj = kobject_create_and_add("memory_peers", mm_kobj);
+		if (!peers_kobj)
+			goto err_unlock;
+	}
+	ret = kobject_add(&ib_peer_client->kobj, peers_kobj, peer_client->name);
+	if (ret)
+		goto err_parent;
+	ret = sysfs_create_group(&ib_peer_client->kobj,
+				 &peer_mem_attr_group);
+	if (ret)
+		goto err_parent;
+	list_add_tail(&ib_peer_client->core_peer_list, &peer_memory_list);
+	mutex_unlock(&peer_memory_mutex);
+	return ib_peer_client;
+err_parent:
+	if (list_empty(&peer_memory_list)) {
+		kobject_put(peers_kobj);
+		peers_kobj = NULL;
+	}
+err_unlock:
+	mutex_unlock(&peer_memory_mutex);
+	kobject_put(&ib_peer_client->kobj);
+	return NULL;
+}
+EXPORT_SYMBOL(ib_register_peer_memory_client);
+
+void ib_unregister_peer_memory_client(void *reg_handle)
+{
+	struct ib_peer_memory_client *ib_peer_client = reg_handle;
+	mutex_lock(&peer_memory_mutex);
+	list_del(&ib_peer_client->core_peer_list);
+	if (list_empty(&peer_memory_list)) {
+		kobject_put(peers_kobj);
+		peers_kobj = NULL;
+	}
+	mutex_unlock(&peer_memory_mutex);
+	/*
+	 * Wait for all umems to be destroyed before returning. Once
+	 * ib_unregister_peer_memory_client() returns no umems will call any
+	 * peer_mem ops.
+	 */
+	if (refcount_dec_and_test(&ib_peer_client->usecnt))
+		complete(&ib_peer_client->usecnt_zero);
+	wait_for_completion(&ib_peer_client->usecnt_zero);
+	kobject_put(&ib_peer_client->kobj);
+}
+EXPORT_SYMBOL(ib_unregister_peer_memory_client);
+
+static struct ib_peer_memory_client *
+ib_get_peer_client(unsigned long addr, size_t size,
+		   unsigned long peer_mem_flags, void **peer_client_context)
+{
+	struct ib_peer_memory_client *ib_peer_client;
+	int ret = 0;
+	mutex_lock(&peer_memory_mutex);
+	list_for_each_entry(ib_peer_client, &peer_memory_list,
+			    core_peer_list) {
+		if (ib_peer_client->invalidation_required &&
+		    (!(peer_mem_flags & IB_PEER_MEM_INVAL_SUPP)))
+			continue;
+		ret = ib_peer_client->peer_mem->acquire(addr, size, NULL, NULL,
+							peer_client_context);
+		if (ret > 0) {
+			refcount_inc(&ib_peer_client->usecnt);
+			mutex_unlock(&peer_memory_mutex);
+			return ib_peer_client;
+		}
+	}
+	mutex_unlock(&peer_memory_mutex);
+	return NULL;
+}
+
+static void ib_put_peer_client(struct ib_peer_memory_client *ib_peer_client,
+			       void *peer_client_context)
+{
+	if (ib_peer_client->peer_mem->release)
+		ib_peer_client->peer_mem->release(peer_client_context);
+	if (refcount_dec_and_test(&ib_peer_client->usecnt))
+		complete(&ib_peer_client->usecnt_zero);
+}
+
+static void ib_peer_umem_kref_release(struct kref *kref)
+{
+	kfree(container_of(kref, struct ib_umem_peer, kref));
+}
+
+static void ib_unmap_peer_client(struct ib_umem_peer *umem_p)
+{
+	struct ib_peer_memory_client *ib_peer_client = umem_p->ib_peer_client;
+	const struct peer_memory_client *peer_mem = ib_peer_client->peer_mem;
+	struct ib_umem *umem = &umem_p->umem;
+
+	lockdep_assert_held(&umem_p->mapping_lock);
+
+	peer_mem->dma_unmap(&umem_p->umem.sg_head, umem_p->peer_client_context,
+			    umem_p->umem.ibdev->dma_device);
+	peer_mem->put_pages(&umem_p->umem.sg_head, umem_p->peer_client_context);
+	memset(&umem->sg_head, 0, sizeof(umem->sg_head));
+
+	atomic64_add(umem->nmap, &ib_peer_client->stats.num_dereg_pages);
+	atomic64_add(umem->length, &ib_peer_client->stats.num_dereg_bytes);
+	atomic64_inc(&ib_peer_client->stats.num_dealloc_mrs);
+
+	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
+		xa_store(&ib_peer_client->umem_xa, umem_p->xa_id, NULL,
+			 GFP_KERNEL);
+	umem_p->mapped = false;
+}
+
+static int ib_invalidate_peer_memory(void *reg_handle, u64 core_context)
+{
+	struct ib_peer_memory_client *ib_peer_client = reg_handle;
+	struct ib_umem_peer *umem_p;
+
+	/*
+	 * The client is not required to fence against invalidation during
+	 * put_pages() as that would deadlock when we call put_pages() here.
+	 * Thus the core_context cannot be a umem pointer as we have no control
+	 * over the lifetime. Since we won't change the kABI for this to add a
+	 * proper kref, an xarray is used.
+	 */
+	xa_lock(&ib_peer_client->umem_xa);
+	ib_peer_client->stats.num_free_callbacks += 1;
+	umem_p = xa_load(&ib_peer_client->umem_xa, core_context);
+	if (!umem_p)
+		goto out_unlock;
+	kref_get(&umem_p->kref);
+	xa_unlock(&ib_peer_client->umem_xa);
+	mutex_lock(&umem_p->mapping_lock);
+	if (umem_p->mapped) {
+		/*
+		 * At this point the invalidation_func must be !NULL as the get
+		 * flow does not unlock mapping_lock until it is set, and umems
+		 * that do not require invalidation are not in the xarray.
+		 */
+		umem_p->invalidation_func(&umem_p->umem,
+					  umem_p->invalidation_private);
+		ib_unmap_peer_client(umem_p);
+	}
+	mutex_unlock(&umem_p->mapping_lock);
+	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+	return 0;
+out_unlock:
+	xa_unlock(&ib_peer_client->umem_xa);
+	return 0;
+}
+
+void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
+					    umem_invalidate_func_t func,
+					    void *priv)
+{
+	struct ib_umem_peer *umem_p =
+		container_of(umem, struct ib_umem_peer, umem);
+
+	if (WARN_ON(!umem->is_peer))
+		return;
+	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
+		return;
+
+	umem_p->invalidation_func = func;
+	umem_p->invalidation_private = priv;
+	/* Pairs with the lock in ib_peer_umem_get() */
+	mutex_unlock(&umem_p->mapping_lock);
+
+	/* At this point func can be called asynchronously */
+}
+EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier);
+
+struct ib_umem *ib_peer_umem_get(struct ib_umem *old_umem, int old_ret,
+				 unsigned long peer_mem_flags)
+{
+	struct ib_peer_memory_client *ib_peer_client;
+	void *peer_client_context;
+	struct ib_umem_peer *umem_p;
+	int ret;
+	ib_peer_client =
+		ib_get_peer_client(old_umem->address, old_umem->length,
+				   peer_mem_flags, &peer_client_context);
+	if (!ib_peer_client)
+		return ERR_PTR(old_ret);
+	umem_p = kzalloc(sizeof(*umem_p), GFP_KERNEL);
+	if (!umem_p) {
+		ret = -ENOMEM;
+		goto err_client;
+	}
+
+	kref_init(&umem_p->kref);
+	umem_p->umem = *old_umem;
+	memset(&umem_p->umem.sg_head, 0, sizeof(umem_p->umem.sg_head));
+	umem_p->umem.is_peer = 1;
+	umem_p->ib_peer_client = ib_peer_client;
+	umem_p->peer_client_context = peer_client_context;
+	mutex_init(&umem_p->mapping_lock);
+	umem_p->xa_id = PEER_NO_INVALIDATION_ID;
+
+	mutex_lock(&umem_p->mapping_lock);
+	if (ib_peer_client->invalidation_required) {
+		ret = xa_alloc_cyclic(&ib_peer_client->umem_xa, &umem_p->xa_id,
+				      umem_p,
+				      XA_LIMIT(0, PEER_NO_INVALIDATION_ID - 1),
+				      &ib_peer_client->xa_cyclic_next,
+				      GFP_KERNEL);
+		if (ret < 0)
+			goto err_umem;
+	}
+
+	/*
+	 * We always request write permissions to the pages, to force breaking
+	 * of any CoW during the registration of the MR. For read-only MRs we
+	 * use the "force" flag to indicate that CoW breaking is required but
+	 * the registration should not fail if referencing read-only areas.
+	 */
+	ret = ib_peer_client->peer_mem->get_pages(umem_p->umem.address,
+						  umem_p->umem.length, 1,
+						  !umem_p->umem.writable, NULL,
+						  peer_client_context,
+						  umem_p->xa_id);
+	if (ret)
+		goto err_xa;
+
+	umem_p->umem.page_shift =
+		ilog2(ib_peer_client->peer_mem->get_page_size(peer_client_context));
+
+	ret = ib_peer_client->peer_mem->dma_map(&umem_p->umem.sg_head,
+						peer_client_context,
+						umem_p->umem.ibdev->dma_device,
+						0, &umem_p->umem.nmap);
+	if (ret)
+		goto err_pages;
+
+	umem_p->mapped = true;
+	atomic64_add(umem_p->umem.nmap, &ib_peer_client->stats.num_reg_pages);
+	atomic64_add(umem_p->umem.length, &ib_peer_client->stats.num_reg_bytes);
+	atomic64_inc(&ib_peer_client->stats.num_alloc_mrs);
+
+	/*
+	 * If invalidation is allowed then the caller must call
+	 * ib_umem_activate_invalidation_notifier() or ib_peer_umem_release() to
+	 * unlock this mutex. The call to  should be done after the last
+	 * read to sg_head, once the caller is ready for the invalidation
+	 * function to be called.
+	 */
+	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID)
+		mutex_unlock(&umem_p->mapping_lock);
+	/*
+	 * On success the old umem is replaced with the new, larger, allocation
+	 */
+	kfree(old_umem);
+	return &umem_p->umem;
+err_pages:
+	ib_peer_client->peer_mem->put_pages(&umem_p->umem.sg_head,
+					    umem_p->peer_client_context);
+err_xa:
+	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
+		xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
+err_umem:
+	mutex_unlock(&umem_p->mapping_lock);
+	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+err_client:
+	ib_put_peer_client(ib_peer_client, peer_client_context);
+	return ERR_PTR(ret);
+}
+
+void ib_peer_umem_release(struct ib_umem *umem)
+{
+	struct ib_umem_peer *umem_p =
+		container_of(umem, struct ib_umem_peer, umem);
+
+	/* invalidation_func being set indicates activate was called */
+	if (umem_p->xa_id == PEER_NO_INVALIDATION_ID ||
+	    umem_p->invalidation_func)
+		mutex_lock(&umem_p->mapping_lock);
+
+	if (umem_p->mapped)
+		ib_unmap_peer_client(umem_p);
+	mutex_unlock(&umem_p->mapping_lock);
+
+	if (umem_p->xa_id != PEER_NO_INVALIDATION_ID)
+		xa_erase(&umem_p->ib_peer_client->umem_xa, umem_p->xa_id);
+	ib_put_peer_client(umem_p->ib_peer_client, umem_p->peer_client_context);
+	umem_p->ib_peer_client = NULL;
+
+	/* Must match ib_umem_release() */
+	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
+	mmdrop(umem->owning_mm);
+
+	kref_put(&umem_p->kref, ib_peer_umem_kref_release);
+}
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 698c5359f643..e7473285e470 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -42,6 +42,7 @@
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
+#include "ib_peer_mem.h"
 
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
@@ -193,15 +194,17 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 EXPORT_SYMBOL(ib_umem_find_best_pgsz);
 
 /**
- * ib_umem_get - Pin and DMA map userspace memory.
+ * __ib_umem_get - Pin and DMA map userspace memory.
  *
  * @device: IB device to connect UMEM
  * @addr: userspace virtual address to start at
  * @size: length of region to pin
  * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @peer_mem_flags: IB_PEER_MEM_xxx flags for memory being used
  */
-struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
-			    size_t size, int access)
+struct ib_umem *__ib_umem_get(struct ib_device *device,
+			      unsigned long addr, size_t size, int access,
+			      unsigned long peer_mem_flags)
 {
 	struct ib_umem *umem;
 	struct page **page_list;
@@ -309,6 +312,24 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 
 umem_release:
 	__ib_umem_release(device, umem, 0);
+	/*
+	 * If the address belongs to peer memory client, then the first
+	 * call to get_user_pages will fail. In this case, try to get
+	 * these pages from the peers.
+	 */
+	//FIXME: this placement is horrible
+	if (ret < 0 && peer_mem_flags & IB_PEER_MEM_ALLOW) {
+		struct ib_umem *new_umem;
+
+		new_umem = ib_peer_umem_get(umem, ret, peer_mem_flags);
+		if (IS_ERR(new_umem)) {
+			ret = PTR_ERR(new_umem);
+			goto vma;
+		}
+		umem = new_umem;
+		ret = 0;
+		goto out;
+	}
 vma:
 	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 out:
@@ -320,8 +341,23 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 	}
 	return ret ? ERR_PTR(ret) : umem;
 }
+
+struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
+			    size_t size, int access)
+{
+	return __ib_umem_get(device, addr, size, access, 0);
+}
 EXPORT_SYMBOL(ib_umem_get);
 
+struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
+				 size_t size, int access,
+				 unsigned long peer_mem_flags)
+{
+	return __ib_umem_get(device, addr, size, access,
+			     IB_PEER_MEM_ALLOW | peer_mem_flags);
+}
+EXPORT_SYMBOL(ib_umem_get_peer);
+
 /**
  * ib_umem_release - release memory pinned with ib_umem_get
  * @umem: umem struct to release
@@ -333,6 +369,8 @@ void ib_umem_release(struct ib_umem *umem)
 	if (umem->is_odp)
 		return ib_umem_odp_release(to_ib_umem_odp(umem));
 
+	if (umem->is_peer)
+		return ib_peer_umem_release(umem);
 	__ib_umem_release(umem->ibdev, umem, 1);
 
 	atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 2f5ee37c252b..cd2241bb865a 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -733,8 +733,9 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	*cqe_size = ucmd.cqe_size;
 
 	cq->buf.umem =
-		ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
-			    entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
+		ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr,
+				 entries * ucmd.cqe_size,
+				 IB_ACCESS_LOCAL_WRITE, 0);
 	if (IS_ERR(cq->buf.umem)) {
 		err = PTR_ERR(cq->buf.umem);
 		return err;
@@ -1132,9 +1133,9 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
 	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
 		return -EINVAL;
 
-	umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
-			   (size_t)ucmd.cqe_size * entries,
-			   IB_ACCESS_LOCAL_WRITE);
+	umem = ib_umem_get_peer(&dev->ib_dev, ucmd.buf_addr,
+				(size_t)ucmd.cqe_size * entries,
+				IB_ACCESS_LOCAL_WRITE, 0);
 	if (IS_ERR(umem)) {
 		err = PTR_ERR(umem);
 		return err;
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index c3b4b6586d17..f8f8507c7938 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -2143,7 +2143,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
 	if (err)
 		return err;
 
-	obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
+	obj->umem = ib_umem_get_peer(&dev->ib_dev, addr, size, access, 0);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
 
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index 61475b571531..a2a7e121ee5f 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -64,8 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
-				 PAGE_SIZE, 0);
+	page->umem = ib_umem_get_peer(context->ibucontext.device, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index b5aece786b36..174567af5ddd 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -55,16 +55,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 	int i = 0;
 	struct scatterlist *sg;
 	int entry;
+	int page_shift = umem->is_peer ? umem->page_shift : PAGE_SHIFT;
 
-	addr = addr >> PAGE_SHIFT;
+	addr = addr >> page_shift;
 	tmp = (unsigned long)addr;
 	m = find_first_bit(&tmp, BITS_PER_LONG);
 	if (max_page_shift)
-		m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
+		m = min_t(unsigned long, max_page_shift - page_shift, m);
 
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
-		len = sg_dma_len(sg) >> PAGE_SHIFT;
-		pfn = sg_dma_address(sg) >> PAGE_SHIFT;
+		len = sg_dma_len(sg) >> page_shift;
+		pfn = sg_dma_address(sg) >> page_shift;
 		if (base + p != pfn) {
 			/* If either the offset or the new
 			 * base are unaligned update m
@@ -96,7 +97,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 
 		*ncont = 0;
 	}
-	*shift = PAGE_SHIFT + m;
+	*shift = page_shift + m;
 	*count = i;
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 24daf420317e..2d075ca40bfc 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -41,6 +41,8 @@
 #include <rdma/ib_verbs.h>
 #include "mlx5_ib.h"
 
+static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv);
+
 enum {
 	MAX_PENDING_REG_MR = 8,
 };
@@ -754,7 +756,7 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 
 static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
 		       int access_flags, struct ib_umem **umem, int *npages,
-		       int *page_shift, int *ncont, int *order)
+		       int *page_shift, int *ncont, int *order, bool allow_peer)
 {
 	struct ib_umem *u;
 
@@ -779,7 +781,13 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
 		if (order)
 			*order = ilog2(roundup_pow_of_two(*ncont));
 	} else {
-		u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
+		if (allow_peer)
+			u = ib_umem_get_peer(&dev->ib_dev, start, length,
+					     access_flags,
+					     IB_PEER_MEM_INVAL_SUPP);
+		else
+			u = ib_umem_get(&dev->ib_dev, start, length,
+					access_flags);
 		if (IS_ERR(u)) {
 			mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
 			return PTR_ERR(u);
@@ -1280,7 +1288,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	}
 
 	err = mr_umem_get(dev, start, length, access_flags, &umem,
-			  &npages, &page_shift, &ncont, &order);
+			  &npages, &page_shift, &ncont, &order, true);
 
 	if (err < 0)
 		return ERR_PTR(err);
@@ -1335,6 +1343,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 		}
 	}
 
+	if (umem->is_peer) {
+		ib_umem_activate_invalidation_notifier(
+			umem, mlx5_invalidate_umem, mr);
+		/* After this point the MR can be invalidated */
+	}
+
 	if (is_odp_mr(mr)) {
 		to_ib_umem_odp(mr->umem)->private = mr;
 		atomic_set(&mr->num_pending_prefetch, 0);
@@ -1412,6 +1426,10 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 
 	atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
 
+	/* Peer memory isn't supported */
+	if (mr->umem->is_peer)
+		return -EOPNOTSUPP;
+
 	if (!mr->umem)
 		return -EINVAL;
 
@@ -1435,7 +1453,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		ib_umem_release(mr->umem);
 		mr->umem = NULL;
 		err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
-				  &npages, &page_shift, &ncont, &order);
+				  &npages, &page_shift, &ncont, &order, false);
 		if (err)
 			goto err;
 	}
@@ -1615,13 +1633,14 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	 * We should unregister the DMA address from the HCA before
 	 * remove the DMA mapping.
 	 */
-	mlx5_mr_cache_free(dev, mr);
+	if (mr->allocated_from_cache)
+		mlx5_mr_cache_free(dev, mr);
+	else
+		kfree(mr);
+
 	ib_umem_release(umem);
 	if (umem)
 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
-
-	if (!mr->allocated_from_cache)
-		kfree(mr);
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
@@ -2331,3 +2350,15 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 
 	return n;
 }
+
+static void mlx5_invalidate_umem(struct ib_umem *umem, void *priv)
+{
+	struct mlx5_ib_mr *mr = priv;
+
+	/*
+	 * DMA is turned off for the mkey, but the mkey remains otherwise
+	 * untouched until the normal flow of dereg_mr happens. Any access to
+	 * this mkey will generate CQEs.
+	 */
+	unreg_umr(mr->dev ,mr);
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 45faab9e1313..be59c6d5ba1c 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -749,7 +749,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 {
 	int err;
 
-	*umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
+	*umem = ib_umem_get_peer(&dev->ib_dev, addr, size, 0, 0);
 	if (IS_ERR(*umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		return PTR_ERR(*umem);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 6d1ff13d2283..2f55f7e1923d 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
-	srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
+	srq->umem = ib_umem_get_peer(pd->device, ucmd.buf_addr, buf_size, 0, 0);
 	if (IS_ERR(srq->umem)) {
 		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
 		err = PTR_ERR(srq->umem);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 9353910915d4..ec9824cbf49d 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -48,10 +48,19 @@ struct ib_umem {
 	unsigned long		address;
 	u32 writable : 1;
 	u32 is_odp : 1;
+	/* Placing at the end of the bitfield list is ABI preserving on LE */
+	u32 is_peer : 1;
 	struct work_struct	work;
 	struct sg_table sg_head;
 	int             nmap;
 	unsigned int    sg_nents;
+	unsigned int    page_shift;
+};
+
+typedef void (*umem_invalidate_func_t)(struct ib_umem *umem, void *priv);
+enum ib_peer_mem_flags {
+	IB_PEER_MEM_ALLOW = 1 << 0,
+	IB_PEER_MEM_INVAL_SUPP = 1 << 1,
 };
 
 /* Returns the offset of the umem start relative to the first page. */
@@ -79,6 +88,13 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 				     unsigned long pgsz_bitmap,
 				     unsigned long virt);
 
+struct ib_umem *ib_umem_get_peer(struct ib_device *device, unsigned long addr,
+				 size_t size, int access,
+				 unsigned long peer_mem_flags);
+void ib_umem_activate_invalidation_notifier(struct ib_umem *umem,
+					    umem_invalidate_func_t func,
+					    void *cookie);
+
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
 #include <linux/err.h>
@@ -102,6 +118,19 @@ static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
 	return 0;
 }
 
+static inline struct ib_umem *ib_umem_get_peer(struct ib_device *device,
+					       unsigned long addr, size_t size,
+					       int access,
+					       unsigned long peer_mem_flags)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void ib_umem_activate_invalidation_notifier(
+	struct ib_umem *umem, umem_invalidate_func_t func, void *cookie)
+{
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */
diff --git a/include/rdma/peer_mem.h b/include/rdma/peer_mem.h
new file mode 100644
index 000000000000..563a820dbc32
--- /dev/null
+++ b/include/rdma/peer_mem.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014-2020,  Mellanox Technologies. All rights reserved.
+ */
+#ifndef RDMA_PEER_MEM_H
+#define RDMA_PEER_MEM_H
+
+#include <linux/scatterlist.h>
+
+#define IB_PEER_MEMORY_NAME_MAX 64
+#define IB_PEER_MEMORY_VER_MAX 16
+
+/*
+ * Prior versions used a void * for core_context, at some point this was
+ * switched to use u64. Be careful if compiling this as 32 bit. To help the
+ * value of core_context is limited to u32 so it should work OK despite the
+ * type change.
+ */
+#define PEER_MEM_U64_CORE_CONTEXT
+
+struct device;
+
+/**
+ *  struct peer_memory_client - registration information for user virtual
+ *                              memory handlers
+ *
+ * The peer_memory_client scheme allows a driver to register with the ib_umem
+ * system that it has the ability to understand user virtual address ranges
+ * that are not compatible with get_user_pages(). For instance VMAs created
+ * with io_remap_pfn_range(), or other driver special VMA.
+ *
+ * For ranges the interface understands it can provide a DMA mapped sg_table
+ * for use by the ib_umem, allowing user virtual ranges that cannot be
+ * supported by get_user_pages() to be used as umems.
+ */
+struct peer_memory_client {
+	char name[IB_PEER_MEMORY_NAME_MAX];
+	char version[IB_PEER_MEMORY_VER_MAX];
+
+	/**
+	 * acquire - Begin working with a user space virtual address range
+	 *
+	 * @addr - Virtual address to be checked whether belongs to peer.
+	 * @size - Length of the virtual memory area starting at addr.
+	 * @peer_mem_private_data - Obsolete, always NULL
+	 * @peer_mem_name - Obsolete, always NULL
+	 * @client_context - Returns an opaque value for this acquire use in
+	 *                   other APIs
+	 *
+	 * Returns 1 if the peer_memory_client supports the entire virtual
+	 * address range, 0 or -ERRNO otherwise. If 1 is returned then
+	 * release() will be called to release the acquire().
+	 */
+	int (*acquire)(unsigned long addr, size_t size,
+		       void *peer_mem_private_data, char *peer_mem_name,
+		       void **client_context);
+	/**
+	 * get_pages - Fill in the first part of a sg_table for a virtual
+	 *             address range
+	 *
+	 * @addr - Virtual address to be checked whether belongs to peer.
+	 * @size - Length of the virtual memory area starting at addr.
+	 * @write - Always 1
+	 * @force - 1 if write is required
+	 * @sg_head - Obsolete, always NULL
+	 * @client_context - Value returned by acquire()
+	 * @core_context - Value to be passed to invalidate_peer_memory for
+	 *                 this get
+	 *
+	 * addr/size are passed as the raw virtual address range requested by
+	 * the user, it is not aligned to any page size. get_pages() is always
+	 * followed by dma_map().
+	 *
+	 * Upon return the caller can call the invalidate_callback().
+	 *
+	 * Returns 0 on success, -ERRNO on failure. After success put_pages()
+	 * will be called to return the pages.
+	 */
+	int (*get_pages)(unsigned long addr, size_t size, int write, int force,
+			 struct sg_table *sg_head, void *client_context,
+			 u64 core_context);
+	/**
+	 * dma_map - Create a DMA mapped sg_table
+	 *
+	 * @sg_head - The sg_table to allocate
+	 * @client_context - Value returned by acquire()
+	 * @dma_device - The device that will be doing DMA from these addresses
+	 * @dmasync - Obsolete, always 0
+	 * @nmap - Returns the number of dma mapped entries in the sg_head
+	 *
+	 * Must be called after get_pages(). This must fill in the sg_head with
+	 * DMA mapped SGLs for dma_device. Each SGL start and end must meet a
+	 * minimum alignment of at least PAGE_SIZE, though individual sgls can
+	 * be multiples of PAGE_SIZE, in any mixture. Since the user virtual
+	 * address/size are not page aligned, the implementation must increase
+	 * it to the logical alignment when building the SGLs.
+	 *
+	 * Returns 0 on success, -ERRNO on failure. After success dma_unmap()
+	 * will be called to unmap the pages. On failure sg_head must be left
+	 * untouched or point to a valid sg_table.
+	 */
+	int (*dma_map)(struct sg_table *sg_head, void *client_context,
+		       struct device *dma_device, int dmasync, int *nmap);
+	/**
+	 * dma_unmap - Unmap a DMA mapped sg_table
+	 *
+	 * @sg_head - The sg_table to unmap
+	 * @client_context - Value returned by acquire()
+	 * @dma_device - The device that will be doing DMA from these addresses
+	 *
+	 * sg_head will not be touched after this function returns.
+	 *
+	 * Must return 0.
+	 */
+	int (*dma_unmap)(struct sg_table *sg_head, void *client_context,
+			 struct device *dma_device);
+	/**
+	 * put_pages - Unpin a SGL
+	 *
+	 * @sg_head - The sg_table to unpin
+	 * @client_context - Value returned by acquire()
+	 *
+	 * sg_head must be freed on return.
+	 */
+	void (*put_pages)(struct sg_table *sg_head, void *client_context);
+	/* Obsolete, not used */
+	unsigned long (*get_page_size)(void *client_context);
+	/**
+	 * release - Undo acquire
+	 *
+	 * @client_context - Value returned by acquire()
+	 *
+	 * If acquire() returns 1 then release() must be called. All
+	 * get_pages() and dma_map()'s must be undone before calling this
+	 * function.
+	 */
+	void (*release)(void *client_context);
+};
+
+/*
+ * If invalidate_callback() is non-NULL then the client will only support
+ * umems which can be invalidated. The caller may call the
+ * invalidate_callback() after acquire() on return the range will no longer
+ * have DMA active, and release() will have been called.
+ *
+ * Note: The implementation locking must ensure that get_pages(), and
+ * dma_map() do not have locking dependencies with invalidate_callback(). The
+ * ib_core will wait until any concurrent get_pages() or dma_map() completes
+ * before returning.
+ *
+ * Similarly, this can call dma_unmap(), put_pages() and release() from within
+ * the callback, or will wait for another thread doing those operations to
+ * complete.
+ *
+ * For these reasons the user of invalidate_callback() must be careful with
+ * locking.
+ */
+typedef int (*invalidate_peer_memory)(void *reg_handle, u64 core_context);
+
+void *
+ib_register_peer_memory_client(const struct peer_memory_client *peer_client,
+			       invalidate_peer_memory *invalidate_callback);
+void ib_unregister_peer_memory_client(void *reg_handle);
+
+#endif
-- 
2.33.0




More information about the kernel-team mailing list