summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2022-11-29 16:29:34 -0400
committerJason Gunthorpe <jgg@nvidia.com>2022-11-30 20:16:49 -0400
commitaad37e71d5c4dc1d3c25734f0bcd51c324f94b5e (patch)
treec835636d8c2f03f720aaa0549dd97bc46fed8060 /drivers/iommu
parent51fe6141f0f64ae0bbc096a41a07572273e8c0ef (diff)
downloadlwn-aad37e71d5c4dc1d3c25734f0bcd51c324f94b5e.tar.gz
lwn-aad37e71d5c4dc1d3c25734f0bcd51c324f94b5e.zip
iommufd: IOCTLs for the io_pagetable
Connect the IOAS to its IOCTL interface. This exposes most of the functionality in the io_pagetable to userspace. This is intended to be the core of the generic interface that IOMMUFD will provide. Every IOMMU driver should be able to implement an iommu_domain that is compatible with this generic mechanism. It is also designed to be easy to use for simple non virtual machine monitor users, like DPDK: - Universal simple support for all IOMMUs (no PPC special path) - An IOVA allocator that considers the aperture and the allowed/reserved ranges - io_pagetable allows any number of iommu_domains to be connected to the IOAS - Automatic allocation and re-use of iommu_domains Along with room in the design to add non-generic features to cater to specific HW functionality. Link: https://lore.kernel.org/r/11-v6-a196d26f289e+11787-iommufd_jgg@nvidia.com Tested-by: Nicolin Chen <nicolinc@nvidia.com> Tested-by: Yi Liu <yi.l.liu@intel.com> Tested-by: Lixiao Yang <lixiao.yang@intel.com> Tested-by: Matthew Rosato <mjrosato@linux.ibm.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/iommufd/Makefile1
-rw-r--r--drivers/iommu/iommufd/ioas.c392
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h33
-rw-r--r--drivers/iommu/iommufd/main.c48
4 files changed, 474 insertions, 0 deletions
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index b66a8c47ff55..2b4f36f1b72f 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
iommufd-y := \
io_pagetable.o \
+ ioas.o \
main.o \
pages.o
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
new file mode 100644
index 000000000000..6ff97dafc891
--- /dev/null
+++ b/drivers/iommu/iommufd/ioas.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
+ */
+#include <linux/interval_tree.h>
+#include <linux/iommufd.h>
+#include <linux/iommu.h>
+#include <uapi/linux/iommufd.h>
+
+#include "io_pagetable.h"
+
+void iommufd_ioas_destroy(struct iommufd_object *obj)
+{
+ struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj);
+ int rc;
+
+ rc = iopt_unmap_all(&ioas->iopt, NULL);
+ WARN_ON(rc && rc != -ENOENT);
+ iopt_destroy_table(&ioas->iopt);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx)
+{
+ struct iommufd_ioas *ioas;
+
+ ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS);
+ if (IS_ERR(ioas))
+ return ioas;
+
+ iopt_init_table(&ioas->iopt);
+ return ioas;
+}
+
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_alloc *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc;
+
+ if (cmd->flags)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_ioas_alloc(ucmd->ictx);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ cmd->out_ioas_id = ioas->obj.id;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_table;
+ iommufd_object_finalize(ucmd->ictx, &ioas->obj);
+ return 0;
+
+out_table:
+ iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_iova_range __user *ranges;
+ struct iommu_ioas_iova_ranges *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ struct interval_tree_span_iter span;
+ u32 max_iovas;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ down_read(&ioas->iopt.iova_rwsem);
+ max_iovas = cmd->num_iovas;
+ ranges = u64_to_user_ptr(cmd->allowed_iovas);
+ cmd->num_iovas = 0;
+ cmd->out_iova_alignment = ioas->iopt.iova_alignment;
+ interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
+ ULONG_MAX) {
+ if (!span.is_hole)
+ continue;
+ if (cmd->num_iovas < max_iovas) {
+ struct iommu_iova_range elm = {
+ .start = span.start_hole,
+ .last = span.last_hole,
+ };
+
+ if (copy_to_user(&ranges[cmd->num_iovas], &elm,
+ sizeof(elm))) {
+ rc = -EFAULT;
+ goto out_put;
+ }
+ }
+ cmd->num_iovas++;
+ }
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+ if (rc)
+ goto out_put;
+ if (cmd->num_iovas > max_iovas)
+ rc = -EMSGSIZE;
+out_put:
+ up_read(&ioas->iopt.iova_rwsem);
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int iommufd_ioas_load_iovas(struct rb_root_cached *itree,
+ struct iommu_iova_range __user *ranges,
+ u32 num)
+{
+ u32 i;
+
+ for (i = 0; i != num; i++) {
+ struct iommu_iova_range range;
+ struct iopt_allowed *allowed;
+
+ if (copy_from_user(&range, ranges + i, sizeof(range)))
+ return -EFAULT;
+
+ if (range.start >= range.last)
+ return -EINVAL;
+
+ if (interval_tree_iter_first(itree, range.start, range.last))
+ return -EINVAL;
+
+ allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT);
+ if (!allowed)
+ return -ENOMEM;
+ allowed->node.start = range.start;
+ allowed->node.last = range.last;
+
+ interval_tree_insert(&allowed->node, itree);
+ }
+ return 0;
+}
+
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_allow_iovas *cmd = ucmd->cmd;
+ struct rb_root_cached allowed_iova = RB_ROOT_CACHED;
+ struct interval_tree_node *node;
+ struct iommufd_ioas *ioas;
+ struct io_pagetable *iopt;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+ iopt = &ioas->iopt;
+
+ rc = iommufd_ioas_load_iovas(&allowed_iova,
+ u64_to_user_ptr(cmd->allowed_iovas),
+ cmd->num_iovas);
+ if (rc)
+ goto out_free;
+
+ /*
+ * We want the allowed tree update to be atomic, so we have to keep the
+ * original nodes around, and keep track of the new nodes as we allocate
+ * memory for them. The simplest solution is to have a new/old tree and
+ * then swap new for old. On success we free the old tree, on failure we
+ * free the new tree.
+ */
+ rc = iopt_set_allow_iova(iopt, &allowed_iova);
+out_free:
+ while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) {
+ interval_tree_remove(node, &allowed_iova);
+ kfree(container_of(node, struct iopt_allowed, node));
+ }
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+static int conv_iommu_prot(u32 map_flags)
+{
+ /*
+ * We provide no manual cache coherency ioctls to userspace and most
+ * architectures make the CPU ops for cache flushing privileged.
+ * Therefore we require the underlying IOMMU to support CPU coherent
+ * operation. Support for IOMMU_CACHE is enforced by the
+ * IOMMU_CAP_CACHE_COHERENCY test during bind.
+ */
+ int iommu_prot = IOMMU_CACHE;
+
+ if (map_flags & IOMMU_IOAS_MAP_WRITEABLE)
+ iommu_prot |= IOMMU_WRITE;
+ if (map_flags & IOMMU_IOAS_MAP_READABLE)
+ iommu_prot |= IOMMU_READ;
+ return iommu_prot;
+}
+
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_map *cmd = ucmd->cmd;
+ unsigned long iova = cmd->iova;
+ struct iommufd_ioas *ioas;
+ unsigned int flags = 0;
+ int rc;
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova,
+ u64_to_user_ptr(cmd->user_va), cmd->length,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put;
+
+ cmd->iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_copy *cmd = ucmd->cmd;
+ struct iommufd_ioas *src_ioas;
+ struct iommufd_ioas *dst_ioas;
+ unsigned int flags = 0;
+ LIST_HEAD(pages_list);
+ unsigned long iova;
+ int rc;
+
+ if ((cmd->flags &
+ ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE |
+ IOMMU_IOAS_MAP_READABLE)))
+ return -EOPNOTSUPP;
+ if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX ||
+ cmd->dst_iova >= ULONG_MAX)
+ return -EOVERFLOW;
+
+ src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id);
+ if (IS_ERR(src_ioas))
+ return PTR_ERR(src_ioas);
+ rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length,
+ &pages_list);
+ iommufd_put_object(&src_ioas->obj);
+ if (rc)
+ return rc;
+
+ dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id);
+ if (IS_ERR(dst_ioas)) {
+ rc = PTR_ERR(dst_ioas);
+ goto out_pages;
+ }
+
+ if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA))
+ flags = IOPT_ALLOC_IOVA;
+ iova = cmd->dst_iova;
+ rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova,
+ conv_iommu_prot(cmd->flags), flags);
+ if (rc)
+ goto out_put_dst;
+
+ cmd->dst_iova = iova;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_put_dst:
+ iommufd_put_object(&dst_ioas->obj);
+out_pages:
+ iopt_free_pages_list(&pages_list);
+ return rc;
+}
+
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_ioas_unmap *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ unsigned long unmapped = 0;
+ int rc;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->ioas_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ if (cmd->iova == 0 && cmd->length == U64_MAX) {
+ rc = iopt_unmap_all(&ioas->iopt, &unmapped);
+ if (rc)
+ goto out_put;
+ } else {
+ if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) {
+ rc = -EOVERFLOW;
+ goto out_put;
+ }
+ rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length,
+ &unmapped);
+ if (rc)
+ goto out_put;
+ }
+
+ cmd->length = unmapped;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put:
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
+
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx)
+{
+ if (cmd->object_id)
+ return -EOPNOTSUPP;
+
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ int rc = 0;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ xa_lock(&ictx->objects);
+ if (!xa_empty(&ictx->objects)) {
+ rc = -EBUSY;
+ } else {
+ if (cmd->val64 == 0)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_USER;
+ else if (cmd->val64 == 1)
+ ictx->account_mode = IOPT_PAGES_ACCOUNT_MM;
+ else
+ rc = -EINVAL;
+ }
+ xa_unlock(&ictx->objects);
+
+ return rc;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd,
+ struct iommufd_ioas *ioas)
+{
+ if (cmd->op == IOMMU_OPTION_OP_GET) {
+ cmd->val64 = !ioas->iopt.disable_large_pages;
+ return 0;
+ }
+ if (cmd->op == IOMMU_OPTION_OP_SET) {
+ if (cmd->val64 == 0)
+ return iopt_disable_large_pages(&ioas->iopt);
+ if (cmd->val64 == 1) {
+ iopt_enable_large_pages(&ioas->iopt);
+ return 0;
+ }
+ return -EINVAL;
+ }
+ return -EOPNOTSUPP;
+}
+
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ struct iommufd_ioas *ioas;
+ int rc = 0;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ ioas = iommufd_get_ioas(ucmd, cmd->object_id);
+ if (IS_ERR(ioas))
+ return PTR_ERR(ioas);
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option_huge_pages(cmd, ioas);
+ break;
+ default:
+ rc = -EOPNOTSUPP;
+ }
+
+ iommufd_put_object(&ioas->obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f7ab6c6edafd..1a13c54a8def 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -11,6 +11,7 @@
struct iommu_domain;
struct iommu_group;
+struct iommu_option;
struct iommufd_ctx {
struct file *file;
@@ -102,6 +103,7 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd,
enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
+ IOMMUFD_OBJ_IOAS,
};
/* Base struct for all objects with a userspace ID handle. */
@@ -174,6 +176,37 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
type), \
typeof(*(ptr)), obj)
+/*
+ * The IO Address Space (IOAS) pagetable is a virtual page table backed by the
+ * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
+ * mapping is copied into all of the associated domains and made available to
+ * in-kernel users.
+ */
+struct iommufd_ioas {
+ struct iommufd_object obj;
+ struct io_pagetable iopt;
+};
+
+static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd,
+ u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_IOAS),
+ struct iommufd_ioas, obj);
+}
+
+struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx);
+int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_ioas_destroy(struct iommufd_object *obj);
+int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
+int iommufd_option_rlimit_mode(struct iommu_option *cmd,
+ struct iommufd_ctx *ictx);
+
struct iommufd_access {
unsigned long iova_alignment;
u32 iopt_access_list_id;
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index dfbc68b97506..1c0a1f499378 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -204,8 +204,39 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp)
return 0;
}
+static int iommufd_option(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_option *cmd = ucmd->cmd;
+ int rc;
+
+ if (cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ switch (cmd->option_id) {
+ case IOMMU_OPTION_RLIMIT_MODE:
+ rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx);
+ break;
+ case IOMMU_OPTION_HUGE_PAGES:
+ rc = iommufd_ioas_option(ucmd);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ if (rc)
+ return rc;
+ if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64,
+ &cmd->val64, sizeof(cmd->val64)))
+ return -EFAULT;
+ return 0;
+}
+
union ucmd_buffer {
struct iommu_destroy destroy;
+ struct iommu_ioas_alloc alloc;
+ struct iommu_ioas_allow_iovas allow_iovas;
+ struct iommu_ioas_iova_ranges iova_ranges;
+ struct iommu_ioas_map map;
+ struct iommu_ioas_unmap unmap;
};
struct iommufd_ioctl_op {
@@ -226,6 +257,20 @@ struct iommufd_ioctl_op {
}
static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+ IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
+ struct iommu_ioas_alloc, out_ioas_id),
+ IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
+ struct iommu_ioas_allow_iovas, allowed_iovas),
+ IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
+ src_iova),
+ IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
+ struct iommu_ioas_iova_ranges, out_iova_alignment),
+ IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map,
+ iova),
+ IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap,
+ length),
+ IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option,
+ val64),
};
static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
@@ -312,6 +357,9 @@ void iommufd_ctx_put(struct iommufd_ctx *ictx)
EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD);
static const struct iommufd_object_ops iommufd_object_ops[] = {
+ [IOMMUFD_OBJ_IOAS] = {
+ .destroy = iommufd_ioas_destroy,
+ },
};
static struct miscdevice iommu_misc_dev = {