// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2018-2020 Intel Corporation. * Copyright (C) 2020 Red Hat, Inc. * * Author: Tiwei Bie <tiwei.bie@intel.com> * Jason Wang <jasowang@redhat.com> * * Thanks Michael S. Tsirkin for the valuable comments and * suggestions. And thanks to Cunming Liang and Zhihong Wang for all * their supports. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/cdev.h> #include <linux/device.h> #include <linux/iommu.h> #include <linux/uuid.h> #include <linux/vdpa.h> #include <linux/nospec.h> #include <linux/vhost.h> #include <linux/virtio_net.h> #include "vhost.h" enum { VHOST_VDPA_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_F_IOMMU_PLATFORM) | (1ULL << VIRTIO_F_RING_PACKED) | (1ULL << VIRTIO_F_ORDER_PLATFORM) | (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | (1ULL << VIRTIO_RING_F_EVENT_IDX), VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES | (1ULL << VIRTIO_NET_F_CSUM) | (1ULL << VIRTIO_NET_F_GUEST_CSUM) | (1ULL << VIRTIO_NET_F_MTU) | (1ULL << VIRTIO_NET_F_MAC) | (1ULL << VIRTIO_NET_F_GUEST_TSO4) | (1ULL << VIRTIO_NET_F_GUEST_TSO6) | (1ULL << VIRTIO_NET_F_GUEST_ECN) | (1ULL << VIRTIO_NET_F_GUEST_UFO) | (1ULL << VIRTIO_NET_F_HOST_TSO4) | (1ULL << VIRTIO_NET_F_HOST_TSO6) | (1ULL << VIRTIO_NET_F_HOST_ECN) | (1ULL << VIRTIO_NET_F_HOST_UFO) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_NET_F_STATUS) | (1ULL << VIRTIO_NET_F_SPEED_DUPLEX), }; /* Currently, only network backend w/o multiqueue is supported. */ #define VHOST_VDPA_VQ_MAX 2 #define VHOST_VDPA_DEV_MAX (1U << MINORBITS) struct vhost_vdpa { struct vhost_dev vdev; struct iommu_domain *domain; struct vhost_virtqueue *vqs; struct completion completion; struct vdpa_device *vdpa; struct device dev; struct cdev cdev; atomic_t opened; int nvqs; int virtio_id; int minor; }; static DEFINE_IDA(vhost_vdpa_ida); static dev_t vhost_vdpa_major; static const u64 vhost_vdpa_features[] = { [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES, }; static void handle_vq_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev); const struct vdpa_config_ops *ops = v->vdpa->config; ops->kick_vq(v->vdpa, vq - v->vqs); } static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) { struct vhost_virtqueue *vq = private; struct eventfd_ctx *call_ctx = vq->call_ctx; if (call_ctx) eventfd_signal(call_ctx, 1); return IRQ_HANDLED; } static void vhost_vdpa_reset(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; ops->set_status(vdpa, 0); } static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u32 device_id; device_id = ops->get_device_id(vdpa); if (copy_to_user(argp, &device_id, sizeof(device_id))) return -EFAULT; return 0; } static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u8 status; status = ops->get_status(vdpa); if (copy_to_user(statusp, &status, sizeof(status))) return -EFAULT; return 0; } static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u8 status; if (copy_from_user(&status, statusp, sizeof(status))) return -EFAULT; /* * Userspace shouldn't remove status bits unless reset the * status to 0. */ if (status != 0 && (ops->get_status(vdpa) & ~status) != 0) return -EINVAL; ops->set_status(vdpa, status); return 0; } static int vhost_vdpa_config_validate(struct vhost_vdpa *v, struct vhost_vdpa_config *c) { long size = 0; switch (v->virtio_id) { case VIRTIO_ID_NET: size = sizeof(struct virtio_net_config); break; } if (c->len == 0) return -EINVAL; if (c->len > size - c->off) return -E2BIG; return 0; } static long vhost_vdpa_get_config(struct vhost_vdpa *v, struct vhost_vdpa_config __user *c) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); u8 *buf; if (copy_from_user(&config, c, size)) return -EFAULT; if (vhost_vdpa_config_validate(v, &config)) return -EINVAL; buf = kvzalloc(config.len, GFP_KERNEL); if (!buf) return -ENOMEM; ops->get_config(vdpa, config.off, buf, config.len); if (copy_to_user(c->buf, buf, config.len)) { kvfree(buf); return -EFAULT; } kvfree(buf); return 0; } static long vhost_vdpa_set_config(struct vhost_vdpa *v, struct vhost_vdpa_config __user *c) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct vhost_vdpa_config config; unsigned long size = offsetof(struct vhost_vdpa_config, buf); u8 *buf; if (copy_from_user(&config, c, size)) return -EFAULT; if (vhost_vdpa_config_validate(v, &config)) return -EINVAL; buf = kvzalloc(config.len, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, c->buf, config.len)) { kvfree(buf); return -EFAULT; } ops->set_config(vdpa, config.off, buf, config.len); kvfree(buf); return 0; } static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u64 features; features = ops->get_features(vdpa); features &= vhost_vdpa_features[v->virtio_id]; if (copy_to_user(featurep, &features, sizeof(features))) return -EFAULT; return 0; } static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u64 features; /* * It's not allowed to change the features after they have * been negotiated. */ if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK) return -EBUSY; if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; if (features & ~vhost_vdpa_features[v->virtio_id]) return -EINVAL; if (ops->set_features(vdpa, features)) return -EINVAL; return 0; } static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; u16 num; num = ops->get_vq_num_max(vdpa); if (copy_to_user(argp, &num, sizeof(num))) return -EFAULT; return 0; } static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct vdpa_callback cb; struct vhost_virtqueue *vq; struct vhost_vring_state s; u8 status; u32 idx; long r; r = get_user(idx, (u32 __user *)argp); if (r < 0) return r; if (idx >= v->nvqs) return -ENOBUFS; idx = array_index_nospec(idx, v->nvqs); vq = &v->vqs[idx]; status = ops->get_status(vdpa); if (cmd == VHOST_VDPA_SET_VRING_ENABLE) { if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; ops->set_vq_ready(vdpa, idx, s.num); return 0; } if (cmd == VHOST_GET_VRING_BASE) vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx); r = vhost_vring_ioctl(&v->vdev, cmd, argp); if (r) return r; switch (cmd) { case VHOST_SET_VRING_ADDR: if (ops->set_vq_address(vdpa, idx, (u64)(uintptr_t)vq->desc, (u64)(uintptr_t)vq->avail, (u64)(uintptr_t)vq->used)) r = -EINVAL; break; case VHOST_SET_VRING_BASE: if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx)) r = -EINVAL; break; case VHOST_SET_VRING_CALL: if (vq->call_ctx) { cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; } else { cb.callback = NULL; cb.private = NULL; } ops->set_vq_cb(vdpa, idx, &cb); break; case VHOST_SET_VRING_NUM: ops->set_vq_num(vdpa, idx, vq->num); break; } return r; } static long vhost_vdpa_unlocked_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vhost_vdpa *v = filep->private_data; struct vhost_dev *d = &v->vdev; void __user *argp = (void __user *)arg; long r; mutex_lock(&d->mutex); switch (cmd) { case VHOST_VDPA_GET_DEVICE_ID: r = vhost_vdpa_get_device_id(v, argp); break; case VHOST_VDPA_GET_STATUS: r = vhost_vdpa_get_status(v, argp); break; case VHOST_VDPA_SET_STATUS: r = vhost_vdpa_set_status(v, argp); break; case VHOST_VDPA_GET_CONFIG: r = vhost_vdpa_get_config(v, argp); break; case VHOST_VDPA_SET_CONFIG: r = vhost_vdpa_set_config(v, argp); break; case VHOST_GET_FEATURES: r = vhost_vdpa_get_features(v, argp); break; case VHOST_SET_FEATURES: r = vhost_vdpa_set_features(v, argp); break; case VHOST_VDPA_GET_VRING_NUM: r = vhost_vdpa_get_vring_num(v, argp); break; case VHOST_SET_LOG_BASE: case VHOST_SET_LOG_FD: r = -ENOIOCTLCMD; break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); if (r == -ENOIOCTLCMD) r = vhost_vdpa_vring_ioctl(v, cmd, argp); break; } mutex_unlock(&d->mutex); return r; } static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) { struct vhost_dev *dev = &v->vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct vhost_iotlb_map *map; struct page *page; unsigned long pfn, pinned; while ((map = vhost_iotlb_itree_first(iotlb, start, last)) != NULL) { pinned = map->size >> PAGE_SHIFT; for (pfn = map->addr >> PAGE_SHIFT; pinned > 0; pfn++, pinned--) { page = pfn_to_page(pfn); if (map->perm & VHOST_ACCESS_WO) set_page_dirty_lock(page); unpin_user_page(page); } atomic64_sub(map->size >> PAGE_SHIFT, &dev->mm->pinned_vm); vhost_iotlb_map_free(iotlb, map); } } static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) { struct vhost_dev *dev = &v->vdev; vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); kfree(dev->iotlb); dev->iotlb = NULL; } static int perm_to_iommu_flags(u32 perm) { int flags = 0; switch (perm) { case VHOST_ACCESS_WO: flags |= IOMMU_WRITE; break; case VHOST_ACCESS_RO: flags |= IOMMU_READ; break; case VHOST_ACCESS_RW: flags |= (IOMMU_WRITE | IOMMU_READ); break; default: WARN(1, "invalidate vhost IOTLB permission\n"); break; } return flags | IOMMU_CACHE; } static int vhost_vdpa_map(struct vhost_vdpa *v, u64 iova, u64 size, u64 pa, u32 perm) { struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; int r = 0; r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, pa, perm); if (r) return r; if (ops->dma_map) r = ops->dma_map(vdpa, iova, size, pa, perm); else if (ops->set_map) r = ops->set_map(vdpa, dev->iotlb); else r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); return r; } static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) { struct vhost_dev *dev = &v->vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); if (ops->dma_map) ops->dma_unmap(vdpa, iova, size); else if (ops->set_map) ops->set_map(vdpa, dev->iotlb); else iommu_unmap(v->domain, iova, size); } static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_iotlb_msg *msg) { struct vhost_dev *dev = &v->vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; unsigned long locked, lock_limit, pinned, i; u64 iova = msg->iova; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; if (msg->perm & VHOST_ACCESS_WO) gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; if (!npages) return -EINVAL; down_read(&dev->mm->mmap_sem); locked = atomic64_add_return(npages, &dev->mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit) { ret = -ENOMEM; goto out; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; while (npages) { pinned = min_t(unsigned long, npages, list_size); ret = pin_user_pages(cur_base, pinned, gup_flags, page_list, NULL); if (ret != pinned) goto out; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); for (i = 0; i < ret; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; if (vhost_vdpa_map(v, iova, csize, map_pfn << PAGE_SHIFT, msg->perm)) goto out; map_pfn = this_pfn; iova += csize; } last_pfn = this_pfn; } cur_base += ret << PAGE_SHIFT; npages -= ret; } /* Pin the rest chunk */ ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT, map_pfn << PAGE_SHIFT, msg->perm); out: if (ret) { vhost_vdpa_unmap(v, msg->iova, msg->size); atomic64_sub(npages, &dev->mm->pinned_vm); } up_read(&dev->mm->mmap_sem); free_page((unsigned long)page_list); return ret; } static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_iotlb_msg *msg) { struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); int r = 0; r = vhost_dev_check_owner(dev); if (r) return r; switch (msg->type) { case VHOST_IOTLB_UPDATE: r = vhost_vdpa_process_iotlb_update(v, msg); break; case VHOST_IOTLB_INVALIDATE: vhost_vdpa_unmap(v, msg->iova, msg->size); break; default: r = -EINVAL; break; } return r; } static ssize_t vhost_vdpa_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_vdpa *v = file->private_data; struct vhost_dev *dev = &v->vdev; return vhost_chr_write_iter(dev, from); } static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct device *dma_dev = vdpa_get_dma_dev(vdpa); struct bus_type *bus; int ret; /* Device want to do DMA by itself */ if (ops->set_map || ops->dma_map) return 0; bus = dma_dev->bus; if (!bus) return -EFAULT; if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) return -ENOTSUPP; v->domain = iommu_domain_alloc(bus); if (!v->domain) return -EIO; ret = iommu_attach_device(v->domain, dma_dev); if (ret) goto err_attach; return 0; err_attach: iommu_domain_free(v->domain); return ret; } static void vhost_vdpa_free_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; struct device *dma_dev = vdpa_get_dma_dev(vdpa); if (v->domain) { iommu_detach_device(v->domain, dma_dev); iommu_domain_free(v->domain); } v->domain = NULL; } static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; struct vhost_dev *dev; struct vhost_virtqueue **vqs; int nvqs, i, r, opened; v = container_of(inode->i_cdev, struct vhost_vdpa, cdev); if (!v) return -ENODEV; opened = atomic_cmpxchg(&v->opened, 0, 1); if (opened) return -EBUSY; nvqs = v->nvqs; vhost_vdpa_reset(v); vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); if (!vqs) { r = -ENOMEM; goto err; } dev = &v->vdev; for (i = 0; i < nvqs; i++) { vqs[i] = &v->vqs[i]; vqs[i]->handle_kick = handle_vq_kick; } vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, vhost_vdpa_process_iotlb_msg); dev->iotlb = vhost_iotlb_alloc(0, 0); if (!dev->iotlb) { r = -ENOMEM; goto err_init_iotlb; } r = vhost_vdpa_alloc_domain(v); if (r) goto err_init_iotlb; filep->private_data = v; return 0; err_init_iotlb: vhost_dev_cleanup(&v->vdev); err: atomic_dec(&v->opened); return r; } static int vhost_vdpa_release(struct inode *inode, struct file *filep) { struct vhost_vdpa *v = filep->private_data; struct vhost_dev *d = &v->vdev; mutex_lock(&d->mutex); filep->private_data = NULL; vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); vhost_vdpa_iotlb_free(v); vhost_vdpa_free_domain(v); vhost_dev_cleanup(&v->vdev); kfree(v->vdev.vqs); mutex_unlock(&d->mutex); atomic_dec(&v->opened); complete(&v->completion); return 0; } static const struct file_operations vhost_vdpa_fops = { .owner = THIS_MODULE, .open = vhost_vdpa_open, .release = vhost_vdpa_release, .write_iter = vhost_vdpa_chr_write_iter, .unlocked_ioctl = vhost_vdpa_unlocked_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static void vhost_vdpa_release_dev(struct device *device) { struct vhost_vdpa *v = container_of(device, struct vhost_vdpa, dev); ida_simple_remove(&vhost_vdpa_ida, v->minor); kfree(v->vqs); kfree(v); } static int vhost_vdpa_probe(struct vdpa_device *vdpa) { const struct vdpa_config_ops *ops = vdpa->config; struct vhost_vdpa *v; int minor, nvqs = VHOST_VDPA_VQ_MAX; int r; /* Currently, we only accept the network devices. */ if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) return -ENOTSUPP; v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!v) return -ENOMEM; minor = ida_simple_get(&vhost_vdpa_ida, 0, VHOST_VDPA_DEV_MAX, GFP_KERNEL); if (minor < 0) { kfree(v); return minor; } atomic_set(&v->opened, 0); v->minor = minor; v->vdpa = vdpa; v->nvqs = nvqs; v->virtio_id = ops->get_device_id(vdpa); device_initialize(&v->dev); v->dev.release = vhost_vdpa_release_dev; v->dev.parent = &vdpa->dev; v->dev.devt = MKDEV(MAJOR(vhost_vdpa_major), minor); v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue), GFP_KERNEL); if (!v->vqs) { r = -ENOMEM; goto err; } r = dev_set_name(&v->dev, "vhost-vdpa-%u", minor); if (r) goto err; cdev_init(&v->cdev, &vhost_vdpa_fops); v->cdev.owner = THIS_MODULE; r = cdev_device_add(&v->cdev, &v->dev); if (r) goto err; init_completion(&v->completion); vdpa_set_drvdata(vdpa, v); return 0; err: put_device(&v->dev); return r; } static void vhost_vdpa_remove(struct vdpa_device *vdpa) { struct vhost_vdpa *v = vdpa_get_drvdata(vdpa); int opened; cdev_device_del(&v->cdev, &v->dev); do { opened = atomic_cmpxchg(&v->opened, 0, 1); if (!opened) break; wait_for_completion(&v->completion); } while (1); put_device(&v->dev); } static struct vdpa_driver vhost_vdpa_driver = { .driver = { .name = "vhost_vdpa", }, .probe = vhost_vdpa_probe, .remove = vhost_vdpa_remove, }; static int __init vhost_vdpa_init(void) { int r; r = alloc_chrdev_region(&vhost_vdpa_major, 0, VHOST_VDPA_DEV_MAX, "vhost-vdpa"); if (r) goto err_alloc_chrdev; r = vdpa_register_driver(&vhost_vdpa_driver); if (r) goto err_vdpa_register_driver; return 0; err_vdpa_register_driver: unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); err_alloc_chrdev: return r; } module_init(vhost_vdpa_init); static void __exit vhost_vdpa_exit(void) { vdpa_unregister_driver(&vhost_vdpa_driver); unregister_chrdev_region(vhost_vdpa_major, VHOST_VDPA_DEV_MAX); } module_exit(vhost_vdpa_exit); MODULE_VERSION("0.0.1"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION("vDPA-based vhost backend for virtio");