From 89a216ed973e49d6f39a6976bcead3b631171b64 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 9 Jul 2025 16:55:31 -0400 Subject: virtio: fix comments, readability Fix a couple of comments to match reality. Initialize config_driver_disabled to be consistent with other fields (note: the structure is already zero initialized, so this is not a bugfix as such). Signed-off-by: Michael S. Tsirkin Message-Id: <7b74a55a5f3dc066d954472f5b68c29022f11b43.1752094439.git.mst@redhat.com> Acked-by: Jason Wang --- drivers/virtio/virtio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 95d5d7993e5b..c441c8cc71ef 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -147,7 +147,7 @@ EXPORT_SYMBOL_GPL(virtio_config_changed); /** * virtio_config_driver_disable - disable config change reporting by drivers - * @dev: the device to reset + * @dev: the device to disable * * This is only allowed to be called by a driver and disabling can't * be nested. @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(virtio_config_driver_disable); /** * virtio_config_driver_enable - enable config change reporting by drivers - * @dev: the device to reset + * @dev: the device to enable * * This is only allowed to be called by a driver and enabling can't * be nested. @@ -530,6 +530,7 @@ int register_virtio_device(struct virtio_device *dev) goto out_ida_remove; spin_lock_init(&dev->config_lock); + dev->config_driver_disabled = false; dev->config_core_enabled = false; dev->config_change_pending = false; -- cgit v1.2.3 From 2507789a724d607fa9e162dcadeb9f51b071fc49 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 7 May 2025 10:28:21 +0200 Subject: drm/virtio: implement virtio_gpu_shutdown Calling drm_dev_unplug() is the drm way to say the device is gone and can not be accessed any more. Cc: Michael S. Tsirkin Signed-off-by: Gerd Hoffmann Reviewed-by: Eric Auger Tested-by: Eric Auger Message-Id: <20250507082821.2710706-1-kraxel@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/gpu/drm/virtio/virtgpu_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c index e32e680c7197..71c6ccad4b99 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.c +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c @@ -130,10 +130,10 @@ static void virtio_gpu_remove(struct virtio_device *vdev) static void virtio_gpu_shutdown(struct virtio_device *vdev) { - /* - * drm does its own synchronization on shutdown. - * Do nothing here, opt out of device reset. - */ + struct drm_device *dev = vdev->priv; + + /* stop talking to the device */ + drm_dev_unplug(dev); } static void virtio_gpu_config_changed(struct virtio_device *vdev) -- cgit v1.2.3 From 482bd84f1fab20ac6c4d112945ae2d1bdb36839f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 May 2025 10:26:29 -0400 Subject: virtio: document ENOSPC drivers handle ENOSPC specially since it's an error one can get from a working VQ. Document the semantics. Message-Id: <2e6ec46b8d5e6755be291cec8e2ec57ef286e97b.1748356035.git.mst@redhat.com> Reported-by: Parav Pandit Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Parav Pandit --- drivers/virtio/virtio_ring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 4397392bfef0..f5062061c408 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2296,6 +2296,10 @@ static inline int virtqueue_add(struct virtqueue *_vq, * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + * + * NB: ENOSPC is a special code that is only returned on an attempt to add a + * buffer to a full VQ. It indicates that some buffers are outstanding and that + * the operation can be retried after some buffers have been used. */ int virtqueue_add_sgs(struct virtqueue *_vq, struct scatterlist *sgs[], -- cgit v1.2.3 From 564a69ad90d15c782176e1a8c9e1c95661e1aed0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 21 May 2025 17:03:46 +0530 Subject: virtio-mmio: Remove virtqueue list from mmio device The MMIO transport implementation creates a list of virtqueues for a virtio device, while the same is already available in the struct virtio_device. Don't create a duplicate list, and use the other one instead. While at it, fix the virtio_device_for_each_vq() macro to accept an argument like "&vm_dev->vdev" (which currently fails to build). Signed-off-by: Viresh Kumar Message-Id: <3e56c6f74002987e22f364d883cbad177cd9ad9c.1747827066.git.viresh.kumar@linaro.org> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio_mmio.c | 52 +++----------------------------------------- include/linux/virtio.h | 2 +- 2 files changed, 4 insertions(+), 50 deletions(-) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 5d78c2d572ab..b152a1eca05a 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -88,22 +87,8 @@ struct virtio_mmio_device { void __iomem *base; unsigned long version; - - /* a list of queues so we can dispatch IRQs */ - spinlock_t lock; - struct list_head virtqueues; -}; - -struct virtio_mmio_vq_info { - /* the actual virtqueue */ - struct virtqueue *vq; - - /* the list node for the virtqueues list */ - struct list_head node; }; - - /* Configuration interface */ static u64 vm_get_features(struct virtio_device *vdev) @@ -300,9 +285,8 @@ static bool vm_notify_with_data(struct virtqueue *vq) static irqreturn_t vm_interrupt(int irq, void *opaque) { struct virtio_mmio_device *vm_dev = opaque; - struct virtio_mmio_vq_info *info; + struct virtqueue *vq; unsigned long status; - unsigned long flags; irqreturn_t ret = IRQ_NONE; /* Read and acknowledge interrupts */ @@ -315,10 +299,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque) } if (likely(status & VIRTIO_MMIO_INT_VRING)) { - spin_lock_irqsave(&vm_dev->lock, flags); - list_for_each_entry(info, &vm_dev->virtqueues, node) - ret |= vring_interrupt(irq, info->vq); - spin_unlock_irqrestore(&vm_dev->lock, flags); + virtio_device_for_each_vq(&vm_dev->vdev, vq) + ret |= vring_interrupt(irq, vq); } return ret; @@ -329,14 +311,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque) static void vm_del_vq(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); - struct virtio_mmio_vq_info *info = vq->priv; - unsigned long flags; unsigned int index = vq->index; - spin_lock_irqsave(&vm_dev->lock, flags); - list_del(&info->node); - spin_unlock_irqrestore(&vm_dev->lock, flags); - /* Select and deactivate the queue */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); if (vm_dev->version == 1) { @@ -347,8 +323,6 @@ static void vm_del_vq(struct virtqueue *vq) } vring_del_virtqueue(vq); - - kfree(info); } static void vm_del_vqs(struct virtio_device *vdev) @@ -375,9 +349,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); bool (*notify)(struct virtqueue *vq); - struct virtio_mmio_vq_info *info; struct virtqueue *vq; - unsigned long flags; unsigned int num; int err; @@ -399,13 +371,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in goto error_available; } - /* Allocate and fill out our active queue description */ - info = kmalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - err = -ENOMEM; - goto error_kmalloc; - } - num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); if (num == 0) { err = -ENOENT; @@ -463,13 +428,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); } - vq->priv = info; - info->vq = vq; - - spin_lock_irqsave(&vm_dev->lock, flags); - list_add(&info->node, &vm_dev->virtqueues); - spin_unlock_irqrestore(&vm_dev->lock, flags); - return vq; error_bad_pfn: @@ -481,8 +439,6 @@ error_new_virtqueue: writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } - kfree(info); -error_kmalloc: error_available: return ERR_PTR(err); } @@ -627,8 +583,6 @@ static int virtio_mmio_probe(struct platform_device *pdev) vm_dev->vdev.dev.release = virtio_mmio_release_dev; vm_dev->vdev.config = &virtio_mmio_config_ops; vm_dev->pdev = pdev; - INIT_LIST_HEAD(&vm_dev->virtqueues); - spin_lock_init(&vm_dev->lock); vm_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(vm_dev->base)) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 64cb4b04be7a..8b745ce0cf5f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -196,7 +196,7 @@ int virtio_device_reset_done(struct virtio_device *dev); size_t virtio_max_dma_size(const struct virtio_device *vdev); #define virtio_device_for_each_vq(vdev, vq) \ - list_for_each_entry(vq, &vdev->vqs, list) + list_for_each_entry(vq, &(vdev)->vqs, list) /** * struct virtio_driver - operations for a virtio I/O driver -- cgit v1.2.3 From 4d0efa600ecf30aa61c14681164290f75c328f8a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 29 May 2025 13:00:27 +0530 Subject: virtio-vdpa: Remove virtqueue list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The virtio vdpa implementation creates a list of virtqueues, while the same is already available in the struct virtio_device. This list is never traversed though, and only the pointer to the struct virtio_vdpa_vq_info is used in the callback, where the virtqueue pointer could be directly used. Remove the unwanted code to simplify the driver. Signed-off-by: Viresh Kumar Message-Id: <7808f2f7e484987b95f172fffb6c71a5da20ed1e.1748503784.git.viresh.kumar@linaro.org> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez --- drivers/virtio/virtio_vdpa.c | 44 +++----------------------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 1f60c9d5cb18..e25610e3393a 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -28,19 +28,6 @@ struct virtio_vdpa_device { struct virtio_device vdev; struct vdpa_device *vdpa; u64 features; - - /* The lock to protect virtqueue list */ - spinlock_t lock; - /* List of virtio_vdpa_vq_info */ - struct list_head virtqueues; -}; - -struct virtio_vdpa_vq_info { - /* the actual virtqueue */ - struct virtqueue *vq; - - /* the list node for the virtqueues list */ - struct list_head node; }; static inline struct virtio_vdpa_device * @@ -135,9 +122,9 @@ static irqreturn_t virtio_vdpa_config_cb(void *private) static irqreturn_t virtio_vdpa_virtqueue_cb(void *private) { - struct virtio_vdpa_vq_info *info = private; + struct virtqueue *vq = private; - return vring_interrupt(0, info->vq); + return vring_interrupt(0, vq); } static struct virtqueue * @@ -145,18 +132,15 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx) { - struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); struct vdpa_device *vdpa = vd_get_vdpa(vdev); struct device *dma_dev; const struct vdpa_config_ops *ops = vdpa->config; - struct virtio_vdpa_vq_info *info; bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; /* Assume split virtqueue, switch to packed if necessary */ struct vdpa_vq_state state = {0}; - unsigned long flags; u32 align, max_num, min_num = 1; bool may_reduce_num = true; int err; @@ -179,10 +163,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, if (ops->get_vq_ready(vdpa, index)) return ERR_PTR(-ENOENT); - /* Allocate and fill out our active queue description */ - info = kmalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); if (ops->get_vq_size) max_num = ops->get_vq_size(vdpa, index); else @@ -217,7 +197,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, /* Setup virtqueue callback */ cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; - cb.private = info; + cb.private = vq; cb.trigger = NULL; ops->set_vq_cb(vdpa, index, &cb); ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq)); @@ -248,13 +228,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, ops->set_vq_ready(vdpa, index, 1); - vq->priv = info; - info->vq = vq; - - spin_lock_irqsave(&vd_dev->lock, flags); - list_add(&info->node, &vd_dev->virtqueues); - spin_unlock_irqrestore(&vd_dev->lock, flags); - return vq; err_vq: @@ -263,7 +236,6 @@ error_new_virtqueue: ops->set_vq_ready(vdpa, index, 0); /* VDPA driver should make sure vq is stopeed here */ WARN_ON(ops->get_vq_ready(vdpa, index)); - kfree(info); return ERR_PTR(err); } @@ -272,20 +244,12 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq) struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev); struct vdpa_device *vdpa = vd_dev->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct virtio_vdpa_vq_info *info = vq->priv; unsigned int index = vq->index; - unsigned long flags; - - spin_lock_irqsave(&vd_dev->lock, flags); - list_del(&info->node); - spin_unlock_irqrestore(&vd_dev->lock, flags); /* Select and deactivate the queue (best effort) */ ops->set_vq_ready(vdpa, index, 0); vring_del_virtqueue(vq); - - kfree(info); } static void virtio_vdpa_del_vqs(struct virtio_device *vdev) @@ -501,8 +465,6 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa) vd_dev->vdev.dev.release = virtio_vdpa_release_dev; vd_dev->vdev.config = &virtio_vdpa_config_ops; vd_dev->vdpa = vdpa; - INIT_LIST_HEAD(&vd_dev->virtqueues); - spin_lock_init(&vd_dev->lock); vd_dev->vdev.id.device = ops->get_device_id(vdpa); if (vd_dev->vdev.id.device == 0) -- cgit v1.2.3 From c0883c1af14c5d351201ace00b1a46df2b157329 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Thu, 29 May 2025 01:42:39 -0700 Subject: virtio: Fix typo in register_virtio_device() doc comment Corrected "suceess" to "success" in the function documentation for clarity. Signed-off-by: Alok Tiwari Acked-by: Jason Wang Message-Id: <20250529084350.3145699-1-alok.a.tiwari@oracle.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index c441c8cc71ef..b25eadf59477 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -506,7 +506,7 @@ out: * On error, the caller must call put_device on &@dev->dev (and not kfree), * as another code path may have obtained a reference to @dev. * - * Returns: 0 on suceess, -error on failure + * Returns: 0 on success, -error on failure */ int register_virtio_device(struct virtio_device *dev) { -- cgit v1.2.3 From 32d89a405adc204d82bea6ae2ba27a62d35568b4 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Wed, 4 Jun 2025 14:55:48 +0800 Subject: vhost: Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)) cocci warning: ./kernel/vhost_task.c:148:9-16: WARNING: ERR_CAST can be used with tsk Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)). Signed-off-by: Pei Xiao Message-Id: <1a8499a5da53e4f72cf21aca044ae4b26db8b2ad.1749020055.git.xiaopei01@kylinos.cn> Signed-off-by: Michael S. Tsirkin --- kernel/vhost_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2f844c279a3e..bc738fa90c1d 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return ERR_PTR(PTR_ERR(tsk)); + return ERR_CAST(tsk); } vtsk->task = tsk; -- cgit v1.2.3 From 6f0f3d7fc4e05797b801ded4910a64d16db230e9 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 4 Jun 2025 21:48:01 +0300 Subject: vdpa/mlx5: Fix needs_teardown flag calculation needs_teardown is a device flag that indicates when virtual queues need to be recreated. This happens for certain configuration changes: queue size and some specific features. Currently, the needs_teardown state can be incorrectly reset by subsequent .set_vq_num() calls. For example, for 1 rx VQ with size 512 and 1 tx VQ with size 256: .set_vq_num(0, 512) -> sets needs_teardown to true (rx queue has a non-default size) .set_vq_num(1, 256) -> sets needs_teardown to false (tx queue has a default size) This change takes into account the previous value of the needs_teardown flag when re-calculating it during VQ size configuration. Fixes: 0fe963d6fc16 ("vdpa/mlx5: Re-create HW VQs under certain conditions") Signed-off-by: Dragos Tatulea Reviewed-by: Shahar Shitrit Reviewed-by: Si-Wei Liu Tested-by: Si-Wei Liu Message-Id: <20250604184802.2625300-1-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index cccc49a08a1a..efb5fa694f1e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2491,7 +2491,7 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) } mvq = &ndev->vqs[idx]; - ndev->needs_teardown = num != mvq->num_ent; + ndev->needs_teardown |= num != mvq->num_ent; mvq->num_ent = num; } -- cgit v1.2.3 From 652abad08571a067b16c5bb47ad5ea7478517e7d Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Wed, 11 Jun 2025 07:39:21 -0700 Subject: vhost-scsi: Fix typos and formatting in comments and logs This patch corrects several minor typos and formatting issues. Changes include: Fixing misspellings like in comments - "explict" -> "explicit" - "infight" -> "inflight", - "with generate" -> "will generate" formatting in logs - Correcting log formatting specifier from "%dd" to "%d" - Adding a missing space in the sysfs emit string to prevent misinterpreted output like "X86_64on ". changing to "X86_64 on " - Cleaning up stray semicolons in struct definition endings These changes improve code readability and consistency. no functionality changes. Signed-off-by: Alok Tiwari Message-Id: <20250611143932.2443796-1-alok.a.tiwari@oracle.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Mike Christie --- drivers/vhost/scsi.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index c12a0d4e6386..508ff3b29f39 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -152,7 +152,7 @@ struct vhost_scsi_nexus { struct vhost_scsi_tpg { /* Vhost port target portal group tag for TCM */ u16 tport_tpgt; - /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ + /* Used to track number of TPG Port/Lun Links wrt to explicit I_T Nexus shutdown */ int tv_tpg_port_count; /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ int tv_tpg_vhost_count; @@ -311,12 +311,12 @@ static void vhost_scsi_init_inflight(struct vhost_scsi *vs, mutex_lock(&vq->mutex); - /* store old infight */ + /* store old inflight */ idx = vs->vqs[i].inflight_idx; if (old_inflight) old_inflight[i] = &vs->vqs[i].inflights[idx]; - /* setup new infight */ + /* setup new inflight */ vs->vqs[i].inflight_idx = idx ^ 1; new_inflight = &vs->vqs[i].inflights[idx ^ 1]; kref_init(&new_inflight->kref); @@ -1249,7 +1249,7 @@ vhost_scsi_setup_resp_iovs(struct vhost_scsi_cmd *cmd, struct iovec *in_iovs, if (!in_iovs_cnt) return 0; /* - * Initiator's normally just put the virtio_scsi_cmd_resp in the first + * Initiators normally just put the virtio_scsi_cmd_resp in the first * iov, but just in case they wedged in some data with it we check for * greater than or equal to the response struct. */ @@ -1457,7 +1457,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) cmd = vhost_scsi_get_cmd(vq, tag); if (IS_ERR(cmd)) { ret = PTR_ERR(cmd); - vq_err(vq, "vhost_scsi_get_tag failed %dd\n", ret); + vq_err(vq, "vhost_scsi_get_tag failed %d\n", ret); goto err; } cmd->tvc_vq = vq; @@ -2609,7 +2609,7 @@ static int vhost_scsi_make_nexus(struct vhost_scsi_tpg *tpg, return -ENOMEM; } /* - * Since we are running in 'demo mode' this call with generate a + * Since we are running in 'demo mode' this call will generate a * struct se_node_acl for the vhost_scsi struct se_portal_group with * the SCSI Initiator port name of the passed configfs group 'name'. */ @@ -2915,7 +2915,7 @@ static ssize_t vhost_scsi_wwn_version_show(struct config_item *item, char *page) { return sysfs_emit(page, "TCM_VHOST fabric module %s on %s/%s" - "on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, + " on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, utsname()->machine); } @@ -2983,13 +2983,13 @@ out_vhost_scsi_deregister: vhost_scsi_deregister(); out: return ret; -}; +} static void vhost_scsi_exit(void) { target_unregister_template(&vhost_scsi_ops); vhost_scsi_deregister(); -}; +} MODULE_DESCRIPTION("VHOST_SCSI series fabric driver"); MODULE_ALIAS("tcm_vhost"); -- cgit v1.2.3 From 69cd720a8a5e9ef0f05ce5dd8c9ea6e018245c82 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 11 Jun 2025 16:01:13 -0500 Subject: vhost-scsi: Fix log flooding with target does not exist errors As part of the normal initiator side scanning the guest's scsi layer will loop over all possible targets and send an inquiry. Since the max number of targets for virtio-scsi is 256, this can result in 255 error messages about targets not existing if you only have a single target. When there's more than 1 vhost-scsi device each with a single target, then you get N * 255 log messages. It looks like the log message was added by accident in: commit 3f8ca2e115e5 ("vhost/scsi: Extract common handling code from control queue handler") when we added common helpers. Then in: commit 09d7583294aa ("vhost/scsi: Use common handling code in request queue handler") we converted the scsi command processing path to use the new helpers so we started to see the extra log messages during scanning. The patches were just making some code common but added the vq_err call and I'm guessing the patch author forgot to enable the vq_err call (vq_err is implemented by pr_debug which defaults to off). So this patch removes the call since it's expected to hit this path during device discovery. Fixes: 09d7583294aa ("vhost/scsi: Use common handling code in request queue handler") Signed-off-by: Mike Christie Reviewed-by: Stefan Hajnoczi Reviewed-by: Stefano Garzarella Message-Id: <20250611210113.10912-1-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 508ff3b29f39..fd9a517dfe13 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1226,10 +1226,8 @@ vhost_scsi_get_req(struct vhost_virtqueue *vq, struct vhost_scsi_ctx *vc, /* validated at handler entry */ vs_tpg = vhost_vq_get_backend(vq); tpg = READ_ONCE(vs_tpg[*vc->target]); - if (unlikely(!tpg)) { - vq_err(vq, "Target 0x%x does not exist\n", *vc->target); + if (unlikely(!tpg)) goto out; - } } if (tpgp) -- cgit v1.2.3 From 569c392e191361cd05fba1fd87ed02ef0d130ef7 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 17 Jun 2025 01:18:36 +0100 Subject: vhost: vringh: Remove unused iotlb functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions: vringh_abandon_iotlb() vringh_notify_disable_iotlb() and vringh_notify_enable_iotlb() were added in 2020 by commit 9ad9c49cfe97 ("vringh: IOTLB support") but have remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Message-Id: <20250617001838.114457-2-linux@treblig.org> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang --- drivers/vhost/vringh.c | 43 ------------------------------------------- include/linux/vringh.h | 5 ----- 2 files changed, 48 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index bbce65452701..67a028d6fb5f 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1534,23 +1534,6 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh, } EXPORT_SYMBOL(vringh_iov_push_iotlb); -/** - * vringh_abandon_iotlb - we've decided not to handle the descriptor(s). - * @vrh: the vring. - * @num: the number of descriptors to put back (ie. num - * vringh_get_iotlb() to undo). - * - * The next vringh_get_iotlb() will return the old descriptor(s) again. - */ -void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num) -{ - /* We only update vring_avail_event(vr) when we want to be notified, - * so we haven't changed that yet. - */ - vrh->last_avail_idx -= num; -} -EXPORT_SYMBOL(vringh_abandon_iotlb); - /** * vringh_complete_iotlb - we've finished with descriptor, publish it. * @vrh: the vring. @@ -1571,32 +1554,6 @@ int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len) } EXPORT_SYMBOL(vringh_complete_iotlb); -/** - * vringh_notify_enable_iotlb - we want to know if something changes. - * @vrh: the vring. - * - * This always enables notifications, but returns false if there are - * now more buffers available in the vring. - */ -bool vringh_notify_enable_iotlb(struct vringh *vrh) -{ - return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb); -} -EXPORT_SYMBOL(vringh_notify_enable_iotlb); - -/** - * vringh_notify_disable_iotlb - don't tell us if something changes. - * @vrh: the vring. - * - * This is our normal running state: we disable and then only enable when - * we're going to sleep. - */ -void vringh_notify_disable_iotlb(struct vringh *vrh) -{ - __vringh_notify_disable(vrh, putu16_iotlb); -} -EXPORT_SYMBOL(vringh_notify_disable_iotlb); - /** * vringh_need_notify_iotlb - must we tell the other side about used buffers? * @vrh: the vring we've called vringh_complete_iotlb() on. diff --git a/include/linux/vringh.h b/include/linux/vringh.h index c3a8117dabe8..af8bd2695a7b 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -319,13 +319,8 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh, struct vringh_kiov *wiov, const void *src, size_t len); -void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num); - int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len); -bool vringh_notify_enable_iotlb(struct vringh *vrh); -void vringh_notify_disable_iotlb(struct vringh *vrh); - int vringh_need_notify_iotlb(struct vringh *vrh); #endif /* CONFIG_VHOST_IOTLB */ -- cgit v1.2.3 From 6e9ef6937c726b97d4a6d49332d06e999acc15f5 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 17 Jun 2025 01:18:37 +0100 Subject: vhost: vringh: Remove unused functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions: vringh_abandon_kern() vringh_abandon_user() vringh_iov_pull_kern() and vringh_iov_push_kern() were all added in 2013 by commit f87d0fbb5798 ("vringh: host-side implementation of virtio rings.") but have remained unused. Remove them and the two helper functions they used. Signed-off-by: Dr. David Alan Gilbert Message-Id: <20250617001838.114457-3-linux@treblig.org> Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez Tested-by: Lei Yang Reviewed-by: Simon Horman --- drivers/vhost/vringh.c | 75 -------------------------------------------------- include/linux/vringh.h | 7 ----- 2 files changed, 82 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 67a028d6fb5f..9f27c3f6091b 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -779,22 +779,6 @@ ssize_t vringh_iov_push_user(struct vringh_iov *wiov, } EXPORT_SYMBOL(vringh_iov_push_user); -/** - * vringh_abandon_user - we've decided not to handle the descriptor(s). - * @vrh: the vring. - * @num: the number of descriptors to put back (ie. num - * vringh_get_user() to undo). - * - * The next vringh_get_user() will return the old descriptor(s) again. - */ -void vringh_abandon_user(struct vringh *vrh, unsigned int num) -{ - /* We only update vring_avail_event(vr) when we want to be notified, - * so we haven't changed that yet. */ - vrh->last_avail_idx -= num; -} -EXPORT_SYMBOL(vringh_abandon_user); - /** * vringh_complete_user - we've finished with descriptor, publish it. * @vrh: the vring. @@ -900,20 +884,6 @@ static inline int putused_kern(const struct vringh *vrh, return 0; } -static inline int xfer_kern(const struct vringh *vrh, void *src, - void *dst, size_t len) -{ - memcpy(dst, src, len); - return 0; -} - -static inline int kern_xfer(const struct vringh *vrh, void *dst, - void *src, size_t len) -{ - memcpy(dst, src, len); - return 0; -} - /** * vringh_init_kern - initialize a vringh for a kernelspace vring. * @vrh: the vringh to initialize. @@ -998,51 +968,6 @@ int vringh_getdesc_kern(struct vringh *vrh, } EXPORT_SYMBOL(vringh_getdesc_kern); -/** - * vringh_iov_pull_kern - copy bytes from vring_iov. - * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) - * @dst: the place to copy. - * @len: the maximum length to copy. - * - * Returns the bytes copied <= len or a negative errno. - */ -ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) -{ - return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern); -} -EXPORT_SYMBOL(vringh_iov_pull_kern); - -/** - * vringh_iov_push_kern - copy bytes into vring_iov. - * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) - * @src: the place to copy from. - * @len: the maximum length to copy. - * - * Returns the bytes copied <= len or a negative errno. - */ -ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, - const void *src, size_t len) -{ - return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer); -} -EXPORT_SYMBOL(vringh_iov_push_kern); - -/** - * vringh_abandon_kern - we've decided not to handle the descriptor(s). - * @vrh: the vring. - * @num: the number of descriptors to put back (ie. num - * vringh_get_kern() to undo). - * - * The next vringh_get_kern() will return the old descriptor(s) again. - */ -void vringh_abandon_kern(struct vringh *vrh, unsigned int num) -{ - /* We only update vring_avail_event(vr) when we want to be notified, - * so we haven't changed that yet. */ - vrh->last_avail_idx -= num; -} -EXPORT_SYMBOL(vringh_abandon_kern); - /** * vringh_complete_kern - we've finished with descriptor, publish it. * @vrh: the vring. diff --git a/include/linux/vringh.h b/include/linux/vringh.h index af8bd2695a7b..49e7cbc9697a 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -175,9 +175,6 @@ int vringh_complete_multi_user(struct vringh *vrh, const struct vring_used_elem used[], unsigned num_used); -/* Pretend we've never seen descriptor (for easy error handling). */ -void vringh_abandon_user(struct vringh *vrh, unsigned int num); - /* Do we need to fire the eventfd to notify the other side? */ int vringh_need_notify_user(struct vringh *vrh); @@ -235,10 +232,6 @@ int vringh_getdesc_kern(struct vringh *vrh, u16 *head, gfp_t gfp); -ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); -ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, - const void *src, size_t len); -void vringh_abandon_kern(struct vringh *vrh, unsigned int num); int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); bool vringh_notify_enable_kern(struct vringh *vrh); -- cgit v1.2.3 From 8a0d18a9348f61c63b33a23b9eece1f66af1d70c Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sun, 15 Jun 2025 10:39:11 -0700 Subject: vhost: Fix typos Fix multiple typos and improve comment clarity across vhost.c. Spelling errors: "thead" -> "thread", "RUNNUNG" -> "RUNNING" and "available". Signed-off-by: Alok Tiwari Message-Id: <20250615173933.1610324-1-alok.a.tiwari@oracle.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Simon Horman --- drivers/vhost/vhost.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 3a5ebb973dba..4390e3a14218 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -594,10 +594,10 @@ static void vhost_attach_mm(struct vhost_dev *dev) if (dev->use_worker) { dev->mm = get_task_mm(current); } else { - /* vDPA device does not use worker thead, so there's - * no need to hold the address space for mm. This help + /* vDPA device does not use worker thread, so there's + * no need to hold the address space for mm. This helps * to avoid deadlock in the case of mmap() which may - * held the refcnt of the file and depends on release + * hold the refcnt of the file and depends on release * method to remove vma. */ dev->mm = current->mm; @@ -731,7 +731,7 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, * We don't want to call synchronize_rcu for every vq during setup * because it will slow down VM startup. If we haven't done * VHOST_SET_VRING_KICK and not done the driver specific - * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will + * SET_ENDPOINT/RUNNING then we can skip the sync since there will * not be any works queued for scsi and net. */ mutex_lock(&vq->mutex); @@ -2860,7 +2860,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev, } EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); -/* return true if we're sure that avaiable ring is empty */ +/* return true if we're sure that available ring is empty */ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) { int r; -- cgit v1.2.3 From 95109b46764665e3de4a118185e4f732e8e849fd Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 23 Jun 2025 14:52:10 +0800 Subject: virtio: virtio_dma_buf: fix missing parameter documentation Add missing parameter documentation for virtio_dma_buf_attach() function to fix kernel-doc warnings: Warning: drivers/virtio/virtio_dma_buf.c:41 function parameter 'dma_buf' not described in 'virtio_dma_buf_attach' Warning: drivers/virtio/virtio_dma_buf.c:41 function parameter 'attach' not described in 'virtio_dma_buf_attach' The function documentation was missing descriptions for both the 'dma_buf' and 'attach' parameters. Add proper parameter documentation following kernel-doc format. Signed-off-by: WangYuli Message-Id: <241C7118259DA110+20250623065210.270237-1-wangyuli@uniontech.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Xuan Zhuo --- drivers/virtio/virtio_dma_buf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/virtio/virtio_dma_buf.c b/drivers/virtio/virtio_dma_buf.c index 3fe1d03b0645..95c10632f84a 100644 --- a/drivers/virtio/virtio_dma_buf.c +++ b/drivers/virtio/virtio_dma_buf.c @@ -36,6 +36,8 @@ EXPORT_SYMBOL(virtio_dma_buf_export); /** * virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs + * @dma_buf: [in] buffer to attach + * @attach: [in] attachment structure */ int virtio_dma_buf_attach(struct dma_buf *dma_buf, struct dma_buf_attachment *attach) -- cgit v1.2.3 From 400cad513c78f9af72c5a20f3611c1f1dc71d465 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 28 Jun 2025 11:33:53 -0700 Subject: vhost-scsi: Fix check for inline_sg_cnt exceeding preallocated limit The condition comparing ret to VHOST_SCSI_PREALLOC_SGLS was incorrect, as ret holds the result of kstrtouint() (typically 0 on success), not the parsed value. Update the check to use cnt, which contains the actual user-provided value. prevents silently accepting values exceeding the maximum inline_sg_cnt. Fixes: bca939d5bcd0 ("vhost-scsi: Dynamically allocate scatterlists") Signed-off-by: Alok Tiwari Reviewed-by: Mike Christie Reviewed-by: Stefan Hajnoczi Message-Id: <20250628183405.3979538-1-alok.a.tiwari@oracle.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index fd9a517dfe13..abf51332a5c5 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -71,7 +71,7 @@ static int vhost_scsi_set_inline_sg_cnt(const char *buf, if (ret) return ret; - if (ret > VHOST_SCSI_PREALLOC_SGLS) { + if (cnt > VHOST_SCSI_PREALLOC_SGLS) { pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS); return -EINVAL; } -- cgit v1.2.3 From cc51a66815999afb7e9cd845968de4fdf07567b7 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Tue, 8 Jul 2025 12:04:24 +0000 Subject: vdpa/mlx5: Fix release of uninitialized resources on error path The commit in the fixes tag made sure that mlx5_vdpa_free() is the single entrypoint for removing the vdpa device resources added in mlx5_vdpa_dev_add(), even in the cleanup path of mlx5_vdpa_dev_add(). This means that all functions from mlx5_vdpa_free() should be able to handle uninitialized resources. This was not the case though: mlx5_vdpa_destroy_mr_resources() and mlx5_cmd_cleanup_async_ctx() were not able to do so. This caused the splat below when adding a vdpa device without a MAC address. This patch fixes these remaining issues: - Makes mlx5_vdpa_destroy_mr_resources() return early if called on uninitialized resources. - Moves mlx5_cmd_init_async_ctx() early on during device addition because it can't fail. This means that mlx5_cmd_cleanup_async_ctx() also can't fail. To mirror this, move the call site of mlx5_cmd_cleanup_async_ctx() in mlx5_vdpa_free(). An additional comment was added in mlx5_vdpa_free() to document the expectations of functions called from this context. Splat: mlx5_core 0000:b5:03.2: mlx5_vdpa_dev_add:3950:(pid 2306) warning: No mac address provisioned? ------------[ cut here ]------------ WARNING: CPU: 13 PID: 2306 at kernel/workqueue.c:4207 __flush_work+0x9a/0xb0 [...] Call Trace: ? __try_to_del_timer_sync+0x61/0x90 ? __timer_delete_sync+0x2b/0x40 mlx5_vdpa_destroy_mr_resources+0x1c/0x40 [mlx5_vdpa] mlx5_vdpa_free+0x45/0x160 [mlx5_vdpa] vdpa_release_dev+0x1e/0x50 [vdpa] device_release+0x31/0x90 kobject_cleanup+0x37/0x130 mlx5_vdpa_dev_add+0x327/0x890 [mlx5_vdpa] vdpa_nl_cmd_dev_add_set_doit+0x2c1/0x4d0 [vdpa] genl_family_rcv_msg_doit+0xd8/0x130 genl_family_rcv_msg+0x14b/0x220 ? __pfx_vdpa_nl_cmd_dev_add_set_doit+0x10/0x10 [vdpa] genl_rcv_msg+0x47/0xa0 ? __pfx_genl_rcv_msg+0x10/0x10 netlink_rcv_skb+0x53/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x27b/0x3b0 netlink_sendmsg+0x1f7/0x430 __sys_sendto+0x1fa/0x210 ? ___pte_offset_map+0x17/0x160 ? next_uptodate_folio+0x85/0x2b0 ? percpu_counter_add_batch+0x51/0x90 ? filemap_map_pages+0x515/0x660 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x7b/0x2c0 ? do_read_fault+0x108/0x220 ? do_pte_missing+0x14a/0x3e0 ? __handle_mm_fault+0x321/0x730 ? count_memcg_events+0x13f/0x180 ? handle_mm_fault+0x1fb/0x2d0 ? do_user_addr_fault+0x20c/0x700 ? syscall_exit_work+0x104/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f0c25b0feca [...] ---[ end trace 0000000000000000 ]--- Signed-off-by: Dragos Tatulea Fixes: 83e445e64f48 ("vdpa/mlx5: Fix error path during device add") Reported-by: Wenli Quan Closes: https://lore.kernel.org/virtualization/CADZSLS0r78HhZAStBaN1evCSoPqRJU95Lt8AqZNJ6+wwYQ6vPQ@mail.gmail.com/ Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Message-Id: <20250708120424.2363354-2-dtatulea@nvidia.com> Tested-by: Wenli Quan Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mr.c | 3 +++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 61424342c096..c7a20278bc3c 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -908,6 +908,9 @@ void mlx5_vdpa_destroy_mr_resources(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; + if (!mres->wq_gc) + return; + atomic_set(&mres->shutdown, 1); flush_delayed_work(&mres->gc_dwork_ent); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index efb5fa694f1e..0ed2fc28e1ce 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3432,15 +3432,17 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev) ndev = to_mlx5_vdpa_ndev(mvdev); + /* Functions called here should be able to work with + * uninitialized resources. + */ free_fixed_resources(ndev); mlx5_vdpa_clean_mrs(mvdev); mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); - mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); - if (!is_zero_ether_addr(ndev->config.mac)) { pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); } + mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); mlx5_vdpa_free_resources(&ndev->mvdev); free_irqs(ndev); kfree(ndev->event_cbs); @@ -3888,6 +3890,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, mvdev->actual_features = (device_features & BIT_ULL(VIRTIO_F_VERSION_1)); + mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); + ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL); ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL); if (!ndev->vqs || !ndev->event_cbs) { @@ -3960,8 +3964,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->rqt_size = 1; } - mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); - ndev->mvdev.mlx_features = device_features; mvdev->vdev.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); -- cgit v1.2.3 From d9ea58b5dc6b4b50fbb6a10c73f840e8b10442b7 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 4 Jul 2025 14:53:35 +0200 Subject: vdpa: Fix IDR memory leak in VDUSE module exit Add missing idr_destroy() call in vduse_exit() to properly free the vduse_idr radix tree nodes. Without this, module load/unload cycles leak 576-byte radix tree node allocations, detectable by kmemleak as: unreferenced object (size 576): backtrace: [] radix_tree_node_alloc+0xa0/0xf0 [] idr_get_free+0x128/0x280 The vduse_idr is initialized via DEFINE_IDR() at line 136 and used throughout the VDUSE (vDPA Device in Userspace) driver for device ID management. The fix follows the documented pattern in lib/idr.c and matches the cleanup approach used by other drivers. This leak was discovered through comprehensive module testing with cumulative kmemleak detection across 10 load/unload iterations per module. Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace") Signed-off-by: Anders Roxell Message-Id: <20250704125335.1084649-1-anders.roxell@linaro.org> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_user/vduse_dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 6a9a37351310..04620bb77203 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2216,6 +2216,7 @@ static void vduse_exit(void) cdev_del(&vduse_ctrl_cdev); unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); class_unregister(&vduse_class); + idr_destroy(&vduse_idr); } module_exit(vduse_exit); -- cgit v1.2.3 From 7d9896e9f6d02d8aa85e63f736871f96c59a5263 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 14 Jul 2025 15:12:32 +0800 Subject: vhost: Reintroduce kthread API and add mode selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), the vhost uses vhost_task and operates as a child of the owner thread. This is required for correct CPU usage accounting, especially when using containers. However, this change has caused confusion for some legacy userspace applications, and we didn't notice until it's too late. Unfortunately, it's too late to revert - we now have userspace depending both on old and new behaviour :( To address the issue, reintroduce kthread mode for vhost workers and provide a configuration to select between kthread and task worker. - Add 'fork_owner' parameter to vhost_dev to let users select kthread or task mode. Default mode is task mode(VHOST_FORK_OWNER_TASK). - Reintroduce kthread mode support: * Bring back the original vhost_worker() implementation, and renamed to vhost_run_work_kthread_list(). * Add cgroup support for the kthread * Introduce struct vhost_worker_ops: - Encapsulates create / stop / wake‑up callbacks. - vhost_worker_create() selects the proper ops according to inherit_owner. - Userspace configuration interface: * New IOCTLs: - VHOST_SET_FORK_FROM_OWNER lets userspace select task mode (VHOST_FORK_OWNER_TASK) or kthread mode (VHOST_FORK_OWNER_KTHREAD) - VHOST_GET_FORK_FROM_OWNER reads the current worker mode * Expose module parameter 'fork_from_owner_default' to allow system administrators to configure the default mode for vhost workers * Kconfig option CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL controls whether these IOCTLs and the parameter are available - The VHOST_NEW_WORKER functionality requires fork_owner to be set to true, with validation added to ensure proper configuration This partially reverts or improves upon: commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") commit 1cdaafa1b8b4 ("vhost: replace single worker pointer with xarray") Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), Signed-off-by: Cindy Lu Message-Id: <20250714071333.59794-2-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Tested-by: Lei Yang --- drivers/vhost/Kconfig | 18 ++++ drivers/vhost/vhost.c | 244 +++++++++++++++++++++++++++++++++++++++++---- drivers/vhost/vhost.h | 22 ++++ include/uapi/linux/vhost.h | 29 ++++++ 4 files changed, 295 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 020d4fbb947c..bc0f38574497 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY If unsure, say "N". +config VHOST_ENABLE_FORK_OWNER_CONTROL + bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL" + default y + help + This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and + VHOST_GET_FORK_FROM_OWNER. These allow userspace applications + to modify the vhost worker mode for vhost devices. + + Also expose module parameter 'fork_from_owner_default' to allow users + to configure the default mode for vhost workers. + + By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`, + users can change the worker thread mode as needed. + If this config is disabled (n),the related IOCTLs and parameters will + be unavailable. + + If unsure, say "Y". + endif diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4390e3a14218..f4c1bc6adeda 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries. (default: 2048)"); +static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK; + +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL +module_param(fork_from_owner_default, bool, 0444); +MODULE_PARM_DESC(fork_from_owner_default, + "Set task mode as the default(default: Y)"); +#endif enum { VHOST_MEMORY_F_LOG = 0x1, @@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker, * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &worker->work_list); - vhost_task_wake(worker->vtsk); + worker->ops->wakeup(worker); } } @@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } +static int vhost_run_work_kthread_list(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_work *work, *work_next; + struct vhost_dev *dev = worker->dev; + struct llist_node *node; + + kthread_use_mm(dev->mm); + + for (;;) { + /* mb paired w/ kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + node = llist_del_all(&worker->work_list); + if (!node) + schedule(); + + node = llist_reverse_order(node); + /* make sure flag is seen after deletion */ + smp_wmb(); + llist_for_each_entry_safe(work, work_next, node, node) { + clear_bit(VHOST_WORK_QUEUED, &work->flags); + __set_current_state(TASK_RUNNING); + kcov_remote_start_common(worker->kcov_handle); + work->fn(work); + kcov_remote_stop(); + cond_resched(); + } + } + kthread_unuse_mm(dev->mm); + + return 0; +} + static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; @@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; + dev->fork_owner = fork_from_owner_default; init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +static void vhost_attach_cgroups_work(struct vhost_work *work) +{ + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); +} + +static int vhost_attach_task_to_cgroups(struct vhost_worker *worker) +{ + struct vhost_attach_cgroups_struct attach; + int saved_cnt; + + attach.owner = current; + + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_worker_queue(worker, &attach.work); + + mutex_lock(&worker->mutex); + + /* + * Bypass attachment_cnt check in __vhost_worker_flush: + * Temporarily change it to INT_MAX to bypass the check + */ + saved_cnt = worker->attachment_cnt; + worker->attachment_cnt = INT_MAX; + __vhost_worker_flush(worker); + worker->attachment_cnt = saved_cnt; + + mutex_unlock(&worker->mutex); + + return attach.ret; +} + /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev, WARN_ON(!llist_empty(&worker->work_list)); xa_erase(&dev->worker_xa, worker->id); - vhost_task_stop(worker->vtsk); + worker->ops->stop(worker); kfree(worker); } @@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev) xa_destroy(&dev->worker_xa); } +static void vhost_task_wakeup(struct vhost_worker *worker) +{ + return vhost_task_wake(worker->vtsk); +} + +static void vhost_kthread_wakeup(struct vhost_worker *worker) +{ + wake_up_process(worker->kthread_task); +} + +static void vhost_task_do_stop(struct vhost_worker *worker) +{ + return vhost_task_stop(worker->vtsk); +} + +static void vhost_kthread_do_stop(struct vhost_worker *worker) +{ + kthread_stop(worker->kthread_task); +} + +static int vhost_task_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct vhost_task *vtsk; + u32 id; + int ret; + + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); + if (IS_ERR(vtsk)) + return PTR_ERR(vtsk); + + worker->vtsk = vtsk; + vhost_task_start(vtsk); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + vhost_task_do_stop(worker); + return ret; + } + worker->id = id; + return 0; +} + +static int vhost_kthread_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct task_struct *task; + u32 id; + int ret; + + task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name); + if (IS_ERR(task)) + return PTR_ERR(task); + + worker->kthread_task = task; + wake_up_process(task); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) + goto stop_worker; + + ret = vhost_attach_task_to_cgroups(worker); + if (ret) + goto stop_worker; + + worker->id = id; + return 0; + +stop_worker: + vhost_kthread_do_stop(worker); + return ret; +} + +static const struct vhost_worker_ops kthread_ops = { + .create = vhost_kthread_worker_create, + .stop = vhost_kthread_do_stop, + .wakeup = vhost_kthread_wakeup, +}; + +static const struct vhost_worker_ops vhost_task_ops = { + .create = vhost_task_worker_create, + .stop = vhost_task_do_stop, + .wakeup = vhost_task_wakeup, +}; + static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) { struct vhost_worker *worker; - struct vhost_task *vtsk; char name[TASK_COMM_LEN]; int ret; - u32 id; + const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops : + &kthread_ops; worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); if (!worker) return NULL; worker->dev = dev; + worker->ops = ops; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, - worker, name); - if (IS_ERR(vtsk)) - goto free_worker; - mutex_init(&worker->mutex); init_llist_head(&worker->work_list); worker->kcov_handle = kcov_common_handle(); - worker->vtsk = vtsk; - - vhost_task_start(vtsk); - - ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + ret = ops->create(worker, dev, name); if (ret < 0) - goto stop_worker; - worker->id = id; + goto free_worker; return worker; -stop_worker: - vhost_task_stop(vtsk); free_worker: kfree(worker); return NULL; @@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, switch (ioctl) { /* dev worker ioctls */ case VHOST_NEW_WORKER: + /* + * vhost_tasks will account for worker threads under the parent's + * NPROC value but kthreads do not. To avoid userspace overflowing + * the system with worker threads fork_owner must be true. + */ + if (!dev->fork_owner) + return -EFAULT; + ret = vhost_new_worker(dev, &state); if (!ret && copy_to_user(argp, &state, sizeof(state))) ret = -EFAULT; @@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) vhost_dev_cleanup(dev); + dev->fork_owner = fork_from_owner_default; dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) goto done; } +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL + if (ioctl == VHOST_SET_FORK_FROM_OWNER) { + /* Only allow modification before owner is set */ + if (vhost_dev_has_owner(d)) { + r = -EBUSY; + goto done; + } + u8 fork_owner_val; + + if (get_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + d->fork_owner = !!fork_owner_val; + r = 0; + goto done; + } + if (ioctl == VHOST_GET_FORK_FROM_OWNER) { + u8 fork_owner_val = d->fork_owner; + + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + if (put_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + r = 0; + goto done; + } +#endif + /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index bb75a292d50c..ab704d84fb34 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -26,7 +26,18 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker; +struct vhost_dev; + +struct vhost_worker_ops { + int (*create)(struct vhost_worker *worker, struct vhost_dev *dev, + const char *name); + void (*stop)(struct vhost_worker *worker); + void (*wakeup)(struct vhost_worker *worker); +}; + struct vhost_worker { + struct task_struct *kthread_task; struct vhost_task *vtsk; struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ @@ -36,6 +47,7 @@ struct vhost_worker { u32 id; int attachment_cnt; bool killed; + const struct vhost_worker_ops *ops; }; /* Poll a file (eventfd or socket) */ @@ -176,6 +188,16 @@ struct vhost_dev { int byte_weight; struct xarray worker_xa; bool use_worker; + /* + * If fork_owner is true we use vhost_tasks to create + * the worker so all settings/limits like cgroups, NPROC, + * scheduler, etc are inherited from the owner. If false, + * we use kthreads and only attach to the same cgroups + * as the owner for compat with older kernels. + * here we use true as default value. + * The default value is set by fork_from_owner_default + */ + bool fork_owner; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); }; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index d4b3e2ae1314..e72f2655459e 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -235,4 +235,33 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* fork_owner values for vhost */ +#define VHOST_FORK_OWNER_KTHREAD 0 +#define VHOST_FORK_OWNER_TASK 1 + +/** + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, + * This ioctl must called before VHOST_SET_OWNER. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @param fork_owner: An 8-bit value that determines the vhost thread mode + * + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): + * - Vhost will create vhost worker as tasks forked from the owner, + * inheriting all of the owner's attributes. + * + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: + * - Vhost will create vhost workers as kernel threads. + */ +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) + +/** + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @return: An 8-bit value indicating the current thread mode. + */ +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) + #endif -- cgit v1.2.3 From b4ba1207d45adaafa2982c035898b36af2d3e518 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 14 Jul 2025 16:47:53 +0800 Subject: vhost: fail early when __vhost_add_used() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fails vhost_add_used_n() early when __vhost_add_used() fails to make sure used idx is not updated with stale used ring information. Reported-by: Eugenio Pérez Signed-off-by: Jason Wang Message-Id: <20250714084755.11921-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vhost/vhost.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f4c1bc6adeda..b38e39242fb9 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2983,6 +2983,9 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, } r = __vhost_add_used_n(vq, heads, count); + if (r < 0) + return r; + /* Make sure buffer is written before we update index. */ smp_wmb(); if (vhost_put_used_idx(vq)) { -- cgit v1.2.3 From 67a873df0c410915275f735fedb401b9637d6faf Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 14 Jul 2025 16:47:54 +0800 Subject: vhost: basic in order support This patch adds basic in order support for vhost. Two optimizations are implemented in this patch: 1) Since driver uses descriptor in order, vhost can deduce the next avail ring head by counting the number of descriptors that has been used in next_avail_head. This eliminate the need to access the available ring in vhost. 2) vhost_add_used_and_singal_n() is extended to accept the number of batched buffers per used elem. While this increases the times of userspace memory access but it helps to reduce the chance of used ring access of both the driver and vhost. Vhost-net will be the first user for this. Acked-by: Jonah Palmer Signed-off-by: Jason Wang Message-Id: <20250714084755.11921-3-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vhost/net.c | 6 ++- drivers/vhost/vhost.c | 120 +++++++++++++++++++++++++++++++++++++++++--------- drivers/vhost/vhost.h | 8 +++- 3 files changed, 109 insertions(+), 25 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 7cbfc7d718b3..4f9c67f17b49 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -374,7 +374,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, while (j) { add = min(UIO_MAXIOV - nvq->done_idx, j); vhost_add_used_and_signal_n(vq->dev, vq, - &vq->heads[nvq->done_idx], add); + &vq->heads[nvq->done_idx], + NULL, add); nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; j -= add; } @@ -457,7 +458,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) if (!nvq->done_idx) return; - vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); + vhost_add_used_and_signal_n(dev, vq, vq->heads, NULL, + nvq->done_idx); nvq->done_idx = 0; } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b38e39242fb9..a4873d116df1 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -372,6 +372,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->avail = NULL; vq->used = NULL; vq->last_avail_idx = 0; + vq->next_avail_head = 0; vq->avail_idx = 0; vq->last_used_idx = 0; vq->signalled_used = 0; @@ -501,6 +502,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq) vq->log = NULL; kfree(vq->heads); vq->heads = NULL; + kfree(vq->nheads); + vq->nheads = NULL; } /* Helper to allocate iovec buffers for all vqs. */ @@ -518,7 +521,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) GFP_KERNEL); vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads), GFP_KERNEL); - if (!vq->indirect || !vq->log || !vq->heads) + vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads), + GFP_KERNEL); + if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads) goto err_nomem; } return 0; @@ -2159,14 +2164,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg break; } if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { - vq->last_avail_idx = s.num & 0xffff; + vq->next_avail_head = vq->last_avail_idx = + s.num & 0xffff; vq->last_used_idx = (s.num >> 16) & 0xffff; } else { if (s.num > 0xffff) { r = -EINVAL; break; } - vq->last_avail_idx = s.num; + vq->next_avail_head = vq->last_avail_idx = s.num; } /* Forget the cached index value. */ vq->avail_idx = vq->last_avail_idx; @@ -2798,11 +2804,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, unsigned int *out_num, unsigned int *in_num, struct vhost_log *log, unsigned int *log_num) { + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct vring_desc desc; unsigned int i, head, found = 0; u16 last_avail_idx = vq->last_avail_idx; __virtio16 ring_head; - int ret, access; + int ret, access, c = 0; if (vq->avail_idx == vq->last_avail_idx) { ret = vhost_get_avail_idx(vq); @@ -2813,17 +2820,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, return vq->num; } - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) { - vq_err(vq, "Failed to read head: idx %d address %p\n", - last_avail_idx, - &vq->avail->ring[last_avail_idx % vq->num]); - return -EFAULT; + if (in_order) + head = vq->next_avail_head & (vq->num - 1); + else { + /* Grab the next descriptor number they're + * advertising, and increment the index we've seen. */ + if (unlikely(vhost_get_avail_head(vq, &ring_head, + last_avail_idx))) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + last_avail_idx, + &vq->avail->ring[last_avail_idx % vq->num]); + return -EFAULT; + } + head = vhost16_to_cpu(vq, ring_head); } - head = vhost16_to_cpu(vq, ring_head); - /* If their number is silly, that's an error. */ if (unlikely(head >= vq->num)) { vq_err(vq, "Guest says index %u > %u is available", @@ -2866,6 +2877,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, "in indirect descriptor at idx %d\n", i); return ret; } + ++c; continue; } @@ -2901,10 +2913,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, } *out_num += ret; } + ++c; } while ((i = next_desc(vq, &desc)) != -1); /* On success, increment avail index. */ vq->last_avail_idx++; + vq->next_avail_head += c; /* Assume notifications from guest are disabled at this point, * if they aren't we would need to update avail_event index. */ @@ -2928,8 +2942,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) cpu_to_vhost32(vq, head), cpu_to_vhost32(vq, len) }; + u16 nheads = 1; - return vhost_add_used_n(vq, &heads, 1); + return vhost_add_used_n(vq, &heads, &nheads, 1); } EXPORT_SYMBOL_GPL(vhost_add_used); @@ -2965,10 +2980,9 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, return 0; } -/* After we've used one of their buffers, we tell them about it. We'll then - * want to notify the guest, using eventfd. */ -int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, - unsigned count) +static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + unsigned count) { int start, n, r; @@ -2981,7 +2995,69 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, heads += n; count -= n; } - r = __vhost_add_used_n(vq, heads, count); + return __vhost_add_used_n(vq, heads, count); +} + +static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + const u16 *nheads, + unsigned count) +{ + vring_used_elem_t __user *used; + u16 old, new = vq->last_used_idx; + int start, i; + + if (!nheads) + return -EINVAL; + + start = vq->last_used_idx & (vq->num - 1); + used = vq->used->ring + start; + + for (i = 0; i < count; i++) { + if (vhost_put_used(vq, &heads[i], start, 1)) { + vq_err(vq, "Failed to write used"); + return -EFAULT; + } + start += nheads[i]; + new += nheads[i]; + if (start >= vq->num) + start -= vq->num; + } + + if (unlikely(vq->log_used)) { + /* Make sure data is seen before log. */ + smp_wmb(); + /* Log used ring entry write. */ + log_used(vq, ((void __user *)used - (void __user *)vq->used), + (vq->num - start) * sizeof *used); + if (start + count > vq->num) + log_used(vq, 0, + (start + count - vq->num) * sizeof *used); + } + + old = vq->last_used_idx; + vq->last_used_idx = new; + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) + vq->signalled_used_valid = false; + return 0; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to notify the guest, using eventfd. */ +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, + u16 *nheads, unsigned count) +{ + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); + int r; + + if (!in_order || !nheads) + r = vhost_add_used_n_ooo(vq, heads, count); + else + r = vhost_add_used_n_in_order(vq, heads, nheads, count); if (r < 0) return r; @@ -3064,9 +3140,11 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal); /* multi-buffer version of vhost_add_used_and_signal */ void vhost_add_used_and_signal_n(struct vhost_dev *dev, struct vhost_virtqueue *vq, - struct vring_used_elem *heads, unsigned count) + struct vring_used_elem *heads, + u16 *nheads, + unsigned count) { - vhost_add_used_n(vq, heads, count); + vhost_add_used_n(vq, heads, nheads, count); vhost_signal(dev, vq); } EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index ab704d84fb34..24f3540b08a2 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -115,6 +115,8 @@ struct vhost_virtqueue { * Values are limited to 0x7fff, and the high bit is used as * a wrap counter when using VIRTIO_F_RING_PACKED. */ u16 last_avail_idx; + /* Next avail ring head when VIRTIO_F_IN_ORDER is negoitated */ + u16 next_avail_head; /* Caches available index value from user. */ u16 avail_idx; @@ -141,6 +143,7 @@ struct vhost_virtqueue { struct iovec iotlb_iov[64]; struct iovec *indirect; struct vring_used_elem *heads; + u16 *nheads; /* Protected by virtqueue mutex. */ struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; @@ -235,11 +238,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq); int vhost_vq_init_access(struct vhost_virtqueue *); int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, - unsigned count); + u16 *nheads, unsigned count); void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, unsigned int id, int len); void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, - struct vring_used_elem *heads, unsigned count); + struct vring_used_elem *heads, u16 *nheads, + unsigned count); void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *); -- cgit v1.2.3 From 45347e79b544928d8ace9eb07c4d8f4fcc525752 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Mon, 14 Jul 2025 16:47:55 +0800 Subject: vhost_net: basic in_order support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces basic in-order support for vhost-net. By recording the number of batched buffers in an array when calling `vhost_add_used_and_signal_n()`, we can reduce the number of userspace accesses. Note that the vhost-net batching logic is kept as we still count the number of buffers there. Testing Results: With testpmd: - TX: txonly mode + vhost_net with XDP_DROP on TAP shows a 17.5% improvement, from 4.75 Mpps to 5.35 Mpps. - RX: No obvious improvements were observed. With virtio-ring in-order experimental code in the guest: - TX: pktgen in the guest + XDP_DROP on TAP shows a 19% improvement, from 5.2 Mpps to 6.2 Mpps. - RX: pktgen on TAP with vhost_net + XDP_DROP in the guest achieves a 6.1% improvement, from 3.47 Mpps to 3.61 Mpps. Acked-by: Jonah Palmer Acked-by: Eugenio Pérez Signed-off-by: Jason Wang Message-Id: <20250714084755.11921-4-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- drivers/vhost/net.c | 86 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 25 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 4f9c67f17b49..8ac994b3228a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -74,7 +74,8 @@ enum { (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_F_RING_RESET) + (1ULL << VIRTIO_F_RING_RESET) | + (1ULL << VIRTIO_F_IN_ORDER) }; enum { @@ -450,7 +451,8 @@ static int vhost_net_enable_vq(struct vhost_net *n, return vhost_poll_start(poll, sock->file); } -static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) +static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq, + unsigned int count) { struct vhost_virtqueue *vq = &nvq->vq; struct vhost_dev *dev = vq->dev; @@ -458,8 +460,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) if (!nvq->done_idx) return; - vhost_add_used_and_signal_n(dev, vq, vq->heads, NULL, - nvq->done_idx); + vhost_add_used_and_signal_n(dev, vq, vq->heads, + vq->nheads, count); nvq->done_idx = 0; } @@ -468,6 +470,8 @@ static void vhost_tx_batch(struct vhost_net *net, struct socket *sock, struct msghdr *msghdr) { + struct vhost_virtqueue *vq = &nvq->vq; + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); struct tun_msg_ctl ctl = { .type = TUN_MSG_PTR, .num = nvq->batched_xdp, @@ -475,6 +479,11 @@ static void vhost_tx_batch(struct vhost_net *net, }; int i, err; + if (in_order) { + vq->heads[0].len = 0; + vq->nheads[0] = nvq->done_idx; + } + if (nvq->batched_xdp == 0) goto signal_used; @@ -496,7 +505,7 @@ static void vhost_tx_batch(struct vhost_net *net, } signal_used: - vhost_net_signal_used(nvq); + vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx); nvq->batched_xdp = 0; } @@ -758,6 +767,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); bool busyloop_intr; + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); do { busyloop_intr = false; @@ -794,11 +804,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; } - /* We can't build XDP buff, go for single - * packet path but let's flush batched - * packets. - */ - vhost_tx_batch(net, nvq, sock, &msg); + if (nvq->batched_xdp) { + /* We can't build XDP buff, go for single + * packet path but let's flush batched + * packets. + */ + vhost_tx_batch(net, nvq, sock, &msg); + } msg.msg_control = NULL; } else { if (tx_can_batch(vq, total_len)) @@ -819,8 +831,12 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) pr_debug("Truncated TX packet: len %d != %zd\n", err, len); done: - vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); - vq->heads[nvq->done_idx].len = 0; + if (in_order) { + vq->heads[0].id = cpu_to_vhost32(vq, head); + } else { + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); + vq->heads[nvq->done_idx].len = 0; + } ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); @@ -999,7 +1015,7 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk) } static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, - bool *busyloop_intr) + bool *busyloop_intr, unsigned int count) { struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; @@ -1009,7 +1025,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, if (!len && rvq->busyloop_timeout) { /* Flush batched heads first */ - vhost_net_signal_used(rnvq); + vhost_net_signal_used(rnvq, count); /* Both tx vq and rx socket were polled here */ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); @@ -1021,7 +1037,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. - * @vq - the relevant virtqueue + * @nvq - the relevant vhost_net virtqueue * @datalen - data length we'll be reading * @iovcount - returned count of io vectors we fill * @log - vhost log @@ -1029,14 +1045,17 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ -static int get_rx_bufs(struct vhost_virtqueue *vq, +static int get_rx_bufs(struct vhost_net_virtqueue *nvq, struct vring_used_elem *heads, + u16 *nheads, int datalen, unsigned *iovcount, struct vhost_log *log, unsigned *log_num, unsigned int quota) { + struct vhost_virtqueue *vq = &nvq->vq; + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); unsigned int out, in; int seg = 0; int headcount = 0; @@ -1073,14 +1092,16 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, nlogs += *log_num; log += *log_num; } - heads[headcount].id = cpu_to_vhost32(vq, d); len = iov_length(vq->iov + seg, in); - heads[headcount].len = cpu_to_vhost32(vq, len); - datalen -= len; + if (!in_order) { + heads[headcount].id = cpu_to_vhost32(vq, d); + heads[headcount].len = cpu_to_vhost32(vq, len); + } ++headcount; + datalen -= len; seg += in; } - heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); + *iovcount = seg; if (unlikely(log)) *log_num = nlogs; @@ -1090,6 +1111,15 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, r = UIO_MAXIOV + 1; goto err; } + + if (!in_order) + heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); + else { + heads[0].len = cpu_to_vhost32(vq, len + datalen); + heads[0].id = cpu_to_vhost32(vq, d); + nheads[0] = headcount; + } + return headcount; err: vhost_discard_vq_desc(vq, headcount); @@ -1102,6 +1132,8 @@ static void handle_rx(struct vhost_net *net) { struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; struct vhost_virtqueue *vq = &nvq->vq; + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); + unsigned int count = 0; unsigned in, log; struct vhost_log *vq_log; struct msghdr msg = { @@ -1149,12 +1181,13 @@ static void handle_rx(struct vhost_net *net) do { sock_len = vhost_net_rx_peek_head_len(net, sock->sk, - &busyloop_intr); + &busyloop_intr, count); if (!sock_len) break; sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; - headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, + headcount = get_rx_bufs(nvq, vq->heads + count, + vq->nheads + count, vhost_len, &in, vq_log, &log, likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ @@ -1230,8 +1263,11 @@ static void handle_rx(struct vhost_net *net) goto out; } nvq->done_idx += headcount; - if (nvq->done_idx > VHOST_NET_BATCH) - vhost_net_signal_used(nvq); + count += in_order ? 1 : headcount; + if (nvq->done_idx > VHOST_NET_BATCH) { + vhost_net_signal_used(nvq, count); + count = 0; + } if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len, vq->iov, in); @@ -1243,7 +1279,7 @@ static void handle_rx(struct vhost_net *net) else if (!sock_len) vhost_net_enable_vq(net, vq); out: - vhost_net_signal_used(nvq); + vhost_net_signal_used(nvq, count); mutex_unlock(&vq->mutex); } -- cgit v1.2.3 From 10a886aaed293c4db3417951f396827216299e3d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:08 +0100 Subject: vhost/vsock: Avoid allocating arbitrarily-sized SKBs vhost_vsock_alloc_skb() returns NULL for packets advertising a length larger than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE in the packet header. However, this is only checked once the SKB has been allocated and, if the length in the packet header is zero, the SKB may not be freed immediately. Hoist the size check before the SKB allocation so that an iovec larger than VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + the header size is rejected outright. The subsequent check on the length field in the header can then simply check that the allocated SKB is indeed large enough to hold the packet. Cc: Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-2-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 802153e23073..66a0f060770e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -344,6 +344,9 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, len = iov_length(vq->iov, out); + if (len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) + return NULL; + /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) @@ -367,8 +370,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return skb; /* The pkt is too big or the length in the header is invalid */ - if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || - payload_len + sizeof(*hdr) > len) { + if (payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } -- cgit v1.2.3 From 0dab92484474587b82e8e0455839eaf5ac7bf894 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:09 +0100 Subject: vsock/virtio: Validate length in packet header before skb_put() When receiving a vsock packet in the guest, only the virtqueue buffer size is validated prior to virtio_vsock_skb_rx_put(). Unfortunately, virtio_vsock_skb_rx_put() uses the length from the packet header as the length argument to skb_put(), potentially resulting in SKB overflow if the host has gone wonky. Validate the length as advertised by the packet header before calling virtio_vsock_skb_rx_put(). Cc: Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-3-will@kernel.org> Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- net/vmw_vsock/virtio_transport.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f0e48e6911fc..eb08a393413d 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -624,8 +624,9 @@ static void virtio_transport_rx_work(struct work_struct *work) do { virtqueue_disable_cb(vq); for (;;) { + unsigned int len, payload_len; + struct virtio_vsock_hdr *hdr; struct sk_buff *skb; - unsigned int len; if (!virtio_transport_more_replies(vsock)) { /* Stop rx until the device processes already @@ -642,12 +643,19 @@ static void virtio_transport_rx_work(struct work_struct *work) vsock->rx_buf_nr--; /* Drop short/long packets */ - if (unlikely(len < sizeof(struct virtio_vsock_hdr) || + if (unlikely(len < sizeof(*hdr) || len > virtio_vsock_skb_len(skb))) { kfree_skb(skb); continue; } + hdr = virtio_vsock_hdr(skb); + payload_len = le32_to_cpu(hdr->len); + if (unlikely(payload_len > len - sizeof(*hdr))) { + kfree_skb(skb); + continue; + } + virtio_vsock_skb_rx_put(skb); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&virtio_transport, skb); -- cgit v1.2.3 From 87dbae5e36613a6020f3d64a2eaeac0a1e0e6dc6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:10 +0100 Subject: vsock/virtio: Move length check to callers of virtio_vsock_skb_rx_put() virtio_vsock_skb_rx_put() only calls skb_put() if the length in the packet header is not zero even though skb_put() handles this case gracefully. Remove the functionally redundant check from virtio_vsock_skb_rx_put() and, on the assumption that this is a worthwhile optimisation for handling credit messages, augment the existing length checks in virtio_transport_rx_work() to elide the call for zero-length payloads. Since the callers all have the length, extend virtio_vsock_skb_rx_put() to take it as an additional parameter rather than fish it back out of the packet header. Note that the vhost code already has similar logic in vhost_vsock_alloc_skb(). Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-4-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 2 +- include/linux/virtio_vsock.h | 9 ++------- net/vmw_vsock/virtio_transport.c | 4 +++- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 66a0f060770e..4c4a642945eb 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -375,7 +375,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return NULL; } - virtio_vsock_skb_rx_put(skb); + virtio_vsock_skb_rx_put(skb, payload_len); nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); if (nbytes != payload_len) { diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 36fb3edfa403..97465f378ade 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -47,14 +47,9 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb) VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false; } -static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb) +static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len) { - u32 len; - - len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - if (len > 0) - skb_put(skb, len); + skb_put(skb, len); } static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index eb08a393413d..0166919f8705 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -656,7 +656,9 @@ static void virtio_transport_rx_work(struct work_struct *work) continue; } - virtio_vsock_skb_rx_put(skb); + if (payload_len) + virtio_vsock_skb_rx_put(skb, payload_len); + virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&virtio_transport, skb); } -- cgit v1.2.3 From 03a92f036a04fed2b00d69f5f46f1a486e70dc5c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:11 +0100 Subject: vsock/virtio: Resize receive buffers so that each SKB fits in a 4K page When allocating receive buffers for the vsock virtio RX virtqueue, an SKB is allocated with a 4140 data payload (the 44-byte packet header + VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE). Even when factoring in the SKB overhead, the resulting 8KiB allocation thanks to the rounding in kmalloc_reserve() is wasteful (~3700 unusable bytes) and results in a higher-order page allocation on systems with 4KiB pages just for the sake of a few hundred bytes of packet data. Limit the vsock virtio RX buffers to 4KiB per SKB, resulting in much better memory utilisation and removing the need to allocate higher-order pages entirely. Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-5-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_vsock.h | 7 ++++++- net/vmw_vsock/virtio_transport.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 97465f378ade..879f1dfa7d3a 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -106,7 +106,12 @@ static inline size_t virtio_vsock_skb_len(struct sk_buff *skb) return (size_t)(skb_end_pointer(skb) - skb->head); } -#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) +/* Dimension the RX SKB so that the entire thing fits exactly into + * a single 4KiB page. This avoids wasting memory due to alloc_skb() + * rounding up to the next page order and also means that we + * don't leave higher-order pages sitting around in the RX queue. + */ +#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE SKB_WITH_OVERHEAD(1024 * 4) #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 0166919f8705..39f346890f7f 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -307,7 +307,7 @@ out_rcu: static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { - int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; struct scatterlist pkt, *p; struct virtqueue *vq; struct sk_buff *skb; -- cgit v1.2.3 From 2304c64a2866c58534560c63dc6e79d09b8f8d8d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:12 +0100 Subject: vsock/virtio: Rename virtio_vsock_alloc_skb() In preparation for nonlinear allocations for large SKBs, rename virtio_vsock_alloc_skb() to virtio_vsock_alloc_linear_skb() to indicate that it returns linear SKBs unconditionally and switch all callers over to this new interface for now. No functional change. Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-6-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 2 +- include/linux/virtio_vsock.h | 3 ++- net/vmw_vsock/virtio_transport.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 4c4a642945eb..1ad96613680e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -348,7 +348,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return NULL; /* len contains both payload and hdr */ - skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); + skb = virtio_vsock_alloc_linear_skb(len, GFP_KERNEL); if (!skb) return NULL; diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 879f1dfa7d3a..4504ea29ff82 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -52,7 +52,8 @@ static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len) skb_put(skb, len); } -static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) +static inline struct sk_buff * +virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask) { struct sk_buff *skb; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 39f346890f7f..80dcf6ac1e72 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -316,7 +316,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) vq = vsock->vqs[VSOCK_VQ_RX]; do { - skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); + skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL); if (!skb) break; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 1b5d9896edae..c9eb7f7ac00d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -261,7 +261,7 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info * if (!zcopy) skb_len += payload_len; - skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); + skb = virtio_vsock_alloc_linear_skb(skb_len, GFP_KERNEL); if (!skb) return NULL; -- cgit v1.2.3 From fac6b82e0f3eaca33c8c67ec401681b21143ae17 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:13 +0100 Subject: vsock/virtio: Move SKB allocation lower-bound check to callers virtio_vsock_alloc_linear_skb() checks that the requested size is at least big enough for the packet header (VIRTIO_VSOCK_SKB_HEADROOM). Of the three callers of virtio_vsock_alloc_linear_skb(), only vhost_vsock_alloc_skb() can potentially pass a packet smaller than the header size and, as it already has a check against the maximum packet size, extend its bounds checking to consider the minimum packet size and remove the check from virtio_vsock_alloc_linear_skb(). Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-7-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 3 ++- include/linux/virtio_vsock.h | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 1ad96613680e..24b7547b05a6 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -344,7 +344,8 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, len = iov_length(vq->iov, out); - if (len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) + if (len < VIRTIO_VSOCK_SKB_HEADROOM || + len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) return NULL; /* len contains both payload and hdr */ diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 4504ea29ff82..36dd0cd55368 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -57,9 +57,6 @@ virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask) { struct sk_buff *skb; - if (size < VIRTIO_VSOCK_SKB_HEADROOM) - return NULL; - skb = alloc_skb(size, mask); if (!skb) return NULL; -- cgit v1.2.3 From ab9aa2f3afc2713c14f6c4c6b90c9a0933b837f1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:14 +0100 Subject: vhost/vsock: Allocate nonlinear SKBs for handling large receive buffers When receiving a packet from a guest, vhost_vsock_handle_tx_kick() calls vhost_vsock_alloc_linear_skb() to allocate and fill an SKB with the receive data. Unfortunately, these are always linear allocations and can therefore result in significant pressure on kmalloc() considering that the maximum packet size (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) is a little over 64KiB, resulting in a 128KiB allocation for each packet. Rework the vsock SKB allocation so that, for sizes with page order greater than PAGE_ALLOC_COSTLY_ORDER, a nonlinear SKB is allocated instead with the packet header in the SKB and the receive data in the fragments. Finally, add a debug warning if virtio_vsock_skb_rx_put() is ever called on an SKB with a non-zero length, as this would be destructive for the nonlinear case. Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-8-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 8 +++----- include/linux/virtio_vsock.h | 32 +++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 24b7547b05a6..0679a706ebc0 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -349,7 +349,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return NULL; /* len contains both payload and hdr */ - skb = virtio_vsock_alloc_linear_skb(len, GFP_KERNEL); + skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) return NULL; @@ -378,10 +378,8 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, virtio_vsock_skb_rx_put(skb, payload_len); - nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); - if (nbytes != payload_len) { - vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", - payload_len, nbytes); + if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) { + vq_err(vq, "Failed to copy %zu byte payload\n", payload_len); kfree_skb(skb); return NULL; } diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 36dd0cd55368..fa5934ea9c81 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -49,22 +49,48 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb) static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len) { - skb_put(skb, len); + DEBUG_NET_WARN_ON_ONCE(skb->len); + + if (skb_is_nonlinear(skb)) + skb->len = len; + else + skb_put(skb, len); } static inline struct sk_buff * -virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask) +__virtio_vsock_alloc_skb_with_frags(unsigned int header_len, + unsigned int data_len, + gfp_t mask) { struct sk_buff *skb; + int err; - skb = alloc_skb(size, mask); + skb = alloc_skb_with_frags(header_len, data_len, + PAGE_ALLOC_COSTLY_ORDER, &err, mask); if (!skb) return NULL; skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM); + skb->data_len = data_len; return skb; } +static inline struct sk_buff * +virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask) +{ + return __virtio_vsock_alloc_skb_with_frags(size, 0, mask); +} + +static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) +{ + if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + return virtio_vsock_alloc_linear_skb(size, mask); + + size -= VIRTIO_VSOCK_SKB_HEADROOM; + return __virtio_vsock_alloc_skb_with_frags(VIRTIO_VSOCK_SKB_HEADROOM, + size, mask); +} + static inline void virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb) { -- cgit v1.2.3 From 8ca76151d2c8219edea82f1925a2a25907ff6a9d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:15 +0100 Subject: vsock/virtio: Rename virtio_vsock_skb_rx_put() In preparation for using virtio_vsock_skb_rx_put() when populating SKBs on the vsock TX path, rename virtio_vsock_skb_rx_put() to virtio_vsock_skb_put(). No functional change. Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-9-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vsock.c | 2 +- include/linux/virtio_vsock.h | 2 +- net/vmw_vsock/virtio_transport.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 0679a706ebc0..ae01457ea2cd 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -376,7 +376,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, return NULL; } - virtio_vsock_skb_rx_put(skb, payload_len); + virtio_vsock_skb_put(skb, payload_len); if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) { vq_err(vq, "Failed to copy %zu byte payload\n", payload_len); diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index fa5934ea9c81..0c67543a45c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -47,7 +47,7 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb) VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false; } -static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len) +static inline void virtio_vsock_skb_put(struct sk_buff *skb, u32 len) { DEBUG_NET_WARN_ON_ONCE(skb->len); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 80dcf6ac1e72..b6569b0ca2bb 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -657,7 +657,7 @@ static void virtio_transport_rx_work(struct work_struct *work) } if (payload_len) - virtio_vsock_skb_rx_put(skb, payload_len); + virtio_vsock_skb_put(skb, payload_len); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&virtio_transport, skb); -- cgit v1.2.3 From 6693731487a8145a9b039bc983d77edc47693855 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 17 Jul 2025 10:01:16 +0100 Subject: vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers When transmitting a vsock packet, virtio_transport_send_pkt_info() calls virtio_transport_alloc_linear_skb() to allocate and fill SKBs with the transmit data. Unfortunately, these are always linear allocations and can therefore result in significant pressure on kmalloc() considering that the maximum packet size (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) is a little over 64KiB, resulting in a 128KiB allocation for each packet. Rework the vsock SKB allocation so that, for sizes with page order greater than PAGE_ALLOC_COSTLY_ORDER, a nonlinear SKB is allocated instead with the packet header in the SKB and the transmit data in the fragments. Note that this affects both the vhost and virtio transports. Reviewed-by: Stefano Garzarella Signed-off-by: Will Deacon Message-Id: <20250717090116.11987-10-will@kernel.org> Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport_common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index c9eb7f7ac00d..fe92e5fa95b4 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -109,7 +109,8 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, return __zerocopy_sg_from_iter(info->msg, NULL, skb, &info->msg->msg_iter, len, NULL); - return memcpy_from_msg(skb_put(skb, len), info->msg, len); + virtio_vsock_skb_put(skb, len); + return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, @@ -261,7 +262,7 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info * if (!zcopy) skb_len += payload_len; - skb = virtio_vsock_alloc_linear_skb(skb_len, GFP_KERNEL); + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); if (!skb) return NULL; -- cgit v1.2.3