diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-15 13:12:15 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-15 13:12:15 -0800 |
commit | 785d21ba2f447fb26df4b22f45653763beb767ea (patch) | |
tree | 6c447d85c3359198921807a1d121499d47c80233 /drivers/vfio/pci/mlx5 | |
parent | 8fa590bf344816c925810331eea8387627bbeb40 (diff) | |
parent | 70be6f322860d322ebcd120cf0c05402ead5c6de (diff) | |
download | lwn-785d21ba2f447fb26df4b22f45653763beb767ea.tar.gz lwn-785d21ba2f447fb26df4b22f45653763beb767ea.zip |
Merge tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Replace deprecated git://github.com link in MAINTAINERS (Palmer
Dabbelt)
- Simplify vfio/mlx5 with module_pci_driver() helper (Shang XiaoJing)
- Drop unnecessary buffer from ACPI call (Rafael Mendonca)
- Correct latent missing include issue in iova-bitmap and fix support
for unaligned bitmaps. Follow-up with better fix through refactor
(Joao Martins)
- Rework ccw mdev driver to split private data from parent structure,
better aligning with the mdev lifecycle and allowing us to remove a
temporary workaround (Eric Farman)
- Add an interface to get an estimated migration data size for a
device, allowing userspace to make informed decisions, ex. more
accurately predicting VM downtime (Yishai Hadas)
- Fix minor typo in vfio/mlx5 array declaration (Yishai Hadas)
- Simplify module and Kconfig through consolidating SPAPR/EEH code and
config options and folding virqfd module into main vfio module (Jason
Gunthorpe)
- Fix error path from device_register() across all vfio mdev and sample
drivers (Alex Williamson)
- Define migration pre-copy interface and implement for vfio/mlx5
devices, allowing portions of the device state to be saved while the
device continues operation, towards reducing the stop-copy state size
(Jason Gunthorpe, Yishai Hadas, Shay Drory)
- Implement pre-copy for hisi_acc devices (Shameer Kolothum)
- Fixes to mdpy mdev driver remove path and error path on probe (Shang
XiaoJing)
- vfio/mlx5 fixes for incorrect return after copy_to_user() fault and
incorrect buffer freeing (Dan Carpenter)
* tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio: (42 commits)
vfio/mlx5: error pointer dereference in error handling
vfio/mlx5: fix error code in mlx5vf_precopy_ioctl()
samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe()
hisi_acc_vfio_pci: Enable PRE_COPY flag
hisi_acc_vfio_pci: Move the dev compatibility tests for early check
hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions
hisi_acc_vfio_pci: Add support for precopy IOCTL
vfio/mlx5: Enable MIGRATION_PRE_COPY flag
vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error
vfio/mlx5: Introduce multiple loads
vfio/mlx5: Consider temporary end of stream as part of PRE_COPY
vfio/mlx5: Introduce vfio precopy ioctl implementation
vfio/mlx5: Introduce SW headers for migration states
vfio/mlx5: Introduce device transitions of PRE_COPY
vfio/mlx5: Refactor to use queue based data chunks
vfio/mlx5: Refactor migration file state
vfio/mlx5: Refactor MKEY usage
vfio/mlx5: Refactor PD usage
vfio/mlx5: Enforce a single SAVE command at a time
vfio: Extend the device migration protocol with PRE_COPY
...
Diffstat (limited to 'drivers/vfio/pci/mlx5')
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 413 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.h | 96 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 784 |
3 files changed, 1069 insertions, 224 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c604b70437a5..64e68d13cb98 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; + int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while the device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the save + * command once it will try to turn on 'tracking' on a suspended device. + */ + if (migf) { + err = wait_for_completion_interruptible(&migf->save_comp); + if (err) + return err; + } + MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); + if (migf) + complete(&migf->save_comp); + + return err; } int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) @@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) } int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size) + size_t *state_size, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; + bool inc = query_flags & MLX5VF_QUERY_INC; int ret; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; + /* + * In case PRE_COPY is used, saving_migf is exposed while device is + * running. Make sure to run only once there is no active save command. + * Running both in parallel, might end-up with a failure in the + * incremental query command on un-tracked vhca. + */ + if (inc) { + ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); + if (ret) + return ret; + if (mvdev->saving_migf->state == + MLX5_MIGF_STATE_PRE_COPY_ERROR) { + /* + * In case we had a PRE_COPY error, only query full + * image for final image + */ + if (!(query_flags & MLX5VF_QUERY_FINAL)) { + *state_size = 0; + complete(&mvdev->saving_migf->save_comp); + return 0; + } + query_flags &= ~MLX5VF_QUERY_INC; + } + } + MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); + MLX5_SET(query_vhca_migration_state_in, in, incremental, + query_flags & MLX5VF_QUERY_INC); ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, out); + if (inc) + complete(&mvdev->saving_migf->save_comp); + if (ret) return ret; @@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) mvdev->core_device.vdev.log_ops = log_ops; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && + MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) + mvdev->core_device.vdev.migration_flags |= + VFIO_MIGRATION_PRE_COPY; + end: mlx5_vf_put_core_dev(mvdev->mdev); } @@ -210,11 +264,11 @@ err_exec: } static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : recv_buf->npages; int err = 0, inlen; __be64 *mtt; @@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - if (migf) { + if (buf) { struct sg_dma_page_iter dma_iter; - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); } else { int i; @@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, - migf ? migf->total_length : (npages * PAGE_SIZE)); + MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; } +static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; + int ret; + + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) + return -ENOTCONN; + + if (buf->dmaed || !buf->allocated_length) + return -EINVAL; + + ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + if (ret) + return ret; + + ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + if (ret) + goto err; + + buf->dmaed = true; + + return 0; +err: + dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + return ret; +} + +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + struct mlx5_vf_migration_file *migf = buf->migf; + struct sg_page_iter sg_iter; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (buf->dmaed) { + mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, + buf->dma_dir, 0); + } + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, + enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return ERR_PTR(-ENOMEM); + + buf->dma_dir = dma_dir; + buf->migf = migf; + if (length) { + ret = mlx5vf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + if (dma_dir != DMA_NONE) { + ret = mlx5vf_dma_data_buffer(buf); + if (ret) + goto end; + } + } + + return buf; +end: + mlx5vf_free_data_buffer(buf); + return ERR_PTR(ret); +} + +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir) +{ + struct mlx5_vhca_data_buffer *buf, *temp_buf; + struct list_head free_list; + + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return ERR_PTR(-ENOTCONN); + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + if (buf->dma_dir == dma_dir) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to mlx5vf_free_data_buffer which + * might sleep. + */ + list_add(&buf->buf_elm, &free_list); + } + } + spin_unlock_irq(&migf->list_lock); + buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + mlx5vf_free_data_buffer(temp_buf); + } + + return buf; +} + void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) { struct mlx5vf_async_data *async_data = container_of(_work, struct mlx5vf_async_data, work); struct mlx5_vf_migration_file *migf = container_of(async_data, struct mlx5_vf_migration_file, async_data); - struct mlx5_core_dev *mdev = migf->mvdev->mdev; mutex_lock(&migf->lock); if (async_data->status) { - migf->is_err = true; + mlx5vf_put_data_buffer(async_data->buf); + if (async_data->header_buf) + mlx5vf_put_data_buffer(async_data->header_buf); + if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) + migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; + else + migf->state = MLX5_MIGF_STATE_ERROR; wake_up_interruptible(&migf->poll_wait); } mutex_unlock(&migf->lock); - - mlx5_core_destroy_mkey(mdev, async_data->mkey); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_core_dealloc_pd(mdev, async_data->pdn); kvfree(async_data->out); + complete(&migf->save_comp); fput(migf->filp); } +static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, + size_t image_size) +{ + struct mlx5_vf_migration_file *migf = header_buf->migf; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + + header.image_size = cpu_to_le64(image_size); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->header_image_size = image_size; + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + return 0; +} + static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) { struct mlx5vf_async_data *async_data = container_of(context, @@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) struct mlx5_vf_migration_file, async_data); if (!status) { - WRITE_ONCE(migf->total_length, - MLX5_GET(save_vhca_state_out, async_data->out, - actual_image_size)); + size_t image_size; + unsigned long flags; + + image_size = MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size); + if (async_data->header_buf) { + status = add_buf_header(async_data->header_buf, image_size); + if (status) + goto err; + } + async_data->buf->length = image_size; + async_data->buf->start_pos = migf->max_pos; + migf->max_pos += async_data->buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->state = async_data->last_chunk ? + MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); } +err: /* * The error and the cleanup flows can't run from an * interrupt context */ + if (status == -EREMOTEIO) + status = MLX5_GET(save_vhca_state_out, async_data->out, status); async_data->status = status; queue_work(migf->mvdev->cb_wq, &async_data->work); } int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track) { u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; struct mlx5vf_async_data *async_data; - struct mlx5_core_dev *mdev; - u32 pdn, mkey; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); + err = wait_for_completion_interruptible(&migf->save_comp); if (err) return err; - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, - 0); - if (err) - goto err_dma_map; - - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); - if (err) - goto err_create_mkey; + if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) + /* + * In case we had a PRE_COPY error, SAVE is triggered only for + * the final image, read device full image. + */ + inc = false; MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(save_vhca_state_in, in, mkey, mkey); - MLX5_SET(save_vhca_state_in, in, size, migf->total_length); + MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, incremental, inc); + MLX5_SET(save_vhca_state_in, in, set_track, track); async_data = &migf->async_data; + async_data->buf = buf; + async_data->last_chunk = !track; async_data->out = kvzalloc(out_size, GFP_KERNEL); if (!async_data->out) { err = -ENOMEM; goto err_out; } - /* no data exists till the callback comes back */ - migf->total_length = 0; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } + } + + if (async_data->last_chunk) + migf->state = MLX5_MIGF_STATE_SAVE_LAST; + + async_data->header_buf = header_buf; get_file(migf->filp); - async_data->mkey = mkey; - async_data->pdn = pdn; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + if (header_buf) + mlx5vf_put_data_buffer(header_buf); fput(migf->filp); +err_free: kvfree(async_data->out); err_out: - mlx5_core_destroy_mkey(mdev, mkey); -err_create_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); -err_dma_map: - mlx5_core_dealloc_pd(mdev, pdn); + complete(&migf->save_comp); return err; } int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf) + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf) { - struct mlx5_core_dev *mdev; - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; - u32 pdn, mkey; + u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; int err; lockdep_assert_held(&mvdev->state_mutex); if (mvdev->mdev_detach) return -ENOTCONN; - mutex_lock(&migf->lock); - if (!migf->total_length) { - err = -EINVAL; - goto end; + if (!buf->dmaed) { + err = mlx5vf_dma_data_buffer(buf); + if (err) + return err; } - mdev = mvdev->mdev; - err = mlx5_core_alloc_pd(mdev, &pdn); - if (err) - goto end; - - err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); - if (err) - goto err_reg; - - err = _create_mkey(mdev, pdn, migf, NULL, &mkey); - if (err) - goto err_mkey; - MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); - MLX5_SET(load_vhca_state_in, in, mkey, mkey); - MLX5_SET(load_vhca_state_in, in, size, migf->total_length); + MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); + MLX5_SET(load_vhca_state_in, in, size, buf->length); + return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); +} - err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) +{ + int err; - mlx5_core_destroy_mkey(mdev, mkey); -err_mkey: - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0); -err_reg: - mlx5_core_dealloc_pd(mdev, pdn); -end: - mutex_unlock(&migf->lock); + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return -ENOTCONN; + + err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); return err; } +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) +{ + lockdep_assert_held(&migf->mvdev->state_mutex); + if (migf->mvdev->mdev_detach) + return; + + mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); +} + +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) +{ + struct mlx5_vhca_data_buffer *entry; + + lockdep_assert_held(&migf->mvdev->state_mutex); + WARN_ON(migf->mvdev->mdev_detach); + + if (migf->buf) { + mlx5vf_free_data_buffer(migf->buf); + migf->buf = NULL; + } + + if (migf->buf_header) { + mlx5vf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + + list_splice(&migf->avail_list, &migf->buf_list); + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct mlx5_vhca_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + mlx5vf_free_data_buffer(entry); + } + + mlx5vf_cmd_dealloc_pd(migf); +} + static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, u32 req_nodes) { diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 921d5720a1e5..5483171d57ad 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -12,31 +12,74 @@ #include <linux/mlx5/cq.h> #include <linux/mlx5/qp.h> +#define MLX5VF_PRE_COPY_SUPP(mvdev) \ + ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY) + +enum mlx5_vf_migf_state { + MLX5_MIGF_STATE_ERROR = 1, + MLX5_MIGF_STATE_PRE_COPY_ERROR, + MLX5_MIGF_STATE_PRE_COPY, + MLX5_MIGF_STATE_SAVE_LAST, + MLX5_MIGF_STATE_COMPLETE, +}; + +enum mlx5_vf_load_state { + MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, + MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_IMAGE, + MLX5_VF_LOAD_STATE_READ_IMAGE, + MLX5_VF_LOAD_STATE_LOAD_IMAGE, +}; + +struct mlx5_vf_migration_header { + __le64 image_size; + /* For future use in case we may need to change the kernel protocol */ + __le64 flags; +}; + +struct mlx5_vhca_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + u64 header_image_size; + u32 mkey; + enum dma_data_direction dma_dir; + u8 dmaed:1; + struct list_head buf_elm; + struct mlx5_vf_migration_file *migf; + /* Optimize mlx5vf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + struct mlx5vf_async_data { struct mlx5_async_work cb_work; struct work_struct work; + struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *header_buf; int status; - u32 pdn; - u32 mkey; + u8 last_chunk:1; void *out; }; struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - u8 disabled:1; - u8 is_err:1; + enum mlx5_vf_migf_state state; - struct sg_append_table table; - size_t total_length; - size_t allocated_length; - - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; + enum mlx5_vf_load_state load_state; + u32 pdn; + loff_t max_pos; + struct mlx5_vhca_data_buffer *buf; + struct mlx5_vhca_data_buffer *buf_header; + spinlock_t list_lock; + struct list_head buf_list; + struct list_head avail_list; struct mlx5vf_pci_core_device *mvdev; wait_queue_head_t poll_wait; + struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; }; @@ -113,19 +156,42 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; +enum { + MLX5VF_QUERY_INC = (1UL << 0), + MLX5VF_QUERY_FINAL = (1UL << 1), +}; + int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, - size_t *state_size); + size_t *state_size, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf, bool inc, + bool track); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, - struct mlx5_vf_migration_file *migf); + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *buf); +int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); +void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); +struct mlx5_vhca_data_buffer * +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); +struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, + size_t length, enum dma_data_direction dma_dir); +void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages); +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 32d1f38d351e..9feb89c6d939 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -32,8 +32,8 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -static struct page * -mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, +struct page * +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, unsigned long offset) { unsigned long cur_offset = 0; @@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned int i; /* All accesses are sequential */ - if (offset < migf->last_offset || !migf->last_offset_sg) { - migf->last_offset = 0; - migf->last_offset_sg = migf->table.sgt.sgl; - migf->sg_last_entry = 0; + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; } - cur_offset = migf->last_offset; + cur_offset = buf->last_offset; - for_each_sg(migf->last_offset_sg, sg, - migf->table.sgt.orig_nents - migf->sg_last_entry, i) { + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { if (offset < sg->length + cur_offset) { - migf->last_offset_sg = sg; - migf->sg_last_entry += i; - migf->last_offset = cur_offset; + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; return nth_page(sg_page(sg), (offset - cur_offset) / PAGE_SIZE); } @@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, return NULL; } -static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, - unsigned int npages) +int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, + unsigned int npages) { unsigned int to_alloc = npages; struct page **page_list; @@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, } to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &migf->table, page_list, filled, 0, + &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL); if (ret) goto err; - migf->allocated_length += filled * PAGE_SIZE; + buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -108,16 +108,8 @@ err: static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { - struct sg_page_iter sg_iter; - mutex_lock(&migf->lock); - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&migf->table); - migf->disabled = true; - migf->total_length = 0; - migf->allocated_length = 0; + migf->state = MLX5_MIGF_STATE_ERROR; migf->filp->f_pos = 0; mutex_unlock(&migf->lock); } @@ -132,10 +124,91 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp) return 0; } +static struct mlx5_vhca_data_buffer * +mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, + bool *end_of_data) +{ + struct mlx5_vhca_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = MLX5_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = mlx5vf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; ssize_t done = 0; if (pos) @@ -144,52 +217,56 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, if (!(filp->f_flags & O_NONBLOCK)) { if (wait_event_interruptible(migf->poll_wait, - READ_ONCE(migf->total_length) || migf->is_err)) + !list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || + migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } mutex_lock(&migf->lock); - if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { - done = -EAGAIN; - goto out_unlock; - } - if (*pos > migf->total_length) { - done = -EINVAL; - goto out_unlock; - } - if (migf->disabled || migf->is_err) { + if (migf->state == MLX5_MIGF_STATE_ERROR) { done = -ENODEV; goto out_unlock; } - len = min_t(size_t, migf->total_length - *pos, len); while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *from_buff; - int ret; + ssize_t count; + + vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, + &end_of_data); + if (first_loop_call) { + first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || + migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { + done = -ENOMSG; + goto out_unlock; + } + + if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { + if (filp->f_flags & O_NONBLOCK) { + done = -EAGAIN; + goto out_unlock; + } + } + } + + if (end_of_data) + goto out_unlock; - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; + if (!vhca_buf) { + done = -EINVAL; goto out_unlock; } - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - from_buff = kmap_local_page(page); - ret = copy_to_user(buf, from_buff + page_offset, page_len); - kunmap_local(from_buff); - if (ret) { - done = -EFAULT; + count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; goto out_unlock; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; + done += count; } out_unlock: @@ -206,27 +283,188 @@ static __poll_t mlx5vf_save_poll(struct file *filp, poll_wait(filp, &migf->poll_wait, wait); mutex_lock(&migf->lock); - if (migf->disabled || migf->is_err) + if (migf->state == MLX5_MIGF_STATE_ERROR) pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - else if (READ_ONCE(migf->total_length)) + else if (!list_empty(&migf->buf_list) || + migf->state == MLX5_MIGF_STATE_COMPLETE) pollflags = EPOLLIN | EPOLLRDNORM; mutex_unlock(&migf->lock); return pollflags; } +/* + * FD is exposed and user can use it after receiving an error. + * Mark migf in error, and wake the user. + */ +static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) +{ + migf->state = MLX5_MIGF_STATE_ERROR; + wake_up_interruptible(&migf->poll_wait); +} + +static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5vf_pci_core_device *mvdev = migf->mvdev; + struct mlx5_vhca_data_buffer *buf; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + size_t inc_length = 0; + bool end_of_data; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * We can't issue a SAVE command when the device is suspended, so as + * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra + * bytes that can't be read. + */ + if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { + /* + * Once the query returns it's guaranteed that there is no + * active SAVE command. + * As so, the other code below is safe with the proper locks. + */ + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, + MLX5VF_QUERY_INC); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + if (buf->start_pos == 0) { + info.initial_bytes = buf->header_image_size - *pos; + } else if (buf->start_pos == + sizeof(struct mlx5_vf_migration_header)) { + /* First data buffer following the header */ + info.initial_bytes = buf->start_pos + + buf->length - *pos; + } else { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + + info.dirty_bytes = inc_length; + } + + if (!end_of_data || !inc_length) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + +done: + mlx5vf_state_mutex_unlock(mvdev); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, .poll = mlx5vf_save_poll, + .unlocked_ioctl = mlx5vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, .release = mlx5vf_release_file, .llseek = no_llseek, }; +static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vf_migration_file *migf = mvdev->saving_migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; + int ret; + + if (migf->state == MLX5_MIGF_STATE_ERROR) + return -ENODEV; + + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, + MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); + if (ret) + goto err; + + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); + if (ret) + goto err_save; + + return 0; + +err_save: + mlx5vf_put_data_buffer(buf); +err: + mlx5vf_mark_err(migf); + return ret; +} + static struct mlx5_vf_migration_file * -mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) +mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + size_t length; int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); @@ -236,43 +474,211 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, O_RDONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); - - kfree(migf); - return ERR_PTR(err); + ret = PTR_ERR(migf->filp); + goto end; } + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); + init_completion(&migf->save_comp); + /* + * save_comp is being used as a binary semaphore built from + * a completion. A normal mutex cannot be used because the lock is + * passed between kernel threads and lockdep can't model this. + */ + complete(&migf->save_comp); mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, - &migf->total_length); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); if (ret) - goto out_free; + goto out_pd; - ret = mlx5vf_add_migration_pages( - migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); - if (ret) - goto out_free; + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } - migf->mvdev = mvdev; - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); if (ret) - goto out_free; + goto out_save; return migf; +out_save: + mlx5vf_free_data_buffer(buf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); out_free: fput(migf->filp); +end: + kfree(migf); return ERR_PTR(ret); } +static int +mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + + page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static int +mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, + loff_t requested_length, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + int ret; + + if (requested_length > MAX_MIGRATION_SIZE) + return -ENOMEM; + + if (vhca_buf->allocated_length < requested_length) { + ret = mlx5vf_add_migration_pages( + vhca_buf, + DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + return ret; + } + + while (*len) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, + done); + if (ret) + return ret; + } + + return 0; +} + +static ssize_t +mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + size_t image_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, image_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == image_size) { + migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; + migf->max_pos += image_size; + *has_work = true; + } + + return 0; +} + +static int +mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { + u64 flags; + + vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); + if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + ret = -ENOMEM; + goto end; + } + + flags = le64_to_cpup((__le64 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, flags))); + if (flags) { + ret = -EOPNOTSUPP; + goto end; + } + + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + migf->max_pos += vhca_buf->length; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct mlx5_vf_migration_file *migf = filp->private_data; + struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; + struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; loff_t requested_length; + bool has_work = false; ssize_t done = 0; + int ret = 0; if (pos) return -ESPIPE; @@ -282,56 +688,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, check_add_overflow((loff_t)len, *pos, &requested_length)) return -EINVAL; - if (requested_length > MAX_MIGRATION_SIZE) - return -ENOMEM; - + mutex_lock(&migf->mvdev->state_mutex); mutex_lock(&migf->lock); - if (migf->disabled) { - done = -ENODEV; + if (migf->state == MLX5_MIGF_STATE_ERROR) { + ret = -ENODEV; goto out_unlock; } - if (migf->allocated_length < requested_length) { - done = mlx5vf_add_migration_pages( - migf, - DIV_ROUND_UP(requested_length - migf->allocated_length, - PAGE_SIZE)); - if (done) - goto out_unlock; - } - - while (len) { - size_t page_offset; - struct page *page; - size_t page_len; - u8 *to_buff; - int ret; - - page_offset = (*pos) % PAGE_SIZE; - page = mlx5vf_get_migration_page(migf, *pos - page_offset); - if (!page) { - if (done == 0) - done = -EINVAL; - goto out_unlock; + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case MLX5_VF_LOAD_STATE_READ_HEADER: + ret = mlx5vf_resume_read_header(migf, vhca_buf_header, + &buf, &len, pos, + &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_PREP_IMAGE: + { + u64 size = vhca_buf_header->header_image_size; + + if (vhca_buf->allocated_length < size) { + mlx5vf_free_data_buffer(vhca_buf); + + migf->buf = mlx5vf_alloc_data_buffer(migf, + size, DMA_TO_DEVICE); + if (IS_ERR(migf->buf)) { + ret = PTR_ERR(migf->buf); + migf->buf = NULL; + goto out_unlock; + } + + vhca_buf = migf->buf; + } + + vhca_buf->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; + break; } + case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: + ret = mlx5vf_resume_read_image_no_header(vhca_buf, + requested_length, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_READ_IMAGE: + ret = mlx5vf_resume_read_image(migf, vhca_buf, + vhca_buf_header->header_image_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case MLX5_VF_LOAD_STATE_LOAD_IMAGE: + ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); + if (ret) + goto out_unlock; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + + /* prep header buf for next image */ + vhca_buf_header->length = 0; + vhca_buf_header->header_image_size = 0; + /* prep data buf for next image */ + vhca_buf->length = 0; - page_len = min_t(size_t, len, PAGE_SIZE - page_offset); - to_buff = kmap_local_page(page); - ret = copy_from_user(to_buff + page_offset, buf, page_len); - kunmap_local(to_buff); - if (ret) { - done = -EFAULT; - goto out_unlock; + break; + default: + break; } - *pos += page_len; - len -= page_len; - done += page_len; - buf += page_len; - migf->total_length += page_len; } + out_unlock: + if (ret) + migf->state = MLX5_MIGF_STATE_ERROR; mutex_unlock(&migf->lock); - return done; + mlx5vf_state_mutex_unlock(migf->mvdev); + return ret ? ret : done; } static const struct file_operations mlx5vf_resume_fops = { @@ -345,6 +778,8 @@ static struct mlx5_vf_migration_file * mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) { struct mlx5_vf_migration_file *migf; + struct mlx5_vhca_data_buffer *buf; + int ret; migf = kzalloc(sizeof(*migf), GFP_KERNEL); if (!migf) @@ -353,20 +788,59 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, O_WRONLY); if (IS_ERR(migf->filp)) { - int err = PTR_ERR(migf->filp); + ret = PTR_ERR(migf->filp); + goto end; + } - kfree(migf); - return ERR_PTR(err); + migf->mvdev = mvdev; + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out_free; + + buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_pd; + } + + migf->buf = buf; + if (MLX5VF_PRE_COPY_SUPP(mvdev)) { + buf = mlx5vf_alloc_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_buf; + } + + migf->buf_header = buf; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + } else { + /* Initial state will be to read the image */ + migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; } + stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); return migf; +out_buf: + mlx5vf_free_data_buffer(migf->buf); +out_pd: + mlx5vf_cmd_dealloc_pd(migf); +out_free: + fput(migf->filp); +end: + kfree(migf); + return ERR_PTR(ret); } void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); fput(mvdev->resuming_migf->filp); mvdev->resuming_migf = NULL; } @@ -374,6 +848,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); + mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; } @@ -402,7 +877,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) @@ -410,7 +886,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return NULL; } - if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) @@ -421,7 +898,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { struct mlx5_vf_migration_file *migf; - migf = mlx5vf_pci_save_device_data(mvdev); + migf = mlx5vf_pci_save_device_data(mvdev, false); if (IS_ERR(migf)) return ERR_CAST(migf); get_file(migf->filp); @@ -429,7 +906,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, return migf->filp; } - if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && + new == VFIO_DEVICE_STATE_RUNNING_P2P)) { mlx5vf_disable_fds(mvdev); return NULL; } @@ -446,14 +926,39 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev, - mvdev->resuming_migf); - if (ret) - return ERR_PTR(ret); + if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { + ret = mlx5vf_cmd_load_vhca_state(mvdev, + mvdev->resuming_migf, + mvdev->resuming_migf->buf); + if (ret) + return ERR_PTR(ret); + } mlx5vf_disable_fds(mvdev); return NULL; } + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct mlx5_vf_migration_file *migf; + + migf = mlx5vf_pci_save_device_data(mvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + mvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = mlx5vf_cmd_suspend_vhca(mvdev, + MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); + if (ret) + return ERR_PTR(ret); + ret = mlx5vf_pci_save_device_inc_data(mvdev); + return ret ? ERR_PTR(ret) : NULL; + } + /* * vfio_mig_get_next_state() does not use arcs other than the above */ @@ -512,6 +1017,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev, return res; } +static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + size_t state_size; + int ret; + + mutex_lock(&mvdev->state_mutex); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &state_size, 0); + if (!ret) + *stop_copy_length = state_size; + mlx5vf_state_mutex_unlock(mvdev); + return ret; +} + static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { @@ -577,6 +1099,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_set_state = mlx5vf_pci_set_device_state, .migration_get_state = mlx5vf_pci_get_device_state, + .migration_get_data_size = mlx5vf_pci_get_data_size, }; static const struct vfio_log_ops mlx5vf_pci_log_ops = { @@ -679,18 +1202,7 @@ static struct pci_driver mlx5vf_pci_driver = { .driver_managed_dma = true, }; -static void __exit mlx5vf_pci_cleanup(void) -{ - pci_unregister_driver(&mlx5vf_pci_driver); -} - -static int __init mlx5vf_pci_init(void) -{ - return pci_register_driver(&mlx5vf_pci_driver); -} - -module_init(mlx5vf_pci_init); -module_exit(mlx5vf_pci_cleanup); +module_pci_driver(mlx5vf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); |