summaryrefslogtreecommitdiff
path: root/drivers/vfio/pci/mlx5
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-15 13:12:15 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-15 13:12:15 -0800
commit785d21ba2f447fb26df4b22f45653763beb767ea (patch)
tree6c447d85c3359198921807a1d121499d47c80233 /drivers/vfio/pci/mlx5
parent8fa590bf344816c925810331eea8387627bbeb40 (diff)
parent70be6f322860d322ebcd120cf0c05402ead5c6de (diff)
downloadlwn-785d21ba2f447fb26df4b22f45653763beb767ea.tar.gz
lwn-785d21ba2f447fb26df4b22f45653763beb767ea.zip
Merge tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Replace deprecated git://github.com link in MAINTAINERS (Palmer Dabbelt) - Simplify vfio/mlx5 with module_pci_driver() helper (Shang XiaoJing) - Drop unnecessary buffer from ACPI call (Rafael Mendonca) - Correct latent missing include issue in iova-bitmap and fix support for unaligned bitmaps. Follow-up with better fix through refactor (Joao Martins) - Rework ccw mdev driver to split private data from parent structure, better aligning with the mdev lifecycle and allowing us to remove a temporary workaround (Eric Farman) - Add an interface to get an estimated migration data size for a device, allowing userspace to make informed decisions, ex. more accurately predicting VM downtime (Yishai Hadas) - Fix minor typo in vfio/mlx5 array declaration (Yishai Hadas) - Simplify module and Kconfig through consolidating SPAPR/EEH code and config options and folding virqfd module into main vfio module (Jason Gunthorpe) - Fix error path from device_register() across all vfio mdev and sample drivers (Alex Williamson) - Define migration pre-copy interface and implement for vfio/mlx5 devices, allowing portions of the device state to be saved while the device continues operation, towards reducing the stop-copy state size (Jason Gunthorpe, Yishai Hadas, Shay Drory) - Implement pre-copy for hisi_acc devices (Shameer Kolothum) - Fixes to mdpy mdev driver remove path and error path on probe (Shang XiaoJing) - vfio/mlx5 fixes for incorrect return after copy_to_user() fault and incorrect buffer freeing (Dan Carpenter) * tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio: (42 commits) vfio/mlx5: error pointer dereference in error handling vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe() hisi_acc_vfio_pci: Enable PRE_COPY flag hisi_acc_vfio_pci: Move the dev compatibility tests for early check hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions hisi_acc_vfio_pci: Add support for precopy IOCTL vfio/mlx5: Enable MIGRATION_PRE_COPY flag vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error vfio/mlx5: Introduce multiple loads vfio/mlx5: Consider temporary end of stream as part of PRE_COPY vfio/mlx5: Introduce vfio precopy ioctl implementation vfio/mlx5: Introduce SW headers for migration states vfio/mlx5: Introduce device transitions of PRE_COPY vfio/mlx5: Refactor to use queue based data chunks vfio/mlx5: Refactor migration file state vfio/mlx5: Refactor MKEY usage vfio/mlx5: Refactor PD usage vfio/mlx5: Enforce a single SAVE command at a time vfio: Extend the device migration protocol with PRE_COPY ...
Diffstat (limited to 'drivers/vfio/pci/mlx5')
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c413
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h96
-rw-r--r--drivers/vfio/pci/mlx5/main.c784
3 files changed, 1069 insertions, 224 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index c604b70437a5..64e68d13cb98 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
+ int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while the device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the save
+ * command once it will try to turn on 'tracking' on a suspended device.
+ */
+ if (migf) {
+ err = wait_for_completion_interruptible(&migf->save_comp);
+ if (err)
+ return err;
+ }
+
MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
- return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ if (migf)
+ complete(&migf->save_comp);
+
+ return err;
}
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
@@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size)
+ size_t *state_size, u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+ bool inc = query_flags & MLX5VF_QUERY_INC;
int ret;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the
+ * incremental query command on un-tracked vhca.
+ */
+ if (inc) {
+ ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+ if (ret)
+ return ret;
+ if (mvdev->saving_migf->state ==
+ MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /*
+ * In case we had a PRE_COPY error, only query full
+ * image for final image
+ */
+ if (!(query_flags & MLX5VF_QUERY_FINAL)) {
+ *state_size = 0;
+ complete(&mvdev->saving_migf->save_comp);
+ return 0;
+ }
+ query_flags &= ~MLX5VF_QUERY_INC;
+ }
+ }
+
MLX5_SET(query_vhca_migration_state_in, in, opcode,
MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+ MLX5_SET(query_vhca_migration_state_in, in, incremental,
+ query_flags & MLX5VF_QUERY_INC);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
+ if (inc)
+ complete(&mvdev->saving_migf->save_comp);
+
if (ret)
return ret;
@@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
+ mvdev->core_device.vdev.migration_flags |=
+ VFIO_MIGRATION_PRE_COPY;
+
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@@ -210,11 +264,11 @@ err_exec:
}
static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf,
struct mlx5_vhca_recv_buf *recv_buf,
u32 *mkey)
{
- size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) :
+ size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
recv_buf->npages;
int err = 0, inlen;
__be64 *mtt;
@@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
DIV_ROUND_UP(npages, 2));
mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
- if (migf) {
+ if (buf) {
struct sg_dma_page_iter dma_iter;
- for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
} else {
int i;
@@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, qpn, 0xffffff);
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
- MLX5_SET64(mkc, mkc, len,
- migf ? migf->total_length : (npages * PAGE_SIZE));
+ MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
kvfree(in);
return err;
}
+static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+ int ret;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ if (buf->dmaed || !buf->allocated_length)
+ return -EINVAL;
+
+ ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ if (ret)
+ return ret;
+
+ ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ if (ret)
+ goto err;
+
+ buf->dmaed = true;
+
+ return 0;
+err:
+ dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ return ret;
+}
+
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5_vf_migration_file *migf = buf->migf;
+ struct sg_page_iter sg_iter;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (buf->dmaed) {
+ mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
+ buf->dma_dir, 0);
+ }
+
+ /* Undo alloc_pages_bulk_array() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length,
+ enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ buf->dma_dir = dma_dir;
+ buf->migf = migf;
+ if (length) {
+ ret = mlx5vf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ if (dma_dir != DMA_NONE) {
+ ret = mlx5vf_dma_data_buffer(buf);
+ if (ret)
+ goto end;
+ }
+ }
+
+ return buf;
+end:
+ mlx5vf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return ERR_PTR(-ENOTCONN);
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ if (buf->dma_dir == dma_dir) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to mlx5vf_free_data_buffer which
+ * might sleep.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ mlx5vf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
struct mlx5vf_async_data, work);
struct mlx5_vf_migration_file *migf = container_of(async_data,
struct mlx5_vf_migration_file, async_data);
- struct mlx5_core_dev *mdev = migf->mvdev->mdev;
mutex_lock(&migf->lock);
if (async_data->status) {
- migf->is_err = true;
+ mlx5vf_put_data_buffer(async_data->buf);
+ if (async_data->header_buf)
+ mlx5vf_put_data_buffer(async_data->header_buf);
+ if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
+ else
+ migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
-
- mlx5_core_destroy_mkey(mdev, async_data->mkey);
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
- mlx5_core_dealloc_pd(mdev, async_data->pdn);
kvfree(async_data->out);
+ complete(&migf->save_comp);
fput(migf->filp);
}
+static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
+ size_t image_size)
+{
+ struct mlx5_vf_migration_file *migf = header_buf->migf;
+ struct mlx5_vf_migration_header header = {};
+ unsigned long flags;
+ struct page *page;
+ u8 *to_buff;
+
+ header.image_size = cpu_to_le64(image_size);
+ page = mlx5vf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->header_image_size = image_size;
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ return 0;
+}
+
static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
{
struct mlx5vf_async_data *async_data = container_of(context,
@@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
- WRITE_ONCE(migf->total_length,
- MLX5_GET(save_vhca_state_out, async_data->out,
- actual_image_size));
+ size_t image_size;
+ unsigned long flags;
+
+ image_size = MLX5_GET(save_vhca_state_out, async_data->out,
+ actual_image_size);
+ if (async_data->header_buf) {
+ status = add_buf_header(async_data->header_buf, image_size);
+ if (status)
+ goto err;
+ }
+ async_data->buf->length = image_size;
+ async_data->buf->start_pos = migf->max_pos;
+ migf->max_pos += async_data->buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ migf->state = async_data->last_chunk ?
+ MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
wake_up_interruptible(&migf->poll_wait);
}
+err:
/*
* The error and the cleanup flows can't run from an
* interrupt context
*/
+ if (status == -EREMOTEIO)
+ status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track)
{
u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
+ struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
- struct mlx5_core_dev *mdev;
- u32 pdn, mkey;
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
+ err = wait_for_completion_interruptible(&migf->save_comp);
if (err)
return err;
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
- 0);
- if (err)
- goto err_dma_map;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_create_mkey;
+ if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
+ /*
+ * In case we had a PRE_COPY error, SAVE is triggered only for
+ * the final image, read device full image.
+ */
+ inc = false;
MLX5_SET(save_vhca_state_in, in, opcode,
MLX5_CMD_OP_SAVE_VHCA_STATE);
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(save_vhca_state_in, in, mkey, mkey);
- MLX5_SET(save_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, incremental, inc);
+ MLX5_SET(save_vhca_state_in, in, set_track, track);
async_data = &migf->async_data;
+ async_data->buf = buf;
+ async_data->last_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- /* no data exists till the callback comes back */
- migf->total_length = 0;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
+ }
+ }
+
+ if (async_data->last_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+
+ async_data->header_buf = header_buf;
get_file(migf->filp);
- async_data->mkey = mkey;
- async_data->pdn = pdn;
err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
async_data->out,
out_size, mlx5vf_save_callback,
@@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
err_exec:
+ if (header_buf)
+ mlx5vf_put_data_buffer(header_buf);
fput(migf->filp);
+err_free:
kvfree(async_data->out);
err_out:
- mlx5_core_destroy_mkey(mdev, mkey);
-err_create_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
-err_dma_map:
- mlx5_core_dealloc_pd(mdev, pdn);
+ complete(&migf->save_comp);
return err;
}
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_core_dev *mdev;
- u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {};
- u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
- u32 pdn, mkey;
+ u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mutex_lock(&migf->lock);
- if (!migf->total_length) {
- err = -EINVAL;
- goto end;
+ if (!buf->dmaed) {
+ err = mlx5vf_dma_data_buffer(buf);
+ if (err)
+ return err;
}
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
- if (err)
- goto end;
-
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
- if (err)
- goto err_reg;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_mkey;
-
MLX5_SET(load_vhca_state_in, in, opcode,
MLX5_CMD_OP_LOAD_VHCA_STATE);
MLX5_SET(load_vhca_state_in, in, op_mod, 0);
MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(load_vhca_state_in, in, mkey, mkey);
- MLX5_SET(load_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(load_vhca_state_in, in, size, buf->length);
+ return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
+}
- err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ int err;
- mlx5_core_destroy_mkey(mdev, mkey);
-err_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
-err_reg:
- mlx5_core_dealloc_pd(mdev, pdn);
-end:
- mutex_unlock(&migf->lock);
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
return err;
}
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return;
+
+ mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
+}
+
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
+{
+ struct mlx5_vhca_data_buffer *entry;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (migf->buf) {
+ mlx5vf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ mlx5vf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ mlx5vf_free_data_buffer(entry);
+ }
+
+ mlx5vf_cmd_dealloc_pd(migf);
+}
+
static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
u32 req_nodes)
{
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 921d5720a1e5..5483171d57ad 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -12,31 +12,74 @@
#include <linux/mlx5/cq.h>
#include <linux/mlx5/qp.h>
+#define MLX5VF_PRE_COPY_SUPP(mvdev) \
+ ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
+
+enum mlx5_vf_migf_state {
+ MLX5_MIGF_STATE_ERROR = 1,
+ MLX5_MIGF_STATE_PRE_COPY_ERROR,
+ MLX5_MIGF_STATE_PRE_COPY,
+ MLX5_MIGF_STATE_SAVE_LAST,
+ MLX5_MIGF_STATE_COMPLETE,
+};
+
+enum mlx5_vf_load_state {
+ MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
+ MLX5_VF_LOAD_STATE_READ_HEADER,
+ MLX5_VF_LOAD_STATE_PREP_IMAGE,
+ MLX5_VF_LOAD_STATE_READ_IMAGE,
+ MLX5_VF_LOAD_STATE_LOAD_IMAGE,
+};
+
+struct mlx5_vf_migration_header {
+ __le64 image_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le64 flags;
+};
+
+struct mlx5_vhca_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ u64 header_image_size;
+ u32 mkey;
+ enum dma_data_direction dma_dir;
+ u8 dmaed:1;
+ struct list_head buf_elm;
+ struct mlx5_vf_migration_file *migf;
+ /* Optimize mlx5vf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
struct mlx5vf_async_data {
struct mlx5_async_work cb_work;
struct work_struct work;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *header_buf;
int status;
- u32 pdn;
- u32 mkey;
+ u8 last_chunk:1;
void *out;
};
struct mlx5_vf_migration_file {
struct file *filp;
struct mutex lock;
- u8 disabled:1;
- u8 is_err:1;
+ enum mlx5_vf_migf_state state;
- struct sg_append_table table;
- size_t total_length;
- size_t allocated_length;
-
- /* Optimize mlx5vf_get_migration_page() for sequential access */
- struct scatterlist *last_offset_sg;
- unsigned int sg_last_entry;
- unsigned long last_offset;
+ enum mlx5_vf_load_state load_state;
+ u32 pdn;
+ loff_t max_pos;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *buf_header;
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
struct mlx5vf_pci_core_device *mvdev;
wait_queue_head_t poll_wait;
+ struct completion save_comp;
struct mlx5_async_ctx async_ctx;
struct mlx5vf_async_data async_data;
};
@@ -113,19 +156,42 @@ struct mlx5vf_pci_core_device {
struct mlx5_core_dev *mdev;
};
+enum {
+ MLX5VF_QUERY_INC = (1UL << 0),
+ MLX5VF_QUERY_FINAL = (1UL << 1),
+};
+
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size);
+ size_t *state_size, u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf);
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track);
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf);
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages);
+struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 32d1f38d351e..9feb89c6d939 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -32,8 +32,8 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
core_device);
}
-static struct page *
-mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
+struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset)
{
unsigned long cur_offset = 0;
@@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
unsigned int i;
/* All accesses are sequential */
- if (offset < migf->last_offset || !migf->last_offset_sg) {
- migf->last_offset = 0;
- migf->last_offset_sg = migf->table.sgt.sgl;
- migf->sg_last_entry = 0;
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
}
- cur_offset = migf->last_offset;
+ cur_offset = buf->last_offset;
- for_each_sg(migf->last_offset_sg, sg,
- migf->table.sgt.orig_nents - migf->sg_last_entry, i) {
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
- migf->last_offset_sg = sg;
- migf->sg_last_entry += i;
- migf->last_offset = cur_offset;
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
@@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
return NULL;
}
-static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
- unsigned int npages)
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
{
unsigned int to_alloc = npages;
struct page **page_list;
@@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
}
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(
- &migf->table, page_list, filled, 0,
+ &buf->table, page_list, filled, 0,
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
GFP_KERNEL);
if (ret)
goto err;
- migf->allocated_length += filled * PAGE_SIZE;
+ buf->allocated_length += filled * PAGE_SIZE;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
@@ -108,16 +108,8 @@ err:
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
- struct sg_page_iter sg_iter;
-
mutex_lock(&migf->lock);
- /* Undo alloc_pages_bulk_array() */
- for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0)
- __free_page(sg_page_iter_page(&sg_iter));
- sg_free_append_table(&migf->table);
- migf->disabled = true;
- migf->total_length = 0;
- migf->allocated_length = 0;
+ migf->state = MLX5_MIGF_STATE_ERROR;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
@@ -132,10 +124,91 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
+ bool *end_of_data)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = MLX5_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = mlx5vf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
ssize_t done = 0;
if (pos)
@@ -144,52 +217,56 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
if (!(filp->f_flags & O_NONBLOCK)) {
if (wait_event_interruptible(migf->poll_wait,
- READ_ONCE(migf->total_length) || migf->is_err))
+ !list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE))
return -ERESTARTSYS;
}
mutex_lock(&migf->lock);
- if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) {
- done = -EAGAIN;
- goto out_unlock;
- }
- if (*pos > migf->total_length) {
- done = -EINVAL;
- goto out_unlock;
- }
- if (migf->disabled || migf->is_err) {
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
- len = min_t(size_t, migf->total_length - *pos, len);
while (len) {
- size_t page_offset;
- struct page *page;
- size_t page_len;
- u8 *from_buff;
- int ret;
+ ssize_t count;
+
+ vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
+ &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+
+ if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
+ if (filp->f_flags & O_NONBLOCK) {
+ done = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
- page_offset = (*pos) % PAGE_SIZE;
- page = mlx5vf_get_migration_page(migf, *pos - page_offset);
- if (!page) {
- if (done == 0)
- done = -EINVAL;
+ if (!vhca_buf) {
+ done = -EINVAL;
goto out_unlock;
}
- page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
- from_buff = kmap_local_page(page);
- ret = copy_to_user(buf, from_buff + page_offset, page_len);
- kunmap_local(from_buff);
- if (ret) {
- done = -EFAULT;
+ count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
goto out_unlock;
}
- *pos += page_len;
- len -= page_len;
- done += page_len;
- buf += page_len;
+ done += count;
}
out_unlock:
@@ -206,27 +283,188 @@ static __poll_t mlx5vf_save_poll(struct file *filp,
poll_wait(filp, &migf->poll_wait, wait);
mutex_lock(&migf->lock);
- if (migf->disabled || migf->is_err)
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- else if (READ_ONCE(migf->total_length))
+ else if (!list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE)
pollflags = EPOLLIN | EPOLLRDNORM;
mutex_unlock(&migf->lock);
return pollflags;
}
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ wake_up_interruptible(&migf->poll_wait);
+}
+
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ size_t inc_length = 0;
+ bool end_of_data;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * We can't issue a SAVE command when the device is suspended, so as
+ * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+ * bytes that can't be read.
+ */
+ if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ /*
+ * Once the query returns it's guaranteed that there is no
+ * active SAVE command.
+ * As so, the other code below is safe with the proper locks.
+ */
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+ MLX5VF_QUERY_INC);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (buf) {
+ if (buf->start_pos == 0) {
+ info.initial_bytes = buf->header_image_size - *pos;
+ } else if (buf->start_pos ==
+ sizeof(struct mlx5_vf_migration_header)) {
+ /* First data buffer following the header */
+ info.initial_bytes = buf->start_pos +
+ buf->length - *pos;
+ } else {
+ info.dirty_bytes = buf->start_pos + buf->length - *pos;
+ }
+ } else {
+ if (!end_of_data) {
+ ret = -EINVAL;
+ goto err_migf_unlock;
+ }
+
+ info.dirty_bytes = inc_length;
+ }
+
+ if (!end_of_data || !inc_length) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, save a new state to be ready for.
+ */
+ buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ mlx5vf_mark_err(migf);
+ goto err_state_unlock;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+ if (ret) {
+ mlx5vf_mark_err(migf);
+ mlx5vf_put_data_buffer(buf);
+ goto err_state_unlock;
+ }
+
+done:
+ mlx5vf_state_mutex_unlock(mvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
static const struct file_operations mlx5vf_save_fops = {
.owner = THIS_MODULE,
.read = mlx5vf_save_read,
.poll = mlx5vf_save_poll,
+ .unlocked_ioctl = mlx5vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
.llseek = no_llseek,
};
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
+ if (ret)
+ goto err;
+
+ buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
+ if (ret)
+ goto err_save;
+
+ return 0;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+ return ret;
+}
+
static struct mlx5_vf_migration_file *
-mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
{
struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL);
@@ -236,43 +474,211 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
O_RDONLY);
if (IS_ERR(migf->filp)) {
- int err = PTR_ERR(migf->filp);
-
- kfree(migf);
- return ERR_PTR(err);
+ ret = PTR_ERR(migf->filp);
+ goto end;
}
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
+ init_completion(&migf->save_comp);
+ /*
+ * save_comp is being used as a binary semaphore built from
+ * a completion. A normal mutex cannot be used because the lock is
+ * passed between kernel threads and lockdep can't model this.
+ */
+ complete(&migf->save_comp);
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- &migf->total_length);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
if (ret)
- goto out_free;
+ goto out_pd;
- ret = mlx5vf_add_migration_pages(
- migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE));
- if (ret)
- goto out_free;
+ buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
- migf->mvdev = mvdev;
- ret = mlx5vf_cmd_save_vhca_state(mvdev, migf);
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
if (ret)
- goto out_free;
+ goto out_save;
return migf;
+out_save:
+ mlx5vf_free_data_buffer(buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
out_free:
fput(migf->filp);
+end:
+ kfree(migf);
return ERR_PTR(ret);
}
+static int
+mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+
+ page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
+ loff_t requested_length,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ int ret;
+
+ if (requested_length > MAX_MIGRATION_SIZE)
+ return -ENOMEM;
+
+ if (vhca_buf->allocated_length < requested_length) {
+ ret = mlx5vf_add_migration_pages(
+ vhca_buf,
+ DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ return ret;
+ }
+
+ while (*len) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static ssize_t
+mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ size_t image_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == image_size) {
+ migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
+ migf->max_pos += image_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
+ page = mlx5vf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
+ u64 flags;
+
+ vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
+ if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ flags = le64_to_cpup((__le64 *)(to_buff +
+ offsetof(struct mlx5_vf_migration_header, flags)));
+ if (flags) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
+ migf->max_pos += vhca_buf->length;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
loff_t requested_length;
+ bool has_work = false;
ssize_t done = 0;
+ int ret = 0;
if (pos)
return -ESPIPE;
@@ -282,56 +688,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
check_add_overflow((loff_t)len, *pos, &requested_length))
return -EINVAL;
- if (requested_length > MAX_MIGRATION_SIZE)
- return -ENOMEM;
-
+ mutex_lock(&migf->mvdev->state_mutex);
mutex_lock(&migf->lock);
- if (migf->disabled) {
- done = -ENODEV;
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
goto out_unlock;
}
- if (migf->allocated_length < requested_length) {
- done = mlx5vf_add_migration_pages(
- migf,
- DIV_ROUND_UP(requested_length - migf->allocated_length,
- PAGE_SIZE));
- if (done)
- goto out_unlock;
- }
-
- while (len) {
- size_t page_offset;
- struct page *page;
- size_t page_len;
- u8 *to_buff;
- int ret;
-
- page_offset = (*pos) % PAGE_SIZE;
- page = mlx5vf_get_migration_page(migf, *pos - page_offset);
- if (!page) {
- if (done == 0)
- done = -EINVAL;
- goto out_unlock;
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case MLX5_VF_LOAD_STATE_READ_HEADER:
+ ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
+ &buf, &len, pos,
+ &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_PREP_IMAGE:
+ {
+ u64 size = vhca_buf_header->header_image_size;
+
+ if (vhca_buf->allocated_length < size) {
+ mlx5vf_free_data_buffer(vhca_buf);
+
+ migf->buf = mlx5vf_alloc_data_buffer(migf,
+ size, DMA_TO_DEVICE);
+ if (IS_ERR(migf->buf)) {
+ ret = PTR_ERR(migf->buf);
+ migf->buf = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf = migf->buf;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
+ break;
}
+ case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
+ ret = mlx5vf_resume_read_image_no_header(vhca_buf,
+ requested_length,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_READ_IMAGE:
+ ret = mlx5vf_resume_read_image(migf, vhca_buf,
+ vhca_buf_header->header_image_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
+ ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
+ /* prep header buf for next image */
+ vhca_buf_header->length = 0;
+ vhca_buf_header->header_image_size = 0;
+ /* prep data buf for next image */
+ vhca_buf->length = 0;
- page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
- to_buff = kmap_local_page(page);
- ret = copy_from_user(to_buff + page_offset, buf, page_len);
- kunmap_local(to_buff);
- if (ret) {
- done = -EFAULT;
- goto out_unlock;
+ break;
+ default:
+ break;
}
- *pos += page_len;
- len -= page_len;
- done += page_len;
- buf += page_len;
- migf->total_length += page_len;
}
+
out_unlock:
+ if (ret)
+ migf->state = MLX5_MIGF_STATE_ERROR;
mutex_unlock(&migf->lock);
- return done;
+ mlx5vf_state_mutex_unlock(migf->mvdev);
+ return ret ? ret : done;
}
static const struct file_operations mlx5vf_resume_fops = {
@@ -345,6 +778,8 @@ static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
{
struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL);
if (!migf)
@@ -353,20 +788,59 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
O_WRONLY);
if (IS_ERR(migf->filp)) {
- int err = PTR_ERR(migf->filp);
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
- kfree(migf);
- return ERR_PTR(err);
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ migf->buf = buf;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+ } else {
+ /* Initial state will be to read the image */
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
}
+
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
return migf;
+out_buf:
+ mlx5vf_free_data_buffer(migf->buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
}
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
fput(mvdev->resuming_migf->filp);
mvdev->resuming_migf = NULL;
}
@@ -374,6 +848,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
mlx5vf_disable_fd(mvdev->saving_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
}
@@ -402,7 +877,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
if (ret)
@@ -410,7 +886,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
if (ret)
@@ -421,7 +898,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct mlx5_vf_migration_file *migf;
- migf = mlx5vf_pci_save_device_data(mvdev);
+ migf = mlx5vf_pci_save_device_data(mvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
@@ -429,7 +906,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
mlx5vf_disable_fds(mvdev);
return NULL;
}
@@ -446,14 +926,39 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
- ret = mlx5vf_cmd_load_vhca_state(mvdev,
- mvdev->resuming_migf);
- if (ret)
- return ERR_PTR(ret);
+ if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ ret = mlx5vf_cmd_load_vhca_state(mvdev,
+ mvdev->resuming_migf,
+ mvdev->resuming_migf->buf);
+ if (ret)
+ return ERR_PTR(ret);
+ }
mlx5vf_disable_fds(mvdev);
return NULL;
}
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = mlx5vf_pci_save_device_inc_data(mvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
@@ -512,6 +1017,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev,
return res;
}
+static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ size_t state_size;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
+ &state_size, 0);
+ if (!ret)
+ *stop_copy_length = state_size;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
@@ -577,6 +1099,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
.migration_set_state = mlx5vf_pci_set_device_state,
.migration_get_state = mlx5vf_pci_get_device_state,
+ .migration_get_data_size = mlx5vf_pci_get_data_size,
};
static const struct vfio_log_ops mlx5vf_pci_log_ops = {
@@ -679,18 +1202,7 @@ static struct pci_driver mlx5vf_pci_driver = {
.driver_managed_dma = true,
};
-static void __exit mlx5vf_pci_cleanup(void)
-{
- pci_unregister_driver(&mlx5vf_pci_driver);
-}
-
-static int __init mlx5vf_pci_init(void)
-{
- return pci_register_driver(&mlx5vf_pci_driver);
-}
-
-module_init(mlx5vf_pci_init);
-module_exit(mlx5vf_pci_cleanup);
+module_pci_driver(mlx5vf_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");