diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-13 13:04:41 -0700 |
commit | 7cd4ecd9177b94af783b8e21de7c65b41a871342 (patch) | |
tree | 3ca393f3eaeeaad56d4ab60f87e28d7197b0ba21 /drivers/nvme/host | |
parent | 79ec6d9cac46d59db9b006bc9cde2811ef365292 (diff) | |
parent | 79cd16681acccffcf5521f6e3d8c7c50aaffca0a (diff) | |
download | lwn-7cd4ecd9177b94af783b8e21de7c65b41a871342.tar.gz lwn-7cd4ecd9177b94af783b8e21de7c65b41a871342.zip |
Merge tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"Here are the driver updates for 5.10.
A few SCSI updates in here too, in coordination with Martin as they
depend on core block changes for the shared tag bitmap.
This contains:
- NVMe pull requests via Christoph:
- fix keep alive timer modification (Amit Engel)
- order the PCI ID list more sensibly (Andy Shevchenko)
- cleanup the open by controller helper (Chaitanya Kulkarni)
- use an xarray for the CSE log lookup (Chaitanya Kulkarni)
- support ZNS in nvmet passthrough mode (Chaitanya Kulkarni)
- fix nvme_ns_report_zones (Christoph Hellwig)
- add a sanity check to nvmet-fc (James Smart)
- fix interrupt allocation when too many polled queues are
specified (Jeffle Xu)
- small nvmet-tcp optimization (Mark Wunderlich)
- fix a controller refcount leak on init failure (Chaitanya
Kulkarni)
- misc cleanups (Chaitanya Kulkarni)
- major refactoring of the scanning code (Christoph Hellwig)
- MD updates via Song:
- Bug fixes in bitmap code, from Zhao Heming
- Fix a work queue check, from Guoqing Jiang
- Fix raid5 oops with reshape, from Song Liu
- Clean up unused code, from Jason Yan
- Discard improvements, from Xiao Ni
- raid5/6 page offset support, from Yufen Yu
- Shared tag bitmap for SCSI/hisi_sas/null_blk (John, Kashyap,
Hannes)
- null_blk open/active zone limit support (Niklas)
- Set of bcache updates (Coly, Dongsheng, Qinglang)"
* tag 'drivers-5.10-2020-10-12' of git://git.kernel.dk/linux-block: (78 commits)
md/raid5: fix oops during stripe resizing
md/bitmap: fix memory leak of temporary bitmap
md: fix the checking of wrong work queue
md/bitmap: md_bitmap_get_counter returns wrong blocks
md/bitmap: md_bitmap_read_sb uses wrong bitmap blocks
md/raid0: remove unused function is_io_in_chunk_boundary()
nvme-core: remove extra condition for vwc
nvme-core: remove extra variable
nvme: remove nvme_identify_ns_list
nvme: refactor nvme_validate_ns
nvme: move nvme_validate_ns
nvme: query namespace identifiers before adding the namespace
nvme: revalidate zone bitmaps in nvme_update_ns_info
nvme: remove nvme_update_formats
nvme: update the known admin effects
nvme: set the queue limits in nvme_update_ns_info
nvme: remove the 0 lba_shift check in nvme_update_ns_info
nvme: clean up the check for too large logic block sizes
nvme: freeze the queue over ->lba_shift updates
nvme: factor out a nvme_configure_metadata helper
...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/core.c | 511 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 13 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 35 | ||||
-rw-r--r-- | drivers/nvme/host/zns.c | 57 |
4 files changed, 259 insertions, 357 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d2397cfe178f..56e2a22e8a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,6 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -968,10 +967,10 @@ static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | NVME_CMD_EFFECTS_CSE_MASK; case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_CSE_MASK; + return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; default: break; } @@ -1009,7 +1008,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, * For simplicity, IO to all namespaces is quiesced even if the command * effects say only one namespace is affected. */ - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { mutex_lock(&ctrl->scan_lock); mutex_lock(&ctrl->subsys->lock); nvme_mpath_start_freeze(ctrl->subsys); @@ -1020,36 +1019,9 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) -{ - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - if (_nvme_revalidate_disk(ns->disk)) - nvme_set_queue_dying(ns); - else if (blk_queue_is_zoned(ns->disk->queue)) { - /* - * IO commands are required to fully revalidate a zoned - * device. Force the command effects to trigger rescan - * work so report zones can run in a context with - * unfrozen IO queues. - */ - *effects |= NVME_CMD_EFFECTS_NCC; - } - up_read(&ctrl->namespaces_rwsem); -} - static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) { - /* - * Revalidate LBA changes prior to unfreezing. This is necessary to - * prevent memory corruption if a logical block size was changed by - * this command. - */ - if (effects & NVME_CMD_EFFECTS_LBCC) - nvme_update_formats(ctrl, &effects); - if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); @@ -1309,6 +1281,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, int status, pos, len; void *data; + if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl)) + return 0; if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST) return 0; @@ -1352,19 +1326,8 @@ free_data: return status; } -static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) -{ - struct nvme_command c = { }; - - c.identify.opcode = nvme_admin_identify; - c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; - c.identify.nsid = cpu_to_le32(nsid); - return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, - NVME_IDENTIFY_DATA_SIZE); -} - -static int nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns **id) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids, struct nvme_id_ns **id) { struct nvme_command c = { }; int error; @@ -1381,9 +1344,24 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(*id); + goto out_free_id; } + error = -ENODEV; + if ((*id)->ncap == 0) /* namespace not allocated or attached */ + goto out_free_id; + + if (ctrl->vs >= NVME_VS(1, 1, 0) && + !memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64)); + if (ctrl->vs >= NVME_VS(1, 2, 0) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid)); + + return 0; + +out_free_id: + kfree(*id); return error; } @@ -1905,20 +1883,6 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) nvme_lba_to_sect(ns, max_blocks)); } -static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, - struct nvme_id_ns *id, struct nvme_ns_ids *ids) -{ - memset(ids, 0, sizeof(*ids)); - - if (ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); - if (ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) - return nvme_identify_ns_descs(ctrl, nsid, ids); - return 0; -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -1959,6 +1923,68 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return 0; } +static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + + /* + * The PI implementation requires the metadata size to be equal to the + * t10 pi tuple size. + */ + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; + + ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + return 0; + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * The NVMe over Fabrics specification only supports metadata as + * part of the extended data LBA. We rely on HCA/HBA support to + * remap the separate metadata buffer from the block layer. + */ + if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) + return -EINVAL; + if (ctrl->max_integrity_segments) + ns->features |= + (NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + } else { + /* + * For PCIe controllers, we can't easily remap the separate + * metadata buffer from the block layer and thus require a + * separate metadata buffer for block layer metadata/PI support. + * We allow extended LBAs for the passthrough interface, though. + */ + if (id->flbas & NVME_NS_FLBAS_META_EXT) + ns->features |= NVME_NS_EXT_LBAS; + else + ns->features |= NVME_NS_METADATA_SUPPORTED; + } + + return 0; +} + +static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, + struct request_queue *q) +{ + bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; + + if (ctrl->max_hw_sectors) { + u32 max_segments = + (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; + + max_segments = min_not_zero(max_segments, ctrl->max_segments); + blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); + blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); + } + blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); + blk_queue_dma_alignment(q, 7); + blk_queue_write_cache(q, vwc, vwc); +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { @@ -1966,11 +1992,15 @@ static void nvme_update_disk_info(struct gendisk *disk, unsigned short bs = 1 << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + /* + * The block layer can't support LBA sizes larger than the page size + * yet, so catch this early and don't allow block I/O. + */ if (ns->lba_shift > PAGE_SHIFT) { - /* unsupported block size, set capacity to 0 later */ + capacity = 0; bs = (1 << 9); } - blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); atomic_bs = phys_bs = bs; @@ -2005,13 +2035,6 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_queue_io_opt(disk->queue, io_opt); /* - * The block layer can't support LBA sizes larger than the page size - * yet, so catch this early and don't allow block I/O. - */ - if (ns->lba_shift > PAGE_SHIFT) - capacity = 0; - - /* * Register a metadata profile for PI, or the plain non-integrity NVMe * metadata masquerading as Type 0 if supported, otherwise reject block * I/O to namespaces with metadata except when the namespace supports @@ -2035,8 +2058,6 @@ static void nvme_update_disk_info(struct gendisk *disk, set_disk_ro(disk, true); else set_disk_ro(disk, false); - - blk_mq_unfreeze_queue(disk->queue); } static inline bool nvme_first_scan(struct gendisk *disk) @@ -2076,151 +2097,49 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) blk_queue_chunk_sectors(ns->queue, iob); } -static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) { unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; int ret; - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ + blk_mq_freeze_queue(ns->disk->queue); ns->lba_shift = id->lbaf[lbaf].ds; - if (ns->lba_shift == 0) - ns->lba_shift = 9; + nvme_set_queue_limits(ns->ctrl, ns->queue); - switch (ns->head->ids.csi) { - case NVME_CSI_NVM: - break; - case NVME_CSI_ZNS: - ret = nvme_update_zone_info(disk, ns, lbaf); - if (ret) { - dev_warn(ctrl->device, - "failed to add zoned namespace:%u ret:%d\n", - ns->head->ns_id, ret); - return ret; - } - break; - default: - dev_warn(ctrl->device, "unknown csi:%u ns:%u\n", - ns->head->ids.csi, ns->head->ns_id); - return -ENODEV; + if (ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf); + if (ret) + goto out_unfreeze; } - ns->features = 0; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - /* the PI implementation requires metadata equal t10 pi tuple size */ - if (ns->ms == sizeof(struct t10_pi_tuple)) - ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - ns->pi_type = 0; + ret = nvme_configure_metadata(ns, id); + if (ret) + goto out_unfreeze; + nvme_set_chunk_sectors(ns, id); + nvme_update_disk_info(ns->disk, ns, id); + blk_mq_unfreeze_queue(ns->disk->queue); - if (ns->ms) { - /* - * For PCIe only the separate metadata pointer is supported, - * as the block layer supplies metadata in a separate bio_vec - * chain. For Fabrics, only metadata as part of extended data - * LBA is supported on the wire per the Fabrics specification, - * but the HBA/HCA will do the remapping from the separate - * metadata buffers for us. - */ - if (id->flbas & NVME_NS_FLBAS_META_EXT) { - ns->features |= NVME_NS_EXT_LBAS; - if ((ctrl->ops->flags & NVME_F_FABRICS) && - (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) && - ctrl->max_integrity_segments) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } else { - if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS)) - return -EINVAL; - if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) - ns->features |= NVME_NS_METADATA_SUPPORTED; - } + if (blk_queue_is_zoned(ns->queue)) { + ret = nvme_revalidate_zones(ns); + if (ret) + return ret; } - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(disk, ns, id); #ifdef CONFIG_NVME_MULTIPATH if (ns->head->disk) { + blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); blk_queue_update_readahead(ns->head->disk->queue); nvme_update_bdev_size(ns->head->disk); + blk_mq_unfreeze_queue(ns->head->disk->queue); } #endif return 0; -} - -static int _nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id; - struct nvme_ns_ids ids; - int ret = 0; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) { - set_capacity(disk, 0); - return -ENODEV; - } - ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); - if (ret) - goto out; - - if (id->ncap == 0) { - ret = -ENODEV; - goto free_id; - } - - ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); - if (ret) - goto free_id; - - if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { - dev_err(ctrl->device, - "identifiers changed for nsid %d\n", ns->head->ns_id); - ret = -ENODEV; - goto free_id; - } - - ret = __nvme_revalidate_disk(disk, id); -free_id: - kfree(id); -out: - /* - * Only fail the function if we got a fatal error back from the - * device, otherwise ignore the error and just move on. - */ - if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) - ret = 0; - else if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - int ret; - - ret = _nvme_revalidate_disk(disk); - if (ret) - return ret; - -#ifdef CONFIG_BLK_DEV_ZONED - if (blk_queue_is_zoned(disk->queue)) { - struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - - ret = blk_revalidate_disk_zones(disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(disk->queue, - ctrl->max_zone_append); - } -#endif +out_unfreeze: + blk_mq_unfreeze_queue(ns->disk->queue); return ret; } @@ -2502,26 +2421,6 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) -{ - bool vwc = false; - - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; - - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 7); - if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) - vwc = true; - blk_queue_write_cache(q, vwc, vwc); -} - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; @@ -3025,26 +2924,10 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) -{ - struct nvme_cel *cel, *ret = NULL; - - spin_lock_irq(&ctrl->lock); - list_for_each_entry(cel, &ctrl->cels, entry) { - if (cel->csi == csi) { - ret = cel; - break; - } - } - spin_unlock_irq(&ctrl->lock); - - return ret; -} - static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { - struct nvme_cel *cel = nvme_find_cel(ctrl, csi); + struct nvme_cel *cel = xa_load(&ctrl->cels, csi); int ret; if (cel) @@ -3062,10 +2945,7 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, } cel->csi = csi; - - spin_lock_irq(&ctrl->lock); - list_add_tail(&cel->entry, &ctrl->cels); - spin_unlock_irq(&ctrl->lock); + xa_store(&ctrl->cels, cel->csi, cel, GFP_KERNEL); out: *log = &cel->log; return 0; @@ -3846,25 +3726,16 @@ out: } static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, - struct nvme_id_ns *id) + struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ctrl *ctrl = ns->ctrl; - bool is_shared = id->nmic & NVME_NS_NMIC_SHARED; struct nvme_ns_head *head = NULL; - struct nvme_ns_ids ids; int ret = 0; - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) { - if (ret < 0) - return ret; - return blk_status_to_errno(nvme_error_status(ret)); - } - mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, &ids); + head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; @@ -3877,7 +3748,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, "Duplicate unshared namespace %d\n", nsid); goto out_put_ns_head; } - if (!nvme_ns_ids_equal(&head->ids, &ids)) { + if (!nvme_ns_ids_equal(&head->ids, ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", nsid); @@ -3925,7 +3796,8 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); -static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, + struct nvme_ns_ids *ids) { struct nvme_ns *ns; struct gendisk *disk; @@ -3933,9 +3805,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) char disk_name[DISK_NAME_LEN]; int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + if (nvme_identify_ns(ctrl, nsid, ids, &id)) + return; + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return; + goto out_free_id; ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) @@ -3950,23 +3825,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue->queuedata = ns; ns->ctrl = ctrl; - kref_init(&ns->kref); - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - nvme_set_queue_limits(ctrl, ns->queue); - ret = nvme_identify_ns(ctrl, nsid, &id); + ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); if (ret) goto out_free_queue; - - if (id->ncap == 0) /* no namespace (legacy quirk) */ - goto out_free_id; - - ret = nvme_init_ns_head(ns, nsid, id); - if (ret) - goto out_free_id; nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); @@ -3980,7 +3843,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; - if (__nvme_revalidate_disk(disk, id)) + if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { @@ -4015,12 +3878,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_id: - kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + out_free_id: + kfree(id); } static void nvme_ns_remove(struct nvme_ns *ns) @@ -4028,6 +3891,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + set_capacity(ns->disk, 0); nvme_fault_inject_fini(&ns->fault_inject); mutex_lock(&ns->ctrl->subsys->lock); @@ -4065,22 +3929,75 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) } } -static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) +{ + struct nvme_id_ns *id; + int ret = -ENODEV; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) + goto out; + + ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id); + if (ret) + goto out; + + ret = -ENODEV; + if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { + dev_err(ns->ctrl->device, + "identifiers changed for nsid %d\n", ns->head->ns_id); + goto out_free_id; + } + + ret = nvme_update_ns_info(ns, id); + +out_free_id: + kfree(id); +out: + /* + * Only remove the namespace if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + * + * TODO: we should probably schedule a delayed retry here. + */ + if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) + nvme_ns_remove(ns); + else + revalidate_disk_size(ns->disk, true); +} + +static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { + struct nvme_ns_ids ids = { }; struct nvme_ns *ns; - int ret; + + if (nvme_identify_ns_descs(ctrl, nsid, &ids)) + return; ns = nvme_find_get_ns(ctrl, nsid); - if (!ns) { - nvme_alloc_ns(ctrl, nsid); + if (ns) { + nvme_validate_ns(ns, &ids); + nvme_put_ns(ns); return; } - ret = nvme_revalidate_disk(ns->disk); - revalidate_disk_size(ns->disk, ret == 0); - if (ret) - nvme_ns_remove(ns); - nvme_put_ns(ns); + switch (ids.csi) { + case NVME_CSI_NVM: + nvme_alloc_ns(ctrl, nsid, &ids); + break; + case NVME_CSI_ZNS: + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + dev_warn(ctrl->device, + "nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", + nsid); + break; + } + nvme_alloc_ns(ctrl, nsid, &ids); + break; + default: + dev_warn(ctrl->device, "unknown csi %u for nsid %u\n", + ids.csi, nsid); + break; + } } static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -4116,7 +4033,14 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) return -ENOMEM; for (;;) { - ret = nvme_identify_ns_list(ctrl, prev, ns_list); + struct nvme_command cmd = { + .identify.opcode = nvme_admin_identify, + .identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST, + .identify.nsid = cpu_to_le32(prev), + }; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, + NVME_IDENTIFY_DATA_SIZE); if (ret) goto free; @@ -4125,7 +4049,7 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) if (!nsid) /* end of the list? */ goto out; - nvme_validate_ns(ctrl, nsid); + nvme_validate_or_alloc_ns(ctrl, nsid); while (++prev < nsid) nvme_ns_remove_by_nsid(ctrl, prev); } @@ -4148,7 +4072,7 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl) kfree(id); for (i = 1; i <= nn; i++) - nvme_validate_ns(ctrl, i); + nvme_validate_or_alloc_ns(ctrl, i); nvme_remove_invalid_namespaces(ctrl, nn); } @@ -4453,15 +4377,11 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - struct nvme_cel *cel, *next; if (!subsys || ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { - list_del(&cel->entry); - kfree(cel); - } + xa_destroy(&ctrl->cels); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4493,7 +4413,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); - INIT_LIST_HEAD(&ctrl->cels); + xa_init(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; @@ -4673,28 +4593,13 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_sync_queues); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path) +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file) { - struct nvme_ctrl *ctrl; - struct file *f; - - f = filp_open(path, O_RDWR, 0); - if (IS_ERR(f)) - return ERR_CAST(f); - - if (f->f_op != &nvme_dev_fops) { - ctrl = ERR_PTR(-EINVAL); - goto out_close; - } - - ctrl = f->private_data; - nvme_get_ctrl(ctrl); - -out_close: - filp_close(f, NULL); - return ctrl; + if (file->f_op != &nvme_dev_fops) + return NULL; + return file->private_data; } -EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU); +EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU); /* * Check we didn't inadvertently grow the command structure sizes: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8ac37430347c..e7c88b40f5bb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -300,7 +300,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; - struct list_head cels; + struct xarray cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -758,10 +758,9 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) } #endif /* CONFIG_NVME_MULTIPATH */ +int nvme_revalidate_zones(struct nvme_ns *ns); #ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf); - +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); int nvme_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -778,9 +777,7 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, return BLK_STS_NOTSUPP; } -static inline int nvme_update_zone_info(struct gendisk *disk, - struct nvme_ns *ns, - unsigned lbaf) +static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { dev_warn(ns->ctrl->device, "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); @@ -825,7 +822,7 @@ static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl) u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); void nvme_execute_passthru_rq(struct request *rq); -struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path); +struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8984796db0c8..e5b02242f3ca 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2038,32 +2038,30 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) .calc_sets = nvme_calc_irq_sets, .priv = dev, }; - unsigned int irq_queues, this_p_queues; + unsigned int irq_queues, poll_queues; /* - * Poll queues don't need interrupts, but we need at least one IO - * queue left over for non-polled IO. + * Poll queues don't need interrupts, but we need at least one I/O queue + * left over for non-polled I/O. */ - this_p_queues = dev->nr_poll_queues; - if (this_p_queues >= nr_io_queues) { - this_p_queues = nr_io_queues - 1; - irq_queues = 1; - } else { - irq_queues = nr_io_queues - this_p_queues + 1; - } - dev->io_queues[HCTX_TYPE_POLL] = this_p_queues; + poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1); + dev->io_queues[HCTX_TYPE_POLL] = poll_queues; - /* Initialize for the single interrupt case */ + /* + * Initialize for the single interrupt case, will be updated in + * nvme_calc_irq_sets(). + */ dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; /* - * Some Apple controllers require all queues to use the - * first vector. + * We need interrupts for the admin queue and each non-polled I/O queue, + * but some Apple controllers require all queues to use the first + * vector. */ - if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) - irq_queues = 1; - + irq_queues = 1; + if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)) + irq_queues += (nr_io_queues - poll_queues); return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -3187,7 +3185,6 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, @@ -3195,6 +3192,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_SINGLE_VECTOR | NVME_QUIRK_128_BYTES_SQES | NVME_QUIRK_SHARED_TAGS }, + + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 57cfd78731fb..67e87e9f306f 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -7,6 +7,17 @@ #include <linux/vmalloc.h> #include "nvme.h" +int nvme_revalidate_zones(struct nvme_ns *ns) +{ + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; +} + static int nvme_set_max_append(struct nvme_ctrl *ctrl) { struct nvme_command c = { }; @@ -35,11 +46,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns, - unsigned lbaf) +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) { struct nvme_effects_log *log = ns->head->effects; - struct request_queue *q = disk->queue; + struct request_queue *q = ns->queue; struct nvme_command c = { }; struct nvme_id_ns_zns *id; int status; @@ -133,28 +143,6 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, return NULL; } -static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - struct nvme_zone_report *report, - size_t buflen) -{ - struct nvme_command c = { }; - int ret; - - c.zmr.opcode = nvme_cmd_zone_mgmt_recv; - c.zmr.nsid = cpu_to_le32(ns->head->ns_id); - c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); - c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); - c.zmr.zra = NVME_ZRA_ZONE_REPORT; - c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; - c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; - - ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); - if (ret) - return ret; - - return le64_to_cpu(report->nr_zones); -} - static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, unsigned int idx, report_zones_cb cb, @@ -182,6 +170,7 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nvme_zone_report *report; + struct nvme_command c = { }; int ret, zone_idx = 0; unsigned int nz, i; size_t buflen; @@ -190,14 +179,26 @@ static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, if (!report) return -ENOMEM; + c.zmr.opcode = nvme_cmd_zone_mgmt_recv; + c.zmr.nsid = cpu_to_le32(ns->head->ns_id); + c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen)); + c.zmr.zra = NVME_ZRA_ZONE_REPORT; + c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; + c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; + sector &= ~(ns->zsze - 1); while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { memset(report, 0, buflen); - ret = __nvme_ns_report_zones(ns, sector, report, buflen); - if (ret < 0) + + c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector)); + ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen); + if (ret) { + if (ret > 0) + ret = -EIO; goto out_free; + } - nz = min_t(unsigned int, ret, nr_zones); + nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); if (!nz) break; |