diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-30 15:02:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-30 15:02:49 -0700 |
commit | 5fc6b075e165f641fbc366b58b578055762d5f8c (patch) | |
tree | a80f92cd8ab95232f162ff1d57cf2c9f7e52c5af | |
parent | cf9446cc8e6d85355642209538dde619f53770dc (diff) | |
parent | 65ff5cd04551daf2c11c7928e48fc3483391c900 (diff) | |
download | lwn-5fc6b075e165f641fbc366b58b578055762d5f8c.tar.gz lwn-5fc6b075e165f641fbc366b58b578055762d5f8c.zip |
Merge tag 'block-5.10-2020-10-30' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- null_blk zone fixes (Damien, Kanchan)
- NVMe pull request from Christoph:
- improve zone revalidation (Keith Busch)
- gracefully handle zero length messages in nvme-rdma (zhenwei pi)
- nvme-fc error handling fixes (James Smart)
- nvmet tracing NULL pointer dereference fix (Chaitanya Kulkarni)"
- xsysace platform fixes (Andy)
- scatterlist type cleanup (David)
- blk-cgroup memory fixes (Gabriel)
- nbd block size update fix (Ming)
- Flush completion state fix (Ming)
- bio_add_hw_page() iteration fix (Naohiro)
* tag 'block-5.10-2020-10-30' of git://git.kernel.dk/linux-block:
blk-mq: mark flush request as IDLE in flush_end_io()
lib/scatterlist: use consistent sg_copy_buffer() return type
xsysace: use platform_get_resource() and platform_get_irq_optional()
null_blk: Fix locking in zoned mode
null_blk: Fix zone reset all tracing
nbd: don't update block size after device is started
block: advance iov_iter on bio_add_hw_page failure
null_blk: synchronization fix for zoned device
nvmet: fix a NULL pointer dereference when tracing the flush command
nvme-fc: remove nvme_fc_terminate_io()
nvme-fc: eliminate terminate_io use by nvme_fc_error_recovery
nvme-fc: remove err_work work item
nvme-fc: track error_recovery while connecting
nvme-rdma: handle unexpected nvme completion data length
nvme: ignore zone validate errors on subsequent scans
blk-cgroup: Pre-allocate tree node on blkg_conf_prep
blk-cgroup: Fix memleak on error path
-rw-r--r-- | block/bio.c | 11 | ||||
-rw-r--r-- | block/blk-cgroup.c | 15 | ||||
-rw-r--r-- | block/blk-flush.c | 1 | ||||
-rw-r--r-- | drivers/block/nbd.c | 9 | ||||
-rw-r--r-- | drivers/block/null_blk.h | 2 | ||||
-rw-r--r-- | drivers/block/null_blk_zoned.c | 123 | ||||
-rw-r--r-- | drivers/block/xsysace.c | 49 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 270 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 8 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/trace.h | 21 | ||||
-rw-r--r-- | lib/scatterlist.c | 2 |
13 files changed, 281 insertions, 236 deletions
diff --git a/block/bio.c b/block/bio.c index e6e26d7a1ffb..fa01bef35bb1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1044,6 +1044,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size, left; unsigned len, i; size_t offset; + int ret = 0; if (WARN_ON_ONCE(!max_append_sectors)) return 0; @@ -1066,15 +1067,17 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) - return -EINVAL; + max_append_sectors, &same_page) != len) { + ret = -EINVAL; + break; + } if (same_page) put_page(page); offset = 0; } - iov_iter_advance(iter, size); - return 0; + iov_iter_advance(iter, size - left); + return ret; } /** diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f9b55614d67d..c68bdf58c9a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -657,13 +657,20 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail; } + if (radix_tree_preload(GFP_KERNEL)) { + blkg_free(new_blkg); + ret = -ENOMEM; + goto fail; + } + rcu_read_lock(); spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); - goto fail_unlock; + blkg_free(new_blkg); + goto fail_preloaded; } if (blkg) { @@ -672,10 +679,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_create(pos, q, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); - goto fail_unlock; + goto fail_preloaded; } } + radix_tree_preload_end(); + if (pos == blkcg) goto success; } @@ -685,6 +694,8 @@ success: ctx->body = input; return 0; +fail_preloaded: + radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); diff --git a/block/blk-flush.c b/block/blk-flush.c index 53abb5c73d99..e32958f0b687 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -225,6 +225,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (!refcount_dec_and_test(&flush_rq->ref)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0bed21c0c81b..c4f9ccf5cc2a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -296,7 +296,7 @@ static void nbd_size_clear(struct nbd_device *nbd) } } -static void nbd_size_update(struct nbd_device *nbd) +static void nbd_size_update(struct nbd_device *nbd, bool start) { struct nbd_config *config = nbd->config; struct block_device *bdev = bdget_disk(nbd->disk, 0); @@ -313,7 +313,8 @@ static void nbd_size_update(struct nbd_device *nbd) if (bdev) { if (bdev->bd_disk) { bd_set_nr_sectors(bdev, nr_sectors); - set_blocksize(bdev, config->blksize); + if (start) + set_blocksize(bdev, config->blksize); } else set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); bdput(bdev); @@ -328,7 +329,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; if (nbd->task_recv != NULL) - nbd_size_update(nbd); + nbd_size_update(nbd, false); } static void nbd_complete_rq(struct request *req) @@ -1308,7 +1309,7 @@ static int nbd_start_device(struct nbd_device *nbd) args->index = i; queue_work(nbd->recv_workq, &args->work); } - nbd_size_update(nbd); + nbd_size_update(nbd, true); return error; } diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index d2e7db43a52a..cfd00ad40355 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -47,6 +47,8 @@ struct nullb_device { unsigned int nr_zones_closed; struct blk_zone *zones; sector_t zone_size_sects; + spinlock_t zone_dev_lock; + unsigned long *zone_locks; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 7d94f2d47a6a..8775acbb4f8f 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/vmalloc.h> +#include <linux/bitmap.h> #include "null_blk.h" #define CREATE_TRACE_POINTS @@ -45,6 +46,13 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (!dev->zones) return -ENOMEM; + spin_lock_init(&dev->zone_dev_lock); + dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); + if (!dev->zone_locks) { + kvfree(dev->zones); + return -ENOMEM; + } + if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; pr_info("changed the number of conventional zones to %u", @@ -123,15 +131,26 @@ int null_register_zoned_dev(struct nullb *nullb) void null_free_zoned_dev(struct nullb_device *dev) { + bitmap_free(dev->zone_locks); kvfree(dev->zones); } +static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) +{ + wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); +} + +static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) +{ + clear_and_wake_up_bit(zno, dev->zone_locks); +} + int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i; + unsigned int first_zone, i, zno; struct blk_zone zone; int error; @@ -142,15 +161,18 @@ int null_report_zones(struct gendisk *disk, sector_t sector, nr_zones = min(nr_zones, dev->nr_zones - first_zone); trace_nullb_report_zones(nullb, nr_zones); - for (i = 0; i < nr_zones; i++) { + zno = first_zone; + for (i = 0; i < nr_zones; i++, zno++) { /* * Stacked DM target drivers will remap the zone information by * modifying the zone information passed to the report callback. * So use a local copy to avoid corruption of the device zone * array. */ - memcpy(&zone, &dev->zones[first_zone + i], - sizeof(struct blk_zone)); + null_lock_zone(dev, zno); + memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); + null_unlock_zone(dev, zno); + error = cb(&zone, i, data); if (error) return error; @@ -159,6 +181,10 @@ int null_report_zones(struct gendisk *disk, sector_t sector, return nr_zones; } +/* + * This is called in the case of memory backing from null_process_cmd() + * with the target zone already locked. + */ size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len) { @@ -295,22 +321,27 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + null_lock_zone(dev, zno); + spin_lock(&dev->zone_dev_lock); + switch (zone->cond) { case BLK_ZONE_COND_FULL: /* Cannot write to a full zone */ - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: break; default: /* Invalid zone condition */ - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } /* @@ -326,11 +357,14 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, else cmd->rq->__sector = sector; } else if (sector != zone->wp) { - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } - if (zone->wp + nr_sectors > zone->start + zone->capacity) - return BLK_STS_IOERR; + if (zone->wp + nr_sectors > zone->start + zone->capacity) { + ret = BLK_STS_IOERR; + goto unlock; + } if (zone->cond == BLK_ZONE_COND_CLOSED) { dev->nr_zones_closed--; @@ -341,9 +375,11 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; + spin_unlock(&dev->zone_dev_lock); ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + spin_lock(&dev->zone_dev_lock); if (ret != BLK_STS_OK) - return ret; + goto unlock; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { @@ -353,7 +389,13 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, dev->nr_zones_imp_open--; zone->cond = BLK_ZONE_COND_FULL; } - return BLK_STS_OK; + ret = BLK_STS_OK; + +unlock: + spin_unlock(&dev->zone_dev_lock); + null_unlock_zone(dev, zno); + + return ret; } static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) @@ -464,16 +506,33 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, sector_t sector) { struct nullb_device *dev = cmd->nq->dev; - unsigned int zone_no = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zone_no]; - blk_status_t ret = BLK_STS_OK; + unsigned int zone_no; + struct blk_zone *zone; + blk_status_t ret; size_t i; + if (op == REQ_OP_ZONE_RESET_ALL) { + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { + null_lock_zone(dev, i); + zone = &dev->zones[i]; + if (zone->cond != BLK_ZONE_COND_EMPTY) { + spin_lock(&dev->zone_dev_lock); + null_reset_zone(dev, zone); + spin_unlock(&dev->zone_dev_lock); + trace_nullb_zone_op(cmd, i, zone->cond); + } + null_unlock_zone(dev, i); + } + return BLK_STS_OK; + } + + zone_no = null_zone_no(dev, sector); + zone = &dev->zones[zone_no]; + + null_lock_zone(dev, zone_no); + spin_lock(&dev->zone_dev_lock); + switch (op) { - case REQ_OP_ZONE_RESET_ALL: - for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) - null_reset_zone(dev, &dev->zones[i]); - break; case REQ_OP_ZONE_RESET: ret = null_reset_zone(dev, zone); break; @@ -487,30 +546,46 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, ret = null_finish_zone(dev, zone); break; default: - return BLK_STS_NOTSUPP; + ret = BLK_STS_NOTSUPP; + break; } + spin_unlock(&dev->zone_dev_lock); + if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); + null_unlock_zone(dev, zone_no); + return ret; } blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, sector_t nr_sectors) { + struct nullb_device *dev = cmd->nq->dev; + unsigned int zno = null_zone_no(dev, sector); + blk_status_t sts; + switch (op) { case REQ_OP_WRITE: - return null_zone_write(cmd, sector, nr_sectors, false); + sts = null_zone_write(cmd, sector, nr_sectors, false); + break; case REQ_OP_ZONE_APPEND: - return null_zone_write(cmd, sector, nr_sectors, true); + sts = null_zone_write(cmd, sector, nr_sectors, true); + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - return null_zone_mgmt(cmd, op, sector); + sts = null_zone_mgmt(cmd, op, sector); + break; default: - return null_process_cmd(cmd, op, sector, nr_sectors); + null_lock_zone(dev, zno); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + null_unlock_zone(dev, zno); } + + return sts; } diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 8d581c7536fb..eb8ef65778c3 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -443,22 +443,27 @@ static void ace_fix_driveid(u16 *id) #define ACE_FSM_NUM_STATES 11 /* Set flag to exit FSM loop and reschedule tasklet */ -static inline void ace_fsm_yield(struct ace_device *ace) +static inline void ace_fsm_yieldpoll(struct ace_device *ace) { - dev_dbg(ace->dev, "ace_fsm_yield()\n"); tasklet_schedule(&ace->fsm_tasklet); ace->fsm_continue_flag = 0; } +static inline void ace_fsm_yield(struct ace_device *ace) +{ + dev_dbg(ace->dev, "%s()\n", __func__); + ace_fsm_yieldpoll(ace); +} + /* Set flag to exit FSM loop and wait for IRQ to reschedule tasklet */ static inline void ace_fsm_yieldirq(struct ace_device *ace) { dev_dbg(ace->dev, "ace_fsm_yieldirq()\n"); - if (!ace->irq) - /* No IRQ assigned, so need to poll */ - tasklet_schedule(&ace->fsm_tasklet); - ace->fsm_continue_flag = 0; + if (ace->irq > 0) + ace->fsm_continue_flag = 0; + else + ace_fsm_yieldpoll(ace); } static bool ace_has_next_request(struct request_queue *q) @@ -1053,12 +1058,12 @@ static int ace_setup(struct ace_device *ace) ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ); /* Now we can hook up the irq handler */ - if (ace->irq) { + if (ace->irq > 0) { rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace); if (rc) { /* Failure - fall back to polled mode */ dev_err(ace->dev, "request_irq failed\n"); - ace->irq = 0; + ace->irq = rc; } } @@ -1110,7 +1115,7 @@ static void ace_teardown(struct ace_device *ace) tasklet_kill(&ace->fsm_tasklet); - if (ace->irq) + if (ace->irq > 0) free_irq(ace->irq, ace); iounmap(ace->baseaddr); @@ -1123,11 +1128,6 @@ static int ace_alloc(struct device *dev, int id, resource_size_t physaddr, int rc; dev_dbg(dev, "ace_alloc(%p)\n", dev); - if (!physaddr) { - rc = -ENODEV; - goto err_noreg; - } - /* Allocate and initialize the ace device structure */ ace = kzalloc(sizeof(struct ace_device), GFP_KERNEL); if (!ace) { @@ -1153,7 +1153,6 @@ err_setup: dev_set_drvdata(dev, NULL); kfree(ace); err_alloc: -err_noreg: dev_err(dev, "could not initialize device, err=%i\n", rc); return rc; } @@ -1176,10 +1175,11 @@ static void ace_free(struct device *dev) static int ace_probe(struct platform_device *dev) { - resource_size_t physaddr = 0; int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */ + resource_size_t physaddr; + struct resource *res; u32 id = dev->id; - int irq = 0; + int irq; int i; dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); @@ -1190,12 +1190,15 @@ static int ace_probe(struct platform_device *dev) if (of_find_property(dev->dev.of_node, "8-bit", NULL)) bus_width = ACE_BUS_WIDTH_8; - for (i = 0; i < dev->num_resources; i++) { - if (dev->resource[i].flags & IORESOURCE_MEM) - physaddr = dev->resource[i].start; - if (dev->resource[i].flags & IORESOURCE_IRQ) - irq = dev->resource[i].start; - } + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; + + physaddr = res->start; + if (!physaddr) + return -ENODEV; + + irq = platform_get_irq_optional(dev, 0); /* Call the bus-independent setup code */ return ace_alloc(&dev->dev, id, physaddr, irq, bus_width); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 95ef4943d8bd..376096bfc54a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2125,7 +2125,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_is_zoned(ns->queue)) { ret = nvme_revalidate_zones(ns); - if (ret) + if (ret && !nvme_first_scan(ns->disk)) return ret; } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3c002bdcace3..f4c246462658 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -146,7 +146,8 @@ struct nvme_fc_rport { /* fc_ctrl flags values - specified as bit positions */ #define ASSOC_ACTIVE 0 -#define FCCTRL_TERMIO 1 +#define ASSOC_FAILED 1 +#define FCCTRL_TERMIO 2 struct nvme_fc_ctrl { spinlock_t lock; @@ -157,7 +158,6 @@ struct nvme_fc_ctrl { u32 cnum; bool ioq_live; - atomic_t err_work_active; u64 association_id; struct nvmefc_ls_rcv_op *rcv_disconn; @@ -167,7 +167,6 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set tag_set; struct delayed_work connect_work; - struct work_struct err_work; struct kref ref; unsigned long flags; @@ -2414,24 +2413,97 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) nvme_fc_ctrl_put(ctrl); } +/* + * This routine is used by the transport when it needs to find active + * io on a queue that is to be terminated. The transport uses + * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke + * this routine to kill them on a 1 by 1 basis. + * + * As FC allocates FC exchange for each io, the transport must contact + * the LLDD to terminate the exchange, thus releasing the FC exchange. + * After terminating the exchange the LLDD will call the transport's + * normal io done path for the request, but it will have an aborted + * status. The done path will return the io request back to the block + * layer with an error status. + */ +static bool +nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) +{ + struct nvme_ctrl *nctrl = data; + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); + + __nvme_fc_abort_op(ctrl, op); + return true; +} + +/* + * This routine runs through all outstanding commands on the association + * and aborts them. This routine is typically be called by the + * delete_association routine. It is also called due to an error during + * reconnect. In that scenario, it is most likely a command that initializes + * the controller, including fabric Connect commands on io queues, that + * may have timed out or failed thus the io must be killed for the connect + * thread to see the error. + */ static void -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) +__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) { - int active; + /* + * If io queues are present, stop them and terminate all outstanding + * ios on them. As FC allocates FC exchange for each io, the + * transport must contact the LLDD to terminate the exchange, + * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() + * to tell us what io's are busy and invoke a transport routine + * to kill them with the LLDD. After terminating the exchange + * the LLDD will call the transport's normal io done path, but it + * will have an aborted status. The done path will return the + * io requests back to the block layer as part of normal completions + * (but with error status). + */ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); + if (start_queues) + nvme_start_queues(&ctrl->ctrl); + } + + /* + * Other transports, which don't have link-level contexts bound + * to sqe's, would try to gracefully shutdown the controller by + * writing the registers for shutdown and polling (call + * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * just aborted and we will wait on those contexts, and given + * there was no indication of how live the controlelr is on the + * link, don't send more io to create more contexts for the + * shutdown. Let the controller fail via keepalive failure if + * its still present. + */ /* - * if an error (io timeout, etc) while (re)connecting, - * it's an error on creating the new association. - * Start the error recovery thread if it hasn't already - * been started. It is expected there could be multiple - * ios hitting this path before things are cleaned up. + * clean up the admin queue. Same thing as above. + */ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); +} + +static void +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) +{ + /* + * if an error (io timeout, etc) while (re)connecting, the remote + * port requested terminating of the association (disconnect_ls) + * or an error (timeout or abort) occurred on an io while creating + * the controller. Abort any ios on the association and let the + * create_association error path resolve things. */ if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { - active = atomic_xchg(&ctrl->err_work_active, 1); - if (!active && !queue_work(nvme_fc_wq, &ctrl->err_work)) { - atomic_set(&ctrl->err_work_active, 0); - WARN_ON(1); - } + __nvme_fc_abort_outstanding_ios(ctrl, true); + set_bit(ASSOC_FAILED, &ctrl->flags); return; } @@ -2745,30 +2817,6 @@ nvme_fc_complete_rq(struct request *rq) nvme_fc_ctrl_put(ctrl); } -/* - * This routine is used by the transport when it needs to find active - * io on a queue that is to be terminated. The transport uses - * blk_mq_tagset_busy_itr() to find the busy requests, which then invoke - * this routine to kill them on a 1 by 1 basis. - * - * As FC allocates FC exchange for each io, the transport must contact - * the LLDD to terminate the exchange, thus releasing the FC exchange. - * After terminating the exchange the LLDD will call the transport's - * normal io done path for the request, but it will have an aborted - * status. The done path will return the io request back to the block - * layer with an error status. - */ -static bool -nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) -{ - struct nvme_ctrl *nctrl = data; - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); - - __nvme_fc_abort_op(ctrl, op); - return true; -} - static const struct blk_mq_ops nvme_fc_mq_ops = { .queue_rq = nvme_fc_queue_rq, @@ -2988,6 +3036,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->cnum, ctrl->lport->localport.port_name, ctrl->rport->remoteport.port_name, ctrl->ctrl.opts->subsysnqn); + clear_bit(ASSOC_FAILED, &ctrl->flags); + /* * Create the admin queue */ @@ -3016,7 +3066,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) */ ret = nvme_enable_ctrl(&ctrl->ctrl); - if (ret) + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; @@ -3026,7 +3076,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvme_init_identify(&ctrl->ctrl); - if (ret) + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; /* sanity checks */ @@ -3071,9 +3121,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ret = nvme_fc_create_io_queues(ctrl); else ret = nvme_fc_recreate_io_queues(ctrl); - if (ret) - goto out_term_aen_ops; } + if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + goto out_term_aen_ops; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -3108,60 +3158,6 @@ out_free_queue: /* - * This routine runs through all outstanding commands on the association - * and aborts them. This routine is typically be called by the - * delete_association routine. It is also called due to an error during - * reconnect. In that scenario, it is most likely a command that initializes - * the controller, including fabric Connect commands on io queues, that - * may have timed out or failed thus the io must be killed for the connect - * thread to see the error. - */ -static void -__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) -{ - /* - * If io queues are present, stop them and terminate all outstanding - * ios on them. As FC allocates FC exchange for each io, the - * transport must contact the LLDD to terminate the exchange, - * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() - * to tell us what io's are busy and invoke a transport routine - * to kill them with the LLDD. After terminating the exchange - * the LLDD will call the transport's normal io done path, but it - * will have an aborted status. The done path will return the - * io requests back to the block layer as part of normal completions - * (but with error status). - */ - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_fc_terminate_exchange, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->tag_set); - if (start_queues) - nvme_start_queues(&ctrl->ctrl); - } - - /* - * Other transports, which don't have link-level contexts bound - * to sqe's, would try to gracefully shutdown the controller by - * writing the registers for shutdown and polling (call - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially - * just aborted and we will wait on those contexts, and given - * there was no indication of how live the controlelr is on the - * link, don't send more io to create more contexts for the - * shutdown. Let the controller fail via keepalive failure if - * its still present. - */ - - /* - * clean up the admin queue. Same thing as above. - */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_fc_terminate_exchange, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); -} - -/* * This routine stops operation of the controller on the host side. * On the host os stack side: Admin and IO queues are stopped, * outstanding ios on them terminated via FC ABTS. @@ -3237,7 +3233,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block @@ -3292,78 +3287,34 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) } static void -__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl) +nvme_fc_reset_ctrl_work(struct work_struct *work) { - /* - * if state is CONNECTING - the error occurred as part of a - * reconnect attempt. Abort any ios on the association and - * let the create_association error paths resolve things. - */ - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { - __nvme_fc_abort_outstanding_ios(ctrl, true); - return; - } - - /* - * For any other state, kill the association. As this routine - * is a common io abort routine for resetting and such, after - * the association is terminated, ensure that the state is set - * to CONNECTING. - */ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); - nvme_stop_keep_alive(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); - if (ctrl->ctrl.state != NVME_CTRL_CONNECTING && - !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " "to CONNECTING\n", ctrl->cnum); -} - -static void -nvme_fc_reset_ctrl_work(struct work_struct *work) -{ - struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); - int ret; - __nvme_fc_terminate_io(ctrl); - - nvme_stop_ctrl(&ctrl->ctrl); - - if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) - ret = nvme_fc_create_association(ctrl); - else - ret = -ENOTCONN; - - if (ret) - nvme_fc_reconnect_or_delete(ctrl, ret); - else - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reset complete\n", - ctrl->cnum); + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule connect " + "after reset\n", ctrl->cnum); + } else { + flush_delayed_work(&ctrl->connect_work); + } + } else { + nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN); + } } -static void -nvme_fc_connect_err_work(struct work_struct *work) -{ - struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, err_work); - - __nvme_fc_terminate_io(ctrl); - - atomic_set(&ctrl->err_work_active, 0); - - /* - * Rescheduling the connection after recovering - * from the io error is left to the reconnect work - * item, which is what should have stalled waiting on - * the io that had the error that scheduled this work. - */ -} static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", @@ -3491,7 +3442,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->dev = lport->dev; ctrl->cnum = idx; ctrl->ioq_live = false; - atomic_set(&ctrl->err_work_active, 0); init_waitqueue_head(&ctrl->ioabort_wait); get_device(ctrl->dev); @@ -3499,7 +3449,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); - INIT_WORK(&ctrl->err_work, nvme_fc_connect_err_work); spin_lock_init(&ctrl->lock); /* io queue count */ @@ -3592,7 +3541,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, fail_ctrl: nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); cancel_work_sync(&ctrl->ctrl.reset_work); - cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->connect_work); ctrl->ctrl.opts = NULL; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 8bbc48cc45dc..541b0cba6d80 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1768,6 +1768,14 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) return; } + /* sanity checking for received data length */ + if (unlikely(wc->byte_len < len)) { + dev_err(queue->ctrl->ctrl.device, + "Unexpected nvme completion length(%d)\n", wc->byte_len); + nvme_rdma_error_recovery(queue->ctrl); + return; + } + ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE); /* * AEN requests are special as they don't time out and can diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index aafcbc424b7a..957b39a82431 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -907,8 +907,6 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->error_loc = NVMET_NO_ERROR_LOC; req->error_slba = 0; - trace_nvmet_req_init(req, req->cmd); - /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { req->error_loc = offsetof(struct nvme_common_command, flags); @@ -938,6 +936,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, if (status) goto fail; + trace_nvmet_req_init(req, req->cmd); + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto fail; diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 0458046d6501..c14e3249a14d 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -46,19 +46,12 @@ static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) return req->sq->ctrl; } -static inline void __assign_disk_name(char *name, struct nvmet_req *req, - bool init) +static inline void __assign_req_name(char *name, struct nvmet_req *req) { - struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req); - struct nvmet_ns *ns; - - if ((init && req->sq->qid) || (!init && req->cq->qid)) { - ns = nvmet_find_namespace(ctrl, req->cmd->rw.nsid); - strncpy(name, ns->device_path, DISK_NAME_LEN); - return; - } - - memset(name, 0, DISK_NAME_LEN); + if (req->ns) + strncpy(name, req->ns->device_path, DISK_NAME_LEN); + else + memset(name, 0, DISK_NAME_LEN); } #endif @@ -81,7 +74,7 @@ TRACE_EVENT(nvmet_req_init, TP_fast_assign( __entry->cmd = cmd; __entry->ctrl = nvmet_req_to_ctrl(req); - __assign_disk_name(__entry->disk, req, true); + __assign_req_name(__entry->disk, req); __entry->qid = req->sq->qid; __entry->cid = cmd->common.command_id; __entry->opcode = cmd->common.opcode; @@ -121,7 +114,7 @@ TRACE_EVENT(nvmet_req_complete, __entry->cid = req->cqe->command_id; __entry->result = le64_to_cpu(req->cqe->result.u64); __entry->status = le16_to_cpu(req->cqe->status) >> 1; - __assign_disk_name(__entry->disk, req, false); + __assign_req_name(__entry->disk, req); ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", __print_ctrl_name(__entry->ctrl), diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 0a482ef988e5..a59778946404 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -933,7 +933,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) - return false; + return 0; while ((offset < buflen) && sg_miter_next(&miter)) { unsigned int len; |