diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-26 10:32:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-26 10:32:12 -0700 |
commit | 04412819652fe30f900d11e96c67b4adfdf17f6b (patch) | |
tree | aed86baef3fd65e6990484a00514f0594d1fdd6c | |
parent | 750c930b085ba56cfac3649e8e0dff72a8c5f8a5 (diff) | |
parent | 9c0b2596f2ac30967af0b8bb9f038b65926a6f00 (diff) | |
download | lwn-04412819652fe30f900d11e96c67b4adfdf17f6b.tar.gz lwn-04412819652fe30f900d11e96c67b4adfdf17f6b.zip |
Merge tag 'for-linus-20190726' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- Several io_uring fixes/improvements:
- Blocking fix for O_DIRECT (me)
- Latter page slowness for registered buffers (me)
- Fix poll hang under certain conditions (me)
- Defer sequence check fix for wrapped rings (Zhengyuan)
- Mismatch in async inc/dec accounting (Zhengyuan)
- Memory ordering issue that could cause stall (Zhengyuan)
- Track sequential defer in bytes, not pages (Zhengyuan)
- NVMe pull request from Christoph
- Set of hang fixes for wbt (Josef)
- Redundant error message kill for libahci (Ding)
- Remove unused blk_mq_sched_started_request() and related ops (Marcos)
- drbd dynamic alloc shash descriptor to reduce stack use (Arnd)
- blkcg ->pd_stat() non-debug print (Tejun)
- bcache memory leak fix (Wei)
- Comment fix (Akinobu)
- BFQ perf regression fix (Paolo)
* tag 'for-linus-20190726' of git://git.kernel.dk/linux-block: (24 commits)
io_uring: ensure ->list is initialized for poll commands
Revert "nvme-pci: don't create a read hctx mapping without read queues"
nvme: fix multipath crash when ANA is deactivated
nvme: fix memory leak caused by incorrect subsystem free
nvme: ignore subnqn for ADATA SX6000LNP
drbd: dynamically allocate shash descriptor
block: blk-mq: Remove blk_mq_sched_started_request and started_request
bcache: fix possible memory leak in bch_cached_dev_run()
io_uring: track io length in async_list based on bytes
io_uring: don't use iov_iter_advance() for fixed buffers
block: properly handle IOCB_NOWAIT for async O_DIRECT IO
blk-mq: allow REQ_NOWAIT to return an error inline
io_uring: add a memory barrier before atomic_read
rq-qos: use a mb for got_token
rq-qos: set ourself TASK_UNINTERRUPTIBLE after we schedule
rq-qos: don't reset has_sleepers on spurious wakeups
rq-qos: fix missed wake-ups in rq_qos_throttle
wait: add wq_has_single_sleeper helper
block, bfq: check also in-flight I/O in dispatch plugging
block: fix sysfs module parameters directory path in comment
...
-rw-r--r-- | block/bfq-iosched.c | 67 | ||||
-rw-r--r-- | block/blk-cgroup.c | 9 | ||||
-rw-r--r-- | block/blk-iolatency.c | 3 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 9 | ||||
-rw-r--r-- | block/blk-mq.c | 10 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 7 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | drivers/ata/libahci_platform.c | 1 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 14 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 3 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 12 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 8 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 6 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 6 | ||||
-rw-r--r-- | fs/block_dev.c | 58 | ||||
-rw-r--r-- | fs/io_uring.c | 81 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 1 | ||||
-rw-r--r-- | include/linux/blk_types.h | 5 | ||||
-rw-r--r-- | include/linux/elevator.h | 1 | ||||
-rw-r--r-- | include/linux/wait.h | 13 |
20 files changed, 224 insertions, 92 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 72860325245a..586fcfe227ea 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3354,38 +3354,57 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * there is no active group, then the primary expectation for * this device is probably a high throughput. * - * We are now left only with explaining the additional - * compound condition that is checked below for deciding - * whether the scenario is asymmetric. To explain this - * compound condition, we need to add that the function + * We are now left only with explaining the two sub-conditions in the + * additional compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain the first + * sub-condition, we need to add that the function * bfq_asymmetric_scenario checks the weights of only - * non-weight-raised queues, for efficiency reasons (see - * comments on bfq_weights_tree_add()). Then the fact that - * bfqq is weight-raised is checked explicitly here. More - * precisely, the compound condition below takes into account - * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all queues with requests - * waiting for completion happen to be - * weight-raised. Actually, we should be even more precise - * here, and differentiate between interactive weight raising - * and soft real-time weight raising. + * non-weight-raised queues, for efficiency reasons (see comments on + * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised + * is checked explicitly here. More precisely, the compound condition + * below takes into account also the fact that, even if bfqq is being + * weight-raised, the scenario is still symmetric if all queues with + * requests waiting for completion happen to be + * weight-raised. Actually, we should be even more precise here, and + * differentiate between interactive weight raising and soft real-time + * weight raising. + * + * The second sub-condition checked in the compound condition is + * whether there is a fair amount of already in-flight I/O not + * belonging to bfqq. If so, I/O dispatching is to be plugged, for the + * following reason. The drive may decide to serve in-flight + * non-bfqq's I/O requests before bfqq's ones, thereby delaying the + * arrival of new I/O requests for bfqq (recall that bfqq is sync). If + * I/O-dispatching is not plugged, then, while bfqq remains empty, a + * basically uncontrolled amount of I/O from other queues may be + * dispatched too, possibly causing the service of bfqq's I/O to be + * delayed even longer in the drive. This problem gets more and more + * serious as the speed and the queue depth of the drive grow, + * because, as these two quantities grow, the probability to find no + * queue busy but many requests in flight grows too. By contrast, + * plugging I/O dispatching minimizes the delay induced by already + * in-flight I/O, and enables bfqq to recover the bandwidth it may + * lose because of this delay. * * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. + * device-idling countermeasures may however fail in the following + * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled + * in a time period during which all symmetry sub-conditions hold, and + * therefore the device is allowed to enqueue many requests, but at + * some later point in time some sub-condition stops to hold, then it + * may become impossible to make requests be served in the desired + * order until all the requests already queued in the device have been + * served. The last sub-condition commented above somewhat mitigates + * this problem for weight-raised queues. */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { return (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd)) || + (bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd) || + bfqd->rq_in_driver >= + bfqq->dispatched + 4)) || bfq_asymmetric_scenario(bfqd, bfqq); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 24ed26957367..55a7dc227dfb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ -static bool blkcg_debug_stats = false; +bool blkcg_debug_stats = false; static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, @@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) dbytes, dios); } - if (!blkcg_debug_stats) - goto next; - - if (atomic_read(&blkg->use_delay)) { + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", @@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) has_stats = true; off += written; } -next: + if (has_stats) { if (off < size - 1) { off += scnprintf(buf+off, size-off, "\n"); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d973c38ee4fd..0fff7b56df0e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, unsigned long long avg_lat; unsigned long long cur_win; + if (!blkcg_debug_stats) + return 0; + if (iolat->ssd) return iolatency_ssd_stat(iolat, buf, size); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index cf22ab00fefb..126021fc3a11 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) e->type->ops.completed_request(rq, now); } -static inline void blk_mq_sched_started_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.started_request) - e->type->ops.started_request(rq); -} - static inline void blk_mq_sched_requeue_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/block/blk-mq.c b/block/blk-mq.c index b038ec680e84..f78d3287dd82 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - blk_mq_sched_started_request(rq); - trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { @@ -1960,9 +1958,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) + + cookie = BLK_QC_T_NONE; + if (bio->bi_opf & REQ_NOWAIT_INLINE) + cookie = BLK_QC_T_EAGAIN; + else if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return BLK_QC_T_NONE; + return cookie; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 659ccb8b693f..3954c0dc1443 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -202,6 +202,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; + smp_wmb(); list_del_init(&curr->entry); wake_up_process(data->task); return 1; @@ -244,7 +245,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, return; prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + has_sleeper = !wq_has_single_sleeper(&rqw->wait); do { + /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { @@ -255,12 +258,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ + smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); break; } io_schedule(); - has_sleeper = false; + has_sleeper = true; + set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); } diff --git a/block/genhd.c b/block/genhd.c index 97887e59f3b2..54f1f0d381f4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1969,7 +1969,7 @@ static const struct attribute *disk_events_attrs[] = { * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to - * /sys/module/block/events_dfl_poll_msecs. + * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 72312ad2e142..3a36e76eca83 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -408,7 +408,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, hpriv->mmio = devm_ioremap_resource(dev, platform_get_resource(pdev, IORESOURCE_MEM, 0)); if (IS_ERR(hpriv->mmio)) { - dev_err(dev, "no mmio space\n"); rc = PTR_ERR(hpriv->mmio); goto err_out; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 90ebfcae0ce6..2b3103c30857 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -5417,7 +5417,7 @@ static int drbd_do_auth(struct drbd_connection *connection) unsigned int key_len; char secret[SHARED_SECRET_MAX]; /* 64 byte */ unsigned int resp_size; - SHASH_DESC_ON_STACK(desc, connection->cram_hmac_tfm); + struct shash_desc *desc; struct packet_info pi; struct net_conf *nc; int err, rv; @@ -5430,6 +5430,13 @@ static int drbd_do_auth(struct drbd_connection *connection) memcpy(secret, nc->shared_secret, key_len); rcu_read_unlock(); + desc = kmalloc(sizeof(struct shash_desc) + + crypto_shash_descsize(connection->cram_hmac_tfm), + GFP_KERNEL); + if (!desc) { + rv = -1; + goto fail; + } desc->tfm = connection->cram_hmac_tfm; rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); @@ -5571,7 +5578,10 @@ static int drbd_do_auth(struct drbd_connection *connection) kfree(peers_ch); kfree(response); kfree(right_response); - shash_desc_zero(desc); + if (desc) { + shash_desc_zero(desc); + kfree(desc); + } return rv; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 26e374fbf57c..20ed838e9413 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -931,6 +931,9 @@ int bch_cached_dev_run(struct cached_dev *dc) if (dc->io_disable) { pr_err("I/O disabled on cached dev %s", dc->backing_dev_name); + kfree(env[1]); + kfree(env[2]); + kfree(buf); return -EIO; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cc09b81fc7f4..8f3fbe5ca937 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2311,17 +2311,15 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); } -static void __nvme_release_subsystem(struct nvme_subsystem *subsys) +static void nvme_release_subsystem(struct device *dev) { + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + ida_simple_remove(&nvme_subsystems_ida, subsys->instance); kfree(subsys); } -static void nvme_release_subsystem(struct device *dev) -{ - __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev)); -} - static void nvme_destroy_subsystem(struct kref *ref) { struct nvme_subsystem *subsys = @@ -2477,7 +2475,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) mutex_lock(&nvme_subsystems_lock); found = __nvme_find_get_subsystem(subsys->subnqn); if (found) { - __nvme_release_subsystem(subsys); + put_device(&subsys->dev); subsys = found; if (!nvme_validate_cntlid(subsys, ctrl, id)) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a9a927677970..4f0d0d12744e 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -12,11 +12,6 @@ module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); -inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) -{ - return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3)); -} - /* * If multipathing is enabled we need to always use the subsystem instance * number for numbering our devices to avoid conflicts between subsystems that @@ -622,7 +617,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { int error; - if (!nvme_ctrl_use_ana(ctrl)) + /* check if multipath is enabled and we have the capability */ + if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3))) return 0; ctrl->anacap = id->anacap; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 716a876119c8..26b563f9985b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -485,7 +485,11 @@ extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH -bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl); +static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) +{ + return ctrl->ana_log_buf != NULL; +} + void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); void nvme_failover_req(struct request *req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bb970ca82517..db160cee42ad 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2254,9 +2254,7 @@ static int nvme_dev_add(struct nvme_dev *dev) if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.nr_maps = 1; /* default */ - if (dev->io_queues[HCTX_TYPE_READ]) - dev->tagset.nr_maps++; + dev->tagset.nr_maps = 2; /* default + read */ if (dev->io_queues[HCTX_TYPE_POLL]) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; @@ -3029,6 +3027,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ .driver_data = NVME_QUIRK_LIGHTNVM, }, + { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ + .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, diff --git a/fs/block_dev.c b/fs/block_dev.c index 4707dfff991b..c2a85b587922 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -345,15 +345,24 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; + bool nowait = (iocb->ki_flags & IOCB_NOWAIT) != 0; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; - int ret = 0; + gfp_t gfp; + ssize_t ret; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); + if (nowait) + gfp = GFP_NOWAIT; + else + gfp = GFP_KERNEL; + + bio = bio_alloc_bioset(gfp, nr_pages, &blkdev_dio_pool); + if (!bio) + return -EAGAIN; dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -375,7 +384,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!is_poll) blk_start_plug(&plug); + ret = 0; for (;;) { + int err; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; @@ -383,8 +395,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { + err = bio_iov_iter_get_pages(bio, iter); + if (unlikely(err)) { + if (!ret) + ret = err; bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; @@ -399,6 +413,14 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) task_io_account_write(bio->bi_iter.bi_size); } + /* + * Tell underlying layer to not block for resource shortage. + * And if we would have blocked, return error inline instead + * of through the bio->bi_end_io() callback. + */ + if (nowait) + bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); + dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; @@ -412,6 +434,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } qc = submit_bio(bio); + if (qc == BLK_QC_T_EAGAIN) { + if (!ret) + ret = -EAGAIN; + goto error; + } if (polled) WRITE_ONCE(iocb->ki_cookie, qc); @@ -432,8 +459,20 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) atomic_inc(&dio->ref); } - submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); + qc = submit_bio(bio); + if (qc == BLK_QC_T_EAGAIN) { + if (!ret) + ret = -EAGAIN; + goto error; + } + ret += bio->bi_iter.bi_size; + + bio = bio_alloc(gfp, nr_pages); + if (!bio) { + if (!ret) + ret = -EAGAIN; + goto error; + } } if (!is_poll) @@ -453,13 +492,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } __set_current_state(TASK_RUNNING); +out: if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); - if (likely(!ret)) - ret = dio->size; bio_put(&dio->bio); return ret; +error: + if (!is_poll) + blk_finish_plug(&plug); + goto out; } static ssize_t diff --git a/fs/io_uring.c b/fs/io_uring.c index e2a66e12fbc6..012bc0efb9d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -202,7 +202,7 @@ struct async_list { struct file *file; off_t io_end; - size_t io_pages; + size_t io_len; }; struct io_ring_ctx { @@ -333,7 +333,8 @@ struct io_kiocb { #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_FAIL_LINK 128 /* fail rest of links */ +#define REQ_F_LINK_DONE 128 /* linked sqes done */ +#define REQ_F_FAIL_LINK 256 /* fail rest of links */ u64 user_data; u32 result; u32 sequence; @@ -429,7 +430,7 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx, if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped; + return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -632,6 +633,7 @@ static void io_req_link_next(struct io_kiocb *req) nxt->flags |= REQ_F_LINK; } + nxt->flags |= REQ_F_LINK_DONE; INIT_WORK(&nxt->work, io_sq_wq_submit_work); queue_work(req->ctx->sqo_wq, &nxt->work); } @@ -1064,8 +1066,44 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, */ offset = buf_addr - imu->ubuf; iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); - if (offset) - iov_iter_advance(iter, offset); + + if (offset) { + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here, because + * we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are PAGE_SIZE in size, except potentially the + * first and last bvec + * + * So just find our index, and adjust the iterator afterwards. + * If the offset is within the first bvec (or the whole first + * bvec, just use iov_iter_advance(). This makes it easier + * since we can just skip the first segment, which may not + * be PAGE_SIZE aligned. + */ + const struct bio_vec *bvec = imu->bvec; + + if (offset <= bvec->bv_len) { + iov_iter_advance(iter, offset); + } else { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> PAGE_SHIFT); + + iter->bvec = bvec + seg_skip; + iter->nr_segs -= seg_skip; + iter->count -= (seg_skip << PAGE_SHIFT); + iter->iov_offset = offset & ~PAGE_MASK; + if (iter->iov_offset) + iter->count -= iter->iov_offset; + } + } + return 0; } @@ -1120,28 +1158,26 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) off_t io_end = kiocb->ki_pos + len; if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) { - unsigned long max_pages; + unsigned long max_bytes; /* Use 8x RA size as a decent limiter for both reads/writes */ - max_pages = filp->f_ra.ra_pages; - if (!max_pages) - max_pages = VM_READAHEAD_PAGES; - max_pages *= 8; - - /* If max pages are exceeded, reset the state */ - len >>= PAGE_SHIFT; - if (async_list->io_pages + len <= max_pages) { + max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3); + if (!max_bytes) + max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3); + + /* If max len are exceeded, reset the state */ + if (async_list->io_len + len <= max_bytes) { req->flags |= REQ_F_SEQ_PREV; - async_list->io_pages += len; + async_list->io_len += len; } else { io_end = 0; - async_list->io_pages = 0; + async_list->io_len = 0; } } /* New file? Reset state. */ if (async_list->file != filp) { - async_list->io_pages = 0; + async_list->io_len = 0; async_list->file = filp; } async_list->io_end = io_end; @@ -1630,6 +1666,8 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) INIT_LIST_HEAD(&poll->wait.entry); init_waitqueue_func_entry(&poll->wait, io_poll_wake); + INIT_LIST_HEAD(&req->list); + mask = vfs_poll(poll->file, &ipt.pt) & poll->events; spin_lock_irq(&ctx->completion_lock); @@ -1844,6 +1882,10 @@ restart: /* async context always use a copy of the sqe */ kfree(sqe); + /* req from defer and link list needn't decrease async cnt */ + if (req->flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE)) + goto out; + if (!async_list) break; if (!list_empty(&req_list)) { @@ -1891,6 +1933,7 @@ restart: } } +out: if (cur_mm) { set_fs(old_fs); unuse_mm(cur_mm); @@ -1917,6 +1960,10 @@ static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req) ret = true; spin_lock(&list->lock); list_add_tail(&req->list, &list->list); + /* + * Ensure we see a simultaneous modification from io_sq_wq_submit_work() + */ + smp_mb(); if (!atomic_read(&list->cnt)) { list_del_init(&req->list); ret = false; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 689a58231288..12811091fd50 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -181,6 +181,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; +extern bool blkcg_debug_stats; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index feff3fe4467e..1b1fa1557e68 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,7 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ + __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -345,6 +346,7 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) +#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U +#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE; + return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 17cd0078377c..1dd014c9c87b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -45,7 +45,6 @@ struct elevator_mq_ops { struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); - void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); diff --git a/include/linux/wait.h b/include/linux/wait.h index b6f77cf60dd7..30c515520fb2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -127,6 +127,19 @@ static inline int waitqueue_active(struct wait_queue_head *wq_head) } /** + * wq_has_single_sleeper - check if there is only one sleeper + * @wq_head: wait queue head + * + * Returns true of wq_head has only one sleeper on the list. + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) +{ + return list_is_singular(&wq_head->head); +} + +/** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * |