From b9151e7bca82a17ff7aa442a168e0f924a07443c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 20 Apr 2020 09:24:52 -0700 Subject: blk-mq: Add blk_mq_delay_run_hw_queues() API call We have: * blk_mq_run_hw_queue() * blk_mq_delay_run_hw_queue() * blk_mq_run_hw_queues() ...but not blk_mq_delay_run_hw_queues(), presumably because nobody needed it before now. Since we need it for a later patch in this series, add it. Signed-off-by: Douglas Anderson Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b45148ba3291..51fbf6f76593 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -508,6 +508,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); -- cgit v1.2.3 From d46430bf5a2298f55e20f59a90ebe3545d273b2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:28:57 +0200 Subject: block: remove the disk argument from blk_drop_partitions The gendisk can be trivially deducted from the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/partitions/core.c | 10 +++++----- fs/block_dev.c | 2 +- include/linux/genhd.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/partitions/core.c b/block/partitions/core.c index 7d062599aa33..deccc3fbcd37 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -603,23 +603,23 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) } } -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) +int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; struct hd_struct *part; int res; - if (!disk_part_scan_enabled(disk)) + if (!disk_part_scan_enabled(bdev->bd_disk)) return 0; if (bdev->bd_part_count || bdev->bd_openers > 1) return -EBUSY; - res = invalidate_partition(disk, 0); + res = invalidate_partition(bdev->bd_disk, 0); if (res) return res; - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(disk, part); + delete_partition(bdev->bd_disk, part); disk_part_iter_exit(&piter); return 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 52b6f646cdbd..9c8de54fa0c9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1517,7 +1517,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); rescan: - ret = blk_drop_partitions(disk, bdev); + ret = blk_drop_partitions(bdev); if (ret) return ret; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9b3fffdf4011..058d895544c7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -339,7 +339,7 @@ extern dev_t blk_lookup_devt(const char *name, int partno); int bdev_disk_changed(struct block_device *bdev, bool invalidate); int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); -int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev); +int blk_drop_partitions(struct block_device *bdev); extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -- cgit v1.2.3 From 02d33b6771fcc63c98cb48cad0cd8b8fb033837a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:29:01 +0200 Subject: block: mark invalidate_partition static invalidate_partition is only used in genhd.c, so mark it static. Also drop the return value given that is is always ignored. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/genhd.c | 27 +++++++++++++-------------- include/linux/fs.h | 1 - 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 1cc50ad5b191..980a4609d4a5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -878,6 +878,19 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); +static void invalidate_partition(struct gendisk *disk, int partno) +{ + struct block_device *bdev; + + bdev = bdget_disk(disk, partno); + if (!bdev) + return; + + fsync_bdev(bdev); + __invalidate_device(bdev, true); + bdput(bdev); +} + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -1806,20 +1819,6 @@ int bdev_read_only(struct block_device *bdev) EXPORT_SYMBOL(bdev_read_only); -int invalidate_partition(struct gendisk *disk, int partno) -{ - int res = 0; - struct block_device *bdev = bdget_disk(disk, partno); - if (bdev) { - fsync_bdev(bdev); - res = __invalidate_device(bdev, true); - bdput(bdev); - } - return res; -} - -EXPORT_SYMBOL(invalidate_partition); - /* * Disk events - monitor disk events like media change and eject request. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..2b4e9f86b151 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2723,7 +2723,6 @@ extern bool is_bad_inode(struct inode *); extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *, bool); -extern int invalidate_partition(struct gendisk *, int); #endif unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); -- cgit v1.2.3 From 9bc5c397d8384b50c8202f4400bf2f87fe8291d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:29:02 +0200 Subject: block: fold bdev_unhash_inode into invalidate_partition invalidate_partition and bdev_unhash_inode are always paired, and invalidate_partition already does an icache lookup for the block device inode. Piggy back on that to remove the inode from the hash. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++-- fs/block_dev.c | 15 --------------- include/linux/fs.h | 1 - 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 980a4609d4a5..c05d509877fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -888,6 +888,12 @@ static void invalidate_partition(struct gendisk *disk, int partno) fsync_bdev(bdev); __invalidate_device(bdev, true); + + /* + * Unhash the bdev inode for this device so that it gets evicted as soon + * as last inode reference is dropped. + */ + remove_inode_hash(bdev->bd_inode); bdput(bdev); } @@ -909,13 +915,11 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(disk, part->partno); - bdev_unhash_inode(part_devt(part)); delete_partition(disk, part); } disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9c8de54fa0c9..998820174d3e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -883,21 +883,6 @@ static int bdev_set(struct inode *inode, void *data) static LIST_HEAD(all_bdevs); -/* - * If there is a bdev inode for this device, unhash it so that it gets evicted - * as soon as last inode reference is dropped. - */ -void bdev_unhash_inode(dev_t dev) -{ - struct inode *inode; - - inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); - if (inode) { - remove_inode_hash(inode); - iput(inode); - } -} - struct block_device *bdget(dev_t dev) { struct block_device *bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2b4e9f86b151..1a95e5158811 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2581,7 +2581,6 @@ extern struct kmem_cache *names_cachep; #ifdef CONFIG_BLOCK extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); -extern void bdev_unhash_inode(dev_t dev); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); -- cgit v1.2.3 From e64a0e16928415648d53d721b3d6fc3635eddf92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:21 +0200 Subject: block: remove RQF_COPY_USER The RQF_COPY_USER is set for bio where the passthrough request mapping helpers decided that bounce buffering is required. It is then used to pad scatterlist for drivers that required it. But given that non-passthrough requests are per definition aligned, and directly mapped pass-through request must be aligned it is not actually required at all. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 9 +-------- block/blk-merge.c | 3 +-- block/blk-mq-debugfs.c | 1 - include/linux/blkdev.h | 2 -- 4 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/blk-map.c b/block/blk-map.c index b72c361911a4..b6fa343fea9f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -654,8 +654,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, bio = rq->bio; } while (iov_iter_count(&i)); - if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: @@ -731,7 +729,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - int do_copy = 0; struct bio *bio, *orig_bio; int ret; @@ -740,8 +737,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf); - if (do_copy) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -752,9 +748,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - if (do_copy) - rq->rq_flags |= RQF_COPY_USER; - orig_bio = bio; ret = blk_rq_append_bio(rq, &bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 1534ed736363..99c9759f3a8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -532,8 +532,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->rq_flags & RQF_COPY_USER) && - (blk_rq_bytes(rq) & q->dma_pad_mask)) { + if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b3f2ba483992..96b7a35c898a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -292,7 +292,6 @@ static const char *const rqf_name[] = { RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(PREEMPT), - RQF_NAME(COPY_USER), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(ELVPRIV), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32868fbedc9e..76da162b6ae9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -82,8 +82,6 @@ typedef __u32 __bitwise req_flags_t; /* set for "ide_preempt" requests and also for requests for which the SCSI "quiesce" state must be ignored. */ #define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) -/* contains copies of user pages */ -#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ -- cgit v1.2.3 From 89de1504d53b59b12bfff227328ee3e63dd3a112 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:22 +0200 Subject: block: provide a blk_rq_map_sg variant that returns the last element To be able to move some of the special purpose hacks in blk_rq_map_sg into the callers we need a variant that returns the last mapped S/G list element to the caller. Add that variant as __blk_rq_map_sg and make blk_rq_map_sg a trivial inline wrapper around it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 25 ++++++++++++------------- include/linux/blkdev.h | 10 +++++++++- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 99c9759f3a8a..ee618cdb141e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -519,24 +519,23 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, * map a request to scatterlist, return number of sg entries setup. Caller * must make sure sg can hold rq->nr_phys_segments entries */ -int blk_rq_map_sg(struct request_queue *q, struct request *rq, - struct scatterlist *sglist) +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg) { - struct scatterlist *sg = NULL; int nsegs = 0; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg); + nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg); + nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - sg->length += pad_len; + (*last_sg)->length += pad_len; rq->extra_len += pad_len; } @@ -544,9 +543,9 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (op_is_write(req_op(rq))) memset(q->dma_drain_buffer, 0, q->dma_drain_size); - sg_unmark_end(sg); - sg = sg_next(sg); - sg_set_page(sg, virt_to_page(q->dma_drain_buffer), + sg_unmark_end(*last_sg); + *last_sg = sg_next(*last_sg); + sg_set_page(*last_sg, virt_to_page(q->dma_drain_buffer), q->dma_drain_size, ((unsigned long)q->dma_drain_buffer) & (PAGE_SIZE - 1)); @@ -554,8 +553,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, rq->extra_len += q->dma_drain_size; } - if (sg) - sg_mark_end(sg); + if (*last_sg) + sg_mark_end(*last_sg); /* * Something must have been wrong if the figured number of @@ -565,7 +564,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, return nsegs; } -EXPORT_SYMBOL(blk_rq_map_sg); +EXPORT_SYMBOL(__blk_rq_map_sg); static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76da162b6ae9..496dc9491026 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1136,7 +1136,15 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } -extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg); +static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist) +{ + struct scatterlist *last_sg = NULL; + + return __blk_rq_map_sg(q, rq, sglist, &last_sg); +} extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From cc97923a5bccc776851c242b61015faf288d5c22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:24 +0200 Subject: block: move dma drain handling to scsi Don't burden the common block code with with specifics of the libata DMA draining mechanism. Instead move most of the code to the scsi midlayer. That also means the nr_phys_segments adjustments in the blk-mq fast path can go away entirely, given that SCSI never looks at nr_phys_segments after mapping the request to a scatterlist. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 14 -------------- block/blk-mq.c | 11 ----------- block/blk-settings.c | 37 ------------------------------------- drivers/ata/libata-scsi.c | 28 ++++++++++------------------ drivers/scsi/scsi_lib.c | 39 ++++++++++++++++++++++++++++++++++----- include/linux/blkdev.h | 7 ------- include/linux/libata.h | 2 ++ include/scsi/scsi_device.h | 3 +++ include/scsi/scsi_host.h | 7 +++++++ 9 files changed, 56 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index ee618cdb141e..25f5a5e00ee6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -539,20 +539,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, rq->extra_len += pad_len; } - if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (op_is_write(req_op(rq))) - memset(q->dma_drain_buffer, 0, q->dma_drain_size); - - sg_unmark_end(*last_sg); - *last_sg = sg_next(*last_sg); - sg_set_page(*last_sg, virt_to_page(q->dma_drain_buffer), - q->dma_drain_size, - ((unsigned long)q->dma_drain_buffer) & - (PAGE_SIZE - 1)); - nsegs++; - rq->extra_len += q->dma_drain_size; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index cf95e8e0881a..2c105cb2a75b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -667,15 +667,6 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } - #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); @@ -695,8 +686,6 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; } } diff --git a/block/blk-settings.c b/block/blk-settings.c index 14397b4c4b53..2ab1967b9716 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -651,43 +651,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) } EXPORT_SYMBOL(blk_queue_update_dma_pad); -/** - * blk_queue_dma_drain - Set up a drain buffer for excess dma. - * @q: the request queue for the device - * @dma_drain_needed: fn which returns non-zero if drain is necessary - * @buf: physically contiguous buffer - * @size: size of the buffer in bytes - * - * Some devices have excess DMA problems and can't simply discard (or - * zero fill) the unwanted piece of the transfer. They have to have a - * real area of memory to transfer it into. The use case for this is - * ATAPI devices in DMA mode. If the packet command causes a transfer - * bigger than the transfer size some HBAs will lock up if there - * aren't DMA elements to contain the excess transfer. What this API - * does is adjust the queue so that the buf is always appended - * silently to the scatterlist. - * - * Note: This routine adjusts max_hw_segments to make room for appending - * the drain buffer. If you call blk_queue_max_segments() after calling - * this routine, you must set the limit to one fewer than your device - * can support otherwise there won't be room for the drain buffer. - */ -int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size) -{ - if (queue_max_segments(q) < 2) - return -EINVAL; - /* make room for appending the drain */ - blk_queue_max_segments(q, queue_max_segments(q) - 1); - q->dma_drain_needed = dma_drain_needed; - q->dma_drain_buffer = buf; - q->dma_drain_size = size; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_queue_dma_drain); - /** * blk_queue_segment_boundary - set boundary rules for segment merging * @q: the request queue for the device diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 36e588d88b95..feb13b8f93d7 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1017,16 +1017,11 @@ void ata_scsi_sdev_config(struct scsi_device *sdev) * RETURNS: * 1 if ; otherwise, 0. */ -static int atapi_drain_needed(struct request *rq) +bool ata_scsi_dma_need_drain(struct request *rq) { - if (likely(!blk_rq_is_passthrough(rq))) - return 0; - - if (!blk_rq_bytes(rq) || op_is_write(req_op(rq))) - return 0; - return atapi_cmd_type(scsi_req(rq)->cmd[0]) == ATAPI_MISC; } +EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain); int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) { @@ -1039,21 +1034,21 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) blk_queue_max_hw_sectors(q, dev->max_sectors); if (dev->class == ATA_DEV_ATAPI) { - void *buf; - sdev->sector_size = ATA_SECT_SIZE; /* set DMA padding */ blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1); - /* configure draining */ - buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL); - if (!buf) { + /* make room for appending the drain */ + blk_queue_max_segments(q, queue_max_segments(q) - 1); + + sdev->dma_drain_len = ATAPI_MAX_DRAIN; + sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, + q->bounce_gfp | GFP_KERNEL); + if (!sdev->dma_drain_buf) { ata_dev_err(dev, "drain buffer allocation failed\n"); return -ENOMEM; } - - blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN); } else { sdev->sector_size = ata_id_logical_sector_size(dev->id); sdev->manage_start_stop = 1; @@ -1135,7 +1130,6 @@ EXPORT_SYMBOL_GPL(ata_scsi_slave_config); void ata_scsi_slave_destroy(struct scsi_device *sdev) { struct ata_port *ap = ata_shost_to_port(sdev->host); - struct request_queue *q = sdev->request_queue; unsigned long flags; struct ata_device *dev; @@ -1152,9 +1146,7 @@ void ata_scsi_slave_destroy(struct scsi_device *sdev) } spin_unlock_irqrestore(ap->lock, flags); - kfree(q->dma_drain_buffer); - q->dma_drain_buffer = NULL; - q->dma_drain_size = 0; + kfree(sdev->dma_drain_buf); } EXPORT_SYMBOL_GPL(ata_scsi_slave_destroy); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 4e42acbb3f32..88cac92fc153 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -978,6 +978,14 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_io_completion_action(cmd, result); } +static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, + struct request *rq) +{ + return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && + !op_is_write(req_op(rq)) && + sdev->host->hostt->dma_need_drain(rq); +} + /* * Function: scsi_init_io() * @@ -991,26 +999,47 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) */ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) { + struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; + unsigned short nr_segs = blk_rq_nr_phys_segments(rq); + struct scatterlist *last_sg = NULL; blk_status_t ret; + bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; - if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq))) + if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; + /* + * Make sure there is space for the drain. The driver must adjust + * max_hw_segments to be prepared for this. + */ + if (need_drain) + nr_segs++; + /* * If sg table allocation fails, requeue request later. */ - if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, - blk_rq_nr_phys_segments(rq), cmd->sdb.table.sgl, - SCSI_INLINE_SG_CNT))) + if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, + cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ - count = blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl); + count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + + if (need_drain) { + sg_unmark_end(last_sg); + last_sg = sg_next(last_sg); + sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); + sg_mark_end(last_sg); + + rq->extra_len += sdev->dma_drain_len; + count++; + } + BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 496dc9491026..8e4726bce498 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ struct blk_queue_ctx; typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); struct bio_vec; -typedef int (dma_drain_needed_fn)(struct request *); enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ @@ -397,7 +396,6 @@ struct request_queue { struct rq_qos *rq_qos; make_request_fn *make_request_fn; - dma_drain_needed_fn *dma_drain_needed; const struct blk_mq_ops *mq_ops; @@ -467,8 +465,6 @@ struct request_queue { */ unsigned long nr_requests; /* Max # of requests */ - unsigned int dma_drain_size; - void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; @@ -1097,9 +1093,6 @@ extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -extern int blk_queue_dma_drain(struct request_queue *q, - dma_drain_needed_fn *dma_drain_needed, - void *buf, unsigned int size); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_dma_alignment(struct request_queue *, int); diff --git a/include/linux/libata.h b/include/linux/libata.h index cffa4714bfa8..af832852e620 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1092,6 +1092,7 @@ extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, #define ATA_SCSI_COMPAT_IOCTL /* empty */ #endif extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd); +bool ata_scsi_dma_need_drain(struct request *rq); extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, unsigned int cmd, void __user *arg); extern bool ata_link_online(struct ata_link *link); @@ -1387,6 +1388,7 @@ extern struct device_attribute *ata_common_sdev_attrs[]; .ioctl = ata_scsi_ioctl, \ ATA_SCSI_COMPAT_IOCTL \ .queuecommand = ata_scsi_queuecmd, \ + .dma_need_drain = ata_scsi_dma_need_drain, \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index c3cba2aaf934..bc5909033d13 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -229,6 +229,9 @@ struct scsi_device { struct scsi_device_handler *handler; void *handler_data; + size_t dma_drain_len; + void *dma_drain_buf; + unsigned char access_state; struct mutex state_mutex; enum scsi_device_state sdev_state; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 822e8cda8d9b..46ef8cccc982 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -270,6 +270,13 @@ struct scsi_host_template { */ int (* map_queues)(struct Scsi_Host *shost); + /* + * Check if scatterlists need to be padded for DMA draining. + * + * Status: OPTIONAL + */ + bool (* dma_need_drain)(struct request *rq); + /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by -- cgit v1.2.3 From bdf8710d69f82ee6fd41b0166300c3306898b3c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 14 Apr 2020 09:42:25 +0200 Subject: block: move dma_pad handling from blk_rq_map_sg into the callers There are only two callers of blk_rq_map_sg/__blk_rq_map_sg that set the dma_pad value in the queue. Move the handling into those callers instead of burdening the common code, and move the ->extra_len field from struct request to struct scsi_cmnd. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-merge.c | 8 -------- block/blk-mq.c | 1 - drivers/ata/libata-scsi.c | 2 +- drivers/ide/ide-io.c | 7 +++++-- drivers/scsi/scsi_lib.c | 10 +++++++++- include/linux/blkdev.h | 2 -- include/scsi/scsi_cmnd.h | 1 + 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 7e4a1da0715e..311596d5dbc4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1638,7 +1638,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; - rq->extra_len = rq_src->extra_len; return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index 25f5a5e00ee6..c49eb3bdd0be 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -531,14 +531,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); - if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { - unsigned int pad_len = - (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; - - (*last_sg)->length += pad_len; - rq->extra_len += pad_len; - } - if (*last_sg) sg_mark_end(*last_sg); diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c105cb2a75b..71d0894ce1c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -318,7 +318,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->nr_integrity_segments = 0; #endif /* tag was already set */ - rq->extra_len = 0; WRITE_ONCE(rq->deadline, 0); rq->timeout = 0; diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index feb13b8f93d7..435781a16875 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -649,7 +649,7 @@ static void ata_qc_set_pc_nbytes(struct ata_queued_cmd *qc) { struct scsi_cmnd *scmd = qc->scsicmd; - qc->extrabytes = scmd->request->extra_len; + qc->extrabytes = scmd->extra_len; qc->nbytes = scsi_bufflen(scmd) + qc->extrabytes; } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index b137f27a34d5..c31f1d2b3b07 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -233,10 +233,13 @@ static ide_startstop_t do_special(ide_drive_t *drive) void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) { ide_hwif_t *hwif = drive->hwif; - struct scatterlist *sg = hwif->sg_table; + struct scatterlist *sg = hwif->sg_table, *last_sg = NULL; struct request *rq = cmd->rq; - cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); + cmd->sg_nents = __blk_rq_map_sg(drive->queue, rq, sg, &last_sg); + if (blk_rq_bytes(rq) && (blk_rq_bytes(rq) & rq->q->dma_pad_mask)) + last_sg->length += + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; } EXPORT_SYMBOL_GPL(ide_map_sg); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 88cac92fc153..0a73230a8f16 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1030,13 +1030,21 @@ blk_status_t scsi_init_io(struct scsi_cmnd *cmd) */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); + if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { + unsigned int pad_len = + (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; + + last_sg->length += pad_len; + cmd->extra_len += pad_len; + } + if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); - rq->extra_len += sdev->dma_drain_len; + cmd->extra_len += sdev->dma_drain_len; count++; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8e4726bce498..f00bd4042295 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -224,8 +224,6 @@ struct request { unsigned short write_hint; unsigned short ioprio; - unsigned int extra_len; /* length of alignment and padding */ - enum mq_rq_state state; refcount_t ref; diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 80ac89e47b47..f93c0b800790 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -142,6 +142,7 @@ struct scsi_cmnd { unsigned long state; /* Command completion state */ unsigned char tag; /* SCSI-II queued command tag */ + unsigned int extra_len; /* length of alignment and padding */ }; /* -- cgit v1.2.3 From 8cf7961dab42c9177a556b719c15f5b9449c24d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:53:36 +0200 Subject: block: bypass ->make_request_fn for blk-mq drivers Call blk_mq_make_request when no ->make_request_fn is set. This is safe now that blk_alloc_queue always sets up the pointer for make_request based drivers. This avoids an indirect call in the blk-mq driver I/O fast path, which is rather expensive due to spectre mitigations. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 26 +++++++++++++++++--------- block/blk-mq.c | 4 ++-- drivers/md/dm.c | 3 +++ include/linux/blk-mq.h | 2 ++ 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 38e984d95e84..dffff2100888 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1073,7 +1073,10 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = q->make_request_fn(q, bio); + if (q->make_request_fn) + ret = q->make_request_fn(q, bio); + else + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); @@ -1113,9 +1116,7 @@ EXPORT_SYMBOL(generic_make_request); * * This function behaves like generic_make_request(), but does not protect * against recursion. Must only be used if the called driver is known - * to not call generic_make_request (or direct_make_request) again from - * its make_request function. (Calling direct_make_request again from - * a workqueue is perfectly fine as that doesn't recurse). + * to be blk-mq based. */ blk_qc_t direct_make_request(struct bio *bio) { @@ -1123,20 +1124,27 @@ blk_qc_t direct_make_request(struct bio *bio) bool nowait = bio->bi_opf & REQ_NOWAIT; blk_qc_t ret; + if (WARN_ON_ONCE(q->make_request_fn)) + goto io_error; if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) - bio_wouldblock_error(bio); - else - bio_io_error(bio); - return BLK_QC_T_NONE; + goto would_block; + goto io_error; } - ret = q->make_request_fn(q, bio); + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); return ret; + +would_block: + bio_wouldblock_error(bio); + return BLK_QC_T_NONE; +io_error: + bio_io_error(bio); + return BLK_QC_T_NONE; } EXPORT_SYMBOL_GPL(direct_make_request); diff --git a/block/blk-mq.c b/block/blk-mq.c index 71d0894ce1c5..bcc3a2397d4a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1984,7 +1984,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); @@ -2096,6 +2096,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return cookie; } +EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -2955,7 +2956,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); - q->make_request_fn = blk_mq_make_request; q->nr_requests = set->queue_depth; /* diff --git a/drivers/md/dm.c b/drivers/md/dm.c index db9e46114653..0eb93da44ea2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1788,6 +1788,9 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) int srcu_idx; struct dm_table *map; + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) + return blk_mq_make_request(q, bio); + map = dm_get_live_table(md, &srcu_idx); /* if we're suspended, we have to queue this io for later */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 51fbf6f76593..d7307795439a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -578,4 +578,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } +blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio); + #endif -- cgit v1.2.3 From 0376e9efe18388bd486a65edbc16d34b84bddc8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2020 13:27:55 +0200 Subject: block: replace BIO_QUEUE_ENTERED with BIO_CGROUP_ACCT BIO_QUEUE_ENTERED is only used for cgroup accounting now, so rename the flag and move setting it into the cgroup code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 10 ---------- include/linux/blk-cgroup.h | 10 ++++++---- include/linux/blk_types.h | 2 +- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index c49eb3bdd0be..a04e991b5ded 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -336,16 +336,6 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, /* there isn't chance to merge the splitted bio */ split->bi_opf |= REQ_NOMERGE; - /* - * Since we're recursing into make_request here, ensure - * that we mark this bio as already having entered the queue. - * If not, and the queue is going away, we can get stuck - * forever on waiting for the queue reference to drop. But - * that will never happen, as we're already holding a - * reference to it. - */ - bio_set_flag(*bio, BIO_QUEUE_ENTERED); - bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); generic_make_request(*bio); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 35f8ffe92b70..4deb8bb7b6af 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -607,12 +607,14 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, u64_stats_update_begin(&bis->sync); /* - * If the bio is flagged with BIO_QUEUE_ENTERED it means this - * is a split bio and we would have already accounted for the - * size of the bio. + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a + * split bio and we would have already accounted for the size of + * the bio. */ - if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } bis->cur.ios[rwd]++; u64_stats_update_end(&bis->sync); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 31eb92876be7..90895d594e64 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -220,7 +220,7 @@ enum { * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ - BIO_QUEUE_ENTERED, /* can use blk_queue_enter_live() */ + BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ BIO_TRACKED, /* set if bio goes through the rq_qos path */ BIO_FLAG_LAST }; -- cgit v1.2.3 From 54c52e10dc9b939084a7e6e3d32ce8fd8dee7898 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Apr 2020 12:27:55 -0400 Subject: blk-iocost: switch to fixed non-auto-decaying use_delay The use_delay mechanism was introduced by blk-iolatency to hold memory allocators accountable for the reclaim and other shared IOs they cause. The duration of the delay is dynamically balanced between iolatency increasing the value on each target miss and it auto-decaying as time passes and threads get delayed on it. While this works well for iolatency, iocost's control model isn't compatible with it. There is no repeated "violation" events which can be balanced against auto-decaying. iocost instead knows how much a given cgroup is over budget and wants to prevent that cgroup from issuing IOs while over budget. Until now, iocost has been adding the cost of force-issued IOs. However, this doesn't reflect the amount which is already over budget and is simply not enough to counter the auto-decaying allowing anon-memory leaking low priority cgroup to go over its alloted share of IOs. As auto-decaying doesn't make much sense for iocost, this patch introduces a different mode of operation for use_delay - when blkcg_set_delay() are used insted of blkcg_add/use_delay(), the delay duration is not auto-decayed until it is explicitly cleared with blkcg_clear_delay(). iocost is updated to keep the delay duration synchronized to the budget overage amount. With this change, iocost can effectively police cgroups which generate significant amount of force-issued IOs. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 ++++++ block/blk-iocost.c | 23 +++++++++-------------- include/linux/blk-cgroup.h | 43 +++++++++++++++++++++++++++++++++---------- 3 files changed, 48 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c5dc833212e1..0a63c6cbbcb1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1530,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); + /* negative use_delay means no scaling, see blkcg_set_delay() */ + if (atomic_read(&blkg->use_delay) < 0) + return; + /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain @@ -1717,6 +1721,8 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index db35ee682294..a8e99ef76a08 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1209,14 +1209,14 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); u64 vtime = atomic64_read(&iocg->vtime); u64 vmargin = ioc->margin_us * now->vrate; u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 expires, oexpires; + u64 delta_ns, expires, oexpires; u32 hw_inuse; /* debt-adjust vtime */ @@ -1233,15 +1233,10 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) return false; /* use delay */ - if (cost) { - u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC, - now->vrate); - blkcg_add_delay(blkg, now->now_ns, cost_ns); - } - blkcg_use_delay(blkg); - - expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; + delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, + now->vrate) * NSEC_PER_USEC; + blkcg_set_delay(blkg, delta_ns); + expires = now->now_ns + delta_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); @@ -1260,7 +1255,7 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) struct ioc_now now; ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); return HRTIMER_NORESTART; } @@ -1378,7 +1373,7 @@ static void ioc_timer_fn(struct timer_list *timer) atomic64_read(&iocg->abs_vdebt)) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now, 0); + iocg_kick_delay(iocg, &now); } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ iocg->last_inuse = iocg->inuse; @@ -1737,7 +1732,7 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { atomic64_add(abs_cost, &iocg->abs_vdebt); - if (iocg_kick_delay(iocg, &now, cost)) + if (iocg_kick_delay(iocg, &now)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); return; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4deb8bb7b6af..a57ebe2f00ab 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -631,6 +631,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, static inline void blkcg_use_delay(struct blkcg_gq *blkg) { + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); } @@ -639,6 +641,8 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); + if (WARN_ON_ONCE(old < 0)) + return 0; if (old == 0) return 0; @@ -663,20 +667,39 @@ static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) return 1; } +/** + * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount + * @blkg: target blkg + * @delay: delay duration in nsecs + * + * When enabled with this function, the delay is not decayed and must be + * explicitly cleared with blkcg_clear_delay(). Must not be mixed with + * blkcg_[un]use_delay() and blkcg_add_delay() usages. + */ +static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person setting the congestion count for this blkg. */ + if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + + atomic64_set(&blkg->delay_nsec, delay); +} + +/** + * blkcg_clear_delay - Disable allocator delay mechanism + * @blkg: target blkg + * + * Disable use_delay mechanism. See blkcg_set_delay(). + */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); - if (!old) - return; + /* We only want 1 person clearing the congestion count for this blkg. */ - while (old) { - int cur = atomic_cmpxchg(&blkg->use_delay, old, 0); - if (cur == old) { - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); - break; - } - old = cur; - } + if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); -- cgit v1.2.3 From a711d91cd97e6c9a554ccd1652527a7f36661857 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:00 +0200 Subject: block: add a cdrom_device_info pointer to struct gendisk Add a pointer to the CDROM information structure to struct gendisk. This will allow various removable media file systems to call directly into the CDROM layer instead of abusing ioctls with kernel pointers. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 2 +- drivers/cdrom/cdrom.c | 5 ++++- drivers/cdrom/gdrom.c | 2 +- drivers/ide/ide-cd.c | 3 +-- drivers/scsi/sr.c | 3 +-- include/linux/cdrom.h | 2 +- include/linux/genhd.h | 9 +++++++++ 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index cda5cf917e9a..5124eca90e83 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -1032,7 +1032,7 @@ static int __init pcd_init(void) for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { if (cd->present) { - register_cdrom(&cd->info); + register_cdrom(cd->disk, &cd->info); cd->disk->private_data = cd; add_disk(cd->disk); } diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..a1d2112fd283 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -586,7 +586,7 @@ static int cdrom_mrw_set_lba_space(struct cdrom_device_info *cdi, int space) return 0; } -int register_cdrom(struct cdrom_device_info *cdi) +int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) { static char banner_printed; const struct cdrom_device_ops *cdo = cdi->ops; @@ -601,6 +601,9 @@ int register_cdrom(struct cdrom_device_info *cdi) cdrom_sysctl_register(); } + cdi->disk = disk; + disk->cdi = cdi; + ENSURE(cdo, drive_status, CDC_DRIVE_STATUS); if (cdo->check_events == NULL && cdo->media_changed == NULL) WARN_ON_ONCE(cdo->capability & (CDC_MEDIA_CHANGED | CDC_SELECT_DISC)); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index c51292c2a131..09b0cd292720 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -770,7 +770,7 @@ static int probe_gdrom(struct platform_device *devptr) goto probe_fail_no_disk; } probe_gdrom_setupdisk(); - if (register_cdrom(gd.cd_info)) { + if (register_cdrom(gd.disk, gd.cd_info)) { err = -ENODEV; goto probe_fail_cdrom_register; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index dcf8b51b47fd..40e124eb918a 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1305,8 +1305,7 @@ static int ide_cdrom_register(ide_drive_t *drive, int nslots) if (drive->atapi_flags & IDE_AFLAG_NO_SPEED_SELECT) devinfo->mask |= CDC_SELECT_SPEED; - devinfo->disk = info->disk; - return register_cdrom(devinfo); + return register_cdrom(info->disk, devinfo); } static int ide_cdrom_probe_capabilities(ide_drive_t *drive) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d2fe3fa470f9..f9b589d60a46 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -794,9 +794,8 @@ static int sr_probe(struct device *dev) set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; disk->queue = sdev->request_queue; - cd->cdi.disk = disk; - if (register_cdrom(&cd->cdi)) + if (register_cdrom(disk, &cd->cdi)) goto fail_put; /* diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 528271c60018..4f74ce050253 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -104,7 +104,7 @@ extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); extern int cdrom_media_changed(struct cdrom_device_info *); -extern int register_cdrom(struct cdrom_device_info *cdi); +extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); typedef struct { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 058d895544c7..f9c226f9546a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -217,11 +217,20 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_INTEGRITY struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#if IS_ENABLED(CONFIG_CDROM) + struct cdrom_device_info *cdi; +#endif int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; }; +#if IS_REACHABLE(CONFIG_CDROM) +#define disk_to_cdi(disk) ((disk)->cdi) +#else +#define disk_to_cdi(disk) NULL +#endif + static inline struct gendisk *part_to_disk(struct hd_struct *part) { if (likely(part)) { -- cgit v1.2.3 From 4c3cfcce45152309c2031b3b87bcf10e4f3899e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:02 +0200 Subject: cdrom: factor out a cdrom_read_tocentry helper Factor out a version of the CDROMREADTOCENTRY ioctl handler that can be called directly from kernel space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 39 ++++++++++++++++++++++----------------- include/linux/cdrom.h | 3 +++ 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index a1d2112fd283..c91d1e138214 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2666,32 +2666,37 @@ static int cdrom_ioctl_read_tochdr(struct cdrom_device_info *cdi, return 0; } +int cdrom_read_tocentry(struct cdrom_device_info *cdi, + struct cdrom_tocentry *entry) +{ + u8 requested_format = entry->cdte_format; + int ret; + + if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) + return -EINVAL; + + /* make interface to low-level uniform */ + entry->cdte_format = CDROM_MSF; + ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, entry); + if (!ret) + sanitize_format(&entry->cdte_addr, &entry->cdte_format, + requested_format); + return ret; +} +EXPORT_SYMBOL_GPL(cdrom_read_tocentry); + static int cdrom_ioctl_read_tocentry(struct cdrom_device_info *cdi, void __user *argp) { struct cdrom_tocentry entry; - u8 requested_format; int ret; - /* cd_dbg(CD_DO_IOCTL, "entering CDROMREADTOCENTRY\n"); */ - if (copy_from_user(&entry, argp, sizeof(entry))) return -EFAULT; - - requested_format = entry.cdte_format; - if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) - return -EINVAL; - /* make interface to low-level uniform */ - entry.cdte_format = CDROM_MSF; - ret = cdi->ops->audio_ioctl(cdi, CDROMREADTOCENTRY, &entry); - if (ret) - return ret; - sanitize_format(&entry.cdte_addr, &entry.cdte_format, requested_format); - - if (copy_to_user(argp, &entry, sizeof(entry))) + ret = cdrom_read_tocentry(cdi, &entry); + if (!ret && copy_to_user(argp, &entry, sizeof(entry))) return -EFAULT; - /* cd_dbg(CD_DO_IOCTL, "CDROMREADTOCENTRY successful\n"); */ - return 0; + return ret; } static int cdrom_ioctl_play_msf(struct cdrom_device_info *cdi, diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 4f74ce050253..008c4d79fa33 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -94,6 +94,9 @@ struct cdrom_device_ops { struct packet_command *); }; +int cdrom_read_tocentry(struct cdrom_device_info *cdi, + struct cdrom_tocentry *entry); + /* the general block_device operations structure: */ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode); -- cgit v1.2.3 From eaf8e3e4b54a8b778257fc3921ad8f793c941882 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 25 Apr 2020 09:57:03 +0200 Subject: cdrom: factor out a cdrom_multisession helper Factor out a version of the CDROMMULTISESSION ioctl handler that can be called directly from kernel space. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 41 +++++++++++++++++++++++++---------------- include/linux/cdrom.h | 2 ++ 2 files changed, 27 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index c91d1e138214..06896c07b133 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2295,37 +2295,46 @@ retry: return cdrom_read_cdda_old(cdi, ubuf, lba, nframes); } -static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, - void __user *argp) +int cdrom_multisession(struct cdrom_device_info *cdi, + struct cdrom_multisession *info) { - struct cdrom_multisession ms_info; u8 requested_format; int ret; - cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); - if (!(cdi->ops->capability & CDC_MULTI_SESSION)) return -ENOSYS; - if (copy_from_user(&ms_info, argp, sizeof(ms_info))) - return -EFAULT; - - requested_format = ms_info.addr_format; + requested_format = info->addr_format; if (requested_format != CDROM_MSF && requested_format != CDROM_LBA) return -EINVAL; - ms_info.addr_format = CDROM_LBA; + info->addr_format = CDROM_LBA; - ret = cdi->ops->get_last_session(cdi, &ms_info); - if (ret) - return ret; + ret = cdi->ops->get_last_session(cdi, info); + if (!ret) + sanitize_format(&info->addr, &info->addr_format, + requested_format); + return ret; +} +EXPORT_SYMBOL_GPL(cdrom_multisession); - sanitize_format(&ms_info.addr, &ms_info.addr_format, requested_format); +static int cdrom_ioctl_multisession(struct cdrom_device_info *cdi, + void __user *argp) +{ + struct cdrom_multisession info; + int ret; + + cd_dbg(CD_DO_IOCTL, "entering CDROMMULTISESSION\n"); - if (copy_to_user(argp, &ms_info, sizeof(ms_info))) + if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + ret = cdrom_multisession(cdi, &info); + if (ret) + return ret; + if (copy_to_user(argp, &info, sizeof(info))) return -EFAULT; cd_dbg(CD_DO_IOCTL, "CDROMMULTISESSION successful\n"); - return 0; + return ret; } static int cdrom_ioctl_eject(struct cdrom_device_info *cdi) diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 008c4d79fa33..8543fa59da72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -94,6 +94,8 @@ struct cdrom_device_ops { struct packet_command *); }; +int cdrom_multisession(struct cdrom_device_info *cdi, + struct cdrom_multisession *info); int cdrom_read_tocentry(struct cdrom_device_info *cdi, struct cdrom_tocentry *entry); -- cgit v1.2.3 From 4c74746625dedb62ce00ac86ac4faedb640536d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:57 +0200 Subject: driver core: remove device_create_vargs All external users of device_create_vargs are gone, so remove it and open code it in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/base/core.c | 37 ++----------------------------------- include/linux/device.h | 4 ---- 2 files changed, 2 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 139cdf7e7327..fb8ae248e5aa 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3188,40 +3188,6 @@ error: return ERR_PTR(retval); } -/** - * device_create_vargs - creates a device and registers it with sysfs - * @class: pointer to the struct class that this device should be registered to - * @parent: pointer to the parent struct device of this new device, if any - * @devt: the dev_t for the char device to be added - * @drvdata: the data to be added to the device for callbacks - * @fmt: string for the device's name - * @args: va_list for the device's name - * - * This function can be used by char device classes. A struct device - * will be created in sysfs, registered to the specified class. - * - * A "dev" file will be created, showing the dev_t for the device, if - * the dev_t is not 0,0. - * If a pointer to a parent struct device is passed in, the newly created - * struct device will be a child of that device in sysfs. - * The pointer to the struct device will be returned from the call. - * Any further sysfs files that might be required can be created using this - * pointer. - * - * Returns &struct device pointer on success, or ERR_PTR() on error. - * - * Note: the struct class passed to this function must have previously - * been created with a call to class_create(). - */ -struct device *device_create_vargs(struct class *class, struct device *parent, - dev_t devt, void *drvdata, const char *fmt, - va_list args) -{ - return device_create_groups_vargs(class, parent, devt, drvdata, NULL, - fmt, args); -} -EXPORT_SYMBOL_GPL(device_create_vargs); - /** * device_create - creates a device and registers it with sysfs * @class: pointer to the struct class that this device should be registered to @@ -3253,7 +3219,8 @@ struct device *device_create(struct class *class, struct device *parent, struct device *dev; va_start(vargs, fmt); - dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs); + dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL, + fmt, vargs); va_end(vargs); return dev; } diff --git a/include/linux/device.h b/include/linux/device.h index ac8e37cd716a..15460a5ac024 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -884,10 +884,6 @@ extern bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ -extern __printf(5, 0) -struct device *device_create_vargs(struct class *cls, struct device *parent, - dev_t devt, void *drvdata, - const char *fmt, va_list vargs); extern __printf(5, 6) struct device *device_create(struct class *cls, struct device *parent, dev_t devt, void *drvdata, -- cgit v1.2.3 From 3c5d202b55d3fa9106607f99cdd5044b02b5929b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:59 +0200 Subject: bdi: remove bdi_register_owner Split out a new bdi_set_owner helper to set the owner, and move the policy for creating the bdi name back into genhd.c, where it belongs. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/genhd.c | 8 +++++--- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 12 ++---------- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index c05d509877fa..27511b3d164d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -840,13 +840,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { + struct backing_dev_info *bdi = disk->queue->backing_dev_info; + struct device *dev = disk_to_dev(disk); int ret; /* Register BDI before referencing it from bdev */ - disk_to_dev(disk)->devt = devt; - ret = bdi_register_owner(disk->queue->backing_dev_info, - disk_to_dev(disk)); + dev->devt = devt; + ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); + bdi_set_owner(bdi, dev); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c9ad5c3b7b4b..4098ed6ba6b4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -33,7 +33,7 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eb6b51e49d11..bb993f99d424 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -977,20 +977,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) } EXPORT_SYMBOL(bdi_register); -int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { - int rc; - - rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); - if (rc) - return rc; - /* Leaking owner reference... */ - WARN_ON(bdi->owner); + WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); - return 0; } -EXPORT_SYMBOL(bdi_register_owner); /* * Remove bdi from bdi_list, and ensure that it is no longer visible -- cgit v1.2.3 From aef33c2ff8aa5e24f3f7d93806aa84ca1c2b6832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:48:00 +0200 Subject: bdi: simplify bdi_alloc Merge the _node vs normal version and drop the superflous gfp_t argument. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/mtd/mtdcore.c | 2 +- fs/super.c | 2 +- include/linux/backing-dev.h | 6 +----- mm/backing-dev.c | 7 +++---- 5 files changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 7f11560bfddb..285a2f8ee8d3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -501,7 +501,7 @@ struct request_queue *__blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id); + q->backing_dev_info = bdi_alloc(node_id); if (!q->backing_dev_info) goto fail_split; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 2916674208b3..39ec563d9a14 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2036,7 +2036,7 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name) struct backing_dev_info *bdi; int ret; - bdi = bdi_alloc(GFP_KERNEL); + bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return ERR_PTR(-ENOMEM); diff --git a/fs/super.c b/fs/super.c index cd352530eca9..dd28fcd706ff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1598,7 +1598,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) int err; va_list args; - bdi = bdi_alloc(GFP_KERNEL); + bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 4098ed6ba6b4..6b3504bf7a42 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,11 +36,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); -static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) -{ - return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); -} +struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bb993f99d424..1f55d5b8269f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -865,12 +865,11 @@ static int bdi_init(struct backing_dev_info *bdi) return ret; } -struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; - bdi = kmalloc_node(sizeof(struct backing_dev_info), - gfp_mask | __GFP_ZERO, node_id); + bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; @@ -880,7 +879,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } return bdi; } -EXPORT_SYMBOL(bdi_alloc_node); +EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { -- cgit v1.2.3 From 1cd925d583857ee3ead6cfbf1e4b1cd067d28591 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:48:01 +0200 Subject: bdi: remove the name field in struct backing_dev_info The name is only printed for a not registered bdi in writeback. Use the device name there as is more useful anyway for the unlike case that the warning triggers. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - drivers/mtd/mtdcore.c | 1 - fs/fs-writeback.c | 2 +- fs/super.c | 2 -- include/linux/backing-dev-defs.h | 2 -- mm/backing-dev.c | 1 - 6 files changed, 1 insertion(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 285a2f8ee8d3..538cbc725620 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -511,7 +511,6 @@ struct request_queue *__blk_alloc_queue(int node_id) q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; - q->backing_dev_info->name = "block"; q->node = node_id; timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 39ec563d9a14..fcb018ce17c3 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2040,7 +2040,6 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name) if (!bdi) return ERR_PTR(-ENOMEM); - bdi->name = name; /* * We put '-0' suffix to the name to get the same name format as we * used to get. Since this is called only once, we get a unique name. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..d85323607b49 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2320,7 +2320,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) WARN(bdi_cap_writeback_dirty(wb->bdi) && !test_bit(WB_registered, &wb->state), - "bdi-%s not registered\n", wb->bdi->name); + "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); inode->dirtied_when = jiffies; if (dirtytime) diff --git a/fs/super.c b/fs/super.c index dd28fcd706ff..4991f441988e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1602,8 +1602,6 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) if (!bdi) return -ENOMEM; - bdi->name = sb->s_type->name; - va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 7367150f962a..90a7e844a098 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -193,8 +193,6 @@ struct backing_dev_info { congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ - const char *name; - struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1f55d5b8269f..d382272bcc31 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -15,7 +15,6 @@ #include struct backing_dev_info noop_backing_dev_info = { - .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; EXPORT_SYMBOL_GPL(noop_backing_dev_info); -- cgit v1.2.3 From 07c4e1e834f8e7c991aa6dcb5ee0b7f2842e495d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 May 2020 16:17:56 +0800 Subject: block: only define 'nr_sects_seq' in hd_part for 32bit SMP The seqcount of 'nr_sects_seq' is only needed in case of 32bit SMP, so define it just for 32bit SMP. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Yufen Yu Cc: Christoph Hellwig Cc: Hou Tao Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- block/partitions/core.c | 2 +- include/linux/genhd.h | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index af6cba2a9332..1a51a9ad1035 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1731,7 +1731,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) * TODO: Ideally set_capacity() and get_capacity() should be * converted to make use of bd_mutex and sequence counters. */ - seqcount_init(&disk->part0.nr_sects_seq); + hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) { hd_free_part(&disk->part0); kfree(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 8d8c87207fc2..297004fd2264 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -392,7 +392,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free; } - seqcount_init(&p->nr_sects_seq); + hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f9c226f9546a..b4744035ae58 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -68,7 +68,9 @@ struct hd_struct { * can be non-atomic on 32bit machines with 64bit sector_t. */ sector_t nr_sects; +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) seqcount_t nr_sects_seq; +#endif sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -274,6 +276,13 @@ static inline void disk_put_part(struct hd_struct *part) put_device(part_to_dev(part)); } +static inline void hd_sects_seq_init(struct hd_struct *p) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + seqcount_init(&p->nr_sects_seq); +#endif +} + /* * Smarter partition iterator without context limits. */ -- cgit v1.2.3 From 520138c3b9425c615d1417687947d86821003319 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 May 2020 16:17:57 +0800 Subject: block: re-organize fields of 'struct hd_part' Put all fields accessed in IO path together at the beginning of the struct, so that all can be fetched in single cacheline. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Yufen Yu Cc: Christoph Hellwig Cc: Hou Tao Signed-off-by: Jens Axboe --- include/linux/genhd.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b4744035ae58..a9384449465a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -71,6 +71,14 @@ struct hd_struct { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) seqcount_t nr_sects_seq; #endif + unsigned long stamp; +#ifdef CONFIG_SMP + struct disk_stats __percpu *dkstats; +#else + struct disk_stats dkstats; +#endif + struct percpu_ref ref; + sector_t alignment_offset; unsigned int discard_alignment; struct device __dev; @@ -80,13 +88,6 @@ struct hd_struct { #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif - unsigned long stamp; -#ifdef CONFIG_SMP - struct disk_stats __percpu *dkstats; -#else - struct disk_stats dkstats; -#endif - struct percpu_ref ref; struct rcu_work rcu_work; }; -- cgit v1.2.3 From e6249cdd46e43a7d3bdb8cce5fe24565d6c11e94 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 3 May 2020 09:54:22 +0800 Subject: block: add blk_io_schedule() for avoiding task hung in sync dio Sync dio could be big, or may take long time in discard or in case of IO failure. We have prevented task hung in submit_bio_wait() and blk_execute_rq(), so apply the same trick for prevent task hung from happening in sync dio. Add helper of blk_io_schedule() and use io_schedule_timeout() to prevent task hung warning. Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Cc: Salman Qazi Cc: Jesse Barnes Cc: Christoph Hellwig Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap/direct-io.c | 2 +- include/linux/blkdev.h | 12 ++++++++++++ 4 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/block_dev.c b/fs/block_dev.c index 7cbb7b79935e..ebd1507789d2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -255,7 +255,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -449,7 +449,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index 00b4d15bb811..6d5370eac2a8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -500,7 +500,7 @@ static struct bio *dio_await_one(struct dio *dio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 20dde5aadcdd..fd3bd06fabb6 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -561,7 +561,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, dio->submit.cookie, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f00bd4042295..222eb5f32279 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -1827,4 +1828,15 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +static inline void blk_io_schedule(void) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + io_schedule_timeout(timeout); + else + io_schedule(); +} + #endif -- cgit v1.2.3 From 02992df822e7e36685593aad10721a5a9f8d3402 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 12 May 2020 17:55:45 +0900 Subject: block: provide fallbacks for blk_queue_zone_is_seq and blk_queue_zone_no blk_queue_zone_is_seq() and blk_queue_zone_no() have not been called with CONFIG_BLK_DEV_ZONED disabled until now. The introduction of REQ_OP_ZONE_APPEND will change this, so we need to provide noop fallbacks for the !CONFIG_BLK_DEV_ZONED case. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 222eb5f32279..d736acf7f564 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -722,6 +722,16 @@ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { return 0; } +static inline bool blk_queue_zone_is_seq(struct request_queue *q, + sector_t sector) +{ + return false; +} +static inline unsigned int blk_queue_zone_no(struct request_queue *q, + sector_t sector) +{ + return 0; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) -- cgit v1.2.3 From 0512a75b98f847c2e9a4b664013424e603e202f7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 May 2020 17:55:47 +0900 Subject: block: Introduce REQ_OP_ZONE_APPEND Define REQ_OP_ZONE_APPEND to append-write sectors to a zone of a zoned block device. This is a no-merge write operation. A zone append write BIO must: * Target a zoned block device * Have a sector position indicating the start sector of the target zone * The target zone must be a sequential write zone * The BIO must not cross a zone boundary * The BIO size must not be split to ensure that a single range of LBAs is written with a single command. Implement these checks in generic_make_request_checks() using the helper function blk_check_zone_append(). To avoid write append BIO splitting, introduce the new max_zone_append_sectors queue limit attribute and ensure that a BIO size is always lower than this limit. Export this new limit through sysfs and check these limits in bio_full(). Also when a LLDD can't dispatch a request to a specific zone, it will return BLK_STS_ZONE_RESOURCE indicating this request needs to be delayed, e.g. because the zone it will be dispatched to is still write-locked. If this happens set the request aside in a local list to continue trying dispatching requests such as READ requests or a WRITE/ZONE_APPEND requests targetting other zones. This way we can still keep a high queue depth without starving other requests even if one request can't be served due to zone write-locking. Finally, make sure that the bio sector position indicates the actual write position as indicated by the device on completion. Signed-off-by: Keith Busch [ jth: added zone-append specific add_page and merge_page helpers ] Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 62 ++++++++++++++++++++++++++++++++++++++++++++--- block/blk-core.c | 52 +++++++++++++++++++++++++++++++++++++++ block/blk-mq.c | 27 +++++++++++++++++++++ block/blk-settings.c | 31 ++++++++++++++++++++++++ block/blk-sysfs.c | 13 ++++++++++ drivers/scsi/scsi_lib.c | 1 + include/linux/blk_types.h | 14 +++++++++++ include/linux/blkdev.h | 11 +++++++++ 8 files changed, 207 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index aad0a6dad4f9..3aa3c4ce2e5e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1025,6 +1025,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } +static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct request_queue *q = bio->bi_disk->queue; + unsigned int max_append_sectors = queue_max_zone_append_sectors(q); + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; + size_t offset; + + if (WARN_ON_ONCE(!max_append_sectors)) + return 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + bool same_page = false; + + len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_add_hw_page(q, bio, page, len, offset, + max_append_sectors, &same_page) != len) + return -EINVAL; + if (same_page) + put_page(page); + offset = 0; + } + + iov_iter_advance(iter, size); + return 0; +} + /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to @@ -1054,10 +1098,16 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return -EINVAL; do { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; + ret = __bio_iov_append_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); if (is_bvec) @@ -1460,6 +1510,10 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); + /* Zone append commands cannot be split */ + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return NULL; + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 409e1a6b73b0..cf5b2163edfe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -135,6 +135,7 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), + REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), @@ -240,6 +241,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); + if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); @@ -887,6 +899,41 @@ out: return ret; } +/* + * Check write append to a zoned block device. + */ +static inline blk_status_t blk_check_zone_append(struct request_queue *q, + struct bio *bio) +{ + sector_t pos = bio->bi_iter.bi_sector; + int nr_sectors = bio_sectors(bio); + + /* Only applicable to zoned block devices */ + if (!blk_queue_is_zoned(q)) + return BLK_STS_NOTSUPP; + + /* The bio sector must point to the start of a sequential zone */ + if (pos & (blk_queue_zone_sectors(q) - 1) || + !blk_queue_zone_is_seq(q, pos)) + return BLK_STS_IOERR; + + /* + * Not allowed to cross zone boundaries. Otherwise, the BIO will be + * split and could result in non-contiguous sectors being written in + * different zones. + */ + if (nr_sectors > q->limits.chunk_sectors) + return BLK_STS_IOERR; + + /* Make sure the BIO is small enough and will not get split */ + if (nr_sectors > q->limits.max_zone_append_sectors) + return BLK_STS_IOERR; + + bio->bi_opf |= REQ_NOMERGE; + + return BLK_STS_OK; +} + static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -959,6 +1006,11 @@ generic_make_request_checks(struct bio *bio) if (!q->limits.max_write_same_sectors) goto not_supported; break; + case REQ_OP_ZONE_APPEND: + status = blk_check_zone_append(q, bio); + if (status != BLK_STS_OK) + goto end_io; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: diff --git a/block/blk-mq.c b/block/blk-mq.c index d82cefb0474f..9ee695bdf873 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1183,6 +1183,19 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + /* * Returns true if we did some work AND can potentially do more. */ @@ -1195,6 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, int errors, queued; blk_status_t ret = BLK_STS_OK; bool no_budget_avail = false; + LIST_HEAD(zone_list); if (list_empty(list)) return false; @@ -1256,6 +1270,16 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_handle_dev_resource(rq, list); break; + } else if (ret == BLK_STS_ZONE_RESOURCE) { + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + if (list_empty(list)) + break; + continue; } if (unlikely(ret != BLK_STS_OK)) { @@ -1267,6 +1291,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, queued++; } while (!list_empty(list)); + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); + hctx->dispatched[queued_to_index(queued)]++; /* diff --git a/block/blk-settings.c b/block/blk-settings.c index 2ab1967b9716..9a2c23cd9700 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -48,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; + lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -83,6 +84,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_zone_append_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -221,6 +223,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); +/** + * blk_queue_max_zone_append_sectors - set max sectors for a single zone append + * @q: the request queue for the device + * @max_zone_append_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors) +{ + unsigned int max_sectors; + + if (WARN_ON(!blk_queue_is_zoned(q))) + return; + + max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); + + /* + * Signal eventual driver bugs resulting in the max_zone_append sectors limit + * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, + * or the max_hw_sectors limit not set. + */ + WARN_ON(!max_sectors); + + q->limits.max_zone_append_sectors = max_sectors; +} +EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -470,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); + t->max_zone_append_sectors = min(t->max_zone_append_sectors, + b->max_zone_append_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index fca9b158f4a0..02643e149d5e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) +{ + unsigned long long max_sectors = q->limits.max_zone_append_sectors; + + return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -639,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = { .show = queue_write_zeroes_max_show, }; +static struct queue_sysfs_entry queue_zone_append_max_entry = { + .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, + .show = queue_zone_append_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = 0644 }, .show = queue_show_nonrot, @@ -749,6 +761,7 @@ static struct attribute *queue_attrs[] = { &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, + &queue_zone_append_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0a73230a8f16..82ad0244b3d0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1706,6 +1706,7 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: + case BLK_STS_ZONE_RESOURCE: if (atomic_read(&sdev->device_busy) || scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 90895d594e64..b90dca1fa430 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -63,6 +63,18 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) +/* + * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone + * related resources are unavailable, but the driver can guarantee the queue + * will be rerun in the future once the resources become available again. + * + * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references + * a zone specific resource and IO to a different zone on the same device could + * still be served. Examples of that are zones that are write-locked, but a read + * to the same zone could be served. + */ +#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -296,6 +308,8 @@ enum req_opf { REQ_OP_ZONE_CLOSE = 11, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = 12, + /* write data at the current zone write pointer */ + REQ_OP_ZONE_APPEND = 13, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d736acf7f564..5647c78bb876 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -332,6 +332,7 @@ struct queue_limits { unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; unsigned int max_write_zeroes_sectors; + unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -750,6 +751,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1084,6 +1088,8 @@ extern void blk_queue_max_write_same_sectors(struct request_queue *q, extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); +extern void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); @@ -1301,6 +1307,11 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } +static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +{ + return q->limits.max_zone_append_sectors; +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; -- cgit v1.2.3 From 1392d37018d4f68c5bb2c98dae9a018b73926865 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 12 May 2020 17:55:48 +0900 Subject: block: introduce blk_req_zone_write_trylock Introduce blk_req_zone_write_trylock(), which either grabs the write-lock for a sequential zone or returns false, if the zone is already locked. Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 14 ++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index f87956e0dcaf..c822cfa7a102 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -82,6 +82,20 @@ bool blk_req_needs_zone_write_lock(struct request *rq) } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); +bool blk_req_zone_write_trylock(struct request *rq) +{ + unsigned int zno = blk_rq_zone_no(rq); + + if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) + return false; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; + + return true; +} +EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); + void __blk_req_zone_write_lock(struct request *rq) { if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5647c78bb876..73f4f4f1df92 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1738,6 +1738,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *, #ifdef CONFIG_BLK_DEV_ZONED bool blk_req_needs_zone_write_lock(struct request *rq); +bool blk_req_zone_write_trylock(struct request *rq); void __blk_req_zone_write_lock(struct request *rq); void __blk_req_zone_write_unlock(struct request *rq); -- cgit v1.2.3 From e732671aa5f67232cf760666a15242dead003362 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 12 May 2020 17:55:49 +0900 Subject: block: Modify revalidate zones Modify the interface of blk_revalidate_disk_zones() to add an optional driver callback function that a driver can use to extend processing done during zone revalidation. The callback, if defined, is executed with the device request queue frozen, after all zones have been inspected. Signed-off-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 9 ++++++++- drivers/block/null_blk_zoned.c | 2 +- include/linux/blkdev.h | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c822cfa7a102..23831fa8701d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -471,14 +471,19 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, /** * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * @disk: Target disk + * @update_driver_data: Callback to update driver data on the frozen disk * * Helper function for low-level device drivers to (re) allocate and initialize * a disk request queue zone bitmaps. This functions should normally be called * within the disk ->revalidate method for blk-mq based drivers. For BIO based * drivers only q->nr_zones needs to be updated so that the sysfs exposed value * is correct. + * If the @update_driver_data callback function is not NULL, the callback is + * executed with the device request queue frozen after all zones have been + * checked. */ -int blk_revalidate_disk_zones(struct gendisk *disk) +int blk_revalidate_disk_zones(struct gendisk *disk, + void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; struct blk_revalidate_zone_args args = { @@ -512,6 +517,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + if (update_driver_data) + update_driver_data(disk); ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 9e4bcdad1a80..46641df2e58e 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -73,7 +73,7 @@ int null_register_zoned_dev(struct nullb *nullb) struct request_queue *q = nullb->q; if (queue_is_mq(q)) - return blk_revalidate_disk_zones(nullb->disk); + return blk_revalidate_disk_zones(nullb->disk, NULL); blk_queue_chunk_sectors(q, nullb->dev->zone_size_sects); q->nr_zones = blkdev_nr_zones(nullb->disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 73f4f4f1df92..5360696d85ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -358,7 +358,8 @@ unsigned int blkdev_nr_zones(struct gendisk *disk); extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); -extern int blk_revalidate_disk_zones(struct gendisk *disk); +int blk_revalidate_disk_zones(struct gendisk *disk, + void (*update_driver_data)(struct gendisk *disk)); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -- cgit v1.2.3 From 71ac860af824ce9ebbbe8de20044e777c0fc33b9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 14 May 2020 16:45:09 +0800 Subject: block: move blk_io_schedule() out of header file blk_io_schedule() isn't called from performance sensitive code path, and it is easier to maintain by exporting it as symbol. Also blk_io_schedule() is only called by CONFIG_BLOCK code, so it is safe to do this way. Meantime fixes build failure when CONFIG_BLOCK is off. Cc: Christoph Hellwig Fixes: e6249cdd46e4 ("block: add blk_io_schedule() for avoiding task hung in sync dio") Reported-by: Satya Tangirala Tested-by: Satya Tangirala Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++++++++ include/linux/blkdev.h | 14 ++------------ 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index cf5b2163edfe..7c1587b45427 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -38,6 +38,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1812,6 +1813,18 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); +void blk_io_schedule(void) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + io_schedule_timeout(timeout); + else + io_schedule(); +} +EXPORT_SYMBOL_GPL(blk_io_schedule); + int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5360696d85ff..f9e4b21b051b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,7 +27,6 @@ #include #include #include -#include struct module; struct scsi_ioctl_command; @@ -1221,6 +1220,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) !list_empty(&plug->cb_list)); } +extern void blk_io_schedule(void); + extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -1851,15 +1852,4 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -static inline void blk_io_schedule(void) -{ - /* Prevent hang_check timer from firing at us during very long I/O */ - unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; - - if (timeout) - io_schedule_timeout(timeout); - else - io_schedule(); -} - #endif -- cgit v1.2.3 From 1b2628397058ebce7277480960b29c788138de90 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:17 +0000 Subject: block: Keyslot Manager for Inline Encryption Inline Encryption hardware allows software to specify an encryption context (an encryption key, crypto algorithm, data unit num, data unit size) along with a data transfer request to a storage device, and the inline encryption hardware will use that context to en/decrypt the data. The inline encryption hardware is part of the storage device, and it conceptually sits on the data path between system memory and the storage device. Inline Encryption hardware implementations often function around the concept of "keyslots". These implementations often have a limited number of "keyslots", each of which can hold a key (we say that a key can be "programmed" into a keyslot). Requests made to the storage device may have a keyslot and a data unit number associated with them, and the inline encryption hardware will en/decrypt the data in the requests using the key programmed into that associated keyslot and the data unit number specified with the request. As keyslots are limited, and programming keys may be expensive in many implementations, and multiple requests may use exactly the same encryption contexts, we introduce a Keyslot Manager to efficiently manage keyslots. We also introduce a blk_crypto_key, which will represent the key that's programmed into keyslots managed by keyslot managers. The keyslot manager also functions as the interface that upper layers will use to program keys into inline encryption hardware. For more information on the Keyslot Manager, refer to documentation found in block/keyslot-manager.c and linux/keyslot-manager.h. Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 7 + block/Makefile | 1 + block/keyslot-manager.c | 378 ++++++++++++++++++++++++++++++++++++++++ include/linux/blk-crypto.h | 52 ++++++ include/linux/blkdev.h | 6 + include/linux/keyslot-manager.h | 106 +++++++++++ 6 files changed, 550 insertions(+) create mode 100644 block/keyslot-manager.c create mode 100644 include/linux/blk-crypto.h create mode 100644 include/linux/keyslot-manager.h (limited to 'include/linux') diff --git a/block/Kconfig b/block/Kconfig index 41cb34b0fcd1..f8870c316a03 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -186,6 +186,13 @@ config BLK_SED_OPAL Enabling this option enables users to setup/unlock/lock Locking ranges for SED devices using the Opal protocol. +config BLK_INLINE_ENCRYPTION + bool "Enable inline encryption support in block layer" + help + Build the blk-crypto subsystem. Enabling this lets the + block layer handle encryption, so users can take + advantage of inline encryption hardware if present. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 206b96e9387f..fc963e4676b0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c new file mode 100644 index 000000000000..fcd3fd469d7c --- /dev/null +++ b/block/keyslot-manager.c @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: The Keyslot Manager + * + * Many devices with inline encryption support have a limited number of "slots" + * into which encryption contexts may be programmed, and requests can be tagged + * with a slot number to specify the key to use for en/decryption. + * + * As the number of slots is limited, and programming keys is expensive on + * many inline encryption hardware, we don't want to program the same key into + * multiple slots - if multiple requests are using the same key, we want to + * program just one slot with that key and use that slot for all requests. + * + * The keyslot manager manages these keyslots appropriately, and also acts as + * an abstraction between the inline encryption hardware and the upper layers. + * + * Lower layer devices will set up a keyslot manager in their request queue + * and tell it how to perform device specific operations like programming/ + * evicting keys from keyslots. + * + * Upper layers will call blk_ksm_get_slot_for_key() to program a + * key into some slot in the inline encryption hardware. + */ +#include +#include +#include +#include +#include +#include + +struct blk_ksm_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_keyslot_manager *ksm; +}; + +static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) +{ + /* + * Calling into the driver requires ksm->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release ksm->lock via blk_ksm_reprogram_all_keys(). + */ + if (ksm->dev) + pm_runtime_get_sync(ksm->dev); + down_write(&ksm->lock); +} + +static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) +{ + up_write(&ksm->lock); + if (ksm->dev) + pm_runtime_put_sync(ksm->dev); +} + +/** + * blk_ksm_init() - Initialize a keyslot manager + * @ksm: The keyslot_manager to initialize. + * @num_slots: The number of key slots to manage. + * + * Allocate memory for keyslots and initialize a keyslot manager. Called by + * e.g. storage drivers to set up a keyslot manager in their request_queue. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(ksm, 0, sizeof(*ksm)); + + if (num_slots == 0) + return -EINVAL; + + ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); + if (!ksm->slots) + return -ENOMEM; + + ksm->num_slots = num_slots; + + init_rwsem(&ksm->lock); + + init_waitqueue_head(&ksm->idle_slots_wait_queue); + INIT_LIST_HEAD(&ksm->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + ksm->slots[slot].ksm = ksm; + list_add_tail(&ksm->slots[slot].idle_slot_node, + &ksm->idle_slots); + } + + spin_lock_init(&ksm->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + ksm->log_slot_ht_size = ilog2(slot_hashtable_size); + ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, + sizeof(ksm->slot_hashtable[0]), + GFP_KERNEL); + if (!ksm->slot_hashtable) + goto err_destroy_ksm; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); + + return 0; + +err_destroy_ksm: + blk_ksm_destroy(ksm); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_ksm_init); + +static inline struct hlist_head * +blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; +} + +static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm = slot->ksm; + unsigned long flags; + + spin_lock_irqsave(&ksm->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); +} + +static struct blk_ksm_keyslot *blk_ksm_find_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); + struct blk_ksm_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_ksm_remove_slot_from_lru_list(slot); + } + return slot; +} + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) +{ + return slot - slot->ksm->slots; +} +EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); + +/** + * blk_ksm_get_slot_for_key() - Program a key into a keyslot. + * @ksm: The keyslot manager to program the key into. + * @key: Pointer to the key object to program, including the raw key, crypto + * mode, and data unit size. + * @slot_ptr: A pointer to return the pointer of the allocated keyslot. + * + * Get a keyslot that's been programmed with the specified key. If one already + * exists, return it with incremented refcount. Otherwise, wait for a keyslot + * to become idle and program it. + * + * Context: Process context. Takes and releases ksm->lock. + * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the + * allocated keyslot), or some other blk_status_t otherwise (and + * keyslot is set to NULL). + */ +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr) +{ + struct blk_ksm_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + down_read(&ksm->lock); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + up_read(&ksm->lock); + if (slot) + goto success; + + for (;;) { + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + if (slot) { + blk_ksm_hw_exit(ksm); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&ksm->idle_slots)) + break; + + blk_ksm_hw_exit(ksm); + wait_event(ksm->idle_slots_wait_queue, + !list_empty(&ksm->idle_slots)); + } + + slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, + idle_slot_node); + slot_idx = blk_ksm_get_slot_idx(slot); + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); + if (err) { + wake_up(&ksm->idle_slots_wait_queue); + blk_ksm_hw_exit(ksm); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_ksm_remove_slot_from_lru_list(slot); + + blk_ksm_hw_exit(ksm); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_ksm_put_slot() - Release a reference to a slot + * @slot: The keyslot to release the reference of. + * + * Context: Any context. + */ +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm; + unsigned long flags; + + if (!slot) + return; + + ksm = slot->ksm; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &ksm->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); + wake_up(&ksm->idle_slots_wait_queue); + } +} + +/** + * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is + * supported by a ksm. + * @ksm: The keyslot manager to check + * @cfg: The crypto configuration to check for. + * + * Checks for crypto_mode/data unit size/dun bytes support. + * + * Return: Whether or not this ksm supports the specified crypto config. + */ +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg) +{ + if (!ksm) + return false; + if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & + cfg->data_unit_size)) + return false; + if (ksm->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/** + * blk_ksm_evict_key() - Evict a key from the lower layer device. + * @ksm: The keyslot manager to evict from + * @key: The key to evict + * + * Find the keyslot that the specified key was programmed into, and evict that + * slot from the lower layer device. The slot must not be in use by any + * in-flight IO when this function is called. + * + * Context: Process context. Takes and releases ksm->lock. + * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY + * if the keyslot is still in use, or another -errno value on other + * error. + */ +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + int err = 0; + + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) + goto out_unlock; + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + err = -EBUSY; + goto out_unlock; + } + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, + blk_ksm_get_slot_idx(slot)); + if (err) + goto out_unlock; + + hlist_del(&slot->hash_node); + slot->key = NULL; + err = 0; +out_unlock: + blk_ksm_hw_exit(ksm); + return err; +} + +/** + * blk_ksm_reprogram_all_keys() - Re-program all keyslots. + * @ksm: The keyslot manager + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases ksm->lock. + */ +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) +{ + unsigned int slot; + + /* This is for device initialization, so don't resume the device */ + down_write(&ksm->lock); + for (slot = 0; slot < ksm->num_slots; slot++) { + const struct blk_crypto_key *key = ksm->slots[slot].key; + int err; + + if (!key) + continue; + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); + WARN_ON(err); + } + up_write(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm) +{ + if (!ksm) + return; + kvfree(ksm->slot_hashtable); + memzero_explicit(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); + kvfree(ksm->slots); + memzero_explicit(ksm, sizeof(*ksm)); +} +EXPORT_SYMBOL_GPL(blk_ksm_destroy); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h new file mode 100644 index 000000000000..4e77938c3d0e --- /dev/null +++ b/include/linux/blk-crypto.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_H +#define __LINUX_BLK_CRYPTO_H + +enum blk_crypto_mode_num { + BLK_ENCRYPTION_MODE_INVALID, + BLK_ENCRYPTION_MODE_AES_256_XTS, + BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, + BLK_ENCRYPTION_MODE_ADIANTUM, + BLK_ENCRYPTION_MODE_MAX, +}; + +#define BLK_CRYPTO_MAX_KEY_SIZE 64 +/** + * struct blk_crypto_config - an inline encryption key's crypto configuration + * @crypto_mode: encryption algorithm this key is for + * @data_unit_size: the data unit size for all encryption/decryptions with this + * key. This is the size in bytes of each individual plaintext and + * ciphertext. This is always a power of 2. It might be e.g. the + * filesystem block size or the disk sector size. + * @dun_bytes: the maximum number of bytes of DUN used when using this key + */ +struct blk_crypto_config { + enum blk_crypto_mode_num crypto_mode; + unsigned int data_unit_size; + unsigned int dun_bytes; +}; + +/** + * struct blk_crypto_key - an inline encryption key + * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this + * key + * @data_unit_size_bits: log2 of data_unit_size + * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) + * @raw: the raw bytes of this key. Only the first @size bytes are used. + * + * A blk_crypto_key is immutable once created, and many bios can reference it at + * the same time. It must not be freed until all bios using it have completed + * and it has been evicted from all devices on which it may have been used. + */ +struct blk_crypto_key { + struct blk_crypto_config crypto_cfg; + unsigned int data_unit_size_bits; + unsigned int size; + u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; +}; + +#endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9e4b21b051b..354e44eebef9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -43,6 +43,7 @@ struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; +struct blk_keyslot_manager; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -468,6 +469,11 @@ struct request_queue { unsigned int dma_pad_mask; unsigned int dma_alignment; +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + /* Inline crypto capabilities */ + struct blk_keyslot_manager *ksm; +#endif + unsigned int rq_timeout; int poll_nsec; diff --git a/include/linux/keyslot-manager.h b/include/linux/keyslot-manager.h new file mode 100644 index 000000000000..18f3f5346843 --- /dev/null +++ b/include/linux/keyslot-manager.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_KEYSLOT_MANAGER_H +#define __LINUX_KEYSLOT_MANAGER_H + +#include +#include + +struct blk_keyslot_manager; + +/** + * struct blk_ksm_ll_ops - functions to manage keyslots in hardware + * @keyslot_program: Program the specified key into the specified slot in the + * inline encryption hardware. + * @keyslot_evict: Evict key from the specified keyslot in the hardware. + * The key is provided so that e.g. dm layers can evict + * keys from the devices that they map over. + * Returns 0 on success, -errno otherwise. + * + * This structure should be provided by storage device drivers when they set up + * a keyslot manager - this structure holds the function ptrs that the keyslot + * manager will use to manipulate keyslots in the hardware. + */ +struct blk_ksm_ll_ops { + int (*keyslot_program)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); + int (*keyslot_evict)(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot); +}; + +struct blk_keyslot_manager { + /* + * The struct blk_ksm_ll_ops that this keyslot manager will use + * to perform operations like programming and evicting keys on the + * device + */ + struct blk_ksm_ll_ops ksm_ll_ops; + + /* + * The maximum number of bytes supported for specifying the data unit + * number. + */ + unsigned int max_dun_bytes_supported; + + /* + * Array of size BLK_ENCRYPTION_MODE_MAX of bitmasks that represents + * whether a crypto mode and data unit size are supported. The i'th + * bit of crypto_mode_supported[crypto_mode] is set iff a data unit + * size of (1 << i) is supported. We only support data unit sizes + * that are powers of 2. + */ + unsigned int crypto_modes_supported[BLK_ENCRYPTION_MODE_MAX]; + + /* Device for runtime power management (NULL if none) */ + struct device *dev; + + /* Here onwards are *private* fields for internal keyslot manager use */ + + unsigned int num_slots; + + /* Protects programming and evicting keys from the device */ + struct rw_semaphore lock; + + /* List of idle slots, with least recently used slot at front */ + wait_queue_head_t idle_slots_wait_queue; + struct list_head idle_slots; + spinlock_t idle_slots_lock; + + /* + * Hash table which maps struct *blk_crypto_key to keyslots, so that we + * can find a key's keyslot in O(1) time rather than O(num_slots). + * Protected by 'lock'. + */ + struct hlist_head *slot_hashtable; + unsigned int log_slot_ht_size; + + /* Per-keyslot data */ + struct blk_ksm_keyslot *slots; +}; + +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots); + +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr); + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot); + +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot); + +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg); + +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key); + +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm); + +#endif /* __LINUX_KEYSLOT_MANAGER_H */ -- cgit v1.2.3 From a892c8d52c02284076fbbacae6692aa5c5807d11 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:18 +0000 Subject: block: Inline encryption support for blk-mq We must have some way of letting a storage device driver know what encryption context it should use for en/decrypting a request. However, it's the upper layers (like the filesystem/fscrypt) that know about and manages encryption contexts. As such, when the upper layer submits a bio to the block layer, and this bio eventually reaches a device driver with support for inline encryption, the device driver will need to have been told the encryption context for that bio. We want to communicate the encryption context from the upper layer to the storage device along with the bio, when the bio is submitted to the block layer. To do this, we add a struct bio_crypt_ctx to struct bio, which can represent an encryption context (note that we can't use the bi_private field in struct bio to do this because that field does not function to pass information across layers in the storage stack). We also introduce various functions to manipulate the bio_crypt_ctx and make the bio/request merging logic aware of the bio_crypt_ctx. We also make changes to blk-mq to make it handle bios with encryption contexts. blk-mq can merge many bios into the same request. These bios need to have contiguous data unit numbers (the necessary changes to blk-merge are also made to ensure this) - as such, it suffices to keep the data unit number of just the first bio, since that's all a storage driver needs to infer the data unit number to use for each data block in each bio in a request. blk-mq keeps track of the encryption context to be used for all the bios in a request with the request's rq_crypt_ctx. When the first bio is added to an empty request, blk-mq will program the encryption context of that bio into the request_queue's keyslot manager, and store the returned keyslot in the request's rq_crypt_ctx. All the functions to operate on encryption contexts are in blk-crypto.c. Upper layers only need to call bio_crypt_set_ctx with the encryption key, algorithm and data_unit_num; they don't have to worry about getting a keyslot for each encryption context, as blk-mq/blk-crypto handles that. Blk-crypto also makes it possible for request-based layered devices like dm-rq to make use of inline encryption hardware by cloning the rq_crypt_ctx and programming a keyslot in the new request_queue when necessary. Note that any user of the block layer can submit bios with an encryption context, such as filesystems, device-mapper targets, etc. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/bio.c | 6 + block/blk-core.c | 27 +++- block/blk-crypto-internal.h | 166 +++++++++++++++++++ block/blk-crypto.c | 376 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-map.c | 1 + block/blk-merge.c | 11 ++ block/blk-mq.c | 13 ++ block/blk.h | 2 + block/bounce.c | 2 + drivers/md/dm.c | 3 + include/linux/blk-crypto.h | 71 +++++++++ include/linux/blk_types.h | 6 + include/linux/blkdev.h | 5 + 14 files changed, 684 insertions(+), 7 deletions(-) create mode 100644 block/blk-crypto-internal.h create mode 100644 block/blk-crypto.c (limited to 'include/linux') diff --git a/block/Makefile b/block/Makefile index fc963e4676b0..3ee88d3e807d 100644 --- a/block/Makefile +++ b/block/Makefile @@ -36,4 +36,4 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o diff --git a/block/bio.c b/block/bio.c index e4c46e2bd5ba..1594804fe8bc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "blk.h" @@ -237,6 +238,8 @@ void bio_uninit(struct bio *bio) if (bio_integrity(bio)) bio_integrity_free(bio); + + bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); @@ -708,6 +711,8 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) __bio_clone_fast(b, bio); + bio_crypt_clone(b, bio, gfp_mask); + if (bio_integrity(bio)) { int ret; @@ -1172,6 +1177,7 @@ void bio_advance(struct bio *bio, unsigned bytes) if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); + bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(bio_advance); diff --git a/block/blk-core.c b/block/blk-core.c index 7c1587b45427..1e97f9973523 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -121,6 +122,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); + blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -652,6 +654,8 @@ bool bio_attempt_back_merge(struct request *req, struct bio *bio, req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; + bio_crypt_free_ctx(bio); + blk_account_io_start(req, false); return true; } @@ -676,6 +680,8 @@ bool bio_attempt_front_merge(struct request *req, struct bio *bio, req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; + bio_crypt_do_front_merge(req, bio); + blk_account_io_start(req, false); return true; } @@ -1125,10 +1131,12 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - if (q->make_request_fn) - ret = q->make_request_fn(q, bio); - else - ret = blk_mq_make_request(q, bio); + if (blk_crypto_bio_prep(&bio)) { + if (q->make_request_fn) + ret = q->make_request_fn(q, bio); + else + ret = blk_mq_make_request(q, bio); + } blk_queue_exit(q); @@ -1167,7 +1175,7 @@ EXPORT_SYMBOL(generic_make_request); blk_qc_t direct_make_request(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; - blk_qc_t ret; + blk_qc_t ret = BLK_QC_T_NONE; if (WARN_ON_ONCE(q->make_request_fn)) { bio_io_error(bio); @@ -1177,7 +1185,8 @@ blk_qc_t direct_make_request(struct bio *bio) return BLK_QC_T_NONE; if (unlikely(bio_queue_enter(bio))) return BLK_QC_T_NONE; - ret = blk_mq_make_request(q, bio); + if (blk_crypto_bio_prep(&bio)) + ret = blk_mq_make_request(q, bio); blk_queue_exit(q); return ret; } @@ -1309,6 +1318,9 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); @@ -1673,6 +1685,9 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + if (rq->bio) + blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask); + return 0; free_and_out: diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h new file mode 100644 index 000000000000..796f757fe8e9 --- /dev/null +++ b/block/blk-crypto-internal.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H +#define __LINUX_BLK_CRYPTO_INTERNAL_H + +#include +#include + +/* Represents a crypto mode supported by blk-crypto */ +struct blk_crypto_mode { + unsigned int keysize; /* key size in bytes */ + unsigned int ivsize; /* iv size in bytes */ +}; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc); + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); + +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2); + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + bio->bi_crypt_context); +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(bio->bi_crypt_context, + bio->bi_iter.bi_size, req->crypt_ctx); +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + next->crypt_ctx); +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) +{ + rq->crypt_ctx = NULL; + rq->crypt_keyslot = NULL; +} + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return rq->crypt_ctx; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return true; +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) { } + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return false; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes); +static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_advance(bio, bytes); +} + +void __bio_crypt_free_ctx(struct bio *bio); +static inline void bio_crypt_free_ctx(struct bio *bio) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_free_ctx(bio); +} + +static inline void bio_crypt_do_front_merge(struct request *rq, + struct bio *bio) +{ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (bio_has_crypt_ctx(bio)) + memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, + sizeof(rq->crypt_ctx->bc_dun)); +#endif +} + +bool __blk_crypto_bio_prep(struct bio **bio_ptr); +static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) +{ + if (bio_has_crypt_ctx(*bio_ptr)) + return __blk_crypto_bio_prep(bio_ptr); + return true; +} + +blk_status_t __blk_crypto_init_request(struct request *rq); +static inline blk_status_t blk_crypto_init_request(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + return __blk_crypto_init_request(rq); + return BLK_STS_OK; +} + +void __blk_crypto_free_request(struct request *rq); +static inline void blk_crypto_free_request(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + __blk_crypto_free_request(rq); +} + +void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask); +static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (bio_has_crypt_ctx(bio)) + __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); +} + +/** + * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted + * into a request queue. + * @rq: the request being queued + * + * Return: BLK_STS_OK on success, nonzero on error. + */ +static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) +{ + + if (blk_crypto_rq_is_encrypted(rq)) + return blk_crypto_init_request(rq); + return BLK_STS_OK; +} + +#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c new file mode 100644 index 000000000000..25b981257f5f --- /dev/null +++ b/block/blk-crypto.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +const struct blk_crypto_mode blk_crypto_modes[] = { + [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .keysize = 64, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .keysize = 16, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .keysize = 32, + .ivsize = 32, + }, +}; + +/* + * This number needs to be at least (the number of threads doing IO + * concurrently) * (maximum recursive depth of a bio), so that we don't + * deadlock on crypt_ctx allocations. The default is chosen to be the same + * as the default number of post read contexts in both EXT4 and F2FS. + */ +static int num_prealloc_crypt_ctxs = 128; + +module_param(num_prealloc_crypt_ctxs, int, 0444); +MODULE_PARM_DESC(num_prealloc_crypt_ctxs, + "Number of bio crypto contexts to preallocate"); + +static struct kmem_cache *bio_crypt_ctx_cache; +static mempool_t *bio_crypt_ctx_pool; + +static int __init bio_crypt_ctx_init(void) +{ + size_t i; + + bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); + if (!bio_crypt_ctx_cache) + goto out_no_mem; + + bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, + bio_crypt_ctx_cache); + if (!bio_crypt_ctx_pool) + goto out_no_mem; + + /* This is assumed in various places. */ + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + + /* Sanity check that no algorithm exceeds the defined limits. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { + BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); + } + + return 0; +out_no_mem: + panic("Failed to allocate mem for bio crypt ctxs\n"); +} +subsys_initcall(bio_crypt_ctx_init); + +void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, + const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) +{ + struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + + bc->bc_key = key; + memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); + + bio->bi_crypt_context = bc; +} + +void __bio_crypt_free_ctx(struct bio *bio) +{ + mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); + bio->bi_crypt_context = NULL; +} + +void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) +{ + dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + *dst->bi_crypt_context = *src->bi_crypt_context; +} +EXPORT_SYMBOL_GPL(__bio_crypt_clone); + +/* Increments @dun by @inc, treating @dun as a multi-limb integer. */ +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc) +{ + int i; + + for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + dun[i] += inc; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if (dun[i] < inc) + inc = 1; + else + inc = 0; + } +} + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + bio_crypt_dun_increment(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +/* + * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to + * @next_dun, treating the DUNs as multi-limb integers. + */ +bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, + unsigned int bytes, + const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) +{ + int i; + unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + if (bc->bc_dun[i] + carry != next_dun[i]) + return false; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if ((bc->bc_dun[i] + carry) < carry) + carry = 1; + else + carry = 0; + } + + /* If the DUN wrapped through 0, don't treat it as contiguous. */ + return carry == 0; +} + +/* + * Checks that two bio crypt contexts are compatible - i.e. that + * they are mergeable except for data_unit_num continuity. + */ +static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, + struct bio_crypt_ctx *bc2) +{ + if (!bc1) + return !bc2; + + return bc2 && bc1->bc_key == bc2->bc_key; +} + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) +{ + return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); +} + +/* + * Checks that two bio crypt contexts are compatible, and also + * that their data_unit_nums are continuous (and can hence be merged) + * in the order @bc1 followed by @bc2. + */ +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2) +{ + if (!bio_crypt_ctx_compatible(bc1, bc2)) + return false; + + return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); +} + +/* Check that all I/O segments are data unit aligned. */ +static bool bio_crypt_check_alignment(struct bio *bio) +{ + const unsigned int data_unit_size = + bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) + return false; + } + + return true; +} + +blk_status_t __blk_crypto_init_request(struct request *rq) +{ + return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); +} + +/** + * __blk_crypto_free_request - Uninitialize the crypto fields of a request. + * + * @rq: The request whose crypto fields to uninitialize. + * + * Completely uninitializes the crypto fields of a request. If a keyslot has + * been programmed into some inline encryption hardware, that keyslot is + * released. The rq->crypt_ctx is also freed. + */ +void __blk_crypto_free_request(struct request *rq) +{ + blk_ksm_put_slot(rq->crypt_keyslot); + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); + blk_crypto_rq_set_defaults(rq); +} + +/** + * __blk_crypto_bio_prep - Prepare bio for inline encryption + * + * @bio_ptr: pointer to original bio pointer + * + * Succeeds if the bio doesn't have inline encryption enabled or if the bio + * crypt context provided for the bio is supported by the underlying device's + * inline encryption hardware. Ends the bio with error otherwise. + * + * Caller must ensure bio has bio_crypt_ctx. + * + * Return: true on success; false on error (and bio->bi_status will be set + * appropriately, and bio_endio() will have been called so bio + * submission should abort). + */ +bool __blk_crypto_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + blk_status_t blk_st = BLK_STS_IOERR; + + /* Error if bio has no data. */ + if (WARN_ON_ONCE(!bio_has_data(bio))) + goto fail; + + if (!bio_crypt_check_alignment(bio)) + goto fail; + + /* + * Success if device supports the encryption context. + */ + if (!blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + &bc_key->crypto_cfg)) { + blk_st = BLK_STS_NOTSUPP; + goto fail; + } + + return true; +fail: + (*bio_ptr)->bi_status = blk_st; + bio_endio(*bio_ptr); + return false; +} + +/** + * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio + * is inserted + * + * @rq: The request to prepare + * @bio: The first bio being inserted into the request + * @gfp_mask: gfp mask + */ +void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (!rq->crypt_ctx) + rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + *rq->crypt_ctx = *bio->bi_crypt_context; +} + +/** + * blk_crypto_init_key() - Prepare a key for use with blk-crypto + * @blk_key: Pointer to the blk_crypto_key to initialize. + * @raw_key: Pointer to the raw key. Must be the correct length for the chosen + * @crypto_mode; see blk_crypto_modes[]. + * @crypto_mode: identifier for the encryption algorithm to use + * @dun_bytes: number of bytes that will be used to specify the DUN when this + * key is used + * @data_unit_size: the data unit size to use for en/decryption + * + * Return: 0 on success, -errno on failure. The caller is responsible for + * zeroizing both blk_key and raw_key when done with them. + */ +int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, + enum blk_crypto_mode_num crypto_mode, + unsigned int dun_bytes, + unsigned int data_unit_size) +{ + const struct blk_crypto_mode *mode; + + memset(blk_key, 0, sizeof(*blk_key)); + + if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) + return -EINVAL; + + mode = &blk_crypto_modes[crypto_mode]; + if (mode->keysize == 0) + return -EINVAL; + + if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + return -EINVAL; + + if (!is_power_of_2(data_unit_size)) + return -EINVAL; + + blk_key->crypto_cfg.crypto_mode = crypto_mode; + blk_key->crypto_cfg.dun_bytes = dun_bytes; + blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->data_unit_size_bits = ilog2(data_unit_size); + blk_key->size = mode->keysize; + memcpy(blk_key->raw, raw_key, mode->keysize); + + return 0; +} + +bool blk_crypto_config_supported(struct request_queue *q, + const struct blk_crypto_config *cfg) +{ + return blk_ksm_crypto_cfg_supported(q->ksm, cfg); +} + +/** + * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @key: A key to use on the device + * @q: the request queue for the device + * + * Upper layers must call this function to ensure that the hardware supports + * the key's crypto settings. + * + * Return: 0 on success; -ENOPKG if the hardware doesn't support the key + */ +int blk_crypto_start_using_key(const struct blk_crypto_key *key, + struct request_queue *q) +{ + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + return 0; + return -ENOPKG; +} + +/** + * blk_crypto_evict_key() - Evict a key from any inline encryption hardware + * it may have been programmed into + * @q: The request queue who's associated inline encryption hardware this key + * might have been programmed into + * @key: The key to evict + * + * Upper layers (filesystems) must call this function to ensure that a key is + * evicted from any hardware that it might have been programmed into. The key + * must not be in use by any in-flight IO when this function is called. + * + * Return: 0 on success or if key is not present in the q's ksm, -err on error. + */ +int blk_crypto_evict_key(struct request_queue *q, + const struct blk_crypto_key *key) +{ + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + return blk_ksm_evict_key(q->ksm, key); + + return 0; +} diff --git a/block/blk-map.c b/block/blk-map.c index e3e4ac48db45..6e804892d5ec 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -550,6 +550,7 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) rq->biotail->bi_next = *bio; rq->biotail = *bio; rq->__data_len += (*bio)->bi_iter.bi_size; + bio_crypt_free_ctx(*bio); } return 0; diff --git a/block/blk-merge.c b/block/blk-merge.c index a04e991b5ded..1a9036398214 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -562,6 +562,8 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) if (blk_integrity_rq(req) && integrity_req_gap_back_merge(req, bio)) return 0; + if (!bio_crypt_ctx_back_mergeable(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { req_set_nomerge(req->q, req); @@ -578,6 +580,8 @@ int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs if (blk_integrity_rq(req) && integrity_req_gap_front_merge(req, bio)) return 0; + if (!bio_crypt_ctx_front_mergeable(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { req_set_nomerge(req->q, req); @@ -627,6 +631,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (blk_integrity_merge_rq(q, req, next) == false) return 0; + if (!bio_crypt_ctx_merge_rq(req, next)) + return 0; + /* Merge is OK... */ req->nr_phys_segments = total_phys_segments; return 1; @@ -851,6 +858,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; + /* Only merge if the crypt contexts are compatible */ + if (!bio_crypt_rq_ctx_compatible(rq, bio)) + return false; + /* must be using the same buffer */ if (req_op(rq) == REQ_OP_WRITE_SAME && !blk_write_same_mergeable(rq->bio, bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9ee695bdf873..4f8adef7fd0d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -317,6 +318,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif + blk_crypto_rq_set_defaults(rq); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); @@ -473,6 +475,7 @@ static void __blk_mq_free_request(struct request *rq) struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; + blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != -1) @@ -1820,6 +1823,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->__sector = bio->bi_iter.bi_sector; rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); + blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); blk_account_io_start(rq, true); } @@ -2021,6 +2025,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct request *same_queue_rq = NULL; unsigned int nr_segs; blk_qc_t cookie; + blk_status_t ret; blk_queue_bounce(q, &bio); __blk_queue_split(q, &bio, &nr_segs); @@ -2054,6 +2059,14 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); + ret = blk_crypto_init_request(rq); + if (ret != BLK_STS_OK) { + bio->bi_status = ret; + bio_endio(bio); + blk_mq_free_request(rq); + return BLK_QC_T_NONE; + } + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* Bypass scheduler for flush requests */ diff --git a/block/blk.h b/block/blk.h index e5cd350ca379..fc00537026a0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include "blk-crypto-internal.h" #include "blk-mq.h" #include "blk-mq-sched.h" diff --git a/block/bounce.c b/block/bounce.c index f8ed677a1bf7..c3aaed070124 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -267,6 +267,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, break; } + bio_crypt_clone(bio, bio_src, gfp_mask); + if (bio_integrity(bio_src)) { int ret; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 0eb93da44ea2..8921cd79422c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -26,6 +26,7 @@ #include #include #include +#include #define DM_MSG_PREFIX "core" @@ -1334,6 +1335,8 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); + bio_crypt_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { int r; diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 4e77938c3d0e..76095b07dd90 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -6,6 +6,8 @@ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H +#include + enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, @@ -49,4 +51,73 @@ struct blk_crypto_key { u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; }; +#define BLK_CRYPTO_MAX_IV_SIZE 32 +#define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) + +/** + * struct bio_crypt_ctx - an inline encryption context + * @bc_key: the key, algorithm, and data unit size to use + * @bc_dun: the data unit number (starting IV) to use + * + * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for + * write requests) or decrypted (for read requests) inline by the storage device + * or controller. + */ +struct bio_crypt_ctx { + const struct blk_crypto_key *bc_key; + u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; +}; + +#include +#include + +struct request; +struct request_queue; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +static inline bool bio_has_crypt_ctx(struct bio *bio) +{ + return bio->bi_crypt_context; +} + +void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, + const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + gfp_t gfp_mask); + +bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, + unsigned int bytes, + const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); + +int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, + enum blk_crypto_mode_num crypto_mode, + unsigned int dun_bytes, + unsigned int data_unit_size); + +int blk_crypto_start_using_key(const struct blk_crypto_key *key, + struct request_queue *q); + +int blk_crypto_evict_key(struct request_queue *q, + const struct blk_crypto_key *key); + +bool blk_crypto_config_supported(struct request_queue *q, + const struct blk_crypto_config *cfg); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool bio_has_crypt_ctx(struct bio *bio) +{ + return false; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + +void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); +static inline void bio_crypt_clone(struct bio *dst, struct bio *src, + gfp_t gfp_mask) +{ + if (bio_has_crypt_ctx(src)) + __bio_crypt_clone(dst, src, gfp_mask); +} + #endif /* __LINUX_BLK_CRYPTO_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b90dca1fa430..9322261fb316 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -18,6 +18,7 @@ struct block_device; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct bio_crypt_ctx; /* * Block error status values. See block/blk-core:blk_errors for the details. @@ -185,6 +186,11 @@ struct bio { u64 bi_iocost_cost; #endif #endif + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *bi_crypt_context; +#endif + union { #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity; /* data integrity */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 354e44eebef9..52a9f456cadf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -222,6 +222,11 @@ struct request { unsigned short nr_integrity_segments; #endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *crypt_ctx; + struct blk_ksm_keyslot *crypt_keyslot; +#endif + unsigned short write_hint; unsigned short ioprio; -- cgit v1.2.3 From d145dc23030bbf2de3a8ca5e0c29c2e568f69737 Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:19 +0000 Subject: block: Make blk-integrity preclude hardware inline encryption Whenever a device supports blk-integrity, make the kernel pretend that the device doesn't support inline encryption (essentially by setting the keyslot manager in the request queue to NULL). There's no hardware currently that supports both integrity and inline encryption. However, it seems possible that there will be such hardware in the near future (like the NVMe key per I/O support that might support both inline encryption and PI). But properly integrating both features is not trivial, and without real hardware that implements both, it is difficult to tell if it will be done correctly by the majority of hardware that support both. So it seems best not to support both features together right now, and to decide what to do at probe time. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 3 +++ block/blk-integrity.c | 7 +++++++ block/keyslot-manager.c | 19 +++++++++++++++++++ include/linux/blkdev.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bf62c25cde8f..3579ac0f6ec1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -42,6 +42,9 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return ERR_PTR(-EOPNOTSUPP); + if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); inline_vecs = nr_vecs; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ff1070edbb40..c03705cbb9c9 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -409,6 +409,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tag_size = template->tag_size; disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (disk->queue->ksm) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + blk_ksm_unregister(disk->queue); + } +#endif } EXPORT_SYMBOL(blk_integrity_register); diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c index fcd3fd469d7c..c2ef41b3147b 100644 --- a/block/keyslot-manager.c +++ b/block/keyslot-manager.c @@ -25,6 +25,9 @@ * Upper layers will call blk_ksm_get_slot_for_key() to program a * key into some slot in the inline encryption hardware. */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + #include #include #include @@ -376,3 +379,19 @@ void blk_ksm_destroy(struct blk_keyslot_manager *ksm) memzero_explicit(ksm, sizeof(*ksm)); } EXPORT_SYMBOL_GPL(blk_ksm_destroy); + +bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->ksm = ksm; + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_register); + +void blk_ksm_unregister(struct request_queue *q) +{ + q->ksm = NULL; +} diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 52a9f456cadf..2b33166b9daf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1583,6 +1583,12 @@ struct blk_integrity *bdev_get_integrity(struct block_device *bdev) return blk_get_integrity(bdev->bd_disk); } +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return q->integrity.profile; +} + static inline bool blk_integrity_rq(struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; @@ -1663,6 +1669,11 @@ static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { return NULL; } +static inline bool +blk_integrity_queue_supports_integrity(struct request_queue *q) +{ + return false; +} static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) { return 0; @@ -1714,6 +1725,25 @@ static inline struct bio_vec *rq_integrity_vec(struct request *rq) #endif /* CONFIG_BLK_DEV_INTEGRITY */ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); + +void blk_ksm_unregister(struct request_queue *q); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, + struct request_queue *q) +{ + return true; +} + +static inline void blk_ksm_unregister(struct request_queue *q) { } + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + + struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); -- cgit v1.2.3 From 488f6682c832e9549d28b30075f00c76328eb1be Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 14 May 2020 00:37:20 +0000 Subject: block: blk-crypto-fallback for Inline Encryption Blk-crypto delegates crypto operations to inline encryption hardware when available. The separately configurable blk-crypto-fallback contains a software fallback to the kernel crypto API - when enabled, blk-crypto will use this fallback for en/decryption when inline encryption hardware is not available. This lets upper layers not have to worry about whether or not the underlying device has support for inline encryption before deciding to specify an encryption context for a bio. It also allows for testing without actual inline encryption hardware - in particular, it makes it possible to test the inline encryption code in ext4 and f2fs simply by running xfstests with the inlinecrypt mount option, which in turn allows for things like the regular upstream regression testing of ext4 to cover the inline encryption code paths. For more details, refer to Documentation/block/inline-encryption.rst. Signed-off-by: Satya Tangirala Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/Kconfig | 10 + block/Makefile | 1 + block/blk-crypto-fallback.c | 657 ++++++++++++++++++++++++++++++++++++++++++++ block/blk-crypto-internal.h | 35 +++ block/blk-crypto.c | 68 +++-- include/linux/blk-crypto.h | 2 +- 6 files changed, 752 insertions(+), 21 deletions(-) create mode 100644 block/blk-crypto-fallback.c (limited to 'include/linux') diff --git a/block/Kconfig b/block/Kconfig index f8870c316a03..9382a4acefc3 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -193,6 +193,16 @@ config BLK_INLINE_ENCRYPTION block layer handle encryption, so users can take advantage of inline encryption hardware if present. +config BLK_INLINE_ENCRYPTION_FALLBACK + bool "Enable crypto API fallback for blk-crypto" + depends on BLK_INLINE_ENCRYPTION + select CRYPTO + select CRYPTO_SKCIPHER + help + Enabling this lets the block layer handle inline encryption + by falling back to the kernel crypto API when inline + encryption hardware is not present. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 3ee88d3e807d..78719169fb2a 100644 --- a/block/Makefile +++ b/block/Makefile @@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c new file mode 100644 index 000000000000..74ab137ae3ba --- /dev/null +++ b/block/blk-crypto-fallback.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto-fallback: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +static unsigned int num_prealloc_bounce_pg = 32; +module_param(num_prealloc_bounce_pg, uint, 0); +MODULE_PARM_DESC(num_prealloc_bounce_pg, + "Number of preallocated bounce pages for the blk-crypto crypto API fallback"); + +static unsigned int blk_crypto_num_keyslots = 100; +module_param_named(num_keyslots, blk_crypto_num_keyslots, uint, 0); +MODULE_PARM_DESC(num_keyslots, + "Number of keyslots for the blk-crypto crypto API fallback"); + +static unsigned int num_prealloc_fallback_crypt_ctxs = 128; +module_param(num_prealloc_fallback_crypt_ctxs, uint, 0); +MODULE_PARM_DESC(num_prealloc_crypt_fallback_ctxs, + "Number of preallocated bio fallback crypto contexts for blk-crypto to use during crypto API fallback"); + +struct bio_fallback_crypt_ctx { + struct bio_crypt_ctx crypt_ctx; + /* + * Copy of the bvec_iter when this bio was submitted. + * We only want to en/decrypt the part of the bio as described by the + * bvec_iter upon submission because bio might be split before being + * resubmitted + */ + struct bvec_iter crypt_iter; + union { + struct { + struct work_struct work; + struct bio *bio; + }; + struct { + void *bi_private_orig; + bio_end_io_t *bi_end_io_orig; + }; + }; +}; + +static struct kmem_cache *bio_fallback_crypt_ctx_cache; +static mempool_t *bio_fallback_crypt_ctx_pool; + +/* + * Allocating a crypto tfm during I/O can deadlock, so we have to preallocate + * all of a mode's tfms when that mode starts being used. Since each mode may + * need all the keyslots at some point, each mode needs its own tfm for each + * keyslot; thus, a keyslot may contain tfms for multiple modes. However, to + * match the behavior of real inline encryption hardware (which only supports a + * single encryption context per keyslot), we only allow one tfm per keyslot to + * be used at a time - the rest of the unused tfms have their keys cleared. + */ +static DEFINE_MUTEX(tfms_init_lock); +static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; + +static struct blk_crypto_keyslot { + enum blk_crypto_mode_num crypto_mode; + struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; +} *blk_crypto_keyslots; + +static struct blk_keyslot_manager blk_crypto_ksm; +static struct workqueue_struct *blk_crypto_wq; +static mempool_t *blk_crypto_bounce_page_pool; + +/* + * This is the key we set when evicting a keyslot. This *should* be the all 0's + * key, but AES-XTS rejects that key, so we use some random bytes instead. + */ +static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; + +static void blk_crypto_evict_keyslot(unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; + int err; + + WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID); + + /* Clear the key in the skcipher */ + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, + blk_crypto_modes[crypto_mode].keysize); + WARN_ON(err); + slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; +} + +static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + const enum blk_crypto_mode_num crypto_mode = + key->crypto_cfg.crypto_mode; + int err; + + if (crypto_mode != slotp->crypto_mode && + slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) + blk_crypto_evict_keyslot(slot); + + slotp->crypto_mode = crypto_mode; + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + key->size); + if (err) { + blk_crypto_evict_keyslot(slot); + return err; + } + return 0; +} + +static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + blk_crypto_evict_keyslot(slot); + return 0; +} + +/* + * The crypto API fallback KSM ops - only used for a bio when it specifies a + * blk_crypto_key that was not supported by the device's inline encryption + * hardware. + */ +static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { + .keyslot_program = blk_crypto_keyslot_program, + .keyslot_evict = blk_crypto_keyslot_evict, +}; + +static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) +{ + struct bio *src_bio = enc_bio->bi_private; + int i; + + for (i = 0; i < enc_bio->bi_vcnt; i++) + mempool_free(enc_bio->bi_io_vec[i].bv_page, + blk_crypto_bounce_page_pool); + + src_bio->bi_status = enc_bio->bi_status; + + bio_put(enc_bio); + bio_endio(src_bio); +} + +static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); + + return bio; +} + +static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) +{ + struct skcipher_request *ciph_req; + const struct blk_crypto_keyslot *slotp; + int keyslot_idx = blk_ksm_get_slot_idx(slot); + + slotp = &blk_crypto_keyslots[keyslot_idx]; + ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], + GFP_NOIO); + if (!ciph_req) + return false; + + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, wait); + *ciph_req_ret = ciph_req; + + return true; +} + +static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + unsigned int i = 0; + unsigned int num_sectors = 0; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + num_sectors += bv.bv_len >> SECTOR_SHIFT; + if (++i == BIO_MAX_PAGES) + break; + } + if (num_sectors < bio_sectors(bio)) { + struct bio *split_bio; + + split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + if (!split_bio) { + bio->bi_status = BLK_STS_RESOURCE; + return false; + } + bio_chain(split_bio, bio); + generic_make_request(bio); + *bio_ptr = split_bio; + } + + return true; +} + +union blk_crypto_iv { + __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_IV_SIZE]; +}; + +static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + union blk_crypto_iv *iv) +{ + int i; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) + iv->dun[i] = cpu_to_le64(dun[i]); +} + +/* + * The crypto API fallback's encryption routine. + * Allocate a bounce bio for encryption, encrypt the input bio using crypto API, + * and replace *bio_ptr with the bounce bio. May split input bio if it's too + * large. Returns true on success. Returns false and sets bio->bi_status on + * error. + */ +static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) +{ + struct bio *src_bio, *enc_bio; + struct bio_crypt_ctx *bc; + struct blk_ksm_keyslot *slot; + int data_unit_size; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + struct scatterlist src, dst; + union blk_crypto_iv iv; + unsigned int i, j; + bool ret = false; + blk_status_t blk_st; + + /* Split the bio if it's too big for single page bvec */ + if (!blk_crypto_split_bio_if_needed(bio_ptr)) + return false; + + src_bio = *bio_ptr; + bc = src_bio->bi_crypt_context; + data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + + /* Allocate bounce bio for encryption */ + enc_bio = blk_crypto_clone_bio(src_bio); + if (!enc_bio) { + src_bio->bi_status = BLK_STS_RESOURCE; + return false; + } + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + src_bio->bi_status = blk_st; + goto out_put_enc_bio; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_release_keyslot; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&src, 1); + sg_init_table(&dst, 1); + + skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size, + iv.bytes); + + /* Encrypt each page in the bounce bio */ + for (i = 0; i < enc_bio->bi_vcnt; i++) { + struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i]; + struct page *plaintext_page = enc_bvec->bv_page; + struct page *ciphertext_page = + mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO); + + enc_bvec->bv_page = ciphertext_page; + + if (!ciphertext_page) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_free_bounce_pages; + } + + sg_set_page(&src, plaintext_page, data_unit_size, + enc_bvec->bv_offset); + sg_set_page(&dst, ciphertext_page, data_unit_size, + enc_bvec->bv_offset); + + /* Encrypt each data unit in this page */ + for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), + &wait)) { + i++; + src_bio->bi_status = BLK_STS_IOERR; + goto out_free_bounce_pages; + } + bio_crypt_dun_increment(curr_dun, 1); + src.offset += data_unit_size; + dst.offset += data_unit_size; + } + } + + enc_bio->bi_private = src_bio; + enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; + *bio_ptr = enc_bio; + ret = true; + + enc_bio = NULL; + goto out_free_ciph_req; + +out_free_bounce_pages: + while (i > 0) + mempool_free(enc_bio->bi_io_vec[--i].bv_page, + blk_crypto_bounce_page_pool); +out_free_ciph_req: + skcipher_request_free(ciph_req); +out_release_keyslot: + blk_ksm_put_slot(slot); +out_put_enc_bio: + if (enc_bio) + bio_put(enc_bio); + + return ret; +} + +/* + * The crypto API fallback's main decryption routine. + * Decrypts input bio in place, and calls bio_endio on the bio. + */ +static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) +{ + struct bio_fallback_crypt_ctx *f_ctx = + container_of(work, struct bio_fallback_crypt_ctx, work); + struct bio *bio = f_ctx->bio; + struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; + struct blk_ksm_keyslot *slot; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + union blk_crypto_iv iv; + struct scatterlist sg; + struct bio_vec bv; + struct bvec_iter iter; + const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + unsigned int i; + blk_status_t blk_st; + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + bio->bi_status = blk_st; + goto out_no_keyslot; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + bio->bi_status = BLK_STS_RESOURCE; + goto out; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&sg, 1); + skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, + iv.bytes); + + /* Decrypt each segment in the bio */ + __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) { + struct page *page = bv.bv_page; + + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); + + /* Decrypt each data unit in the segment */ + for (i = 0; i < bv.bv_len; i += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req), + &wait)) { + bio->bi_status = BLK_STS_IOERR; + goto out; + } + bio_crypt_dun_increment(curr_dun, 1); + sg.offset += data_unit_size; + } + } + +out: + skcipher_request_free(ciph_req); + blk_ksm_put_slot(slot); +out_no_keyslot: + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); +} + +/** + * blk_crypto_fallback_decrypt_endio - queue bio for fallback decryption + * + * @bio: the bio to queue + * + * Restore bi_private and bi_end_io, and queue the bio for decryption into a + * workqueue, since this function will be called from an atomic context. + */ +static void blk_crypto_fallback_decrypt_endio(struct bio *bio) +{ + struct bio_fallback_crypt_ctx *f_ctx = bio->bi_private; + + bio->bi_private = f_ctx->bi_private_orig; + bio->bi_end_io = f_ctx->bi_end_io_orig; + + /* If there was an IO error, don't queue for decrypt. */ + if (bio->bi_status) { + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); + return; + } + + INIT_WORK(&f_ctx->work, blk_crypto_fallback_decrypt_bio); + f_ctx->bio = bio; + queue_work(blk_crypto_wq, &f_ctx->work); +} + +/** + * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption + * + * @bio_ptr: pointer to the bio to prepare + * + * If bio is doing a WRITE operation, this splits the bio into two parts if it's + * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio + * for the first part, encrypts it, and update bio_ptr to point to the bounce + * bio. + * + * For a READ operation, we mark the bio for decryption by using bi_private and + * bi_end_io. + * + * In either case, this function will make the bio look like a regular bio (i.e. + * as if no encryption context was ever specified) for the purposes of the rest + * of the stack except for blk-integrity (blk-integrity and blk-crypto are not + * currently supported together). + * + * Return: true on success. Sets bio->bi_status and returns false on error. + */ +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + struct bio_fallback_crypt_ctx *f_ctx; + + if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) { + /* User didn't call blk_crypto_start_using_key() first */ + bio->bi_status = BLK_STS_IOERR; + return false; + } + + if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, + &bc->bc_key->crypto_cfg)) { + bio->bi_status = BLK_STS_NOTSUPP; + return false; + } + + if (bio_data_dir(bio) == WRITE) + return blk_crypto_fallback_encrypt_bio(bio_ptr); + + /* + * bio READ case: Set up a f_ctx in the bio's bi_private and set the + * bi_end_io appropriately to trigger decryption when the bio is ended. + */ + f_ctx = mempool_alloc(bio_fallback_crypt_ctx_pool, GFP_NOIO); + f_ctx->crypt_ctx = *bc; + f_ctx->crypt_iter = bio->bi_iter; + f_ctx->bi_private_orig = bio->bi_private; + f_ctx->bi_end_io_orig = bio->bi_end_io; + bio->bi_private = (void *)f_ctx; + bio->bi_end_io = blk_crypto_fallback_decrypt_endio; + bio_crypt_free_ctx(bio); + + return true; +} + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return blk_ksm_evict_key(&blk_crypto_ksm, key); +} + +static bool blk_crypto_fallback_inited; +static int blk_crypto_fallback_init(void) +{ + int i; + int err = -ENOMEM; + + if (blk_crypto_fallback_inited) + return 0; + + prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + + err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + if (err) + goto out; + err = -ENOMEM; + + blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; + blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + + /* All blk-crypto modes have a crypto API fallback. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) + blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; + blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + + blk_crypto_wq = alloc_workqueue("blk_crypto_wq", + WQ_UNBOUND | WQ_HIGHPRI | + WQ_MEM_RECLAIM, num_online_cpus()); + if (!blk_crypto_wq) + goto fail_free_ksm; + + blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, + sizeof(blk_crypto_keyslots[0]), + GFP_KERNEL); + if (!blk_crypto_keyslots) + goto fail_free_wq; + + blk_crypto_bounce_page_pool = + mempool_create_page_pool(num_prealloc_bounce_pg, 0); + if (!blk_crypto_bounce_page_pool) + goto fail_free_keyslots; + + bio_fallback_crypt_ctx_cache = KMEM_CACHE(bio_fallback_crypt_ctx, 0); + if (!bio_fallback_crypt_ctx_cache) + goto fail_free_bounce_page_pool; + + bio_fallback_crypt_ctx_pool = + mempool_create_slab_pool(num_prealloc_fallback_crypt_ctxs, + bio_fallback_crypt_ctx_cache); + if (!bio_fallback_crypt_ctx_pool) + goto fail_free_crypt_ctx_cache; + + blk_crypto_fallback_inited = true; + + return 0; +fail_free_crypt_ctx_cache: + kmem_cache_destroy(bio_fallback_crypt_ctx_cache); +fail_free_bounce_page_pool: + mempool_destroy(blk_crypto_bounce_page_pool); +fail_free_keyslots: + kfree(blk_crypto_keyslots); +fail_free_wq: + destroy_workqueue(blk_crypto_wq); +fail_free_ksm: + blk_ksm_destroy(&blk_crypto_ksm); +out: + return err; +} + +/* + * Prepare blk-crypto-fallback for the specified crypto mode. + * Returns -ENOPKG if the needed crypto API support is missing. + */ +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; + struct blk_crypto_keyslot *slotp; + unsigned int i; + int err = 0; + + /* + * Fast path + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we try to access them. + */ + if (likely(smp_load_acquire(&tfms_inited[mode_num]))) + return 0; + + mutex_lock(&tfms_init_lock); + if (tfms_inited[mode_num]) + goto out; + + err = blk_crypto_fallback_init(); + if (err) + goto out; + + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0); + if (IS_ERR(slotp->tfms[mode_num])) { + err = PTR_ERR(slotp->tfms[mode_num]); + if (err == -ENOENT) { + pr_warn_once("Missing crypto API support for \"%s\"\n", + cipher_str); + err = -ENOPKG; + } + slotp->tfms[mode_num] = NULL; + goto out_free_tfms; + } + + crypto_skcipher_set_flags(slotp->tfms[mode_num], + CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + } + + /* + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we set tfms_inited[mode_num]. + */ + smp_store_release(&tfms_inited[mode_num], true); + goto out; + +out_free_tfms: + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + crypto_free_skcipher(slotp->tfms[mode_num]); + slotp->tfms[mode_num] = NULL; + } +out: + mutex_unlock(&tfms_init_lock); + return err; +} diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index 796f757fe8e9..d2b0f565d83c 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -11,10 +11,13 @@ /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; +extern const struct blk_crypto_mode blk_crypto_modes[]; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], @@ -163,4 +166,36 @@ static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) return BLK_STS_OK; } +#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK + +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); + +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + +static inline int +blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + pr_warn_once("crypto API fallback is disabled\n"); + return -ENOPKG; +} + +static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + pr_warn_once("crypto API fallback disabled; failing request.\n"); + (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; + return false; +} + +static inline int +blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return 0; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 25b981257f5f..6533c9b36ab8 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -19,14 +19,17 @@ const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, }, @@ -229,9 +232,16 @@ void __blk_crypto_free_request(struct request *rq) * * @bio_ptr: pointer to original bio pointer * - * Succeeds if the bio doesn't have inline encryption enabled or if the bio - * crypt context provided for the bio is supported by the underlying device's - * inline encryption hardware. Ends the bio with error otherwise. + * If the bio crypt context provided for the bio is supported by the underlying + * device's inline encryption hardware, do nothing. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. When the crypto API fallback is used for encryption, + * blk-crypto may choose to split the bio into 2 - the first one that will + * continue to be processed and the second one that will be resubmitted via + * generic_make_request. A bounce bio will be allocated to encrypt the contents + * of the aforementioned "first one", and *bio_ptr will be updated to this + * bounce bio. * * Caller must ensure bio has bio_crypt_ctx. * @@ -243,27 +253,29 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; - blk_status_t blk_st = BLK_STS_IOERR; /* Error if bio has no data. */ - if (WARN_ON_ONCE(!bio_has_data(bio))) + if (WARN_ON_ONCE(!bio_has_data(bio))) { + bio->bi_status = BLK_STS_IOERR; goto fail; + } - if (!bio_crypt_check_alignment(bio)) + if (!bio_crypt_check_alignment(bio)) { + bio->bi_status = BLK_STS_IOERR; goto fail; + } /* - * Success if device supports the encryption context. + * Success if device supports the encryption context, or if we succeeded + * in falling back to the crypto API. */ - if (!blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, - &bc_key->crypto_cfg)) { - blk_st = BLK_STS_NOTSUPP; - goto fail; - } + if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + &bc_key->crypto_cfg)) + return true; - return true; + if (blk_crypto_fallback_bio_prep(bio_ptr)) + return true; fail: - (*bio_ptr)->bi_status = blk_st; bio_endio(*bio_ptr); return false; } @@ -329,10 +341,16 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return 0; } +/* + * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the + * request queue it's submitted to supports inline crypto, or the + * blk-crypto-fallback is enabled and supports the cfg). + */ bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg) { - return blk_ksm_crypto_cfg_supported(q->ksm, cfg); + return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + blk_ksm_crypto_cfg_supported(q->ksm, cfg); } /** @@ -340,17 +358,22 @@ bool blk_crypto_config_supported(struct request_queue *q, * @key: A key to use on the device * @q: the request queue for the device * - * Upper layers must call this function to ensure that the hardware supports - * the key's crypto settings. + * Upper layers must call this function to ensure that either the hardware + * supports the key's crypto settings, or the crypto API fallback has transforms + * for the needed mode allocated and ready to go. This function may allocate + * an skcipher, and *should not* be called from the data path, since that might + * cause a deadlock * - * Return: 0 on success; -ENOPKG if the hardware doesn't support the key + * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and + * blk-crypto-fallback is either disabled or the needed algorithm + * is disabled in the crypto API; or another -errno code. */ int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q) { if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) return 0; - return -ENOPKG; + return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** @@ -372,5 +395,10 @@ int blk_crypto_evict_key(struct request_queue *q, if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) return blk_ksm_evict_key(q->ksm, key); - return 0; + /* + * If the request queue's associated inline encryption hardware didn't + * have support for the key, then the key might have been programmed + * into the fallback keyslot manager, so try to evict from there. + */ + return blk_crypto_fallback_evict_key(key); } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 76095b07dd90..e82342907f2b 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -61,7 +61,7 @@ struct blk_crypto_key { * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device - * or controller. + * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; -- cgit v1.2.3 From 2771cefeac499d68ca2fca3861ba9e46d15bd4ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 18:10:05 +0200 Subject: block: remove the REQ_NOWAIT_INLINE flag Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9322261fb316..ccb895f911b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -343,7 +343,6 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ - __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -378,7 +377,6 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) -#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From c1527c0e12d461ff4c603996f77983f37b85c286 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 May 2020 21:07:35 -0700 Subject: bio.h: Declare the arguments of the bio iteration functions const This change makes it possible to pass 'const struct bio *' arguments to these functions. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Cc: Ming Lei Cc: Damien Le Moal Cc: Chaitanya Kulkarni Cc: Alexander Potapenko Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a0ee494a6329..950c9dc44c4f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -70,7 +70,7 @@ static inline bool bio_has_data(struct bio *bio) return false; } -static inline bool bio_no_advance_iter(struct bio *bio) +static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || @@ -138,8 +138,8 @@ static inline bool bio_next_segment(const struct bio *bio, #define bio_for_each_segment_all(bvl, bio, iter) \ for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); ) -static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, - unsigned bytes) +static inline void bio_advance_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector += bytes >> 9; -- cgit v1.2.3 From 854b5f01dc6a7b59f0dddd646a80b1fcd767a8db Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 May 2020 21:07:36 -0700 Subject: block: Document the bio_vec properties Since it is nontrivial that nth_page() does not have to be used for a bio_vec, document this. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig CC: Christoph Hellwig Cc: Ming Lei Signed-off-by: Jens Axboe --- include/linux/bvec.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index a81c13ac1972..ac0c7299d5b8 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -12,8 +12,17 @@ #include #include -/* - * was unsigned short, but we might as well be ready for > 64kB I/O pages +/** + * struct bio_vec - a contiguous range of physical memory addresses + * @bv_page: First page associated with the address range. + * @bv_len: Number of bytes in the address range. + * @bv_offset: Start of the address range relative to the start of @bv_page. + * + * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: + * + * nth_page(@bv_page, n) == @bv_page + n + * + * This holds because page_is_mergeable() checks the above property. */ struct bio_vec { struct page *bv_page; -- cgit v1.2.3 From 9398554fb3979852512ff4f1405e759889b45c16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 14:36:00 +0200 Subject: block: remove the error_sector argument to blkdev_issue_flush The argument isn't used by any caller, and drivers don't fill out bi_sector for flush requests either. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++--------------- drivers/md/dm-integrity.c | 2 +- drivers/md/dm-zoned-metadata.c | 6 +++--- drivers/md/raid5-ppl.c | 2 +- drivers/nvme/target/io-cmd-bdev.c | 2 +- fs/block_dev.c | 2 +- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/super.c | 2 +- fs/fat/file.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/hfsplus/super.c | 2 +- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 4 ++-- fs/jbd2/recovery.c | 2 +- fs/libfs.c | 2 +- fs/nilfs2/the_nilfs.h | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/file.c | 2 +- fs/xfs/xfs_super.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/blkdev.h | 5 ++--- 22 files changed, 27 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/block/blk-flush.c b/block/blk-flush.c index b733f7ac75c7..d47a579964fb 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,15 +432,11 @@ void blk_insert_flush(struct request *rq) * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * @gfp_mask: memory allocation flags (for bio_alloc) - * @error_sector: error sector * * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. + * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { struct request_queue *q; struct bio *bio; @@ -458,15 +454,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be - * copied from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_iter.bi_sector; - bio_put(bio); return ret; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4094c47eca7f..84cb04904fab 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2657,7 +2657,7 @@ static void bitmap_flush_work(struct work_struct *work) dm_integrity_flush_buffers(ic); if (ic->meta_dev) - blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO); limit = ic->provided_data_sectors; if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 369de15c4e80..bf2245370305 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -661,7 +661,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -703,7 +703,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, /* Flush drive cache (this will also sync data) */ if (ret == 0) - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); return ret; } @@ -772,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { - ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); + ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO); goto err; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index d50238d0a85d..a750f4bbb5d9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1037,7 +1037,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, } /* flush the disk cache after recovery if necessary */ - ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); out: __free_page(page); return ret; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index ea0e596be15d..26f50c23b82e 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -226,7 +226,7 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { - if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL, NULL)) + if (blkdev_issue_flush(req->ns->bdev, GFP_KERNEL)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index ebd1507789d2..d1e08bba925a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -672,7 +672,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e10206e7f4bb..35ff9a56db67 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -176,7 +176,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..499f08d8522e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1440,7 +1440,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_NOFS); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..629a56b5c859 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5256,7 +5256,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/fat/file.c b/fs/fat/file.c index bdc4503c00a3..42134c58c87e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 94bd83b36644..e3da9e96b835 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 2b9e5743105e..129dca3f4b78 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ out: mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); return error; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 96bf33986d03..263f02ad8ebf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -414,7 +414,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e855d8260433..6d2da8ad0e6f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -775,7 +775,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -882,7 +882,7 @@ start_journal_io: stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_dev, GFP_NOFS); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index a4967b27ffb6..2ed278f0dced 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -286,7 +286,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); if (!err) err = err2; } diff --git a/fs/libfs.c b/fs/libfs.c index 3759fbacf522..4d08edf19c78 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1113,7 +1113,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 380a543c5b19..b55cdeb4d169 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); if (err != -EIO) err = 0; return err; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6cd5e4924e4d..85979e2214b3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!err) err = ret; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 84cf8bdbec9c..0b641ae694f1 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 424bb9a2d532..a123cd8267d9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -305,7 +305,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 0bf7009f50a2..25afcf55aa41 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -479,7 +479,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (ret) zonefs_io_error(inode, true); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2b33166b9daf..7d10f4e63232 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1233,7 +1233,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) extern void blk_io_schedule(void); -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); +int blkdev_issue_flush(struct block_device *, gfp_t); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); @@ -1872,8 +1872,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } -static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, - sector_t *error_sector) +static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { return 0; } -- cgit v1.2.3 From 956d510ee78caebc83c0eaeb892db5b239a36a06 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:04 +0200 Subject: block: add disk/bio-based accounting helpers Add two new helpers to simplify I/O accounting for bio based drivers. Currently these drivers use the generic_start_io_acct and generic_end_io_acct helpers which have very cumbersome calling conventions, don't actually return the time they started accounting, and try to deal with accounting for partitions, which can't happen for bio based drivers. The new helpers will be used to subsequently replace uses of the old helpers. The main API is the bio based wrappes in blkdev.h, but for zram which wants to account rw_page based I/O lower level routines are provided as well. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/blk-core.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 77e57c2e8d60..8973104f88d9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1432,6 +1432,40 @@ void blk_account_io_start(struct request *rq, bool new_io) part_stat_unlock(); } +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) +{ + struct hd_struct *part = &disk->part0; + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + + part_stat_lock(); + update_io_ticks(part, now, false); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + part_stat_local_inc(part, in_flight[op_is_write(op)]); + part_stat_unlock(); + + return now; +} +EXPORT_SYMBOL(disk_start_io_acct); + +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time) +{ + struct hd_struct *part = &disk->part0; + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + unsigned long duration = now - start_time; + + part_stat_lock(); + update_io_ticks(part, now, true); + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(part, in_flight[op_is_write(op)]); + part_stat_unlock(); +} +EXPORT_SYMBOL(disk_end_io_acct); + /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d10f4e63232..6f7ff0fa8fcf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1892,4 +1892,32 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op); +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time); + +#ifdef CONFIG_BLOCK +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +static inline unsigned long bio_start_io_acct(struct bio *bio) +{ + return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); +} + +/** + * bio_end_io_acct - end I/O accounting for bio based drivers + * @bio: bio to end account for + * @start: start time returned by bio_start_io_acct() + */ +static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) +{ + return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); +} +#endif /* CONFIG_BLOCK */ + #endif -- cgit v1.2.3 From e722fff238bbfe6308d7778a8c2163c181bf998a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:12 +0200 Subject: block: remove generic_{start,end}_io_acct Remove these now unused functions. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/bio.c | 39 --------------------------------------- include/linux/bio.h | 6 ------ 2 files changed, 45 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 9c101a0572ca..3e89c7b37855 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1392,45 +1392,6 @@ again: } } -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part) -{ - const int sgrp = op_stat_group(op); - int rw = op_is_write(op); - - part_stat_lock(); - - update_io_ticks(part, jiffies, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[rw]); - if (part->partno) - part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_start_io_acct); - -void generic_end_io_acct(struct request_queue *q, int req_op, - struct hd_struct *part, unsigned long start_time) -{ - unsigned long now = jiffies; - unsigned long duration = now - start_time; - const int sgrp = op_stat_group(req_op); - int rw = op_is_write(req_op); - - part_stat_lock(); - - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[rw]); - if (part->partno) - part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); - - part_stat_unlock(); -} -EXPORT_SYMBOL(generic_end_io_acct); - static inline bool bio_remaining_done(struct bio *bio) { /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 950c9dc44c4f..941378ec5b39 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -444,12 +444,6 @@ void bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int op, - struct hd_struct *part, - unsigned long start_time); - extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); -- cgit v1.2.3 From 58d4f14fc30ac26288cfed74d7e566921c22cf59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2020 07:24:14 +0200 Subject: block: always use a percpu variable for disk stats percpu variables have a perfectly fine working stub implementation for UP kernels, so use that. Signed-off-by: Christoph Hellwig Reviewed-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 12 +++-------- block/partitions/core.c | 5 +++-- include/linux/genhd.h | 13 ----------- include/linux/part_stat.h | 55 ++++++++++------------------------------------- 5 files changed, 18 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/block/blk.h b/block/blk.h index bdf5e94467aa..0ecba2ab383d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -378,7 +378,7 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - free_part_stats(part); + free_percpu(part->dkstats); kfree(part->info); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index 094ed9096496..3e7df0a3e6bb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -92,7 +92,6 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -#ifdef CONFIG_SMP static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) { int cpu; @@ -112,12 +111,6 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) stat->io_ticks += ptr->io_ticks; } } -#else /* CONFIG_SMP */ -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) -{ - memcpy(stat, &part->dkstats, sizeof(struct disk_stats)); -} -#endif /* CONFIG_SMP */ static unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) @@ -1688,14 +1681,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (disk) { - if (!init_part_stats(&disk->part0)) { + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) { kfree(disk); return NULL; } init_rwsem(&disk->lookup_sem); disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) { - free_part_stats(&disk->part0); + free_percpu(disk->part0.dkstats); kfree(disk); return NULL; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 297004fd2264..78951e33b2d7 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -387,7 +387,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); - if (!init_part_stats(p)) { + p->dkstats = alloc_percpu(struct disk_stats); + if (!p->dkstats) { err = -ENOMEM; goto out_free; } @@ -468,7 +469,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_free_info: kfree(p->info); out_free_stats: - free_part_stats(p); + free_percpu(p->dkstats); out_free: kfree(p); return ERR_PTR(err); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a9384449465a..f0d6d77309a5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -39,15 +39,6 @@ extern struct class block_class; #include #include -struct disk_stats { - u64 nsecs[NR_STAT_GROUPS]; - unsigned long sectors[NR_STAT_GROUPS]; - unsigned long ios[NR_STAT_GROUPS]; - unsigned long merges[NR_STAT_GROUPS]; - unsigned long io_ticks; - local_t in_flight[2]; -}; - #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. @@ -72,11 +63,7 @@ struct hd_struct { seqcount_t nr_sects_seq; #endif unsigned long stamp; -#ifdef CONFIG_SMP struct disk_stats __percpu *dkstats; -#else - struct disk_stats dkstats; -#endif struct percpu_ref ref; sector_t alignment_offset; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index ece607607a86..6644197980b9 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -4,19 +4,23 @@ #include +struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; + unsigned long sectors[NR_STAT_GROUPS]; + unsigned long ios[NR_STAT_GROUPS]; + unsigned long merges[NR_STAT_GROUPS]; + unsigned long io_ticks; + local_t in_flight[2]; +}; + /* * Macros to operate on percpu disk statistics: * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters - * and should be called between disk_stat_lock() and - * disk_stat_unlock(). + * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should + * be called between disk_stat_lock() and disk_stat_unlock(). * * part_stat_read() can be called at any time. - * - * part_stat_{add|set_all}() and {init|free}_part_stats are for - * internal use only. */ -#ifdef CONFIG_SMP #define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) #define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) @@ -44,43 +48,6 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) sizeof(struct disk_stats)); } -static inline int init_part_stats(struct hd_struct *part) -{ - part->dkstats = alloc_percpu(struct disk_stats); - if (!part->dkstats) - return 0; - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ - free_percpu(part->dkstats); -} - -#else /* !CONFIG_SMP */ -#define part_stat_lock() ({ rcu_read_lock(); 0; }) -#define part_stat_unlock() rcu_read_unlock() - -#define part_stat_get(part, field) ((part)->dkstats.field) -#define part_stat_get_cpu(part, field, cpu) part_stat_get(part, field) -#define part_stat_read(part, field) part_stat_get(part, field) - -static inline void part_stat_set_all(struct hd_struct *part, int value) -{ - memset(&part->dkstats, value, sizeof(struct disk_stats)); -} - -static inline int init_part_stats(struct hd_struct *part) -{ - return 1; -} - -static inline void free_part_stats(struct hd_struct *part) -{ -} - -#endif /* CONFIG_SMP */ - #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ -- cgit v1.2.3 From 8ab1d40a646e753adb6814642432a093d93dbf47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 May 2020 07:24:17 +0200 Subject: block: remove rcu_read_lock() from part_stat_lock() The RCU lock is required only in disk_map_sector_rcu() to lookup the partition. After that request holds reference to related hd_struct. Replace get_cpu() with preempt_disable() - returned cpu index is unused. [hch: rebased] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 11 ++++++++--- include/linux/part_stat.h | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 3e7df0a3e6bb..1a7659327664 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -321,11 +321,12 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) struct hd_struct *part; int i; + rcu_read_lock(); ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) - return part; + goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); @@ -339,10 +340,14 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) if (!hd_struct_try_get(part)) break; rcu_assign_pointer(ptbl->last_lookup, part); - return part; + goto out_unlock; } } - return &disk->part0; + + part = &disk->part0; +out_unlock: + rcu_read_unlock(); + return part; } /** diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 6644197980b9..a6b0938ce82e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -21,8 +21,8 @@ struct disk_stats { * * part_stat_read() can be called at any time. */ -#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); }) -#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0) +#define part_stat_lock() preempt_disable() +#define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->dkstats, (cpu))->field) -- cgit v1.2.3 From b2d76adbc0828e0f108567973bcc500ed1abc139 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 27 May 2020 07:24:18 +0200 Subject: block: use __this_cpu_add() instead of access by smp_processor_id() Most architectures have fast path to access percpu for current cpu. The required preempt_disable() is provided by part_stat_lock(). [hch: rebased] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/part_stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index a6b0938ce82e..24125778ef3e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - (part_stat_get(part, field) += (addnd)) + __this_cpu_add((part)->dkstats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ -- cgit v1.2.3 From dc35ada4251f183137ee3a524543c9329d7a4fa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 15:41:23 +0200 Subject: block: fix a warning when blkdev.h is included for !CONFIG_BLOCK builds disk_start_io_acct and disk_end_io_acct need at least a struct gendisk forward declaration, but for weird historic reasons much of blkdev.h is stubbed out for CONFIG_BLOCK=n. Fix this by stubbing more out for now, but eventually this header will need a massive cleanup. Fixes: 956d510ee78 ("block: add disk/bio-based accounting helpers") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6f7ff0fa8fcf..8fd900998b4e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1892,12 +1892,12 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } +#ifdef CONFIG_BLOCK unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -#ifdef CONFIG_BLOCK /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for -- cgit v1.2.3 From 7b11eab041dacfeaaa6d27d9183b247a995bc16d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 29 May 2020 07:51:59 -0700 Subject: blk-mq: blk-mq: provide forced completion method Drivers may need to bypass error injection for error recovery. Rename __blk_mq_complete_request() to blk_mq_force_complete_rq() and export that function so drivers may skip potential fake timeouts after they've reclaimed lost requests. Signed-off-by: Keith Busch Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq.c | 15 +++++++++++++-- include/linux/blk-mq.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index c606c74463cc..c6dcd390be2d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -556,7 +556,17 @@ static void __blk_mq_complete_request_remote(void *data) q->mq_ops->complete(rq); } -static void __blk_mq_complete_request(struct request *rq) +/** + * blk_mq_force_complete_rq() - Force complete the request, bypassing any error + * injection that could drop the completion. + * @rq: Request to be force completed + * + * Drivers should use blk_mq_complete_request() to complete requests in their + * normal IO path. For timeout error recovery, drivers may call this forced + * completion routine after they've reclaimed timed out requests to bypass + * potentially subsequent fake timeouts. + */ +void blk_mq_force_complete_rq(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; @@ -602,6 +612,7 @@ static void __blk_mq_complete_request(struct request *rq) } put_cpu(); } +EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -635,7 +646,7 @@ bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) return false; - __blk_mq_complete_request(rq); + blk_mq_force_complete_rq(rq); return true; } EXPORT_SYMBOL(blk_mq_complete_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d7307795439a..856bb10993cf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -494,6 +494,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_force_complete_rq(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From 5d9c305b8ea3fbc95bedfde01f7dd91e68082098 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 15:53:08 +0200 Subject: blk-mq: remove the bio argument to ->prepare_request None of the I/O schedulers actually needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-mq.c | 2 +- block/kyber-iosched.c | 2 +- block/mq-deadline.c | 2 +- include/linux/elevator.h | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3d411716d7ee..50c8f034c01c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6073,7 +6073,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, * comments on bfq_init_rq for the reason behind this delayed * preparation. */ -static void bfq_prepare_request(struct request *rq, struct bio *bio) +static void bfq_prepare_request(struct request *rq) { /* * Regardless of whether we have an icq attached, we have to diff --git a/block/blk-mq.c b/block/blk-mq.c index c6dcd390be2d..f15722cbfac6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -387,7 +387,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, if (e->type->icq_cache) blk_mq_sched_assign_ioc(rq); - e->type->ops.prepare_request(rq, bio); + e->type->ops.prepare_request(rq); rq->rq_flags |= RQF_ELVPRIV; } } diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 34dcea0ef637..a38c5ab103d1 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -579,7 +579,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, return merged; } -static void kyber_prepare_request(struct request *rq, struct bio *bio) +static void kyber_prepare_request(struct request *rq) { rq_set_domain_token(rq, -1); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b490f47fd553..b57470e154c8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -541,7 +541,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, * Nothing to do here. This is defined only to ensure that .finish_request * method is called upon request completion. */ -static void dd_prepare_request(struct request *rq, struct bio *bio) +static void dd_prepare_request(struct request *rq) { } diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 901bda352dcb..bacc40a0bdf3 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -39,7 +39,7 @@ struct elevator_mq_ops { void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); void (*requests_merged)(struct request_queue *, struct request *, struct request *); void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *); - void (*prepare_request)(struct request *, struct bio *bio); + void (*prepare_request)(struct request *); void (*finish_request)(struct request *); void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); -- cgit v1.2.3 From bf0beec0607db3c6f6fb7bd2c6d503792b05cf3f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 29 May 2020 15:53:15 +0200 Subject: blk-mq: drain I/O when all CPUs in a hctx are offline Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup up queue mapping. Thomas mentioned the following point[1]: "That was the constraint of managed interrupts from the very beginning: The driver/subsystem has to quiesce the interrupt line and the associated queue _before_ it gets shutdown in CPU unplug and not fiddle with it until it's restarted by the core when the CPU is plugged in again." However, current blk-mq implementation doesn't quiesce hw queue before the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a cpuhp state handled after the CPU is down, so there isn't any chance to quiesce the hctx before shutting down the CPU. Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs where the last CPU goes away, and wait for completion of in-flight requests. This guarantees that there is no inflight I/O before shutting down the managed IRQ. Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need to wait for completion of in-flight requests from these drivers to avoid a potential dead-lock. It is safe to do this for stacking drivers as those do not use interrupts at all and their I/O completions are triggered by underlying devices I/O completion. [1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/ [hch: different retry mechanism, merged two patches, minor cleanups] Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 + block/blk-mq-tag.c | 8 ++++ block/blk-mq.c | 112 ++++++++++++++++++++++++++++++++++++++++++++- drivers/block/loop.c | 2 +- drivers/md/dm-rq.c | 2 +- include/linux/blk-mq.h | 10 ++++ include/linux/cpuhotplug.h | 1 + 7 files changed, 133 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 96b7a35c898a..15df3a36e9fa 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), + HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME @@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), + HCTX_FLAG_NAME(STACKING), }; #undef HCTX_FLAG_NAME diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 762198b62088..96a39d0724a2 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) sbitmap_finish_wait(bt, ws, &wait); found_tag: + /* + * Give up this allocation if the hctx is inactive. The caller will + * retry on an active hctx. + */ + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { + blk_mq_put_tag(tags, data->ctx, tag + tag_offset); + return BLK_MQ_NO_TAG; + } return tag + tag_offset; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 560ef5df8993..9a36ac1c1fa1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) e->type->ops.limit_depth(data->cmd_flags, data); } +retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); if (!(data->flags & BLK_MQ_REQ_INTERNAL)) blk_mq_tag_busy(data->hctx); + /* + * Waiting allocations only fail because of an inactive hctx. In that + * case just retry the hctx assignment and tag allocation as CPU hotplug + * should have migrated us to an online CPU by now. + */ tag = blk_mq_get_tag(data); - if (tag == BLK_MQ_NO_TAG) - return NULL; + if (tag == BLK_MQ_NO_TAG) { + if (data->flags & BLK_MQ_REQ_NOWAIT) + return NULL; + + /* + * Give up the CPU and sleep for a random short time to ensure + * that thread using a realtime scheduling class are migrated + * off the the CPU, and thus off the hctx that is going away. + */ + msleep(3); + goto retry; + } return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); } @@ -2335,6 +2351,86 @@ fail: return -ENOMEM; } +struct rq_iter_data { + struct blk_mq_hw_ctx *hctx; + bool has_rq; +}; + +static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +{ + struct rq_iter_data *iter_data = data; + + if (rq->mq_hctx != iter_data->hctx) + return true; + iter_data->has_rq = true; + return false; +} + +static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->sched_tags ? + hctx->sched_tags : hctx->tags; + struct rq_iter_data data = { + .hctx = hctx, + }; + + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + return data.has_rq; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask) || + !blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Prevent new request from being allocated on the current hctx. + * + * The smp_mb__after_atomic() Pairs with the implied barrier in + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is + * seen once we return from the tag allocator. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + + /* + * Try to grab a reference to the queue and wait for any outstanding + * requests. If we could not grab a reference the queue has been + * frozen and there are no requests. + */ + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { + while (blk_mq_hctx_has_requests(hctx)) + msleep(5); + percpu_ref_put(&hctx->queue->q_usage_counter); + } + + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + return 0; +} + /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; @@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } @@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q, { hctx->queue_num = hctx_idx; + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void) { cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", + blk_mq_hctx_notify_online, + blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..d7904b4d8d12 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 3f8577e2c13b..f60c02512121 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 856bb10993cf..d6fcae17da5a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,8 @@ struct blk_mq_hw_ctx { */ atomic_t nr_active; + /** @cpuhp_online: List to store request if CPU is going to die */ + struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ @@ -391,6 +393,11 @@ struct blk_mq_ops { enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, + /* + * Set when this device requires underlying blk-mq device for + * completing IO: + */ + BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, @@ -400,6 +407,9 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + /* hw queue is inactive after all its CPUs become offline */ + BLK_MQ_S_INACTIVE = 3, + BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 77d70b633531..24b3a77810b6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,6 +152,7 @@ enum cpuhp_state { CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, + CPUHP_AP_BLK_MQ_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, -- cgit v1.2.3 From abb30460bda232f304f642510adc8c6576ea51ea Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 Jun 2020 10:02:01 -0600 Subject: block: mark bio_wouldblock_error() bio with BIO_QUIET We really don't care about triggering buffer errors for this condition. This avoids a spew of: Buffer I/O error on dev sdc, logical block 785929, async page read Buffer I/O error on dev sdc, logical block 759095, async page read Buffer I/O error on dev sdc, logical block 766922, async page read Buffer I/O error on dev sdc, logical block 17659, async page read Buffer I/O error on dev sdc, logical block 637571, async page read Buffer I/O error on dev sdc, logical block 39241, async page read Buffer I/O error on dev sdc, logical block 397241, async page read Buffer I/O error on dev sdc, logical block 763992, async page read from -EAGAIN conditions on request allocation for async reads. Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 941378ec5b39..683ff5fd8871 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -417,6 +417,7 @@ static inline void bio_io_error(struct bio *bio) static inline void bio_wouldblock_error(struct bio *bio) { + bio_set_flag(bio, BIO_QUIET); bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } -- cgit v1.2.3