diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-13 13:03:54 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-13 13:03:54 -0700 |
commit | 0c9f4ac808b017a0013cee92a30de980550145d5 (patch) | |
tree | 94eedbb9ef4815df9dc8d1dd6424fc92a2fbcd7a /drivers | |
parent | 9961a785944601e32f185ea696347b22ffda634c (diff) | |
parent | a3166c51702bb00b8f8b84022090cbab8f37be1a (diff) | |
download | lwn-0c9f4ac808b017a0013cee92a30de980550145d5.tar.gz lwn-0c9f4ac808b017a0013cee92a30de980550145d5.zip |
Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe:
- Add a partscan attribute in sysfs, fixing an issue with systemd
relying on an internal interface that went away.
- Attempt #2 at making long running discards interruptible. The
previous attempt went into 6.9, but we ended up mostly reverting it
as it had issues.
- Remove old ida_simple API in bcache
- Support for zoned write plugging, greatly improving the performance
on zoned devices.
- Remove the old throttle low interface, which has been experimental
since 2017 and never made it beyond that and isn't being used.
- Remove page->index debugging checks in brd, as it hasn't caught
anything and prepares us for removing in struct page.
- MD pull request from Song
- Don't schedule block workers on isolated CPUs
* tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits)
blk-throttle: delay initialization until configuration
blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW
block: fix that util can be greater than 100%
block: support to account io_ticks precisely
block: add plug while submitting IO
bcache: fix variable length array abuse in btree_iter
bcache: Remove usage of the deprecated ida_simple_xx() API
md: Revert "md: Fix overflow in is_mddev_idle"
blk-lib: check for kill signal in ioctl BLKDISCARD
block: add a bio_await_chain helper
block: add a blk_alloc_discard_bio helper
block: add a bio_chain_and_submit helper
block: move discard checks into the ioctl handler
block: remove the discard_granularity check in __blkdev_issue_discard
block/ioctl: prefer different overflow check
null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION()
block: fix and simplify blkdevparts= cmdline parsing
block: refine the EOF check in blkdev_iomap_begin
block: add a partscan sysfs attribute for disks
block: add a disk_has_partscan helper
...
Diffstat (limited to 'drivers')
35 files changed, 495 insertions, 1133 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e322cef6596b..b900fe9e0030 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -29,10 +29,7 @@ /* * Each block ramdisk device has a xarray brd_pages of pages that stores - * the pages containing the block device's contents. A brd page's ->index is - * its offset in PAGE_SIZE units. This is similar to, but in no way connected - * with, the kernel's pagecache or buffer cache (which sit above our block - * device). + * the pages containing the block device's contents. */ struct brd_device { int brd_number; @@ -51,15 +48,7 @@ struct brd_device { */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - pgoff_t idx; - struct page *page; - - idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ - page = xa_load(&brd->brd_pages, idx); - - BUG_ON(page && page->index != idx); - - return page; + return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); } /* @@ -67,8 +56,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) */ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) { - pgoff_t idx; - struct page *page, *cur; + pgoff_t idx = sector >> PAGE_SECTORS_SHIFT; + struct page *page; int ret = 0; page = brd_lookup_page(brd, sector); @@ -80,23 +69,16 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) return -ENOMEM; xa_lock(&brd->brd_pages); - - idx = sector >> PAGE_SECTORS_SHIFT; - page->index = idx; - - cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp); - - if (unlikely(cur)) { - __free_page(page); - ret = xa_err(cur); - if (!ret && (cur->index != idx)) - ret = -EIO; - } else { + ret = __xa_insert(&brd->brd_pages, idx, page, gfp); + if (!ret) brd->brd_nr_pages++; - } - xa_unlock(&brd->brd_pages); + if (ret < 0) { + __free_page(page); + if (ret == -EBUSY) + ret = 0; + } return ret; } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index ed33cf7192d2..4005a8b685e8 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -225,6 +225,10 @@ static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); +static bool g_fua = true; +module_param_named(fua, g_fua, bool, 0444); +MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); + static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); @@ -253,6 +257,11 @@ static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); +static int g_zone_append_max_sectors = INT_MAX; +module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); +MODULE_PARM_DESC(zone_append_max_sectors, + "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -436,10 +445,12 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); +NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); +NULLB_DEVICE_ATTR(fua, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -580,12 +591,14 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_fua, NULL, }; @@ -664,14 +677,14 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size," + "badblocks,blocking,blocksize,cache_size,fua," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," "poll_queues,power,queue_mode,shared_tag_bitmap," "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size\n"); + "zone_size,zone_append_max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -751,10 +764,13 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; + dev->zone_append_max_sectors = g_zone_append_max_sectors; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; + dev->fua = g_fua; + return dev; } @@ -1151,7 +1167,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev, return BLK_STS_OK; } -static int null_handle_flush(struct nullb *nullb) +static blk_status_t null_handle_flush(struct nullb *nullb) { int err; @@ -1168,7 +1184,7 @@ static int null_handle_flush(struct nullb *nullb) WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); - return err; + return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, @@ -1206,7 +1222,7 @@ static int null_handle_rq(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err; + int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); struct req_iterator iter; @@ -1218,15 +1234,13 @@ static int null_handle_rq(struct nullb_cmd *cmd) err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } + if (err) + break; sector += len >> SECTOR_SHIFT; } spin_unlock_irq(&nullb->lock); - return 0; + return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) @@ -1273,8 +1287,8 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return errno_to_blk_status(null_handle_rq(cmd)); + return null_handle_rq(cmd); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) @@ -1343,7 +1357,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, blk_status_t sts; if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + cmd->error = null_handle_flush(nullb); goto out; } @@ -1912,7 +1926,7 @@ static int null_add_dev(struct nullb_device *dev) if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); + blk_queue_write_cache(nullb->q, true, dev->fua); } nullb->q->queuedata = nullb; @@ -2113,10 +2127,13 @@ static void __exit null_exit(void) if (tag_set.ops) blk_mq_free_tag_set(&tag_set); + + mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); +MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 477b97746823..3234e6c85eed 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -82,6 +82,7 @@ struct nullb_device { unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int zone_max_open; /* max number of open zones */ unsigned int zone_max_active; /* max number of active zones */ + unsigned int zone_append_max_sectors; /* Max sectors per zone append command */ unsigned int submit_queues; /* number of submission queues */ unsigned int prev_submit_queues; /* number of submission queues before change */ unsigned int poll_queues; /* number of IOPOLL submission queues */ @@ -104,6 +105,7 @@ struct nullb_device { bool no_sched; /* no IO scheduler for the device */ bool shared_tags; /* share tag set between devices for blk-mq */ bool shared_tag_bitmap; /* use hostwide shared tags */ + bool fua; /* Support FUA */ }; struct nullb { diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 1689e2584104..5b5a63adacc1 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -9,6 +9,8 @@ #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt +#define NULL_ZONE_INVALID_WP ((sector_t)-1) + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } -static inline void null_lock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_lock_irq(&dev->zone_res_lock); -} - -static inline void null_unlock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_unlock_irq(&dev->zone_res_lock); -} - static inline void null_init_zone_lock(struct nullb_device *dev, struct nullb_zone *zone) { @@ -103,6 +93,11 @@ int null_init_zoned_dev(struct nullb_device *dev, dev->zone_nr_conv); } + dev->zone_append_max_sectors = + min(ALIGN_DOWN(dev->zone_append_max_sectors, + dev->blocksize >> SECTOR_SHIFT), + zone_capacity_sects); + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_active = 0; @@ -154,7 +149,7 @@ int null_init_zoned_dev(struct nullb_device *dev, lim->zoned = true; lim->chunk_sectors = dev->zone_size_sects; - lim->max_zone_append_sectors = dev->zone_size_sects; + lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; lim->max_active_zones = dev->zone_max_active; return 0; @@ -163,11 +158,16 @@ int null_init_zoned_dev(struct nullb_device *dev, int null_register_zoned_dev(struct nullb *nullb) { struct request_queue *q = nullb->q; + struct gendisk *disk = nullb->disk; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); - return blk_revalidate_disk_zones(nullb->disk, NULL); + disk->nr_zones = bdev_nr_zones(disk->part0); + + pr_info("%s: using %s zone append\n", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + + return blk_revalidate_disk_zones(disk); } void null_free_zoned_dev(struct nullb_device *dev) @@ -241,35 +241,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t __null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - static void null_close_imp_open_zone(struct nullb_device *dev) { struct nullb_zone *zone; @@ -286,7 +257,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev) zno = dev->zone_nr_conv; if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, zone); + dev->nr_zones_imp_open--; + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } dev->imp_close_zone_no = zno; return; } @@ -374,73 +351,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, null_lock_zone(dev, zone); - if (zone->cond == BLK_ZONE_COND_FULL || - zone->cond == BLK_ZONE_COND_READONLY || - zone->cond == BLK_ZONE_COND_OFFLINE) { - /* Cannot write to the zone */ - ret = BLK_STS_IOERR; - goto unlock; - } - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. + * Regular writes must be at the write pointer position. Zone append + * writes are automatically issued at the write pointer and the position + * returned using the request sector. Note that we do not check the zone + * condition because for FULL, READONLY and OFFLINE zones, the sector + * check against the zone write pointer will always result in failing + * the command. */ if (append) { + if (WARN_ON_ONCE(!dev->zone_append_max_sectors) || + zone->wp == NULL_ZONE_INVALID_WP) { + ret = BLK_STS_IOERR; + goto unlock_zone; + } sector = zone->wp; blk_mq_rq_from_pdu(cmd)->__sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; } - if (zone->wp + nr_sectors > zone->start + zone->capacity) { + if (sector != zone->wp || + zone->wp + nr_sectors > zone->start + zone->capacity) { ret = BLK_STS_IOERR; - goto unlock; + goto unlock_zone; } if (zone->cond == BLK_ZONE_COND_CLOSED || zone->cond == BLK_ZONE_COND_EMPTY) { - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) { - null_unlock_zone_res(dev); - goto unlock; - } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + goto unlock_zone; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; + spin_unlock(&dev->zone_res_lock); + } - null_unlock_zone_res(dev); + zone->cond = BLK_ZONE_COND_IMP_OPEN; } ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); if (ret != BLK_STS_OK) - goto unlock; + goto unlock_zone; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { - null_lock_zone_res(dev); - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + spin_unlock(&dev->zone_res_lock); + } zone->cond = BLK_ZONE_COND_FULL; - null_unlock_zone_res(dev); } ret = BLK_STS_OK; -unlock: +unlock_zone: null_unlock_zone(dev, zone); return ret; @@ -454,54 +431,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - goto unlock; + /* Open operation on exp open is not an error */ + return BLK_STS_OK; case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - ret = BLK_STS_IOERR; - goto unlock; + return BLK_STS_IOERR; } - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); -unlock: - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + break; + } - return ret; + dev->nr_zones_exp_open++; + + spin_unlock(&dev->zone_res_lock); + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + + return BLK_STS_OK; } static blk_status_t null_close_zone(struct nullb_device *dev, struct nullb_zone *zone) { - blk_status_t ret; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - ret = __null_close_zone(dev, zone); - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - return ret; + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + default: + break; + } + + if (zone->wp > zone->start) + dev->nr_zones_closed++; + + spin_unlock(&dev->zone_res_lock); + } + + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; + + return BLK_STS_OK; } static blk_status_t null_finish_zone(struct nullb_device *dev, @@ -512,41 +535,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - default: - ret = BLK_STS_IOERR; - goto unlock; + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Finish operation on full is not an error */ + spin_unlock(&dev->zone_res_lock); + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; -unlock: - null_unlock_zone_res(dev); - - return ret; + return BLK_STS_OK; } static blk_status_t null_reset_zone(struct nullb_device *dev, @@ -555,34 +584,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - null_unlock_zone_res(dev); - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - null_unlock_zone_res(dev); - return BLK_STS_IOERR; + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; - null_unlock_zone_res(dev); - if (dev->memory_backed) return null_handle_discard(dev, zone->start, zone->len); @@ -711,7 +739,7 @@ static void null_set_zone_cond(struct nullb_device *dev, zone->cond != BLK_ZONE_COND_OFFLINE) null_finish_zone(dev, zone); zone->cond = cond; - zone->wp = (sector_t)-1; + zone->wp = NULL_ZONE_INVALID_WP; } null_unlock_zone(dev, zone); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 374e4efa8759..176657dce3e3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) @@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - blk_queue_required_elevator_features(ub->ub_disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 42dea7601d87..c1af0a7d56c8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev) */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); - err = blk_revalidate_disk_zones(vblk->disk, NULL); + err = blk_revalidate_disk_zones(vblk->disk); if (err) goto out_cleanup_disk; } diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 2bba4d6aaaa2..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,7 +54,7 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; if (b->ops->is_extents) @@ -67,7 +67,7 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; for_each_key(b, k, &iter) { @@ -879,7 +879,7 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; @@ -895,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1100,33 +1100,33 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->size = ARRAY_SIZE(iter->data); - iter->used = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, @@ -1293,10 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1307,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1323,11 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_stack_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index d795c84246b0..011f6062c4c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -321,7 +321,14 @@ struct btree_iter { #endif struct btree_iter_set { struct bkey *k, *end; - } data[MAX_BSETS]; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + struct btree_iter iter; + struct btree_iter_set stack_data[MAX_BSETS]; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -333,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -350,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 196cdacce38f..d011a7154d33 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1309,7 +1309,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; gc->nodes++; @@ -1570,7 +1570,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) @@ -1611,17 +1611,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1911,7 +1912,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1919,10 +1920,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1950,7 +1951,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1959,8 +1960,8 @@ static int bch_btree_check_thread(void *arg) ret = 0; /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1978,7 +1979,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2051,7 +2052,7 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; /* check and mark root node keys */ @@ -2547,11 +2548,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2580,11 +2581,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 330bcd9ea4a9..cba09660148a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(disk->first_minor)); + ida_free(&bcache_device_idx, + first_minor_to_idx(disk->first_minor)); put_disk(disk); } @@ -940,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->full_dirty_stripes) goto out_free_stripe_sectors_dirty; - idx = ida_simple_get(&bcache_device_idx, 0, - BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1, + GFP_KERNEL); if (idx < 0) goto out_free_full_dirty_stripes; @@ -986,7 +986,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, out_bioset_exit: bioset_exit(&d->bio_split); out_ida_remove: - ida_simple_remove(&bcache_device_idx, idx); + ida_free(&bcache_device_idx, idx); out_free_full_dirty_stripes: kvfree(d->full_dirty_stripes); out_free_stripe_sectors_dirty: @@ -1914,8 +1914,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * - sizeof(struct btree_iter_set); + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); if (!c->devices) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 6956beb55326..826b14cae4e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,7 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; + struct btree_iter_stack iter; goto lock_root; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8827a6f130ad..792e070ccf38 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,15 +908,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -930,7 +930,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -979,7 +979,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index fd852981ef9c..cf433b0cf742 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -321,8 +321,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison, { BUG_ON(!cell->exclusive_lock); - bio_list_merge(bios, &cell->bios); - bio_list_init(&cell->bios); + bio_list_merge_init(bios, &cell->bios); if (cell->shared_count) { cell->exclusive_lock = false; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 911f73f7ebba..0fcbf8603846 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -115,8 +115,7 @@ static void __commit(struct work_struct *_ws) */ spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); - bio_list_merge(&bios, &b->bios); - bio_list_init(&b->bios); + bio_list_merge_init(&bios, &b->bios); b->commit_scheduled = false; spin_unlock_irq(&b->lock); @@ -565,8 +564,7 @@ static void defer_bio(struct cache *cache, struct bio *bio) static void defer_bios(struct cache *cache, struct bio_list *bios) { spin_lock_irq(&cache->lock); - bio_list_merge(&cache->deferred_bios, bios); - bio_list_init(bios); + bio_list_merge_init(&cache->deferred_bios, bios); spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); @@ -1816,8 +1814,7 @@ static void process_deferred_bios(struct work_struct *ws) bio_list_init(&bios); spin_lock_irq(&cache->lock); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); + bio_list_merge_init(&bios, &cache->deferred_bios); spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { @@ -1847,8 +1844,7 @@ static void requeue_deferred_bios(struct cache *cache) struct bio_list bios; bio_list_init(&bios); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); + bio_list_merge_init(&bios, &cache->deferred_bios); while ((bio = bio_list_pop(&bios))) { bio->bi_status = BLK_STS_DM_REQUEUE; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 94b2fc33f64b..3f68672ab7c9 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1181,8 +1181,7 @@ static void process_deferred_discards(struct clone *clone) struct bio_list discards = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); - bio_list_merge(&discards, &clone->deferred_discard_bios); - bio_list_init(&clone->deferred_discard_bios); + bio_list_merge_init(&discards, &clone->deferred_discard_bios); spin_unlock_irq(&clone->lock); if (bio_list_empty(&discards)) @@ -1215,8 +1214,7 @@ static void process_deferred_bios(struct clone *clone) struct bio_list bios = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); - bio_list_merge(&bios, &clone->deferred_bios); - bio_list_init(&clone->deferred_bios); + bio_list_merge_init(&bios, &clone->deferred_bios); spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios)) @@ -1237,11 +1235,9 @@ static void process_deferred_flush_bios(struct clone *clone) * before issuing them or signaling their completion. */ spin_lock_irq(&clone->lock); - bio_list_merge(&bios, &clone->deferred_flush_bios); - bio_list_init(&clone->deferred_flush_bios); - - bio_list_merge(&bio_completions, &clone->deferred_flush_completions); - bio_list_init(&clone->deferred_flush_completions); + bio_list_merge_init(&bios, &clone->deferred_flush_bios); + bio_list_merge_init(&bio_completions, + &clone->deferred_flush_completions); spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index e6757a30dcca..08700bfc3e23 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -140,7 +140,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; - unsigned int *zwp_offset; + void *zone_revalidate_map; #endif #ifdef CONFIG_IMA diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 6acfa5bf97a4..8f81e597858d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1272,8 +1272,7 @@ static void process_deferred_bios(struct era *era) bio_list_init(&marked_bios); spin_lock(&era->deferred_lock); - bio_list_merge(&deferred_bios, &era->deferred_bios); - bio_list_init(&era->deferred_bios); + bio_list_merge_init(&deferred_bios, &era->deferred_bios); spin_unlock(&era->deferred_lock); if (bio_list_empty(&deferred_bios)) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 05d1328d1811..15b681b90153 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -704,8 +704,7 @@ static void process_queued_bios(struct work_struct *work) return; } - bio_list_merge(&bios, &m->queued_bios); - bio_list_init(&m->queued_bios); + bio_list_merge_init(&bios, &m->queued_bios); spin_unlock_irqrestore(&m->lock, flags); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..2c6fbd87363f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2042,7 +2042,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; - if (!static_key_enabled(&zoned_enabled.key)) + if (blk_queue_is_zoned(q) && + !static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4793ad2aa1f7..f359984c8ef2 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -592,12 +592,6 @@ struct dm_thin_endio_hook { struct dm_bio_prison_cell *cell; }; -static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) -{ - bio_list_merge(bios, master); - bio_list_init(master); -} - static void error_bio_list(struct bio_list *bios, blk_status_t error) { struct bio *bio; @@ -616,7 +610,7 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, bio_list_init(&bios); spin_lock_irq(&tc->lock); - __merge_bio_list(&bios, master); + bio_list_merge_init(&bios, master); spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); @@ -645,8 +639,8 @@ static void requeue_io(struct thin_c *tc) bio_list_init(&bios); spin_lock_irq(&tc->lock); - __merge_bio_list(&bios, &tc->deferred_bio_list); - __merge_bio_list(&bios, &tc->retry_on_resume_list); + bio_list_merge_init(&bios, &tc->deferred_bio_list); + bio_list_merge_init(&bios, &tc->retry_on_resume_list); spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 94f6f1ccfb7d..ab3ea8337809 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -604,8 +604,7 @@ static void assign_discard_permit(struct limiter *limiter) static void get_waiters(struct limiter *limiter) { - bio_list_merge(&limiter->waiters, &limiter->new_waiters); - bio_list_init(&limiter->new_waiters); + bio_list_merge_init(&limiter->waiters, &limiter->new_waiters); } static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool) diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c index 57e87f0d7069..dd4fdee2ca0c 100644 --- a/drivers/md/dm-vdo/flush.c +++ b/drivers/md/dm-vdo/flush.c @@ -369,8 +369,7 @@ void vdo_dump_flusher(const struct flusher *flusher) static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo) { bio_list_init(&flush->bios); - bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios); - bio_list_init(&vdo->flusher->waiting_flush_bios); + bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios); } static void launch_flush(struct vdo_flush *flush) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index eb9832b22b14..8e6bcb0d786a 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -60,16 +60,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, struct dm_table *map; int srcu_idx, ret; - if (dm_suspended_md(md)) - return -EAGAIN; + if (!md->zone_revalidate_map) { + /* Regular user context */ + if (dm_suspended_md(md)) + return -EAGAIN; - map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return -EIO; + } else { + /* Zone revalidation during __bind() */ + map = md->zone_revalidate_map; + } ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); - dm_put_live_table(md, srcu_idx); + if (!md->zone_revalidate_map) + dm_put_live_table(md, srcu_idx); return ret; } @@ -138,80 +145,47 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) } } -void dm_cleanup_zoned_dev(struct mapped_device *md) +/* + * Count conventional zones of a mapped zoned device. If the device + * only has conventional zones, do not expose it as zoned. + */ +static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, + void *data) { - if (md->disk) { - bitmap_free(md->disk->conv_zones_bitmap); - md->disk->conv_zones_bitmap = NULL; - bitmap_free(md->disk->seq_zones_wlock); - md->disk->seq_zones_wlock = NULL; - } + unsigned int *nr_conv_zones = data; - kvfree(md->zwp_offset); - md->zwp_offset = NULL; - md->nr_zones = 0; -} + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + (*nr_conv_zones)++; -static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. Use 0 as for an empty zone. - */ - return 0; - } + return 0; } -static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, - void *data) +static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) { - struct mapped_device *md = data; struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; + int ret; - switch (zone->type) { - case BLK_ZONE_TYPE_CONVENTIONAL: - if (!disk->conv_zones_bitmap) { - disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones, - GFP_NOIO); - if (!disk->conv_zones_bitmap) - return -ENOMEM; - } - set_bit(idx, disk->conv_zones_bitmap); - break; - case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (!disk->seq_zones_wlock) { - disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones, - GFP_NOIO); - if (!disk->seq_zones_wlock) - return -ENOMEM; - } - if (!md->zwp_offset) { - md->zwp_offset = - kvcalloc(disk->nr_zones, sizeof(unsigned int), - GFP_KERNEL); - if (!md->zwp_offset) - return -ENOMEM; - } - md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); - - break; - default: - DMERR("Invalid zone type 0x%x at sectors %llu", - (int)zone->type, zone->start); - return -ENODEV; + /* Count conventional zones */ + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); + return ret; + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + disk->queue->limits.max_open_zones = 0; + disk->queue->limits.max_active_zones = 0; + disk->queue->limits.zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; } return 0; @@ -226,41 +200,32 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) { struct gendisk *disk = md->disk; - unsigned int noio_flag; int ret; - /* - * Check if something changed. If yes, cleanup the current resources - * and reallocate everything. - */ + /* Revalidate only if something changed. */ if (!disk->nr_zones || disk->nr_zones != md->nr_zones) - dm_cleanup_zoned_dev(md); + md->nr_zones = 0; + if (md->nr_zones) return 0; /* - * Scan all zones to initialize everything. Ensure that all vmalloc - * operations in this context are done as if GFP_NOIO was specified. + * Our table is not live yet. So the call to dm_get_live_table() + * in dm_blk_report_zones() will fail. Set a temporary pointer to + * our table for dm_blk_report_zones() to use directly. */ - noio_flag = memalloc_noio_save(); - ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, - dm_zone_revalidate_cb, md); - memalloc_noio_restore(noio_flag); - if (ret < 0) - goto err; - if (ret != disk->nr_zones) { - ret = -EIO; - goto err; + md->zone_revalidate_map = t; + ret = blk_revalidate_disk_zones(disk); + md->zone_revalidate_map = NULL; + + if (ret) { + DMERR("Revalidate zones failed %d", ret); + return ret; } md->nr_zones = disk->nr_zones; return 0; - -err: - DMERR("Revalidate zones failed %d", ret); - dm_cleanup_zoned_dev(md); - return ret; } static int device_not_zone_append_capable(struct dm_target *ti, @@ -289,294 +254,40 @@ static bool dm_table_supports_zone_append(struct dm_table *t) int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; + int ret; /* - * For a zoned target, the number of zones should be updated for the - * correct value to be exposed in sysfs queue/nr_zones. + * Check if zone append is natively supported, and if not, set the + * mapped device queue as needing zone append emulation. */ WARN_ON_ONCE(queue_is_mq(q)); - md->disk->nr_zones = bdev_nr_zones(md->disk->part0); - - /* Check if zone append is natively supported */ if (dm_table_supports_zone_append(t)) { clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - dm_cleanup_zoned_dev(md); - return 0; + } else { + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + blk_queue_max_zone_append_sectors(q, 0); } - /* - * Mark the mapped device as needing zone append emulation and - * initialize the emulation resources once the capacity is set. - */ - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); if (!get_capacity(md->disk)) return 0; - return dm_revalidate_zones(md, t); -} - -static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - unsigned int *wp_offset = data; - - *wp_offset = dm_get_zone_wp_offset(zone); - - return 0; -} - -static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, - unsigned int *wp_ofst) -{ - sector_t sector = zno * bdev_zone_sectors(md->disk->part0); - unsigned int noio_flag; - struct dm_table *t; - int srcu_idx, ret; - - t = dm_get_live_table(md, &srcu_idx); - if (!t) - return -EIO; - - /* - * Ensure that all memory allocations in this context are done as if - * GFP_NOIO was specified. - */ - noio_flag = memalloc_noio_save(); - ret = dm_blk_do_report_zones(md, t, sector, 1, - dm_update_zone_wp_offset_cb, wp_ofst); - memalloc_noio_restore(noio_flag); - - dm_put_live_table(md, srcu_idx); - - if (ret != 1) - return -EIO; - - return 0; -} - -struct orig_bio_details { - enum req_op op; - unsigned int nr_sectors; -}; - -/* - * First phase of BIO mapping for targets with zone append emulation: - * check all BIO that change a zone writer pointer and change zone - * append operations into regular write operations. - */ -static bool dm_zone_map_bio_begin(struct mapped_device *md, - unsigned int zno, struct bio *clone) -{ - sector_t zsectors = bdev_zone_sectors(md->disk->part0); - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); - - /* - * If the target zone is in an error state, recover by inspecting the - * zone to get its current write pointer position. Note that since the - * target zone is already locked, a BIO issuing context should never - * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. - */ - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { - if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) - return false; - WRITE_ONCE(md->zwp_offset[zno], zwp_offset); - } - - switch (bio_op(clone)) { - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_FINISH: - return true; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - /* Writes must be aligned to the zone write pointer */ - if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) - return false; - break; - case REQ_OP_ZONE_APPEND: - /* - * Change zone append operations into a non-mergeable regular - * writes directed at the current write pointer position of the - * target zone. - */ - clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | - (clone->bi_opf & (~REQ_OP_MASK)); - clone->bi_iter.bi_sector += zwp_offset; - break; - default: - DMWARN_LIMIT("Invalid BIO operation"); - return false; - } - - /* Cannot write to a full zone */ - if (zwp_offset >= zsectors) - return false; - - return true; -} - -/* - * Second phase of BIO mapping for targets with zone append emulation: - * update the zone write pointer offset array to account for the additional - * data written to a zone. Note that at this point, the remapped clone BIO - * may already have completed, so we do not touch it. - */ -static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, - struct orig_bio_details *orig_bio_details, - unsigned int nr_sectors) -{ - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); - - /* The clone BIO may already have been completed and failed */ - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) - return BLK_STS_IOERR; - - /* Update the zone wp offset */ - switch (orig_bio_details->op) { - case REQ_OP_ZONE_RESET: - WRITE_ONCE(md->zwp_offset[zno], 0); - return BLK_STS_OK; - case REQ_OP_ZONE_FINISH: - WRITE_ONCE(md->zwp_offset[zno], - bdev_zone_sectors(md->disk->part0)); - return BLK_STS_OK; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); - return BLK_STS_OK; - case REQ_OP_ZONE_APPEND: - /* - * Check that the target did not truncate the write operation - * emulating a zone append. - */ - if (nr_sectors != orig_bio_details->nr_sectors) { - DMWARN_LIMIT("Truncated write for zone append"); - return BLK_STS_IOERR; - } - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); - return BLK_STS_OK; - default: - DMWARN_LIMIT("Invalid BIO operation"); - return BLK_STS_IOERR; - } -} - -static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, - struct bio *clone) -{ - if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) - return; - - wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); - bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); -} - -static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, - struct bio *clone) -{ - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) - return; - - WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); - clear_bit_unlock(zno, disk->seq_zones_wlock); - smp_mb__after_atomic(); - wake_up_bit(disk->seq_zones_wlock, zno); - - bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); -} - -static bool dm_need_zone_wp_tracking(struct bio *bio) -{ /* - * Special processing is not needed for operations that do not need the - * zone write lock, that is, all operations that target conventional - * zones and all operations that do not modify directly a sequential - * zone write pointer. + * Check that the mapped device will indeed be zoned, that is, that it + * has sequential write required zones. */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; - switch (bio_op(bio)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_FINISH: - case REQ_OP_ZONE_APPEND: - return bio_zone_is_seq(bio); - default: - return false; - } -} - -/* - * Special IO mapping for targets needing zone append emulation. - */ -int dm_zone_map_bio(struct dm_target_io *tio) -{ - struct dm_io *io = tio->io; - struct dm_target *ti = tio->ti; - struct mapped_device *md = io->md; - struct bio *clone = &tio->clone; - struct orig_bio_details orig_bio_details; - unsigned int zno; - blk_status_t sts; - int r; - - /* - * IOs that do not change a zone write pointer do not need - * any additional special processing. - */ - if (!dm_need_zone_wp_tracking(clone)) - return ti->type->map(ti, clone); - - /* Lock the target zone */ - zno = bio_zone_no(clone); - dm_zone_lock(md->disk, zno, clone); - - orig_bio_details.nr_sectors = bio_sectors(clone); - orig_bio_details.op = bio_op(clone); + ret = dm_check_zoned(md, t); + if (ret) + return ret; + if (!blk_queue_is_zoned(q)) + return 0; - /* - * Check that the bio and the target zone write pointer offset are - * both valid, and if the bio is a zone append, remap it to a write. - */ - if (!dm_zone_map_bio_begin(md, zno, clone)) { - dm_zone_unlock(md->disk, zno, clone); - return DM_MAPIO_KILL; + if (!md->disk->nr_zones) { + DMINFO("%s using %s zone append", + md->disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); } - /* Let the target do its work */ - r = ti->type->map(ti, clone); - switch (r) { - case DM_MAPIO_SUBMITTED: - /* - * The target submitted the clone BIO. The target zone will - * be unlocked on completion of the clone. - */ - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, - *tio->len_ptr); - break; - case DM_MAPIO_REMAPPED: - /* - * The target only remapped the clone BIO. In case of error, - * unlock the target zone here as the clone will not be - * submitted. - */ - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, - *tio->len_ptr); - if (sts != BLK_STS_OK) - dm_zone_unlock(md->disk, zno, clone); - break; - case DM_MAPIO_REQUEUE: - case DM_MAPIO_KILL: - default: - dm_zone_unlock(md->disk, zno, clone); - sts = BLK_STS_IOERR; - break; - } - - if (sts != BLK_STS_OK) - return DM_MAPIO_KILL; - - return r; + return dm_revalidate_zones(md, t); } /* @@ -587,61 +298,17 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) struct mapped_device *md = io->md; struct gendisk *disk = md->disk; struct bio *orig_bio = io->orig_bio; - unsigned int zwp_offset; - unsigned int zno; /* - * For targets that do not emulate zone append, we only need to - * handle native zone-append bios. + * Get the offset within the zone of the written sector + * and add that to the original bio sector position. */ - if (!dm_emulate_zone_append(md)) { - /* - * Get the offset within the zone of the written sector - * and add that to the original bio sector position. - */ - if (clone->bi_status == BLK_STS_OK && - bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = - (sector_t)bdev_zone_sectors(disk->part0) - 1; - - orig_bio->bi_iter.bi_sector += - clone->bi_iter.bi_sector & mask; - } - - return; - } + if (clone->bi_status == BLK_STS_OK && + bio_op(clone) == REQ_OP_ZONE_APPEND) { + sector_t mask = bdev_zone_sectors(disk->part0) - 1; - /* - * For targets that do emulate zone append, if the clone BIO does not - * own the target zone write lock, we have nothing to do. - */ - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) - return; - - zno = bio_zone_no(orig_bio); - - if (clone->bi_status != BLK_STS_OK) { - /* - * BIOs that modify a zone write pointer may leave the zone - * in an unknown state in case of failure (e.g. the write - * pointer was only partially advanced). In this case, set - * the target zone write pointer as invalid unless it is - * already being updated. - */ - WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); - } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { - /* - * Get the written sector for zone append operation that were - * emulated using regular write operations. - */ - zwp_offset = READ_ONCE(md->zwp_offset[zno]); - if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) - WRITE_ONCE(md->zwp_offset[zno], - DM_ZONE_INVALID_WP_OFST); - else - orig_bio->bi_iter.bi_sector += - zwp_offset - bio_sectors(orig_bio); + orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; } - dm_zone_unlock(disk, zno, clone); + return; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7d0746b37c8e..597dd7a25823 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1428,25 +1428,12 @@ static void __map_bio(struct bio *clone) down(&md->swap_bios_semaphore); } - if (static_branch_unlikely(&zoned_enabled)) { - /* - * Check if the IO needs a special mapping due to zone append - * emulation on zoned target. In this case, dm_zone_map_bio() - * calls the target map operation. - */ - if (unlikely(dm_emulate_zone_append(md))) - r = dm_zone_map_bio(tio); - else - goto do_map; - } else { -do_map: - if (likely(ti->type->map == linear_map)) - r = linear_map(ti, clone); - else if (ti->type->map == stripe_map) - r = stripe_map(ti, clone); - else - r = ti->type->map(ti, clone); - } + if (likely(ti->type->map == linear_map)) + r = linear_map(ti, clone); + else if (ti->type->map == stripe_map) + r = stripe_map(ti, clone); + else + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1774,6 +1761,33 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io, ci->sector_count = 0; } +#ifdef CONFIG_BLK_DEV_ZONED +static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) +{ + /* + * For mapped device that need zone append emulation, we must + * split any large BIO that straddles zone boundaries. + */ + return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && + !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); +} +static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) +{ + return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); +} +#else +static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) +{ + return false; +} +static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) +{ + return false; +} +#endif + /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1783,19 +1797,32 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct clone_info ci; struct dm_io *io; blk_status_t error = BLK_STS_OK; - bool is_abnormal; + bool is_abnormal, need_split; + + need_split = is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) + need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); - is_abnormal = is_abnormal_io(bio); - if (unlikely(is_abnormal)) { + if (unlikely(need_split)) { /* * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. + * Also split the BIO for mapped devices needing zone append + * emulation to ensure that the BIO does not cross zone + * boundaries. */ bio = bio_split_to_limits(bio); if (!bio) return; } + /* + * Use the block layer zone write plugging for mapped devices that + * need zone append emulation (e.g. dm-crypt). + */ + if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) + return; + /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { io = alloc_io(md, bio, GFP_NOWAIT); @@ -2016,7 +2043,6 @@ static void cleanup_mapped_device(struct mapped_device *md) md->dax_dev = NULL; } - dm_cleanup_zoned_dev(md); if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 7f1acbf6bd9e..e0c57f19839b 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -104,13 +104,11 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED -void dm_cleanup_zoned_dev(struct mapped_device *md); int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_map_bio(struct dm_target_io *io); #else -static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 059afc24c08b..0a2d37eb38ef 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1424,7 +1424,7 @@ __acquires(bitmap->lock) sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; - sector_t csize; + sector_t csize = ((sector_t)1) << bitmap->chunkshift; int err; if (page >= bitmap->pages) { @@ -1433,6 +1433,7 @@ __acquires(bitmap->lock) * End-of-device while looking for a whole page or * user set a huge number to sysfs bitmap_set_bits. */ + *blocks = csize - (offset & (csize - 1)); return NULL; } err = md_bitmap_checkpage(bitmap, page, create, 0); @@ -1441,8 +1442,7 @@ __acquires(bitmap->lock) bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + PAGE_COUNTER_SHIFT); - else - csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); if (err < 0) diff --git a/drivers/md/md.c b/drivers/md/md.c index e575e74aabf5..aff9118ff697 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8087,7 +8087,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); - wake_up(&t->wqueue); + if (wq_has_sleeper(&t->wqueue)) + wake_up(&t->wqueue); } rcu_read_unlock(); } @@ -8582,6 +8583,10 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; + + if (!init && !blk_queue_io_stat(disk->queue)) + continue; + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats diff --git a/drivers/md/md.h b/drivers/md/md.h index 097d9dbd69b8..ca085ecad504 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -621,7 +621,8 @@ extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); + if (blk_queue_io_stat(bdev->bd_disk->queue)) + atomic_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d874abfc1836..2bd1ce9b3922 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,7 +36,6 @@ */ #include <linux/blkdev.h> -#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -6734,6 +6733,9 @@ static void raid5d(struct md_thread *thread) int batch_size, released; unsigned int offset; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + break; + released = release_stripe_list(conf, conf->temp_inactive_list); if (released) clear_bit(R5_DID_ALLOC, &conf->cache_state); @@ -6770,18 +6772,7 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); - - /* - * Waiting on MD_SB_CHANGE_PENDING below may deadlock - * seeing md_check_recovery() is needed to clear - * the flag when using mdmon. - */ - continue; } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 095f59e7aa93..bf7615cb36ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2132,7 +2132,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = blk_revalidate_disk_zones(ns->disk, NULL); + ret = blk_revalidate_disk_zones(ns->disk); if (ret && !nvme_first_scan(ns->disk)) goto out; } diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 3148d9f1bde6..0021d06041c1 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -52,14 +52,10 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1)) return false; /* - * ZNS does not define a conventional zone type. If the underlying - * device has a bitmap set indicating the existence of conventional - * zones, reject the device. Otherwise, use report zones to detect if - * the device has conventional zones. + * ZNS does not define a conventional zone type. Use report zones + * to detect if the device has conventional zones and reject it if + * it does. */ - if (ns->bdev->bd_disk->conv_zones_bitmap) - return false; - ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev), validate_conv_zones_cb, NULL); if (ret < 0) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5b3230ef51fe..967b6d62bb37 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1869,7 +1869,6 @@ out_put_budget: case BLK_STS_OK: break; case BLK_STS_RESOURCE: - case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65cdc8b77e35..64c5129044b3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1260,12 +1260,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) } } - if (req_op(rq) == REQ_OP_ZONE_APPEND) { - ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); - if (ret) - goto fail; - } - fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); @@ -1348,7 +1342,6 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: - case REQ_OP_ZONE_APPEND: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, @@ -3981,7 +3974,6 @@ static void scsi_disk_release(struct device *dev) struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); - sd_zbc_free_zone_info(sdkp); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 5c4285a582b2..49dd600bfa48 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -104,12 +104,6 @@ struct scsi_disk { * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; - u32 *zones_wp_offset; - spinlock_t zones_wp_offset_lock; - u32 *rev_wp_offset; - struct mutex rev_mutex; - struct work_struct zone_wp_offset_work; - char *zone_wp_update_buf; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ @@ -245,7 +239,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED -void sd_zbc_free_zone_info(struct scsi_disk *sdkp); int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, @@ -255,13 +248,8 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, - unsigned int nr_blocks); - #else /* CONFIG_BLK_DEV_ZONED */ -static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {} - static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { return 0; @@ -285,13 +273,6 @@ static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, return good_bytes; } -static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, - sector_t *lba, - unsigned int nr_blocks) -{ - return BLK_STS_TARGET; -} - #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 26af5ab7d7c1..806036e48abe 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -23,36 +23,6 @@ #define CREATE_TRACE_POINTS #include "sd_trace.h" -/** - * sd_zbc_get_zone_wp_offset - Get zone write pointer offset. - * @zone: Zone for which to return the write pointer offset. - * - * Return: offset of the write pointer from the start of the zone. - */ -static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) -{ - if (zone->type == ZBC_ZONE_TYPE_CONV) - return 0; - - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Offline and read-only zones do not have a valid - * write pointer. Use 0 as for an empty zone. - */ - return 0; - } -} - /* Whether or not a SCSI zone descriptor describes a gap zone. */ static bool sd_zbc_is_gap_zone(const u8 buf[64]) { @@ -121,9 +91,6 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], if (ret) return ret; - if (sdkp->rev_wp_offset) - sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone); - return 0; } @@ -347,123 +314,6 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) return BLK_STS_OK; } -#define SD_ZBC_INVALID_WP_OFST (~0u) -#define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1) - -static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - struct scsi_disk *sdkp = data; - - lockdep_assert_held(&sdkp->zones_wp_offset_lock); - - sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone); - - return 0; -} - -/* - * An attempt to append a zone triggered an invalid write pointer error. - * Reread the write pointer of the zone(s) in which the append failed. - */ -static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) -{ - struct scsi_disk *sdkp; - unsigned long flags; - sector_t zno; - int ret; - - sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); - - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) { - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) - continue; - - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf, - SD_BUF_SIZE, - zno * sdkp->zone_info.zone_blocks, true); - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - if (!ret) - sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64, - zno, sd_zbc_update_wp_offset_cb, - sdkp); - } - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - - scsi_device_put(sdkp->device); -} - -/** - * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command. - * @cmd: the command to setup - * @lba: the LBA to patch - * @nr_blocks: the number of LBAs to be written - * - * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND. - * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and - * patching of the lba for an emulated ZONE_APPEND command. - * - * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will - * schedule a REPORT ZONES command and return BLK_STS_IOERR. - */ -blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, - unsigned int nr_blocks) -{ - struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); - unsigned int wp_offset, zno = blk_rq_zone_no(rq); - unsigned long flags; - blk_status_t ret; - - ret = sd_zbc_cmnd_checks(cmd); - if (ret != BLK_STS_OK) - return ret; - - if (!blk_rq_zone_is_seq(rq)) - return BLK_STS_IOERR; - - /* Unlock of the write lock will happen in sd_zbc_complete() */ - if (!blk_req_zone_write_trylock(rq)) - return BLK_STS_ZONE_RESOURCE; - - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - wp_offset = sdkp->zones_wp_offset[zno]; - switch (wp_offset) { - case SD_ZBC_INVALID_WP_OFST: - /* - * We are about to schedule work to update a zone write pointer - * offset, which will cause the zone append command to be - * requeued. So make sure that the scsi device does not go away - * while the work is being processed. - */ - if (scsi_device_get(sdkp->device)) { - ret = BLK_STS_IOERR; - break; - } - sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST; - schedule_work(&sdkp->zone_wp_offset_work); - fallthrough; - case SD_ZBC_UPDATING_WP_OFST: - ret = BLK_STS_DEV_RESOURCE; - break; - default: - wp_offset = sectors_to_logical(sdkp->device, wp_offset); - if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) { - ret = BLK_STS_IOERR; - break; - } - - trace_scsi_prepare_zone_append(cmd, *lba, wp_offset); - *lba += wp_offset; - } - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - if (ret) - blk_req_zone_write_unlock(rq); - return ret; -} - /** * sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations * can be RESET WRITE POINTER, OPEN, CLOSE or FINISH. @@ -504,96 +354,6 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, return BLK_STS_OK; } -static bool sd_zbc_need_zone_wp_update(struct request *rq) -{ - switch (req_op(rq)) { - case REQ_OP_ZONE_APPEND: - case REQ_OP_ZONE_FINISH: - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - return true; - case REQ_OP_WRITE: - case REQ_OP_WRITE_ZEROES: - return blk_rq_zone_is_seq(rq); - default: - return false; - } -} - -/** - * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion - * @cmd: Completed command - * @good_bytes: Command reply bytes - * - * Called from sd_zbc_complete() to handle the update of the cached zone write - * pointer value in case an update is needed. - */ -static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, - unsigned int good_bytes) -{ - int result = cmd->result; - struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); - unsigned int zno = blk_rq_zone_no(rq); - enum req_op op = req_op(rq); - unsigned long flags; - - /* - * If we got an error for a command that needs updating the write - * pointer offset cache, we must mark the zone wp offset entry as - * invalid to force an update from disk the next time a zone append - * command is issued. - */ - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - - if (result && op != REQ_OP_ZONE_RESET_ALL) { - if (op == REQ_OP_ZONE_APPEND) { - /* Force complete completion (no retry) */ - good_bytes = 0; - scsi_set_resid(cmd, blk_rq_bytes(rq)); - } - - /* - * Force an update of the zone write pointer offset on - * the next zone append access. - */ - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) - sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST; - goto unlock_wp_offset; - } - - switch (op) { - case REQ_OP_ZONE_APPEND: - trace_scsi_zone_wp_update(cmd, rq->__sector, - sdkp->zones_wp_offset[zno], good_bytes); - rq->__sector += sdkp->zones_wp_offset[zno]; - fallthrough; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp)) - sdkp->zones_wp_offset[zno] += - good_bytes >> SECTOR_SHIFT; - break; - case REQ_OP_ZONE_RESET: - sdkp->zones_wp_offset[zno] = 0; - break; - case REQ_OP_ZONE_FINISH: - sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp); - break; - case REQ_OP_ZONE_RESET_ALL: - memset(sdkp->zones_wp_offset, 0, - sdkp->zone_info.nr_zones * sizeof(unsigned int)); - break; - default: - break; - } - -unlock_wp_offset: - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - - return good_bytes; -} - /** * sd_zbc_complete - ZBC command post processing. * @cmd: Completed command @@ -619,11 +379,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, * so be quiet about the error. */ rq->rq_flags |= RQF_QUIET; - } else if (sd_zbc_need_zone_wp_update(rq)) - good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND) - blk_req_zone_write_unlock(rq); + } return good_bytes; } @@ -780,46 +536,6 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp) sdkp->zone_info.zone_blocks); } -static int sd_zbc_init_disk(struct scsi_disk *sdkp) -{ - sdkp->zones_wp_offset = NULL; - spin_lock_init(&sdkp->zones_wp_offset_lock); - sdkp->rev_wp_offset = NULL; - mutex_init(&sdkp->rev_mutex); - INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn); - sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL); - if (!sdkp->zone_wp_update_buf) - return -ENOMEM; - - return 0; -} - -void sd_zbc_free_zone_info(struct scsi_disk *sdkp) -{ - if (!sdkp->zone_wp_update_buf) - return; - - /* Serialize against revalidate zones */ - mutex_lock(&sdkp->rev_mutex); - - kvfree(sdkp->zones_wp_offset); - sdkp->zones_wp_offset = NULL; - kfree(sdkp->zone_wp_update_buf); - sdkp->zone_wp_update_buf = NULL; - - sdkp->early_zone_info = (struct zoned_disk_info){ }; - sdkp->zone_info = (struct zoned_disk_info){ }; - - mutex_unlock(&sdkp->rev_mutex); -} - -static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) -{ - struct scsi_disk *sdkp = scsi_disk(disk); - - swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset); -} - /* * Call blk_revalidate_disk_zones() if any of the zoned disk properties have * changed that make it necessary to call that function. Called by @@ -831,18 +547,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) struct request_queue *q = disk->queue; u32 zone_blocks = sdkp->early_zone_info.zone_blocks; unsigned int nr_zones = sdkp->early_zone_info.nr_zones; - int ret = 0; unsigned int flags; - - /* - * For all zoned disks, initialize zone append emulation data if not - * already done. - */ - if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) { - ret = sd_zbc_init_disk(sdkp); - if (ret) - return ret; - } + int ret; /* * There is nothing to do for regular disks, including host-aware disks @@ -851,50 +557,32 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) if (!blk_queue_is_zoned(q)) return 0; - /* - * Make sure revalidate zones are serialized to ensure exclusive - * updates of the scsi disk data. - */ - mutex_lock(&sdkp->rev_mutex); - if (sdkp->zone_info.zone_blocks == zone_blocks && sdkp->zone_info.nr_zones == nr_zones && disk->nr_zones == nr_zones) - goto unlock; + return 0; - flags = memalloc_noio_save(); sdkp->zone_info.zone_blocks = zone_blocks; sdkp->zone_info.nr_zones = nr_zones; - sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL); - if (!sdkp->rev_wp_offset) { - ret = -ENOMEM; - memalloc_noio_restore(flags); - goto unlock; - } blk_queue_chunk_sectors(q, logical_to_sectors(sdkp->device, zone_blocks)); - blk_queue_max_zone_append_sectors(q, - q->limits.max_segments << PAGE_SECTORS_SHIFT); - ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb); + /* Enable block layer zone append emulation */ + blk_queue_max_zone_append_sectors(q, 0); + flags = memalloc_noio_save(); + ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); - kvfree(sdkp->rev_wp_offset); - sdkp->rev_wp_offset = NULL; - if (ret) { sdkp->zone_info = (struct zoned_disk_info){ }; sdkp->capacity = 0; - goto unlock; + return ret; } sd_zbc_print_zones(sdkp); -unlock: - mutex_unlock(&sdkp->rev_mutex); - - return ret; + return 0; } /** @@ -917,10 +605,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (!sd_is_zoned(sdkp)) { /* * Device managed or normal SCSI disk, no special handling - * required. Nevertheless, free the disk zone information in - * case the device type changed. + * required. */ - sd_zbc_free_zone_info(sdkp); return 0; } @@ -941,7 +627,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); if (sdkp->zones_max_open == U32_MAX) disk_set_max_open_zones(disk, 0); else |