diff options
Diffstat (limited to 'drivers/block')
| -rw-r--r-- | drivers/block/loop.c | 106 | ||||
| -rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 2 | ||||
| -rw-r--r-- | drivers/block/null_blk/main.c | 188 | ||||
| -rw-r--r-- | drivers/block/null_blk/null_blk.h | 6 | ||||
| -rw-r--r-- | drivers/block/null_blk/zoned.c | 20 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-clt.c | 2 | ||||
| -rw-r--r-- | drivers/block/rnull.rs | 2 | ||||
| -rw-r--r-- | drivers/block/sunvdc.c | 2 | ||||
| -rw-r--r-- | drivers/block/ublk_drv.c | 361 | ||||
| -rw-r--r-- | drivers/block/virtio_blk.c | 7 | ||||
| -rw-r--r-- | drivers/block/xen-blkfront.c | 2 | ||||
| -rw-r--r-- | drivers/block/zram/backend_zstd.c | 11 | ||||
| -rw-r--r-- | drivers/block/zram/zcomp.c | 48 | ||||
| -rw-r--r-- | drivers/block/zram/zcomp.h | 8 | ||||
| -rw-r--r-- | drivers/block/zram/zram_drv.c | 330 | ||||
| -rw-r--r-- | drivers/block/zram/zram_drv.h | 17 |
16 files changed, 683 insertions, 429 deletions
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c05fe27a96b6..674527d770dc 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -45,8 +45,6 @@ enum { Lo_deleting, }; -struct loop_func_table; - struct loop_device { int lo_number; loff_t lo_offset; @@ -54,7 +52,8 @@ struct loop_device { int lo_flags; char lo_file_name[LO_NAME_SIZE]; - struct file * lo_backing_file; + struct file *lo_backing_file; + unsigned int lo_min_dio_size; struct block_device *lo_device; gfp_t old_gfp_mask; @@ -169,29 +168,14 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) * of backing device, and the logical block size of loop is bigger than that of * the backing device. */ -static bool lo_bdev_can_use_dio(struct loop_device *lo, - struct block_device *backing_bdev) -{ - unsigned int sb_bsize = bdev_logical_block_size(backing_bdev); - - if (queue_logical_block_size(lo->lo_queue) < sb_bsize) - return false; - if (lo->lo_offset & (sb_bsize - 1)) - return false; - return true; -} - static bool lo_can_use_dio(struct loop_device *lo) { - struct inode *inode = lo->lo_backing_file->f_mapping->host; - if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) return false; - - if (S_ISBLK(inode->i_mode)) - return lo_bdev_can_use_dio(lo, I_BDEV(inode)); - if (inode->i_sb->s_bdev) - return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev); + if (queue_logical_block_size(lo->lo_queue) < lo->lo_min_dio_size) + return false; + if (lo->lo_offset & (lo->lo_min_dio_size - 1)) + return false; return true; } @@ -205,20 +189,12 @@ static bool lo_can_use_dio(struct loop_device *lo) */ static inline void loop_update_dio(struct loop_device *lo) { - bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO; - lockdep_assert_held(&lo->lo_mutex); WARN_ON_ONCE(lo->lo_state == Lo_bound && lo->lo_queue->mq_freeze_depth == 0); - if (lo->lo_backing_file->f_flags & O_DIRECT) - lo->lo_flags |= LO_FLAGS_DIRECT_IO; if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - - /* flush dirty pages before starting to issue direct I/O */ - if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use) - vfs_fsync(lo->lo_backing_file, 0); } /** @@ -541,6 +517,28 @@ static void loop_reread_partitions(struct loop_device *lo) __func__, lo->lo_number, lo->lo_file_name, rc); } +static unsigned int loop_query_min_dio_size(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; + struct kstat st; + + /* + * Use the minimal dio alignment of the file system if provided. + */ + if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN)) + return st.dio_offset_align; + + /* + * In a perfect world this wouldn't be needed, but as of Linux 6.13 only + * a handful of file systems support the STATX_DIOALIGN flag. + */ + if (sb_bdev) + return bdev_logical_block_size(sb_bdev); + return SECTOR_SIZE; +} + static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; @@ -573,6 +571,17 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) return 0; } +static void loop_assign_backing_file(struct loop_device *lo, struct file *file) +{ + lo->lo_backing_file = file; + lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); + mapping_set_gfp_mask(file->f_mapping, + lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + if (lo->lo_backing_file->f_flags & O_DIRECT) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + lo->lo_min_dio_size = loop_query_min_dio_size(lo); +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -622,14 +631,18 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) goto out_err; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + /* and ... switch */ disk_force_media_change(lo->lo_disk); memflags = blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, - lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue, memflags); partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; @@ -971,12 +984,11 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static unsigned int loop_default_blocksize(struct loop_device *lo, - struct block_device *backing_bdev) +static unsigned int loop_default_blocksize(struct loop_device *lo) { - /* In case of direct I/O, match underlying block size */ - if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev) - return bdev_logical_block_size(backing_bdev); + /* In case of direct I/O, match underlying minimum I/O size */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + return lo->lo_min_dio_size; return SECTOR_SIZE; } @@ -994,7 +1006,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, backing_bdev = inode->i_sb->s_bdev; if (!bsize) - bsize = loop_default_blocksize(lo, backing_bdev); + bsize = loop_default_blocksize(lo); loop_get_discard_config(lo, &granularity, &max_discard_sectors); @@ -1019,7 +1031,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, const struct loop_config *config) { struct file *file = fget(config->fd); - struct address_space *mapping; struct queue_limits lim; int error; loff_t size; @@ -1055,8 +1066,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (error) goto out_unlock; - mapping = file->f_mapping; - if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) { error = -EINVAL; goto out_unlock; @@ -1088,9 +1097,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->lo_device = bdev; - lo->lo_backing_file = file; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_assign_backing_file(lo, file); lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, config->block_size); @@ -1099,6 +1106,13 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, if (error) goto out_unlock; + /* + * We might switch to direct I/O mode for the loop device, write back + * all dirty data the page cache now that so that the individual I/O + * operations don't have to do that. + */ + vfs_fsync(file, 0); + loop_update_dio(lo); loop_sysfs_init(lo); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 95361099a2dc..0d619df03fa9 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2056,7 +2056,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, unsigned int nents; /* Map the scatter list for DMA access */ - nents = blk_rq_map_sg(hctx->queue, rq, command->sg); + nents = blk_rq_map_sg(rq, command->sg); nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); prefetch(&port->flags); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d94ef37480bd..3bb9cee0a9b5 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -473,6 +473,8 @@ NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); NULLB_DEVICE_ATTR(rotational, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_once, bool, NULL); +NULLB_DEVICE_ATTR(badblocks_partial_io, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -559,14 +561,14 @@ static ssize_t nullb_device_badblocks_store(struct config_item *item, goto out; /* enable badblocks */ cmpxchg(&t_dev->badblocks.shift, -1, 0); - if (buf[0] == '+') - ret = badblocks_set(&t_dev->badblocks, start, - end - start + 1, 1); - else - ret = badblocks_clear(&t_dev->badblocks, start, - end - start + 1); - if (ret == 0) + if (buf[0] == '+') { + if (badblocks_set(&t_dev->badblocks, start, + end - start + 1, 1)) + ret = count; + } else if (badblocks_clear(&t_dev->badblocks, start, + end - start + 1)) { ret = count; + } out: kfree(orig); return ret; @@ -592,41 +594,43 @@ static ssize_t nullb_device_zone_offline_store(struct config_item *item, CONFIGFS_ATTR_WO(nullb_device_, zone_offline); static struct configfs_attribute *nullb_device_attrs[] = { - &nullb_device_attr_size, + &nullb_device_attr_badblocks, + &nullb_device_attr_badblocks_once, + &nullb_device_attr_badblocks_partial_io, + &nullb_device_attr_blocking, + &nullb_device_attr_blocksize, + &nullb_device_attr_cache_size, &nullb_device_attr_completion_nsec, - &nullb_device_attr_submit_queues, - &nullb_device_attr_poll_queues, + &nullb_device_attr_discard, + &nullb_device_attr_fua, &nullb_device_attr_home_node, - &nullb_device_attr_queue_mode, - &nullb_device_attr_blocksize, - &nullb_device_attr_max_sectors, - &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, - &nullb_device_attr_blocking, - &nullb_device_attr_use_per_node_hctx, - &nullb_device_attr_power, - &nullb_device_attr_memory_backed, - &nullb_device_attr_discard, + &nullb_device_attr_irqmode, + &nullb_device_attr_max_sectors, &nullb_device_attr_mbps, - &nullb_device_attr_cache_size, - &nullb_device_attr_badblocks, - &nullb_device_attr_zoned, - &nullb_device_attr_zone_size, + &nullb_device_attr_memory_backed, + &nullb_device_attr_no_sched, + &nullb_device_attr_poll_queues, + &nullb_device_attr_power, + &nullb_device_attr_queue_mode, + &nullb_device_attr_rotational, + &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_shared_tags, + &nullb_device_attr_size, + &nullb_device_attr_submit_queues, + &nullb_device_attr_use_per_node_hctx, + &nullb_device_attr_virt_boundary, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_capacity, - &nullb_device_attr_zone_nr_conv, - &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_full, &nullb_device_attr_zone_max_active, - &nullb_device_attr_zone_append_max_sectors, - &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_max_open, + &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_offline, - &nullb_device_attr_zone_full, - &nullb_device_attr_virt_boundary, - &nullb_device_attr_no_sched, - &nullb_device_attr_shared_tags, - &nullb_device_attr_shared_tag_bitmap, - &nullb_device_attr_fua, - &nullb_device_attr_rotational, + &nullb_device_attr_zone_readonly, + &nullb_device_attr_zone_size, + &nullb_device_attr_zoned, NULL, }; @@ -704,16 +708,28 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size,fua," - "completion_nsec,discard,home_node,hw_queue_depth," - "irqmode,max_sectors,mbps,memory_backed,no_sched," - "poll_queues,power,queue_mode,shared_tag_bitmap," - "shared_tags,size,submit_queues,use_per_node_hctx," - "virt_boundary,zoned,zone_capacity,zone_max_active," - "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors,zone_full," - "rotational\n"); + + struct configfs_attribute **entry; + char delimiter = ','; + size_t left = PAGE_SIZE; + size_t written = 0; + int ret; + + for (entry = &nullb_device_attrs[0]; *entry && left > 0; entry++) { + if (!*(entry + 1)) + delimiter = '\n'; + ret = snprintf(page + written, left, "%s%c", (*entry)->ca_name, + delimiter); + if (ret >= left) { + WARN_ONCE(1, "Too many null_blk features to print\n"); + memzero_explicit(page, PAGE_SIZE); + return -ENOBUFS; + } + left -= ret; + written += ret; + } + + return written; } CONFIGFS_ATTR_RO(memb_group_, features); @@ -1249,25 +1265,37 @@ static int null_transfer(struct nullb *nullb, struct page *page, return err; } -static blk_status_t null_handle_rq(struct nullb_cmd *cmd) +/* + * Transfer data for the given request. The transfer size is capped with the + * nr_sectors argument. + */ +static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, + sector_t nr_sectors) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); + unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; + unsigned int transferred_bytes = 0; struct req_iterator iter; struct bio_vec bvec; spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; + if (transferred_bytes + len > max_bytes) + len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); if (err) break; sector += len >> SECTOR_SHIFT; + transferred_bytes += len; + if (transferred_bytes >= max_bytes) + break; } spin_unlock_irq(&nullb->lock); @@ -1295,31 +1323,51 @@ static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) return sts; } -static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, - sector_t sector, - sector_t nr_sectors) +/* + * Check if the command should fail for the badblocks. If so, return + * BLK_STS_IOERR and return number of partial I/O sectors to be written or read, + * which may be less than the requested number of sectors. + * + * @cmd: The command to handle. + * @sector: The start sector for I/O. + * @nr_sectors: Specifies number of sectors to write or read, and returns the + * number of sectors to be written or read. + */ +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + unsigned int *nr_sectors) { struct badblocks *bb = &cmd->nq->dev->badblocks; - sector_t first_bad; - int bad_sectors; + struct nullb_device *dev = cmd->nq->dev; + unsigned int block_sectors = dev->blocksize >> SECTOR_SHIFT; + sector_t first_bad, bad_sectors; + unsigned int partial_io_sectors = 0; - if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) - return BLK_STS_IOERR; + if (!badblocks_check(bb, sector, *nr_sectors, &first_bad, &bad_sectors)) + return BLK_STS_OK; - return BLK_STS_OK; + if (cmd->nq->dev->badblocks_once) + badblocks_clear(bb, first_bad, bad_sectors); + + if (cmd->nq->dev->badblocks_partial_io) { + if (!IS_ALIGNED(first_bad, block_sectors)) + first_bad = ALIGN_DOWN(first_bad, block_sectors); + if (sector < first_bad) + partial_io_sectors = first_bad - sector; + } + *nr_sectors = partial_io_sectors; + + return BLK_STS_IOERR; } -static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_op op, - sector_t sector, - sector_t nr_sectors) +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return null_handle_rq(cmd); + return null_handle_data_transfer(cmd, nr_sectors); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) @@ -1366,18 +1414,19 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; - if (dev->badblocks.shift != -1) { - ret = null_handle_badblocks(cmd, sector, nr_sectors); + if (dev->badblocks.shift != -1) + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + + if (dev->memory_backed && nr_sectors) { + ret = null_handle_memory_backed(cmd, op, sector, nr_sectors); if (ret != BLK_STS_OK) return ret; } - if (dev->memory_backed) - return null_handle_memory_backed(cmd, op, sector, nr_sectors); - - return BLK_STS_OK; + return badblocks_ret; } static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, @@ -1426,8 +1475,7 @@ static void nullb_setup_bwtimer(struct nullb *nullb) { ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); - hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - nullb->bw_timer.function = nullb_bwtimer_fn; + hrtimer_setup(&nullb->bw_timer, nullb_bwtimer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } @@ -1549,8 +1597,8 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) cmd = blk_mq_rq_to_pdu(req); cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), blk_rq_sectors(req)); - if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, - blk_mq_end_request_batch)) + if (!blk_mq_add_to_batch(req, iob, cmd->error != BLK_STS_OK, + blk_mq_end_request_batch)) blk_mq_end_request(req, cmd->error); nr++; } @@ -1604,8 +1652,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; + hrtimer_setup(&cmd->timer, null_cmd_timer_expired, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); } cmd->error = BLK_STS_OK; cmd->nq = nq; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 6f9fe6171087..7bb6128dbaaf 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -63,6 +63,8 @@ struct nullb_device { unsigned long flags; /* device flags */ unsigned int curr_cache; struct badblocks badblocks; + bool badblocks_once; + bool badblocks_partial_io; unsigned int nr_zones; unsigned int nr_zones_imp_open; @@ -131,6 +133,10 @@ blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); +blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, sector_t sector, + unsigned int *nr_sectors); +blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 0d5f9bf95229..4e5728f45989 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -353,6 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); struct nullb_zone *zone = &dev->zones[zno]; + blk_status_t badblocks_ret = BLK_STS_OK; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -412,9 +413,20 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_IMP_OPEN; } - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (ret != BLK_STS_OK) - goto unlock_zone; + if (dev->badblocks.shift != -1) { + badblocks_ret = null_handle_badblocks(cmd, sector, &nr_sectors); + if (badblocks_ret != BLK_STS_OK && !nr_sectors) { + ret = badblocks_ret; + goto unlock_zone; + } + } + + if (dev->memory_backed) { + ret = null_handle_memory_backed(cmd, REQ_OP_WRITE, sector, + nr_sectors); + if (ret != BLK_STS_OK) + goto unlock_zone; + } zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { @@ -429,7 +441,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, zone->cond = BLK_ZONE_COND_FULL; } - ret = BLK_STS_OK; + ret = badblocks_ret; unlock_zone: null_unlock_zone(dev, zone); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 82467ecde7ec..15627417f12e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1010,7 +1010,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, * See queue limits. */ if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES)) - sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl); + sg_cnt = blk_rq_map_sg(rq, iu->sgt.sgl); if (sg_cnt == 0) sg_mark_end(&iu->sgt.sgl[0]); diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs index ddf3629d8894..d07e76ae2c13 100644 --- a/drivers/block/rnull.rs +++ b/drivers/block/rnull.rs @@ -27,7 +27,7 @@ use kernel::{ module! { type: NullBlkModule, name: "rnull_mod", - author: "Andreas Hindborg", + authors: ["Andreas Hindborg"], description: "Rust implementation of the C null block driver", license: "GPL v2", } diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 282f81616a78..2b33fb5b949b 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -485,7 +485,7 @@ static int __send_request(struct request *req) } sg_init_table(sg, port->ring_cookies); - nsg = blk_rq_map_sg(req->q, req, sg); + nsg = blk_rq_map_sg(req, sg); len = 0; for (i = 0; i < nsg; i++) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 529085181f35..2fd05c1bd30b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -51,6 +51,9 @@ /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -70,15 +73,31 @@ /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ - UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED) + UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) struct ublk_rq_data { - struct llist_node node; - struct kref ref; }; struct ublk_uring_cmd_pdu { + /* + * Store requests in same batch temporarily for queuing them to + * daemon context. + * + * It should have been stored to request payload, but we do want + * to avoid extra pre-allocation, and uring_cmd payload is always + * free for us + */ + union { + struct request *req; + struct request *req_list; + }; + + /* + * The following two are valid in this cmd whole lifetime, and + * setup in ublk uring_cmd handler + */ struct ublk_queue *ubq; u16 tag; }; @@ -139,12 +158,8 @@ struct ublk_queue { unsigned long flags; struct task_struct *ubq_daemon; - char *io_cmd_buf; - - struct llist_head io_cmds; + struct ublksrv_io_desc *io_cmd_buf; - unsigned long io_addr; /* mapped vm address */ - unsigned int max_io_sz; bool force_abort; bool timeout; bool canceling; @@ -196,12 +211,14 @@ struct ublk_params_header { static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_COPY; + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) @@ -489,15 +506,17 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); + +#define UBLK_MAX_UBLKS UBLK_MINORS + /* - * Max ublk devices allowed to add + * Max unprivileged ublk devices allowed to add * * It can be extended to one per-user limit in future or even controlled * by cgroup. */ -#define UBLK_MAX_UBLKS UBLK_MINORS -static unsigned int ublks_max = 64; -static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ +static unsigned int unprivileged_ublks_max = 64; +static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */ static struct miscdevice ublk_misc; @@ -568,6 +587,28 @@ static int ublk_validate_params(const struct ublk_device *ub) else if (ublk_dev_is_zoned(ub)) return -EINVAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) { + const struct ublk_param_dma_align *p = &ub->params.dma; + + if (p->alignment >= PAGE_SIZE) + return -EINVAL; + + if (!is_power_of_2(p->alignment + 1)) + return -EINVAL; + } + + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + const struct ublk_param_segment *p = &ub->params.seg; + + if (!is_power_of_2(p->seg_boundary_mask + 1)) + return -EINVAL; + + if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE) + return -EINVAL; + } + return 0; } @@ -581,7 +622,12 @@ static void ublk_apply_params(struct ublk_device *ub) static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & UBLK_F_USER_COPY; + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); +} + +static inline bool ublk_need_map_io(const struct ublk_queue *ubq) +{ + return !ublk_support_user_copy(ubq); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -660,11 +706,11 @@ static inline bool ublk_rq_has_data(const struct request *rq) static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag) { - return (struct ublksrv_io_desc *) - &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); + return &ubq->io_cmd_buf[tag]; } -static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +static inline struct ublksrv_io_desc * +ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) { return ublk_get_queue(ub, q_id)->io_cmd_buf; } @@ -911,7 +957,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; /* @@ -935,7 +981,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (ublk_support_user_copy(ubq)) + if (!ublk_need_map_io(ubq)) return rq_bytes; if (ublk_need_unmap_req(req)) { @@ -1023,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( struct io_uring_cmd *ioucmd) { - return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; + return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) @@ -1095,7 +1141,7 @@ static void ublk_complete_rq(struct kref *ref) } /* - * Since __ublk_rq_task_work always fails requests immediately during + * Since ublk_rq_task_work_cb always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during * exiting. So lock is unnecessary. * @@ -1141,10 +1187,10 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, blk_mq_end_request(rq, BLK_STS_IOERR); } -static inline void __ublk_rq_task_work(struct request *req, - unsigned issue_flags) +static void ublk_dispatch_req(struct ublk_queue *ubq, + struct request *req, + unsigned int issue_flags) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; int tag = req->tag; struct ublk_io *io = &ubq->ios[tag]; unsigned int mapped_bytes; @@ -1220,34 +1266,49 @@ static inline void __ublk_rq_task_work(struct request *req, ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } -static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, - unsigned issue_flags) +static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_queue *ubq = pdu->ubq; + + ublk_dispatch_req(ubq, pdu->req, issue_flags); +} + +static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { - struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); - struct ublk_rq_data *data, *tmp; + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - io_cmds = llist_reverse_order(io_cmds); - llist_for_each_entry_safe(data, tmp, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) +static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, + unsigned int issue_flags) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct request *rq = pdu->req_list; struct ublk_queue *ubq = pdu->ubq; + struct request *next; - ublk_forward_io_cmds(ubq, issue_flags); + do { + next = rq->rq_next; + rq->rq_next = NULL; + ublk_dispatch_req(ubq, rq, issue_flags); + rq = next; + } while (rq); } -static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) +static void ublk_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) { - struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); - - if (llist_add(&data->node, &ubq->io_cmds)) { - struct ublk_io *io = &ubq->ios[rq->tag]; + struct request *rq = rq_list_peek(l); + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); - io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb); - } + pdu->req_list = rq; + rq_list_init(l); + io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb); } static enum blk_eh_timer_return ublk_timeout(struct request *rq) @@ -1288,21 +1349,12 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq) return BLK_EH_RESET_TIMER; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; - if (unlikely(ubq->fail_io)) { + if (unlikely(ubq->fail_io)) return BLK_STS_TARGET; - } - - /* fill iod to slot in io cmd buffer */ - res = ublk_setup_iod(ubq, rq); - if (unlikely(res != BLK_STS_OK)) - return BLK_STS_IOERR; /* With recovery feature enabled, force_abort is set in * ublk_stop_dev() before calling del_gendisk(). We have to @@ -1316,17 +1368,68 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) return BLK_STS_IOERR; + if (unlikely(ubq->canceling)) + return BLK_STS_IOERR; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) + return BLK_STS_IOERR; + + blk_mq_start_request(rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + + res = ublk_prep_req(ubq, rq); + if (res != BLK_STS_OK) + return res; + + /* + * ->canceling has to be handled after ->force_abort and ->fail_io + * is dealt with, otherwise this request may not be failed in case + * of recovery, and cause hang when deleting disk + */ if (unlikely(ubq->canceling)) { __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - blk_mq_start_request(bd->rq); ublk_queue_cmd(ubq, rq); - return BLK_STS_OK; } +static void ublk_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ubq && ubq != this_q && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + + if (ublk_prep_req(ubq, req) == BLK_STS_OK) + rq_list_add_tail(&submit_list, req); + else + rq_list_add_tail(&requeue_list, req); + } + + if (ubq && !rq_list_empty(&submit_list)) + ublk_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1339,6 +1442,7 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_queue_rqs, .init_hctx = ublk_init_hctx, .timeout = ublk_timeout, }; @@ -1440,7 +1544,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) struct request *rq; /* - * Either we fail the request or ublk_rq_task_work_fn + * Either we fail the request or ublk_rq_task_work_cb * will do it */ rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); @@ -1452,17 +1556,27 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) } } +/* Must be called when queue is frozen */ +static bool ublk_mark_queue_canceling(struct ublk_queue *ubq) +{ + bool canceled; + + spin_lock(&ubq->cancel_lock); + canceled = ubq->canceling; + if (!canceled) + ubq->canceling = true; + spin_unlock(&ubq->cancel_lock); + + return canceled; +} + static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) { + bool was_canceled = ubq->canceling; struct gendisk *disk; - spin_lock(&ubq->cancel_lock); - if (ubq->canceling) { - spin_unlock(&ubq->cancel_lock); + if (was_canceled) return false; - } - ubq->canceling = true; - spin_unlock(&ubq->cancel_lock); spin_lock(&ub->lock); disk = ub->ub_disk; @@ -1474,14 +1588,23 @@ static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq) if (!disk) return false; - /* Now we are serialized with ublk_queue_rq() */ + /* + * Now we are serialized with ublk_queue_rq() + * + * Make sure that ubq->canceling is set when queue is frozen, + * because ublk_queue_rq() has to rely on this flag for avoiding to + * touch completed uring_cmd + */ blk_mq_quiesce_queue(disk->queue); - /* abort queue is for making forward progress */ - ublk_abort_queue(ub, ubq); + was_canceled = ublk_mark_queue_canceling(ubq); + if (!was_canceled) { + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + } blk_mq_unquiesce_queue(disk->queue); put_device(disk_to_dev(disk)); - return true; + return !was_canceled; } static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, @@ -1747,6 +1870,42 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); } +static void ublk_io_release(void *priv) +{ + struct request *rq = priv; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + ublk_put_req_ref(ubq, rq); +} + +static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_queue *ubq, unsigned int tag, + unsigned int index, unsigned int issue_flags) +{ + struct ublk_device *ub = cmd->file->private_data; + struct request *req; + int ret; + + req = __ublk_check_and_get_req(ub, ubq, tag, 0); + if (!req) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) { + ublk_put_req_ref(ubq, req); + return ret; + } + + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + unsigned int index, unsigned int issue_flags) +{ + return io_buffer_unregister_bvec(cmd, index, issue_flags); +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1798,6 +1957,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { + case UBLK_IO_REGISTER_IO_BUF: + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); + case UBLK_IO_UNREGISTER_IO_BUF: + return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -1811,7 +1974,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled @@ -1833,7 +1996,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - if (!ublk_support_user_copy(ubq)) { + if (ublk_need_map_io(ubq)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. @@ -1866,10 +2029,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", __func__, cmd_op, tag, ret, io->flags); - return -EIOCBQUEUED; + return ret; } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, @@ -1925,7 +2087,10 @@ static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { - ublk_ch_uring_cmd_local(cmd, issue_flags); + int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); + + if (ret != -EIOCBQUEUED) + io_uring_cmd_done(cmd, ret, 0, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) @@ -2190,7 +2355,8 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - ublks_added++; + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) + unprivileged_ublks_added++; return 0; fail: put_device(dev); @@ -2219,11 +2385,16 @@ static int ublk_add_tag_set(struct ublk_device *ub) static void ublk_remove(struct ublk_device *ub) { + bool unprivileged; + ublk_stop_dev(ub); cancel_work_sync(&ub->nosrv_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); + unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; ublk_put_device(ub); - ublks_added--; + + if (unprivileged) + unprivileged_ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -2298,6 +2469,15 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL) lim.features |= BLK_FEAT_ROTATIONAL; + if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) + lim.dma_alignment = ub->params.dma.alignment; + + if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) { + lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask; + lim.max_segment_size = ub->params.seg.max_segment_size; + lim.max_segments = ub->params.seg.max_segments; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2459,7 +2639,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device */ - if (info.flags & UBLK_F_USER_COPY) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; } @@ -2485,7 +2665,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return ret; ret = -EACCES; - if (ublks_added >= ublks_max) + if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) && + unprivileged_ublks_added >= unprivileged_ublks_max) goto out_unlock; ret = -ENOMEM; @@ -2527,9 +2708,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_free_dev_number; } - /* We are not ready to support zero copy */ - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; - ub->dev_info.nr_hw_queues = min_t(unsigned int, ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); @@ -2715,9 +2893,12 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); - if (ub->dev_info.state == UBLK_S_DEV_LIVE) { + if (test_bit(UB_STATE_USED, &ub->state)) { + /* + * Parameters can only be changed when device hasn't + * been started yet + */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { ret = -EFAULT; @@ -2860,7 +3041,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + u64 features = UBLK_F_ALL; if (header->len != UBLK_FEATURES_LEN || !header->addr) return -EINVAL; @@ -3056,10 +3237,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ub) ublk_put_device(ub); out: - io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); - return -EIOCBQUEUED; + return ret; } static const struct file_operations ublk_ctl_fops = { @@ -3123,23 +3303,26 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); -static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp) +static int ublk_set_max_unprivileged_ublks(const char *buf, + const struct kernel_param *kp) { return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS); } -static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp) +static int ublk_get_max_unprivileged_ublks(char *buf, + const struct kernel_param *kp) { - return sysfs_emit(buf, "%u\n", ublks_max); + return sysfs_emit(buf, "%u\n", unprivileged_ublks_max); } -static const struct kernel_param_ops ublk_max_ublks_ops = { - .set = ublk_set_max_ublks, - .get = ublk_get_max_ublks, +static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = { + .set = ublk_set_max_unprivileged_ublks, + .get = ublk_get_max_unprivileged_ublks, }; -module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644); -MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); +module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops, + &unprivileged_ublks_max, 0644); +MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)"); MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); MODULE_DESCRIPTION("Userspace block device"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a61ec35f426..7cffea01d868 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -226,7 +226,7 @@ static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req, if (unlikely(err)) return -ENOMEM; - return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl); + return blk_rq_map_sg(req, vbr->sg_table.sgl); } static void virtblk_cleanup_cmd(struct request *req) @@ -1207,11 +1207,12 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { struct request *req = blk_mq_rq_from_pdu(vbr); + u8 status = virtblk_vbr_status(vbr); found++; if (!blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) + !blk_mq_add_to_batch(req, iob, status != VIRTIO_BLK_S_OK, + virtblk_complete_batch)) virtblk_request_done(req); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index edcd08a9dcef..5babe575c288 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -751,7 +751,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri id = blkif_ring_get_request(rinfo, req, &final_ring_req); ring_req = &rinfo->shadow[id].req; - num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); + num_sg = blk_rq_map_sg(req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) diff --git a/drivers/block/zram/backend_zstd.c b/drivers/block/zram/backend_zstd.c index 1184c0036f44..22c8067536f3 100644 --- a/drivers/block/zram/backend_zstd.c +++ b/drivers/block/zram/backend_zstd.c @@ -24,19 +24,10 @@ struct zstd_params { /* * For C/D dictionaries we need to provide zstd with zstd_custom_mem, * which zstd uses internally to allocate/free memory when needed. - * - * This means that allocator.customAlloc() can be called from zcomp_compress() - * under local-lock (per-CPU compression stream), in which case we must use - * GFP_ATOMIC. - * - * Another complication here is that we can be configured as a swap device. */ static void *zstd_custom_alloc(void *opaque, size_t size) { - if (!preemptible()) - return kvzalloc(size, GFP_ATOMIC); - - return kvzalloc(size, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN); + return kvzalloc(size, GFP_NOIO | __GFP_NOWARN); } static void zstd_custom_free(void *opaque, void *address) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index bb514403e305..d26a58c67e95 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -6,8 +6,7 @@ #include <linux/slab.h> #include <linux/wait.h> #include <linux/sched.h> -#include <linux/cpu.h> -#include <linux/crypto.h> +#include <linux/cpuhotplug.h> #include <linux/vmalloc.h> #include "zcomp.h" @@ -46,6 +45,7 @@ static const struct zcomp_ops *backends[] = { static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) { comp->ops->destroy_ctx(&zstrm->ctx); + vfree(zstrm->local_copy); vfree(zstrm->buffer); zstrm->buffer = NULL; } @@ -58,12 +58,13 @@ static int zcomp_strm_init(struct zcomp *comp, struct zcomp_strm *zstrm) if (ret) return ret; + zstrm->local_copy = vzalloc(PAGE_SIZE); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ zstrm->buffer = vzalloc(2 * PAGE_SIZE); - if (!zstrm->buffer) { + if (!zstrm->buffer || !zstrm->local_copy) { zcomp_strm_free(comp, zstrm); return -ENOMEM; } @@ -109,13 +110,29 @@ ssize_t zcomp_available_show(const char *comp, char *buf) struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { - local_lock(&comp->stream->lock); - return this_cpu_ptr(comp->stream); + for (;;) { + struct zcomp_strm *zstrm = raw_cpu_ptr(comp->stream); + + /* + * Inspired by zswap + * + * stream is returned with ->mutex locked which prevents + * cpu_dead() from releasing this stream under us, however + * there is still a race window between raw_cpu_ptr() and + * mutex_lock(), during which we could have been migrated + * from a CPU that has already destroyed its stream. If + * so then unlock and re-try on the current CPU. + */ + mutex_lock(&zstrm->lock); + if (likely(zstrm->buffer)) + return zstrm; + mutex_unlock(&zstrm->lock); + } } -void zcomp_stream_put(struct zcomp *comp) +void zcomp_stream_put(struct zcomp_strm *zstrm) { - local_unlock(&comp->stream->lock); + mutex_unlock(&zstrm->lock); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -129,6 +146,7 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, }; int ret; + might_sleep(); ret = comp->ops->compress(comp->params, &zstrm->ctx, &req); if (!ret) *dst_len = req.dst_len; @@ -145,18 +163,16 @@ int zcomp_decompress(struct zcomp *comp, struct zcomp_strm *zstrm, .dst_len = PAGE_SIZE, }; + might_sleep(); return comp->ops->decompress(comp->params, &zstrm->ctx, &req); } int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) { struct zcomp *comp = hlist_entry(node, struct zcomp, node); - struct zcomp_strm *zstrm; + struct zcomp_strm *zstrm = per_cpu_ptr(comp->stream, cpu); int ret; - zstrm = per_cpu_ptr(comp->stream, cpu); - local_lock_init(&zstrm->lock); - ret = zcomp_strm_init(comp, zstrm); if (ret) pr_err("Can't allocate a compression stream\n"); @@ -166,16 +182,17 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct zcomp *comp = hlist_entry(node, struct zcomp, node); - struct zcomp_strm *zstrm; + struct zcomp_strm *zstrm = per_cpu_ptr(comp->stream, cpu); - zstrm = per_cpu_ptr(comp->stream, cpu); + mutex_lock(&zstrm->lock); zcomp_strm_free(comp, zstrm); + mutex_unlock(&zstrm->lock); return 0; } static int zcomp_init(struct zcomp *comp, struct zcomp_params *params) { - int ret; + int ret, cpu; comp->stream = alloc_percpu(struct zcomp_strm); if (!comp->stream) @@ -186,6 +203,9 @@ static int zcomp_init(struct zcomp *comp, struct zcomp_params *params) if (ret) goto cleanup; + for_each_possible_cpu(cpu) + mutex_init(&per_cpu_ptr(comp->stream, cpu)->lock); + ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node); if (ret < 0) goto cleanup; diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index ad5762813842..25339ed1e07e 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -3,7 +3,7 @@ #ifndef _ZCOMP_H_ #define _ZCOMP_H_ -#include <linux/local_lock.h> +#include <linux/mutex.h> #define ZCOMP_PARAM_NO_LEVEL INT_MIN @@ -31,9 +31,11 @@ struct zcomp_ctx { }; struct zcomp_strm { - local_lock_t lock; + struct mutex lock; /* compression buffer */ void *buffer; + /* local copy of handle memory */ + void *local_copy; struct zcomp_ctx ctx; }; @@ -77,7 +79,7 @@ struct zcomp *zcomp_create(const char *alg, struct zcomp_params *params); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); -void zcomp_stream_put(struct zcomp *comp); +void zcomp_stream_put(struct zcomp_strm *zstrm); int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, const void *src, unsigned int *dst_len); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9f5020b077c5..fda7d8624889 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,6 +44,8 @@ static DEFINE_MUTEX(zram_index_mutex); static int zram_major; static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; +#define ZRAM_MAX_ALGO_NAME_SZ 128 + /* Module params (documentation at end) */ static unsigned int num_devices = 1; /* @@ -58,19 +60,56 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_read_from_zspool(struct zram *zram, struct page *page, u32 index); -static int zram_slot_trylock(struct zram *zram, u32 index) +#define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map) + +static void zram_slot_lock_init(struct zram *zram, u32 index) +{ + static struct lock_class_key __key; + + lockdep_init_map(slot_dep_map(zram, index), "zram->table[index].lock", + &__key, 0); +} + +/* + * entry locking rules: + * + * 1) Lock is exclusive + * + * 2) lock() function can sleep waiting for the lock + * + * 3) Lock owner can sleep + * + * 4) Use TRY lock variant when in atomic context + * - must check return value and handle locking failers + */ +static __must_check bool zram_slot_trylock(struct zram *zram, u32 index) { - return spin_trylock(&zram->table[index].lock); + unsigned long *lock = &zram->table[index].flags; + + if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) { + mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_); + lock_acquired(slot_dep_map(zram, index), _RET_IP_); + return true; + } + + return false; } static void zram_slot_lock(struct zram *zram, u32 index) { - spin_lock(&zram->table[index].lock); + unsigned long *lock = &zram->table[index].flags; + + mutex_acquire(slot_dep_map(zram, index), 0, 0, _RET_IP_); + wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE); + lock_acquired(slot_dep_map(zram, index), _RET_IP_); } static void zram_slot_unlock(struct zram *zram, u32 index) { - spin_unlock(&zram->table[index].lock); + unsigned long *lock = &zram->table[index].flags; + + mutex_release(slot_dep_map(zram, index), _RET_IP_); + clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock); } static inline bool init_done(struct zram *zram) @@ -93,7 +132,6 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) zram->table[index].handle = handle; } -/* flag operations require table entry bit_spin_lock() being held */ static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { @@ -257,15 +295,24 @@ static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl) kfree(ctl); } -static void place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl, - struct zram_pp_slot *pps) +static bool place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl, + u32 index) { - u32 idx; + struct zram_pp_slot *pps; + u32 bid; + + pps = kmalloc(sizeof(*pps), GFP_NOIO | __GFP_NOWARN); + if (!pps) + return false; + + INIT_LIST_HEAD(&pps->entry); + pps->index = index; - idx = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; - list_add(&pps->entry, &ctl->pp_buckets[idx]); + bid = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; + list_add(&pps->entry, &ctl->pp_buckets[bid]); zram_set_flag(zram, pps->index, ZRAM_PP_SLOT); + return true; } static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl) @@ -699,15 +746,8 @@ static int scan_slots_for_writeback(struct zram *zram, u32 mode, unsigned long index, struct zram_pp_ctl *ctl) { - struct zram_pp_slot *pps = NULL; - for (; nr_pages != 0; index++, nr_pages--) { - if (!pps) - pps = kmalloc(sizeof(*pps), GFP_KERNEL); - if (!pps) - return -ENOMEM; - - INIT_LIST_HEAD(&pps->entry); + bool ok = true; zram_slot_lock(zram, index); if (!zram_allocated(zram, index)) @@ -727,14 +767,13 @@ static int scan_slots_for_writeback(struct zram *zram, u32 mode, !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) goto next; - pps->index = index; - place_pp_slot(zram, ctl, pps); - pps = NULL; + ok = place_pp_slot(zram, ctl, index); next: zram_slot_unlock(zram, index); + if (!ok) + break; } - kfree(pps); return 0; } @@ -748,7 +787,7 @@ static ssize_t writeback_store(struct device *dev, unsigned long index = 0; struct bio bio; struct bio_vec bio_vec; - struct page *page; + struct page *page = NULL; ssize_t ret = len; int mode, err; unsigned long blk_idx = 0; @@ -890,8 +929,10 @@ next: if (blk_idx) free_block_bdev(zram, blk_idx); - __free_page(page); + release_init_lock: + if (page) + __free_page(page); release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -1065,27 +1106,6 @@ static void zram_debugfs_register(struct zram *zram) {}; static void zram_debugfs_unregister(struct zram *zram) {}; #endif -/* - * We switched to per-cpu streams and this attr is not needed anymore. - * However, we will keep it around for some time, because: - * a) we may revert per-cpu streams in the future - * b) it's visible to user space and we need to follow our 2 years - * retirement rule; but we already have a number of 'soon to be - * altered' attrs, so max_comp_streams need to wait for the next - * layoff cycle. - */ -static ssize_t max_comp_streams_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); -} - -static ssize_t max_comp_streams_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - return len; -} - static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) { /* Do not free statically defined compression algorithms */ @@ -1112,7 +1132,7 @@ static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) size_t sz; sz = strlen(buf); - if (sz >= CRYPTO_MAX_ALG_NAME) + if (sz >= ZRAM_MAX_ALGO_NAME_SZ) return -E2BIG; compressor = kstrdup(buf, GFP_KERNEL); @@ -1420,9 +1440,8 @@ static ssize_t debug_stat_show(struct device *dev, down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, - "version: %d\n%8llu %8llu\n", + "version: %d\n0 %8llu\n", version, - (u64)atomic64_read(&zram->stats.writestall), (u64)atomic64_read(&zram->stats.miss_free)); up_read(&zram->init_lock); @@ -1473,15 +1492,11 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) huge_class_size = zs_huge_class_size(zram->mem_pool); for (index = 0; index < num_pages; index++) - spin_lock_init(&zram->table[index].lock); + zram_slot_lock_init(zram, index); + return true; } -/* - * To protect concurrent access to the same index entry, - * caller should hold this table index entry's bit_spinlock to - * indicate this index entry is accessing. - */ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; @@ -1548,11 +1563,11 @@ static int read_incompressible_page(struct zram *zram, struct page *page, void *src, *dst; handle = zram_get_handle(zram, index); - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + src = zs_obj_read_begin(zram->mem_pool, handle, NULL); dst = kmap_local_page(page); copy_page(dst, src); kunmap_local(dst); - zs_unmap_object(zram->mem_pool, handle); + zs_obj_read_end(zram->mem_pool, handle, src); return 0; } @@ -1570,12 +1585,12 @@ static int read_compressed_page(struct zram *zram, struct page *page, u32 index) prio = zram_get_priority(zram, index); zstrm = zcomp_stream_get(zram->comps[prio]); - src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + src = zs_obj_read_begin(zram->mem_pool, handle, zstrm->local_copy); dst = kmap_local_page(page); ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst); kunmap_local(dst); - zs_unmap_object(zram->mem_pool, handle); - zcomp_stream_put(zram->comps[prio]); + zs_obj_read_end(zram->mem_pool, handle, src); + zcomp_stream_put(zstrm); return ret; } @@ -1670,7 +1685,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, u32 index) { unsigned long handle; - void *src, *dst; + void *src; /* * This function is called from preemptible context so we don't need @@ -1678,7 +1693,8 @@ static int write_incompressible_page(struct zram *zram, struct page *page, * like we do for compressible pages. */ handle = zs_malloc(zram->mem_pool, PAGE_SIZE, - GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); + GFP_NOIO | __GFP_NOWARN | + __GFP_HIGHMEM | __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); @@ -1687,11 +1703,9 @@ static int write_incompressible_page(struct zram *zram, struct page *page, return -ENOMEM; } - dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); src = kmap_local_page(page); - memcpy(dst, src, PAGE_SIZE); + zs_obj_write(zram->mem_pool, handle, src, PAGE_SIZE); kunmap_local(src); - zs_unmap_object(zram->mem_pool, handle); zram_slot_lock(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); @@ -1710,11 +1724,11 @@ static int write_incompressible_page(struct zram *zram, struct page *page, static int zram_write_page(struct zram *zram, struct page *page, u32 index) { int ret = 0; - unsigned long handle = -ENOMEM; - unsigned int comp_len = 0; - void *dst, *mem; + unsigned long handle; + unsigned int comp_len; + void *mem; struct zcomp_strm *zstrm; - unsigned long element = 0; + unsigned long element; bool same_filled; /* First, free memory allocated to this slot (if any) */ @@ -1728,7 +1742,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) if (same_filled) return write_same_filled_page(zram, element, index); -compress_again: zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); mem = kmap_local_page(page); ret = zcomp_compress(zram->comps[ZRAM_PRIMARY_COMP], zstrm, @@ -1736,59 +1749,32 @@ compress_again: kunmap_local(mem); if (unlikely(ret)) { - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zcomp_stream_put(zstrm); pr_err("Compression failed! err=%d\n", ret); - zs_free(zram->mem_pool, handle); return ret; } if (comp_len >= huge_class_size) { - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zcomp_stream_put(zstrm); return write_incompressible_page(zram, page, index); } - /* - * handle allocation has 2 paths: - * a) fast path is executed with preemption disabled (for - * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, - * since we can't sleep; - * b) slow path enables preemption and attempts to allocate - * the page with __GFP_DIRECT_RECLAIM bit set. we have to - * put per-cpu compression stream and, thus, to re-do - * the compression once handle is allocated. - * - * if we have a 'non-null' handle here then we are coming - * from the slow path and handle has already been allocated. - */ - if (IS_ERR_VALUE(handle)) - handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); + handle = zs_malloc(zram->mem_pool, comp_len, + GFP_NOIO | __GFP_NOWARN | + __GFP_HIGHMEM | __GFP_MOVABLE); if (IS_ERR_VALUE(handle)) { - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); - atomic64_inc(&zram->stats.writestall); - handle = zs_malloc(zram->mem_pool, comp_len, - GFP_NOIO | __GFP_HIGHMEM | - __GFP_MOVABLE); - if (IS_ERR_VALUE(handle)) - return PTR_ERR((void *)handle); - - goto compress_again; + zcomp_stream_put(zstrm); + return PTR_ERR((void *)handle); } if (!zram_can_store_page(zram)) { - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zcomp_stream_put(zstrm); zs_free(zram->mem_pool, handle); return -ENOMEM; } - dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); - - memcpy(dst, zstrm->buffer, comp_len); - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); - zs_unmap_object(zram->mem_pool, handle); + zs_obj_write(zram->mem_pool, handle, zstrm->buffer, comp_len); + zcomp_stream_put(zstrm); zram_slot_lock(zram, index); zram_set_handle(zram, index, handle); @@ -1835,20 +1821,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, #define RECOMPRESS_IDLE (1 << 0) #define RECOMPRESS_HUGE (1 << 1) -static int scan_slots_for_recompress(struct zram *zram, u32 mode, +static int scan_slots_for_recompress(struct zram *zram, u32 mode, u32 prio_max, struct zram_pp_ctl *ctl) { unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - struct zram_pp_slot *pps = NULL; unsigned long index; for (index = 0; index < nr_pages; index++) { - if (!pps) - pps = kmalloc(sizeof(*pps), GFP_KERNEL); - if (!pps) - return -ENOMEM; - - INIT_LIST_HEAD(&pps->entry); + bool ok = true; zram_slot_lock(zram, index); if (!zram_allocated(zram, index)) @@ -1867,14 +1847,17 @@ static int scan_slots_for_recompress(struct zram *zram, u32 mode, zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) goto next; - pps->index = index; - place_pp_slot(zram, ctl, pps); - pps = NULL; + /* Already compressed with same of higher priority */ + if (zram_get_priority(zram, index) + 1 >= prio_max) + goto next; + + ok = place_pp_slot(zram, ctl, index); next: zram_slot_unlock(zram, index); + if (!ok) + break; } - kfree(pps); return 0; } @@ -1896,9 +1879,8 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, unsigned int comp_len_new; unsigned int class_index_old; unsigned int class_index_new; - u32 num_recomps = 0; - void *src, *dst; - int ret; + void *src; + int ret = 0; handle_old = zram_get_handle(zram, index); if (!handle_old) @@ -1923,6 +1905,16 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, zram_clear_flag(zram, index, ZRAM_IDLE); class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); + + prio = max(prio, zram_get_priority(zram, index) + 1); + /* + * Recompression slots scan should not select slots that are + * already compressed with a higher priority algorithm, but + * just in case + */ + if (prio >= prio_max) + return 0; + /* * Iterate the secondary comp algorithms list (in order of priority) * and try to recompress the page. @@ -1931,14 +1923,6 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, if (!zram->comps[prio]) continue; - /* - * Skip if the object is already re-compressed with a higher - * priority algorithm (or same algorithm). - */ - if (prio <= zram_get_priority(zram, index)) - continue; - - num_recomps++; zstrm = zcomp_stream_get(zram->comps[prio]); src = kmap_local_page(page); ret = zcomp_compress(zram->comps[prio], zstrm, @@ -1946,8 +1930,9 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, kunmap_local(src); if (ret) { - zcomp_stream_put(zram->comps[prio]); - return ret; + zcomp_stream_put(zstrm); + zstrm = NULL; + break; } class_index_new = zs_lookup_class_index(zram->mem_pool, @@ -1956,7 +1941,8 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, /* Continue until we make progress */ if (class_index_new >= class_index_old || (threshold && comp_len_new >= threshold)) { - zcomp_stream_put(zram->comps[prio]); + zcomp_stream_put(zstrm); + zstrm = NULL; continue; } @@ -1965,14 +1951,6 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, } /* - * We did not try to recompress, e.g. when we have only one - * secondary algorithm and the page is already recompressed - * using that algorithm - */ - if (!zstrm) - return 0; - - /* * Decrement the limit (if set) on pages we can recompress, even * when current recompression was unsuccessful or did not compress * the page below the threshold, because we still spent resources @@ -1981,48 +1959,39 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, if (*num_recomp_pages) *num_recomp_pages -= 1; - if (class_index_new >= class_index_old) { + /* Compression error */ + if (ret) + return ret; + + if (!zstrm) { /* * Secondary algorithms failed to re-compress the page - * in a way that would save memory, mark the object as - * incompressible so that we will not try to compress - * it again. + * in a way that would save memory. * - * We need to make sure that all secondary algorithms have - * failed, so we test if the number of recompressions matches - * the number of active secondary algorithms. + * Mark the object incompressible if the max-priority + * algorithm couldn't re-compress it. */ - if (num_recomps == zram->num_active_comps - 1) - zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); + if (prio < zram->num_active_comps) + return 0; + zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } - /* Successful recompression but above threshold */ - if (threshold && comp_len_new >= threshold) - return 0; - /* - * No direct reclaim (slow path) for handle allocation and no - * re-compression attempt (unlike in zram_write_bvec()) since - * we already have stored that object in zsmalloc. If we cannot - * alloc memory for recompressed object then we bail out and - * simply keep the old (existing) object in zsmalloc. + * We are holding per-CPU stream mutex and entry lock so better + * avoid direct reclaim. Allocation error is not fatal since + * we still have the old object in the mem_pool. */ handle_new = zs_malloc(zram->mem_pool, comp_len_new, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); + GFP_NOIO | __GFP_NOWARN | + __GFP_HIGHMEM | __GFP_MOVABLE); if (IS_ERR_VALUE(handle_new)) { - zcomp_stream_put(zram->comps[prio]); + zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); } - dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO); - memcpy(dst, zstrm->buffer, comp_len_new); - zcomp_stream_put(zram->comps[prio]); - - zs_unmap_object(zram->mem_pool, handle_new); + zs_obj_write(zram->mem_pool, handle_new, zstrm->buffer, comp_len_new); + zcomp_stream_put(zstrm); zram_free_page(zram, index); zram_set_handle(zram, index, handle_new); @@ -2039,16 +2008,19 @@ static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); char *args, *param, *val, *algo = NULL; u64 num_recomp_pages = ULLONG_MAX; struct zram_pp_ctl *ctl = NULL; struct zram_pp_slot *pps; u32 mode = 0, threshold = 0; - struct page *page; + u32 prio, prio_max; + struct page *page = NULL; ssize_t ret; + prio = ZRAM_SECONDARY_COMP; + prio_max = zram->num_active_comps; + args = skip_spaces(buf); while (*args) { args = next_arg(args, ¶m, &val); @@ -2101,7 +2073,7 @@ static ssize_t recompress_store(struct device *dev, if (prio == ZRAM_PRIMARY_COMP) prio = ZRAM_SECONDARY_COMP; - prio_max = min(prio + 1, ZRAM_MAX_COMPS); + prio_max = prio + 1; continue; } } @@ -2129,7 +2101,7 @@ static ssize_t recompress_store(struct device *dev, continue; if (!strcmp(zram->comp_algs[prio], algo)) { - prio_max = min(prio + 1, ZRAM_MAX_COMPS); + prio_max = prio + 1; found = true; break; } @@ -2141,6 +2113,12 @@ static ssize_t recompress_store(struct device *dev, } } + prio_max = min(prio_max, (u32)zram->num_active_comps); + if (prio >= prio_max) { + ret = -EINVAL; + goto release_init_lock; + } + page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; @@ -2153,7 +2131,7 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } - scan_slots_for_recompress(zram, mode, ctl); + scan_slots_for_recompress(zram, mode, prio_max, ctl); ret = len; while ((pps = select_pp_slot(ctl))) { @@ -2181,9 +2159,9 @@ next: cond_resched(); } - __free_page(page); - release_init_lock: + if (page) + __free_page(page); release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -2506,7 +2484,6 @@ static DEVICE_ATTR_WO(reset); static DEVICE_ATTR_WO(mem_limit); static DEVICE_ATTR_WO(mem_used_max); static DEVICE_ATTR_WO(idle); -static DEVICE_ATTR_RW(max_comp_streams); static DEVICE_ATTR_RW(comp_algorithm); #ifdef CONFIG_ZRAM_WRITEBACK static DEVICE_ATTR_RW(backing_dev); @@ -2528,7 +2505,6 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_mem_limit.attr, &dev_attr_mem_used_max.attr, &dev_attr_idle.attr, - &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, #ifdef CONFIG_ZRAM_WRITEBACK &dev_attr_backing_dev.attr, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index db78d7c01b9a..6cee93f9c0d0 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -17,7 +17,6 @@ #include <linux/rwsem.h> #include <linux/zsmalloc.h> -#include <linux/crypto.h> #include "zcomp.h" @@ -28,7 +27,6 @@ #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) - /* * ZRAM is mainly used for memory efficiency so we want to keep memory * footprint small and thus squeeze size and zram pageflags into a flags @@ -46,6 +44,7 @@ /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ + ZRAM_ENTRY_LOCK, /* entry access lock bit */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_PP_SLOT, /* Selected for post-processing */ ZRAM_HUGE, /* Incompressible page */ @@ -58,16 +57,19 @@ enum zram_pageflags { __NR_ZRAM_PAGEFLAGS, }; -/*-- Data structures */ - -/* Allocated for each disk page */ +/* + * Allocated for each disk page. We use bit-lock (ZRAM_ENTRY_LOCK bit + * of flags) to save memory. There can be plenty of entries and standard + * locking primitives (e.g. mutex) will significantly increase sizeof() + * of each entry and hence of the meta table. + */ struct zram_table_entry { unsigned long handle; - unsigned int flags; - spinlock_t lock; + unsigned long flags; #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME ktime_t ac_time; #endif + struct lockdep_map dep_map; }; struct zram_stats { @@ -80,7 +82,6 @@ struct zram_stats { atomic64_t huge_pages_since; /* no. of huge pages since zram set up */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ - atomic64_t writestall; /* no. of write slow paths */ atomic64_t miss_free; /* no. of missed free */ #ifdef CONFIG_ZRAM_WRITEBACK atomic64_t bd_count; /* no. of pages in backing device */ |
