diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-05 14:27:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-05 14:27:02 -0700 |
commit | 3526dd0c7832f1011a0477cc6d903662bae05ea8 (patch) | |
tree | 22fbac64eb40a0b29bfa4c029695f39b2f591e62 /block | |
parent | dd972f924df6bdbc0ab185a38d5d2361dbc26311 (diff) | |
parent | bc6d65e6dc89c3b7ff78e4ad797117c122ffde8e (diff) | |
download | lwn-3526dd0c7832f1011a0477cc6d903662bae05ea8.tar.gz lwn-3526dd0c7832f1011a0477cc6d903662bae05ea8.zip |
Merge tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe:
"It's a pretty quiet round this time, which is nice. This contains:
- series from Bart, cleaning up the way we set/test/clear atomic
queue flags.
- series from Bart, fixing races between gendisk and queue
registration and removal.
- set of bcache fixes and improvements from various folks, by way of
Michael Lyle.
- set of lightnvm updates from Matias, most of it being the 1.2 to
2.0 transition.
- removal of unused DIO flags from Nikolay.
- blk-mq/sbitmap memory ordering fixes from Omar.
- divide-by-zero fix for BFQ from Paolo.
- minor documentation patches from Randy.
- timeout fix from Tejun.
- Alpha "can't write a char atomically" fix from Mikulas.
- set of NVMe fixes by way of Keith.
- bsg and bsg-lib improvements from Christoph.
- a few sed-opal fixes from Jonas.
- cdrom check-disk-change deadlock fix from Maurizio.
- various little fixes, comment fixes, etc from various folks"
* tag 'for-4.17/block-20180402' of git://git.kernel.dk/linux-block: (139 commits)
blk-mq: Directly schedule q->timeout_work when aborting a request
blktrace: fix comment in blktrace_api.h
lightnvm: remove function name in strings
lightnvm: pblk: remove some unnecessary NULL checks
lightnvm: pblk: don't recover unwritten lines
lightnvm: pblk: implement 2.0 support
lightnvm: pblk: implement get log report chunk
lightnvm: pblk: rename ppaf* to addrf*
lightnvm: pblk: check for supported version
lightnvm: implement get log report chunk helpers
lightnvm: make address conversions depend on generic device
lightnvm: add support for 2.0 address format
lightnvm: normalize geometry nomenclature
lightnvm: complete geo structure with maxoc*
lightnvm: add shorten OCSSD version in geo
lightnvm: add minor version to generic geometry
lightnvm: simplify geometry structure
lightnvm: pblk: refactor init/exit sequences
lightnvm: Avoid validation of default op value
lightnvm: centralize permission check for lightnvm ioctl
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-iosched.c | 25 | ||||
-rw-r--r-- | block/bfq-iosched.h | 2 | ||||
-rw-r--r-- | block/bio.c | 4 | ||||
-rw-r--r-- | block/blk-cgroup.c | 78 | ||||
-rw-r--r-- | block/blk-core.c | 250 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 150 | ||||
-rw-r--r-- | block/blk-mq-pci.c | 6 | ||||
-rw-r--r-- | block/blk-mq.c | 20 | ||||
-rw-r--r-- | block/blk-settings.c | 6 | ||||
-rw-r--r-- | block/blk-stat.c | 6 | ||||
-rw-r--r-- | block/blk-sysfs.c | 29 | ||||
-rw-r--r-- | block/blk-timeout.c | 8 | ||||
-rw-r--r-- | block/blk-zoned.c | 4 | ||||
-rw-r--r-- | block/blk.h | 69 | ||||
-rw-r--r-- | block/bsg-lib.c | 165 | ||||
-rw-r--r-- | block/bsg.c | 262 | ||||
-rw-r--r-- | block/sed-opal.c | 37 |
17 files changed, 695 insertions, 426 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index aeca22d91101..f0ecd98509d8 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -201,7 +201,20 @@ static struct kmem_cache *bfq_pool; /* Target observation time interval for a peak-rate update (ns) */ #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -/* Shift used for peak rate fixed precision calculations. */ +/* + * Shift used for peak-rate fixed precision calculations. + * With + * - the current shift: 16 positions + * - the current type used to store rate: u32 + * - the current unit of measure for rate: [sectors/usec], or, more precisely, + * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, + * the range of rates that can be stored is + * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = + * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = + * [15, 65G] sectors/sec + * Which, assuming a sector size of 512B, corresponds to a range of + * [7.5K, 33T] B/sec + */ #define BFQ_RATE_SHIFT 16 /* @@ -2637,6 +2650,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) rate /= divisor; /* smoothing constant alpha = 1/divisor */ bfqd->peak_rate += rate; + + /* + * For a very slow device, bfqd->peak_rate can reach 0 (see + * the minimum representable values reported in the comments + * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid + * divisions by zero where bfqd->peak_rate is used as a + * divisor. + */ + bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); + update_thr_responsiveness_params(bfqd); reset_computation: diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 350c39ae2896..ae2f3dadec44 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -499,7 +499,7 @@ struct bfq_data { u64 delta_from_first; /* * Current estimate of the device peak rate, measured in - * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by + * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by * BFQ_RATE_SHIFT is performed to increase precision in * fixed-point calculations. */ diff --git a/block/bio.c b/block/bio.c index e1708db48258..53e0f0a1ed94 100644 --- a/block/bio.c +++ b/block/bio.c @@ -43,9 +43,9 @@ * break badly! cannot be bigger than what you can fit into an * unsigned short */ -#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } +#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), + BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), }; #undef BV diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c2033a232a44..1c16694ae145 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -307,11 +307,28 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, } } +static void blkg_pd_offline(struct blkcg_gq *blkg) +{ + int i; + + lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->blkcg->lock); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && !blkg->pd[i]->offline && + pol->pd_offline_fn) { + pol->pd_offline_fn(blkg->pd[i]); + blkg->pd[i]->offline = true; + } + } +} + static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; struct blkcg_gq *parent = blkg->parent; - int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); @@ -320,13 +337,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); - } - if (parent) { blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); @@ -369,6 +379,7 @@ static void blkg_destroy_all(struct request_queue *q) struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); + blkg_pd_offline(blkg); blkg_destroy(blkg); spin_unlock(&blkcg->lock); } @@ -995,25 +1006,25 @@ static struct cftype blkcg_legacy_files[] = { * @css: css of interest * * This function is called when @css is about to go away and responsible - * for shooting down all blkgs associated with @css. blkgs should be - * removed while holding both q and blkcg locks. As blkcg lock is nested - * inside q lock, this function performs reverse double lock dancing. + * for offlining all blkgs pd and killing all wbs associated with @css. + * blkgs pd offline should be done while holding both q and blkcg locks. + * As blkcg lock is nested inside q lock, this function performs reverse + * double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; spin_lock_irq(&blkcg->lock); - while (!hlist_empty(&blkcg->blkg_list)) { - struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, - struct blkcg_gq, blkcg_node); + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct request_queue *q = blkg->q; if (spin_trylock(q->queue_lock)) { - blkg_destroy(blkg); + blkg_pd_offline(blkg); spin_unlock(q->queue_lock); } else { spin_unlock_irq(&blkcg->lock); @@ -1027,11 +1038,43 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) wb_blkcg_offline(blkcg); } +/** + * blkcg_destroy_all_blkgs - destroy all blkgs associated with a blkcg + * @blkcg: blkcg of interest + * + * This function is called when blkcg css is about to free and responsible for + * destroying all blkgs associated with @blkcg. + * blkgs should be removed while holding both q and blkcg locks. As blkcg lock + * is nested inside q lock, this function performs reverse double lock dancing. + */ +static void blkcg_destroy_all_blkgs(struct blkcg *blkcg) +{ + spin_lock_irq(&blkcg->lock); + while (!hlist_empty(&blkcg->blkg_list)) { + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkcg_gq, + blkcg_node); + struct request_queue *q = blkg->q; + + if (spin_trylock(q->queue_lock)) { + blkg_destroy(blkg); + spin_unlock(q->queue_lock); + } else { + spin_unlock_irq(&blkcg->lock); + cpu_relax(); + spin_lock_irq(&blkcg->lock); + } + } + spin_unlock_irq(&blkcg->lock); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; + blkcg_destroy_all_blkgs(blkcg); + mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); @@ -1371,8 +1414,11 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_lock(&blkg->blkcg->lock); if (blkg->pd[pol->plid]) { - if (pol->pd_offline_fn) + if (!blkg->pd[pol->plid]->offline && + pol->pd_offline_fn) { pol->pd_offline_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid]->offline = true; + } pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } diff --git a/block/blk-core.c b/block/blk-core.c index 6d82c4f7fadd..abcb8684ba67 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -71,6 +71,78 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +/** + * blk_queue_flag_set - atomically set a queue flag + * @flag: flag to be set + * @q: request queue + */ +void blk_queue_flag_set(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_set(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_queue_flag_set); + +/** + * blk_queue_flag_clear - atomically clear a queue flag + * @flag: flag to be cleared + * @q: request queue + */ +void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); +} +EXPORT_SYMBOL(blk_queue_flag_clear); + +/** + * blk_queue_flag_test_and_set - atomically test and set a queue flag + * @flag: flag to be set + * @q: request queue + * + * Returns the previous value of @flag - 0 if the flag was not set and 1 if + * the flag was already set. + */ +bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + bool res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_set(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); + +/** + * blk_queue_flag_test_and_clear - atomically test and clear a queue flag + * @flag: flag to be cleared + * @q: request queue + * + * Returns the previous value of @flag - 0 if the flag was not set and 1 if + * the flag was set. + */ +bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q) +{ + unsigned long flags; + bool res; + + spin_lock_irqsave(q->queue_lock, flags); + res = queue_flag_test_and_clear(flag, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + return res; +} +EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear); + static void blk_clear_congested(struct request_list *rl, int sync) { #ifdef CONFIG_CGROUP_WRITEBACK @@ -361,25 +433,14 @@ EXPORT_SYMBOL(blk_sync_queue); */ int blk_set_preempt_only(struct request_queue *q) { - unsigned long flags; - int res; - - spin_lock_irqsave(q->queue_lock, flags); - res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return res; + return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q); } EXPORT_SYMBOL_GPL(blk_set_preempt_only); void blk_clear_preempt_only(struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); + blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q); wake_up_all(&q->mq_freeze_wq); - spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_clear_preempt_only); @@ -629,9 +690,7 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_end); void blk_set_queue_dying(struct request_queue *q) { - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_DYING, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * When queue DYING flag is set, we need to block new req @@ -719,6 +778,37 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); + /* + * I/O scheduler exit is only safe after the sysfs scheduler attribute + * has been removed. + */ + WARN_ON_ONCE(q->kobj.state_in_sysfs); + + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); + if (q->mq_ops) blk_mq_free_queue(q); percpu_ref_exit(&q->q_usage_counter); @@ -810,7 +900,7 @@ void blk_exit_rl(struct request_queue *q, struct request_list *rl) struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); + return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); } EXPORT_SYMBOL(blk_alloc_queue); @@ -827,7 +917,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) bool success = false; int ret; - rcu_read_lock_sched(); + rcu_read_lock(); if (percpu_ref_tryget_live(&q->q_usage_counter)) { /* * The code that sets the PREEMPT_ONLY flag is @@ -840,7 +930,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) percpu_ref_put(&q->q_usage_counter); } } - rcu_read_unlock_sched(); + rcu_read_unlock(); if (success) return 0; @@ -888,7 +978,21 @@ static void blk_rq_timed_out_timer(struct timer_list *t) kblockd_schedule_work(&q->timeout_work); } -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) +/** + * blk_alloc_queue_node - allocate a request queue + * @gfp_mask: memory allocation flags + * @node_id: NUMA node to allocate memory from + * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. + * serialize calls to the legacy .request_fn() callback. Ignored for + * blk-mq request queues. + * + * Note: pass the queue lock as the third argument to this function instead of + * setting the queue lock pointer explicitly to avoid triggering a sporadic + * crash in the blkcg code. This function namely calls blkcg_init_queue() and + * the queue lock pointer must be set before blkcg_init_queue() is called. + */ +struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, + spinlock_t *lock) { struct request_queue *q; @@ -939,11 +1043,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); - /* - * By default initialize queue_lock to internal lock and driver can - * override it later if need be. - */ - q->queue_lock = &q->__queue_lock; + if (!q->mq_ops) + q->queue_lock = lock ? : &q->__queue_lock; /* * A queue starts its life with bypass turned on to avoid @@ -952,7 +1053,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) * registered by blk_register_queue(). */ q->bypass_depth = 1; - __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); init_waitqueue_head(&q->mq_freeze_wq); @@ -1030,13 +1131,11 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { struct request_queue *q; - q = blk_alloc_queue_node(GFP_KERNEL, node_id); + q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); if (!q) return NULL; q->request_fn = rfn; - if (lock) - q->queue_lock = lock; if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); return NULL; @@ -2023,7 +2122,7 @@ out_unlock: return BLK_QC_T_NONE; } -static void handle_bad_sector(struct bio *bio) +static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -2031,7 +2130,7 @@ static void handle_bad_sector(struct bio *bio) printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", bio_devname(bio, b), bio->bi_opf, (unsigned long long)bio_end_sector(bio), - (long long)get_capacity(bio->bi_disk)); + (long long)maxsector); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -2093,67 +2192,58 @@ static noinline int should_fail_bio(struct bio *bio) ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* + * Check whether this bio extends beyond the end of the device or partition. + * This may well happen - the kernel calls bread() without checking the size of + * the device, e.g., when mounting a file system. + */ +static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +{ + unsigned int nr_sectors = bio_sectors(bio); + + if (nr_sectors && maxsector && + (nr_sectors > maxsector || + bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + handle_bad_sector(bio, maxsector); + return -EIO; + } + return 0; +} + +/* * Remap block n of partition p to block n+start(p) of the disk. */ static inline int blk_partition_remap(struct bio *bio) { struct hd_struct *p; - int ret = 0; + int ret = -EIO; rcu_read_lock(); p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || - bio_check_ro(bio, p))) { - ret = -EIO; + if (unlikely(!p)) + goto out; + if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) + goto out; + if (unlikely(bio_check_ro(bio, p))) goto out; - } /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ - if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) - goto out; - - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_partno = 0; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); - + if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { + if (bio_check_eod(bio, part_nr_sects_read(p))) + goto out; + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); + } + ret = 0; out: rcu_read_unlock(); return ret; } -/* - * Check whether this bio extends beyond the end of the device. - */ -static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) -{ - sector_t maxsector; - - if (!nr_sectors) - return 0; - - /* Test device or partition size, when known. */ - maxsector = get_capacity(bio->bi_disk); - if (maxsector) { - sector_t sector = bio->bi_iter.bi_sector; - - if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { - /* - * This may well happen - the kernel calls bread() - * without checking the size of the device, e.g., when - * mounting a device. - */ - handle_bad_sector(bio); - return 1; - } - } - - return 0; -} - static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { @@ -2164,9 +2254,6 @@ generic_make_request_checks(struct bio *bio) might_sleep(); - if (bio_check_eod(bio, nr_sectors)) - goto end_io; - q = bio->bi_disk->queue; if (unlikely(!q)) { printk(KERN_ERR @@ -2186,17 +2273,16 @@ generic_make_request_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - if (!bio->bi_partno) { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + if (bio->bi_partno) { + if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (blk_partition_remap(bio)) + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; } - if (bio_check_eod(bio, nr_sectors)) - goto end_io; - /* * Filter flush bio's early so that make_request based * drivers without flush support don't have to worry diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 21cbc1f071c6..58b3b79cbe83 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -24,6 +24,64 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + if (stat->nr_samples) { + seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); + } else { + seq_puts(m, "samples=0"); + } +} + +static int queue_poll_stat_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + int bucket; + + for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { + seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); + print_stat(m, &q->poll_stat[2*bucket]); + seq_puts(m, "\n"); + + seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); + print_stat(m, &q->poll_stat[2*bucket+1]); + seq_puts(m, "\n"); + } + return 0; +} + +static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) + __acquires(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_lock_irq(&q->requeue_lock); + return seq_list_start(&q->requeue_list, *pos); +} + +static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + + return seq_list_next(v, &q->requeue_list, pos); +} + +static void queue_requeue_list_stop(struct seq_file *m, void *v) + __releases(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_unlock_irq(&q->requeue_lock); +} + +static const struct seq_operations queue_requeue_list_seq_ops = { + .start = queue_requeue_list_start, + .next = queue_requeue_list_next, + .stop = queue_requeue_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + static int blk_flags_show(struct seq_file *m, const unsigned long flags, const char *const *flag_name, int flag_name_count) { @@ -125,16 +183,6 @@ inval: return count; } -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); - } else { - seq_puts(m, "samples=0"); - } -} - static int queue_write_hint_show(void *data, struct seq_file *m) { struct request_queue *q = data; @@ -158,23 +206,30 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; } -static int queue_poll_stat_show(void *data, struct seq_file *m) +static int queue_zone_wlock_show(void *data, struct seq_file *m) { struct request_queue *q = data; - int bucket; + unsigned int i; - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); - print_stat(m, &q->poll_stat[2*bucket]); - seq_puts(m, "\n"); + if (!q->seq_zones_wlock) + return 0; + + for (i = 0; i < blk_queue_nr_zones(q); i++) + if (test_bit(i, q->seq_zones_wlock)) + seq_printf(m, "%u\n", i); - seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); - print_stat(m, &q->poll_stat[2*bucket+1]); - seq_puts(m, "\n"); - } return 0; } +static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { + { "poll_stat", 0400, queue_poll_stat_show }, + { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, + { "state", 0600, queue_state_show, queue_state_write }, + { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, + { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { }, +}; + #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), @@ -295,6 +350,20 @@ static const char *const rqf_name[] = { }; #undef RQF_NAME +static const char *const blk_mq_rq_state_name_array[] = { + [MQ_RQ_IDLE] = "idle", + [MQ_RQ_IN_FLIGHT] = "in_flight", + [MQ_RQ_COMPLETE] = "complete", +}; + +static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) +{ + if (WARN_ON_ONCE((unsigned int)rq_state > + ARRAY_SIZE(blk_mq_rq_state_name_array))) + return "(?)"; + return blk_mq_rq_state_name_array[rq_state]; +} + int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -311,7 +380,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); - seq_printf(m, ", complete=%d", blk_rq_is_complete(rq)); + seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) @@ -327,37 +396,6 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); -static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) - __acquires(&q->requeue_lock) -{ - struct request_queue *q = m->private; - - spin_lock_irq(&q->requeue_lock); - return seq_list_start(&q->requeue_list, *pos); -} - -static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct request_queue *q = m->private; - - return seq_list_next(v, &q->requeue_list, pos); -} - -static void queue_requeue_list_stop(struct seq_file *m, void *v) - __releases(&q->requeue_lock) -{ - struct request_queue *q = m->private; - - spin_unlock_irq(&q->requeue_lock); -} - -static const struct seq_operations queue_requeue_list_seq_ops = { - .start = queue_requeue_list_start, - .next = queue_requeue_list_next, - .stop = queue_requeue_list_stop, - .show = blk_mq_debugfs_rq_show, -}; - static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { @@ -747,14 +785,6 @@ static const struct file_operations blk_mq_debugfs_fops = { .release = blk_mq_debugfs_release, }; -static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { - {"poll_stat", 0400, queue_poll_stat_show}, - {"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops}, - {"state", 0600, queue_state_show, queue_state_write}, - {"write_hints", 0600, queue_write_hint_show, queue_write_hint_store}, - {}, -}; - static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index 76944e3271bf..e233996bb76f 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -21,6 +21,7 @@ * blk_mq_pci_map_queues - provide a default queue mapping for PCI device * @set: tagset to provide the mapping for * @pdev: PCI device associated with @set. + * @offset: Offset to use for the pci irq vector * * This function assumes the PCI device @pdev has at least as many available * interrupt vectors as @set has queues. It will then query the vector @@ -28,13 +29,14 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev) +int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, + int offset) { const struct cpumask *mask; unsigned int queue, cpu; for (queue = 0; queue < set->nr_hw_queues; queue++) { - mask = pci_irq_get_affinity(pdev, queue); + mask = pci_irq_get_affinity(pdev, queue + offset); if (!mask) goto fallback; diff --git a/block/blk-mq.c b/block/blk-mq.c index 16e83e6df404..f5c7dbcb954f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,11 +194,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_set(QUEUE_FLAG_QUIESCED, q); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); @@ -239,11 +235,7 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_QUIESCED, q); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); /* dispatch requests which are inserted during quiescing */ blk_mq_run_hw_queues(q, true); @@ -986,9 +978,9 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; - sbitmap_clear_bit(sb, bitnr); spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_list, flush_data->list); + sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } @@ -2556,7 +2548,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); if (!uninit_q) return ERR_PTR(-ENOMEM); @@ -2678,7 +2670,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (!(set->flags & BLK_MQ_F_SG_MERGE)) - q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; + queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); q->sg_reserved_size = INT_MAX; @@ -3005,7 +2997,7 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static bool blk_poll_stats_enable(struct request_queue *q) { if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) return true; blk_stat_add_callback(q, q->poll_cb); return false; diff --git a/block/blk-settings.c b/block/blk-settings.c index 48ebe6be07b7..d1de71124656 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -859,12 +859,10 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment); void blk_queue_flush_queueable(struct request_queue *q, bool queueable) { - spin_lock_irq(q->queue_lock); if (queueable) - clear_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); + blk_queue_flag_clear(QUEUE_FLAG_FLUSH_NQ, q); else - set_bit(QUEUE_FLAG_FLUSH_NQ, &q->queue_flags); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_FLUSH_NQ, q); } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); diff --git a/block/blk-stat.c b/block/blk-stat.c index 28003bf9941c..bd365a95fcf8 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -152,7 +152,7 @@ void blk_stat_add_callback(struct request_queue *q, spin_lock(&q->stats->lock); list_add_tail_rcu(&cb->list, &q->stats->callbacks); - set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); } EXPORT_SYMBOL_GPL(blk_stat_add_callback); @@ -163,7 +163,7 @@ void blk_stat_remove_callback(struct request_queue *q, spin_lock(&q->stats->lock); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) - clear_bit(QUEUE_FLAG_STATS, &q->queue_flags); + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); del_timer_sync(&cb->timer); @@ -191,7 +191,7 @@ void blk_stat_enable_accounting(struct request_queue *q) { spin_lock(&q->stats->lock); q->stats->enable_accounting = true; - set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cbea895a5547..d00d1b0ec109 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -276,12 +276,10 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \ if (neg) \ val = !val; \ \ - spin_lock_irq(q->queue_lock); \ if (val) \ - queue_flag_set(QUEUE_FLAG_##flag, q); \ + blk_queue_flag_set(QUEUE_FLAG_##flag, q); \ else \ - queue_flag_clear(QUEUE_FLAG_##flag, q); \ - spin_unlock_irq(q->queue_lock); \ + blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \ return ret; \ } @@ -414,12 +412,10 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - spin_lock_irq(q->queue_lock); if (poll_on) - queue_flag_set(QUEUE_FLAG_POLL, q); + blk_queue_flag_set(QUEUE_FLAG_POLL, q); else - queue_flag_clear(QUEUE_FLAG_POLL, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); return ret; } @@ -487,12 +483,10 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, if (set == -1) return -EINVAL; - spin_lock_irq(q->queue_lock); if (set) - queue_flag_set(QUEUE_FLAG_WC, q); + blk_queue_flag_set(QUEUE_FLAG_WC, q); else - queue_flag_clear(QUEUE_FLAG_WC, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_WC, q); return count; } @@ -798,13 +792,6 @@ static void __blk_release_queue(struct work_struct *work) if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - bdi_put(q->backing_dev_info); - blkcg_exit_queue(q); - - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - } blk_free_queue_stats(q->stats); @@ -953,9 +940,7 @@ void blk_unregister_queue(struct gendisk *disk) */ mutex_lock(&q->sysfs_lock); - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_REGISTERED, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); /* * Remove the sysfs attributes before unregistering the queue data diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a05e3676d24a..652d4d4d3e97 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -57,12 +57,10 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, char *p = (char *) buf; val = simple_strtoul(p, &p, 10); - spin_lock_irq(q->queue_lock); if (val) - queue_flag_set(QUEUE_FLAG_FAIL_IO, q); + blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); else - queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); } return count; @@ -165,7 +163,7 @@ void blk_abort_request(struct request *req) * No need for fancy synchronizations. */ blk_rq_set_deadline(req, jiffies); - mod_timer(&req->q->timeout, 0); + kblockd_schedule_work(&req->q->timeout_work); } else { if (blk_mark_rq_complete(req)) return; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index acb7252c7e81..08e84ef2bc05 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -296,7 +296,7 @@ int blkdev_reset_zones(struct block_device *bdev, } EXPORT_SYMBOL_GPL(blkdev_reset_zones); -/** +/* * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ @@ -355,7 +355,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -/** +/* * BLKRESETZONE ioctl processing. * Called from blkdev_ioctl. */ diff --git a/block/blk.h b/block/blk.h index 46db5dc83dcb..b034fd2460c4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -41,6 +41,75 @@ extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; +/* + * @q->queue_lock is set while a queue is being initialized. Since we know + * that no other threads access the queue object before @q->queue_lock has + * been set, it is safe to manipulate queue flags without holding the + * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and + * blk_init_allocated_queue(). + */ +static inline void queue_lockdep_assert_held(struct request_queue *q) +{ + if (q->queue_lock) + lockdep_assert_held(q->queue_lock); +} + +static inline void queue_flag_set_unlocked(unsigned int flag, + struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && + kref_read(&q->kobj.kref)) + lockdep_assert_held(q->queue_lock); + __set_bit(flag, &q->queue_flags); +} + +static inline void queue_flag_clear_unlocked(unsigned int flag, + struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && + kref_read(&q->kobj.kref)) + lockdep_assert_held(q->queue_lock); + __clear_bit(flag, &q->queue_flags); +} + +static inline int queue_flag_test_and_clear(unsigned int flag, + struct request_queue *q) +{ + queue_lockdep_assert_held(q); + + if (test_bit(flag, &q->queue_flags)) { + __clear_bit(flag, &q->queue_flags); + return 1; + } + + return 0; +} + +static inline int queue_flag_test_and_set(unsigned int flag, + struct request_queue *q) +{ + queue_lockdep_assert_held(q); + + if (!test_bit(flag, &q->queue_flags)) { + __set_bit(flag, &q->queue_flags); + return 0; + } + + return 1; +} + +static inline void queue_flag_set(unsigned int flag, struct request_queue *q) +{ + queue_lockdep_assert_held(q); + __set_bit(flag, &q->queue_flags); +} + +static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) +{ + queue_lockdep_assert_held(q); + __clear_bit(flag, &q->queue_flags); +} + static inline struct blk_flush_queue *blk_get_flush_queue( struct request_queue *q, struct blk_mq_ctx *ctx) { diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 1474153f73e3..fc2e5ff2c4b9 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -27,6 +27,94 @@ #include <linux/bsg-lib.h> #include <linux/export.h> #include <scsi/scsi_cmnd.h> +#include <scsi/sg.h> + +#define uptr64(val) ((void __user *)(uintptr_t)(val)) + +static int bsg_transport_check_proto(struct sg_io_v4 *hdr) +{ + if (hdr->protocol != BSG_PROTOCOL_SCSI || + hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + return 0; +} + +static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, + fmode_t mode) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + + job->request_len = hdr->request_len; + job->request = memdup_user(uptr64(hdr->request), hdr->request_len); + if (IS_ERR(job->request)) + return PTR_ERR(job->request); + return 0; +} + +static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + int ret = 0; + + /* + * The assignments below don't make much sense, but are kept for + * bug by bug backwards compatibility: + */ + hdr->device_status = job->result & 0xff; + hdr->transport_status = host_byte(job->result); + hdr->driver_status = driver_byte(job->result); + hdr->info = 0; + if (hdr->device_status || hdr->transport_status || hdr->driver_status) + hdr->info |= SG_INFO_CHECK; + hdr->response_len = 0; + + if (job->result < 0) { + /* we're only returning the result field in the reply */ + job->reply_len = sizeof(u32); + ret = job->result; + } + + if (job->reply_len && hdr->response) { + int len = min(hdr->max_response_len, job->reply_len); + + if (copy_to_user(uptr64(hdr->response), job->reply, len)) + ret = -EFAULT; + else + hdr->response_len = len; + } + + /* we assume all request payload was transferred, residual == 0 */ + hdr->dout_resid = 0; + + if (rq->next_rq) { + unsigned int rsp_len = job->reply_payload.payload_len; + + if (WARN_ON(job->reply_payload_rcv_len > rsp_len)) + hdr->din_resid = 0; + else + hdr->din_resid = rsp_len - job->reply_payload_rcv_len; + } else { + hdr->din_resid = 0; + } + + return ret; +} + +static void bsg_transport_free_rq(struct request *rq) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + + kfree(job->request); +} + +static const struct bsg_ops bsg_transport_ops = { + .check_proto = bsg_transport_check_proto, + .fill_hdr = bsg_transport_fill_hdr, + .complete_rq = bsg_transport_complete_rq, + .free_rq = bsg_transport_free_rq, +}; /** * bsg_teardown_job - routine to teardown a bsg job @@ -35,7 +123,7 @@ static void bsg_teardown_job(struct kref *kref) { struct bsg_job *job = container_of(kref, struct bsg_job, kref); - struct request *rq = job->req; + struct request *rq = blk_mq_rq_from_pdu(job); put_device(job->dev); /* release reference for the request */ @@ -68,28 +156,9 @@ EXPORT_SYMBOL_GPL(bsg_job_get); void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len) { - struct request *req = job->req; - struct request *rsp = req->next_rq; - struct scsi_request *rq = scsi_req(req); - int err; - - err = scsi_req(job->req)->result = result; - if (err < 0) - /* we're only returning the result field in the reply */ - rq->sense_len = sizeof(u32); - else - rq->sense_len = job->reply_len; - /* we assume all request payload was transferred, residual == 0 */ - rq->resid_len = 0; - - if (rsp) { - WARN_ON(reply_payload_rcv_len > scsi_req(rsp)->resid_len); - - /* set reply (bidi) residual */ - scsi_req(rsp)->resid_len -= - min(reply_payload_rcv_len, scsi_req(rsp)->resid_len); - } - blk_complete_request(req); + job->result = result; + job->reply_payload_rcv_len = reply_payload_rcv_len; + blk_complete_request(blk_mq_rq_from_pdu(job)); } EXPORT_SYMBOL_GPL(bsg_job_done); @@ -114,7 +183,6 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); - scsi_req(req)->resid_len = blk_rq_bytes(req); buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); buf->payload_len = blk_rq_bytes(req); return 0; @@ -125,15 +193,13 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) * @dev: device that is being sent the bsg request * @req: BSG request that needs a job structure */ -static int bsg_prepare_job(struct device *dev, struct request *req) +static bool bsg_prepare_job(struct device *dev, struct request *req) { struct request *rsp = req->next_rq; - struct scsi_request *rq = scsi_req(req); struct bsg_job *job = blk_mq_rq_to_pdu(req); int ret; - job->request = rq->cmd; - job->request_len = rq->cmd_len; + job->timeout = req->timeout; if (req->bio) { ret = bsg_map_buffer(&job->request_payload, req); @@ -149,12 +215,13 @@ static int bsg_prepare_job(struct device *dev, struct request *req) /* take a reference for the request */ get_device(job->dev); kref_init(&job->kref); - return 0; + return true; failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - return -ENOMEM; + job->result = -ENOMEM; + return false; } /** @@ -183,9 +250,7 @@ static void bsg_request_fn(struct request_queue *q) break; spin_unlock_irq(q->queue_lock); - ret = bsg_prepare_job(dev, req); - if (ret) { - scsi_req(req)->result = ret; + if (!bsg_prepare_job(dev, req)) { blk_end_request_all(req, BLK_STS_OK); spin_lock_irq(q->queue_lock); continue; @@ -202,47 +267,34 @@ static void bsg_request_fn(struct request_queue *q) spin_lock_irq(q->queue_lock); } +/* called right after the request is allocated for the request_queue */ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) { struct bsg_job *job = blk_mq_rq_to_pdu(req); - struct scsi_request *sreq = &job->sreq; - - /* called right after the request is allocated for the request_queue */ - sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); - if (!sreq->sense) + job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); + if (!job->reply) return -ENOMEM; - return 0; } +/* called right before the request is given to the request_queue user */ static void bsg_initialize_rq(struct request *req) { struct bsg_job *job = blk_mq_rq_to_pdu(req); - struct scsi_request *sreq = &job->sreq; - void *sense = sreq->sense; - - /* called right before the request is given to the request_queue user */ + void *reply = job->reply; memset(job, 0, sizeof(*job)); - - scsi_req_init(sreq); - - sreq->sense = sense; - sreq->sense_len = SCSI_SENSE_BUFFERSIZE; - - job->req = req; - job->reply = sense; - job->reply_len = sreq->sense_len; + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; job->dd_data = job + 1; } static void bsg_exit_rq(struct request_queue *q, struct request *req) { struct bsg_job *job = blk_mq_rq_to_pdu(req); - struct scsi_request *sreq = &job->sreq; - kfree(sreq->sense); + kfree(job->reply); } /** @@ -275,12 +327,11 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, q->queuedata = dev; q->bsg_job_fn = job_fn; - queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); - queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q); + blk_queue_flag_set(QUEUE_FLAG_BIDI, q); blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - ret = bsg_register_queue(q, dev, name, release); + ret = bsg_register_queue(q, dev, name, &bsg_transport_ops, release); if (ret) { printk(KERN_ERR "%s: bsg interface failed to " "initialize - register queue\n", dev->kobj.name); diff --git a/block/bsg.c b/block/bsg.c index 06dc96e1f670..defa06c11858 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -130,114 +130,120 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index) return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; } -static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_v4 *hdr, struct bsg_device *bd, - fmode_t mode) +#define uptr64(val) ((void __user *)(uintptr_t)(val)) + +static int bsg_scsi_check_proto(struct sg_io_v4 *hdr) +{ + if (hdr->protocol != BSG_PROTOCOL_SCSI || + hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) + return -EINVAL; + return 0; +} + +static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, + fmode_t mode) { - struct scsi_request *req = scsi_req(rq); + struct scsi_request *sreq = scsi_req(rq); - if (hdr->request_len > BLK_MAX_CDB) { - req->cmd = kzalloc(hdr->request_len, GFP_KERNEL); - if (!req->cmd) + sreq->cmd_len = hdr->request_len; + if (sreq->cmd_len > BLK_MAX_CDB) { + sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL); + if (!sreq->cmd) return -ENOMEM; } - if (copy_from_user(req->cmd, (void __user *)(unsigned long)hdr->request, - hdr->request_len)) + if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len)) return -EFAULT; - - if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(req->cmd, mode)) - return -EPERM; - } else if (!capable(CAP_SYS_RAWIO)) + if (blk_verify_command(sreq->cmd, mode)) return -EPERM; - - /* - * fill in request structure - */ - req->cmd_len = hdr->request_len; - - rq->timeout = msecs_to_jiffies(hdr->timeout); - if (!rq->timeout) - rq->timeout = q->sg_timeout; - if (!rq->timeout) - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - if (rq->timeout < BLK_MIN_SG_TIMEOUT) - rq->timeout = BLK_MIN_SG_TIMEOUT; - return 0; } -/* - * Check if sg_io_v4 from user is allowed and valid - */ -static int -bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op) +static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr) { + struct scsi_request *sreq = scsi_req(rq); int ret = 0; - if (hdr->guard != 'Q') - return -EINVAL; + /* + * fill in all the output members + */ + hdr->device_status = sreq->result & 0xff; + hdr->transport_status = host_byte(sreq->result); + hdr->driver_status = driver_byte(sreq->result); + hdr->info = 0; + if (hdr->device_status || hdr->transport_status || hdr->driver_status) + hdr->info |= SG_INFO_CHECK; + hdr->response_len = 0; - switch (hdr->protocol) { - case BSG_PROTOCOL_SCSI: - switch (hdr->subprotocol) { - case BSG_SUB_PROTOCOL_SCSI_CMD: - case BSG_SUB_PROTOCOL_SCSI_TRANSPORT: - break; - default: - ret = -EINVAL; - } - break; - default: - ret = -EINVAL; + if (sreq->sense_len && hdr->response) { + int len = min_t(unsigned int, hdr->max_response_len, + sreq->sense_len); + + if (copy_to_user(uptr64(hdr->response), sreq->sense, len)) + ret = -EFAULT; + else + hdr->response_len = len; + } + + if (rq->next_rq) { + hdr->dout_resid = sreq->resid_len; + hdr->din_resid = scsi_req(rq->next_rq)->resid_len; + } else if (rq_data_dir(rq) == READ) { + hdr->din_resid = sreq->resid_len; + } else { + hdr->dout_resid = sreq->resid_len; } - *op = hdr->dout_xfer_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN; return ret; } -/* - * map sg_io_v4 to a request. - */ +static void bsg_scsi_free_rq(struct request *rq) +{ + scsi_req_free_cmd(scsi_req(rq)); +} + +static const struct bsg_ops bsg_scsi_ops = { + .check_proto = bsg_scsi_check_proto, + .fill_hdr = bsg_scsi_fill_hdr, + .complete_rq = bsg_scsi_complete_rq, + .free_rq = bsg_scsi_free_rq, +}; + static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) +bsg_map_hdr(struct request_queue *q, struct sg_io_v4 *hdr, fmode_t mode) { - struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; int ret; - unsigned int op, dxfer_len; - void __user *dxferp = NULL; - struct bsg_class_device *bcd = &q->bsg_dev; - /* if the LLD has been removed then the bsg_unregister_queue will - * eventually be called and the class_dev was freed, so we can no - * longer use this request_queue. Return no such address. - */ - if (!bcd->class_dev) + if (!q->bsg_dev.class_dev) return ERR_PTR(-ENXIO); - bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", - (unsigned long long) hdr->dout_xferp, - hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, - hdr->din_xfer_len); + if (hdr->guard != 'Q') + return ERR_PTR(-EINVAL); - ret = bsg_validate_sgv4_hdr(hdr, &op); + ret = q->bsg_dev.ops->check_proto(hdr); if (ret) return ERR_PTR(ret); - /* - * map scatter-gather elements separately and string them to request - */ - rq = blk_get_request(q, op, GFP_KERNEL); + rq = blk_get_request(q, hdr->dout_xfer_len ? + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, + GFP_KERNEL); if (IS_ERR(rq)) return rq; - ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, mode); + ret = q->bsg_dev.ops->fill_hdr(rq, hdr, mode); if (ret) goto out; - if (op == REQ_OP_SCSI_OUT && hdr->din_xfer_len) { + rq->timeout = msecs_to_jiffies(hdr->timeout); + if (!rq->timeout) + rq->timeout = q->sg_timeout; + if (!rq->timeout) + rq->timeout = BLK_DEFAULT_SG_TIMEOUT; + if (rq->timeout < BLK_MIN_SG_TIMEOUT) + rq->timeout = BLK_MIN_SG_TIMEOUT; + + if (hdr->dout_xfer_len && hdr->din_xfer_len) { if (!test_bit(QUEUE_FLAG_BIDI, &q->queue_flags)) { ret = -EOPNOTSUPP; goto out; @@ -246,42 +252,39 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) next_rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL); if (IS_ERR(next_rq)) { ret = PTR_ERR(next_rq); - next_rq = NULL; goto out; } - rq->next_rq = next_rq; - dxferp = (void __user *)(unsigned long)hdr->din_xferp; - ret = blk_rq_map_user(q, next_rq, NULL, dxferp, + rq->next_rq = next_rq; + ret = blk_rq_map_user(q, next_rq, NULL, uptr64(hdr->din_xferp), hdr->din_xfer_len, GFP_KERNEL); if (ret) - goto out; + goto out_free_nextrq; } if (hdr->dout_xfer_len) { - dxfer_len = hdr->dout_xfer_len; - dxferp = (void __user *)(unsigned long)hdr->dout_xferp; + ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr->dout_xferp), + hdr->dout_xfer_len, GFP_KERNEL); } else if (hdr->din_xfer_len) { - dxfer_len = hdr->din_xfer_len; - dxferp = (void __user *)(unsigned long)hdr->din_xferp; - } else - dxfer_len = 0; - - if (dxfer_len) { - ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len, - GFP_KERNEL); - if (ret) - goto out; + ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr->din_xferp), + hdr->din_xfer_len, GFP_KERNEL); + } else { + ret = blk_rq_map_user(q, rq, NULL, NULL, 0, GFP_KERNEL); } + if (ret) + goto out_unmap_nextrq; return rq; + +out_unmap_nextrq: + if (rq->next_rq) + blk_rq_unmap_user(rq->next_rq->bio); +out_free_nextrq: + if (rq->next_rq) + blk_put_request(rq->next_rq); out: - scsi_req_free_cmd(scsi_req(rq)); + q->bsg_dev.ops->free_rq(rq); blk_put_request(rq); - if (next_rq) { - blk_rq_unmap_user(next_rq->bio); - blk_put_request(next_rq); - } return ERR_PTR(ret); } @@ -383,56 +386,18 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct bio *bio, struct bio *bidi_bio) { - struct scsi_request *req = scsi_req(rq); - int ret = 0; - - pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result); - /* - * fill in all the output members - */ - hdr->device_status = req->result & 0xff; - hdr->transport_status = host_byte(req->result); - hdr->driver_status = driver_byte(req->result); - hdr->info = 0; - if (hdr->device_status || hdr->transport_status || hdr->driver_status) - hdr->info |= SG_INFO_CHECK; - hdr->response_len = 0; - - if (req->sense_len && hdr->response) { - int len = min_t(unsigned int, hdr->max_response_len, - req->sense_len); + int ret; - ret = copy_to_user((void __user *)(unsigned long)hdr->response, - req->sense, len); - if (!ret) - hdr->response_len = len; - else - ret = -EFAULT; - } + ret = rq->q->bsg_dev.ops->complete_rq(rq, hdr); if (rq->next_rq) { - hdr->dout_resid = req->resid_len; - hdr->din_resid = scsi_req(rq->next_rq)->resid_len; blk_rq_unmap_user(bidi_bio); blk_put_request(rq->next_rq); - } else if (rq_data_dir(rq) == READ) - hdr->din_resid = req->resid_len; - else - hdr->dout_resid = req->resid_len; - - /* - * If the request generated a negative error number, return it - * (providing we aren't already returning an error); if it's - * just a protocol response (i.e. non negative), that gets - * processed above. - */ - if (!ret && req->result < 0) - ret = req->result; + } blk_rq_unmap_user(bio); - scsi_req_free_cmd(req); + rq->q->bsg_dev.ops->free_rq(rq); blk_put_request(rq); - return ret; } @@ -614,7 +579,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, mode); + rq = bsg_map_hdr(bd->queue, &bc->hdr, mode); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -742,11 +707,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct bsg_device *bd; unsigned char buf[32]; - if (!blk_queue_scsi_passthrough(rq)) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); - return ERR_PTR(-EINVAL); - } - if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); @@ -907,7 +867,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode); + rq = bsg_map_hdr(bd->queue, &hdr, file->f_mode); if (IS_ERR(rq)) return PTR_ERR(rq); @@ -959,7 +919,8 @@ void bsg_unregister_queue(struct request_queue *q) EXPORT_SYMBOL_GPL(bsg_unregister_queue); int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, void (*release)(struct device *)) + const char *name, const struct bsg_ops *ops, + void (*release)(struct device *)) { struct bsg_class_device *bcd; dev_t dev; @@ -996,6 +957,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, bcd->queue = q; bcd->parent = get_device(parent); bcd->release = release; + bcd->ops = ops; kref_init(&bcd->ref); dev = MKDEV(bsg_major, bcd->minor); class_dev = device_create(bsg_class, parent, dev, NULL, "%s", devname); @@ -1023,7 +985,17 @@ unlock: mutex_unlock(&bsg_mutex); return ret; } -EXPORT_SYMBOL_GPL(bsg_register_queue); + +int bsg_scsi_register_queue(struct request_queue *q, struct device *parent) +{ + if (!blk_queue_scsi_passthrough(q)) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + return -EINVAL; + } + + return bsg_register_queue(q, parent, NULL, &bsg_scsi_ops, NULL); +} +EXPORT_SYMBOL_GPL(bsg_scsi_register_queue); static struct cdev bsg_cdev; diff --git a/block/sed-opal.c b/block/sed-opal.c index e4929eec547f..945f4b8610e0 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -554,15 +554,14 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) size_t len; int msb; - u8 n; if (!(number & ~TINY_ATOM_DATA_MASK)) { add_token_u8(err, cmd, number); return; } - msb = fls(number); - len = DIV_ROUND_UP(msb, 4); + msb = fls64(number); + len = DIV_ROUND_UP(msb, 8); if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) { pr_debug("Error adding u64: end of buffer.\n"); @@ -570,10 +569,8 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) return; } add_short_atom_header(cmd, false, false, len); - while (len--) { - n = number >> (len * 8); - add_token_u8(err, cmd, n); - } + while (len--) + add_token_u8(err, cmd, number >> (len * 8)); } static void add_token_bytestring(int *err, struct opal_dev *cmd, @@ -871,6 +868,9 @@ static int response_parse(const u8 *buf, size_t length, static size_t response_get_string(const struct parsed_resp *resp, int n, const char **store) { + u8 skip; + const struct opal_resp_tok *token; + *store = NULL; if (!resp) { pr_debug("Response is NULL\n"); @@ -883,13 +883,30 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, return 0; } - if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) { + token = &resp->toks[n]; + if (token->type != OPAL_DTA_TOKENID_BYTESTRING) { pr_debug("Token is not a byte string!\n"); return 0; } - *store = resp->toks[n].pos + 1; - return resp->toks[n].len - 1; + switch (token->width) { + case OPAL_WIDTH_TINY: + case OPAL_WIDTH_SHORT: + skip = 1; + break; + case OPAL_WIDTH_MEDIUM: + skip = 2; + break; + case OPAL_WIDTH_LONG: + skip = 4; + break; + default: + pr_debug("Token has invalid width!\n"); + return 0; + } + + *store = token->pos + skip; + return token->len - skip; } static u64 response_get_u64(const struct parsed_resp *resp, int n) |