From 834f9f61a525d2f6d3d0c93894e26326c8d3ceed Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 17 Oct 2011 12:57:22 +0200 Subject: blk-flush: fix invalid BUG_ON in blk_insert_flush A user reported a regression due to commit 4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush machinery for stacking drivers with differring flush flags). Part of the problem is that blk_insert_flush required a single bio be attached to the request. In reality, having no attached bio is also a valid case, as can be observed with an empty flush. [1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html Reported-by: Christophe Saout Signed-off-by: Jeff Moyer Stable note: 3.1 Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 491eb30a242d..89ae3b9bf7ca 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -320,7 +320,7 @@ void blk_insert_flush(struct request *rq) return; } - BUG_ON(!rq->bio || rq->bio != rq->biotail); + BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ /* * If there's data but flush is not necessary, the request can be -- cgit v1.2.3 From e67b77c791ca2778198c9e7088f3266ed2da7a55 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 17 Oct 2011 12:57:23 +0200 Subject: blk-flush: move the queue kick into A dm-multipath user reported[1] a problem when trying to boot a kernel with commit 4853abaae7e4a2af938115ce9071ef8684fb7af4 (block: fix flush machinery for stacking drivers with differring flush flags) applied. It turns out that an empty flush request can be sent into blk_insert_flush. When the BUG_ON was fixed to allow for this, I/O on the underlying device would stall. The reason is that blk_insert_cloned_request does not kick the queue. In the aforementioned commit, I had added a special case to kick the queue if data was sent down but the queue flags did not require a flush. A better solution is to push the queue kick up into blk_insert_cloned_request. This patch, along with a follow-on which fixes the BUG_ON, fixes the issue reported. [1] http://www.redhat.com/archives/dm-devel/2011-September/msg00154.html Reported-by: Christophe Saout Signed-off-by: Jeff Moyer Acked-by: Tejun Heo Stable note: 3.1 Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk-flush.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d34433ae7917..795154e54a75 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1725,6 +1725,8 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); + if (where == ELEVATOR_INSERT_FLUSH) + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return 0; diff --git a/block/blk-flush.c b/block/blk-flush.c index 89ae3b9bf7ca..720ad607ff91 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,6 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { list_add_tail(&rq->queuelist, &q->queue_head); - blk_run_queue_async(q); return; } -- cgit v1.2.3 From f992ae801a7dec34a4ed99a6598bbbbfb82af4fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 17 Oct 2011 13:42:43 +0200 Subject: block: make gendisk hold a reference to its queue The following command sequence triggers an oops. # mount /dev/sdb1 /mnt # echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete # umount /mnt general protection fault: 0000 [#1] PREEMPT SMP CPU 2 Modules linked in: Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs RIP: 0010:[] [] __lock_acquire+0x389/0x1d60 ... Call Trace: [] lock_acquire+0x95/0x140 [] _raw_spin_lock+0x3b/0x50 [] bdi_lock_two+0x5c/0x70 [] bdev_inode_switch_bdi+0x4c/0xf0 [] __blkdev_put+0x11b/0x1d0 [] __blkdev_put+0x160/0x1d0 [] blkdev_put+0x5f/0x190 [] kill_block_super+0x4d/0x80 [] deactivate_locked_super+0x45/0x70 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0xed/0x130 [] sys_umount+0x7e/0x3a0 [] system_call_fastpath+0x16/0x1b This is because bdev holds on to disk but disk doesn't pin the associated queue. If a SCSI device is removed while the device is still open, the sdev puts the base reference to the queue on release. When the bdev is finally released, the associated queue is already gone along with the bdi and bdev_inode_switch_bdi() ends up dereferencing already freed bdi. Even if it were not for this bug, disk not holding onto the associated queue is very unusual and error-prone. Fix it by making add_disk() take an extra reference to its queue and put it on disk_release() and ensuring that disk and its fops owner are put in that order after all accesses to the disk and queue are complete. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 8 ++++++++ fs/block_dev.c | 13 ++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index e2f67902dd02..d261b73b9744 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk) register_disk(disk); blk_register_queue(disk); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(blk_get_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -1095,6 +1101,8 @@ static void disk_release(struct device *dev) disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); free_part_info(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786ec7f08..1c44b8d54504 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); -- cgit v1.2.3