diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 15:32:19 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-14 15:32:19 -0800 |
commit | e2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch) | |
tree | b97a90170c45211bcc437761653aa8016c34afcd /fs | |
parent | abc36be236358162202e86ad88616ff95a755101 (diff) | |
parent | a04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff) | |
download | lwn-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.tar.gz lwn-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.zip |
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe:
"This is the main pull request for block storage for 4.15-rc1.
Nothing out of the ordinary in here, and no API changes or anything
like that. Just various new features for drivers, core changes, etc.
In particular, this pull request contains:
- A patch series from Bart, closing the whole on blk/scsi-mq queue
quescing.
- A series from Christoph, building towards hidden gendisks (for
multipath) and ability to move bio chains around.
- NVMe
- Support for native multipath for NVMe (Christoph).
- Userspace notifications for AENs (Keith).
- Command side-effects support (Keith).
- SGL support (Chaitanya Kulkarni)
- FC fixes and improvements (James Smart)
- Lots of fixes and tweaks (Various)
- bcache
- New maintainer (Michael Lyle)
- Writeback control improvements (Michael)
- Various fixes (Coly, Elena, Eric, Liang, et al)
- lightnvm updates, mostly centered around the pblk interface
(Javier, Hans, and Rakesh).
- Removal of unused bio/bvec kmap atomic interfaces (me, Christoph)
- Writeback series that fix the much discussed hundreds of millions
of sync-all units. This goes all the way, as discussed previously
(me).
- Fix for missing wakeup on writeback timer adjustments (Yafang
Shao).
- Fix laptop mode on blk-mq (me).
- {mq,name} tupple lookup for IO schedulers, allowing us to have
alias names. This means you can use 'deadline' on both !mq and on
mq (where it's called mq-deadline). (me).
- blktrace race fix, oopsing on sg load (me).
- blk-mq optimizations (me).
- Obscure waitqueue race fix for kyber (Omar).
- NBD fixes (Josef).
- Disable writeback throttling by default on bfq, like we do on cfq
(Luca Miccio).
- Series from Ming that enable us to treat flush requests on blk-mq
like any other request. This is a really nice cleanup.
- Series from Ming that improves merging on blk-mq with schedulers,
getting us closer to flipping the switch on scsi-mq again.
- BFQ updates (Paolo).
- blk-mq atomic flags memory ordering fixes (Peter Z).
- Loop cgroup support (Shaohua).
- Lots of minor fixes from lots of different folks, both for core and
driver code"
* 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits)
nvme: fix visibility of "uuid" ns attribute
blk-mq: fixup some comment typos and lengths
ide: ide-atapi: fix compile error with defining macro DEBUG
blk-mq: improve tag waiting setup for non-shared tags
brd: remove unused brd_mutex
blk-mq: only run the hardware queue if IO is pending
block: avoid null pointer dereference on null disk
fs: guard_bio_eod() needs to consider partitions
xtensa/simdisk: fix compile error
nvme: expose subsys attribute to sysfs
nvme: create 'slaves' and 'holders' entries for hidden controllers
block: create 'slaves' and 'holders' entries for hidden gendisks
nvme: also expose the namespace identification sysfs files for mpath nodes
nvme: implement multipath access to nvme subsystems
nvme: track shared namespaces
nvme: introduce a nvme_ns_ids structure
nvme: track subsystems
block, nvme: Introduce blk_mq_req_flags_t
block, scsi: Make SCSI quiesce and resume work reliably
block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/block_dev.c | 20 | ||||
-rw-r--r-- | fs/buffer.c | 70 | ||||
-rw-r--r-- | fs/direct-io.c | 2 | ||||
-rw-r--r-- | fs/fs-writeback.c | 153 | ||||
-rw-r--r-- | fs/iomap.c | 2 | ||||
-rw-r--r-- | fs/ntfs/aops.c | 2 | ||||
-rw-r--r-- | fs/ntfs/mft.c | 2 | ||||
-rw-r--r-- | fs/sync.c | 2 |
8 files changed, 112 insertions, 141 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index 789f55e851ae..4a181fcb5175 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); - va_end(args); -} - static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false); @@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, false); + result = blk_queue_enter(bdev->bd_queue, 0); if (result) return result; diff --git a/fs/buffer.c b/fs/buffer.c index 49b7e9bdcd1d..1c18a22a6013 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -253,27 +253,6 @@ out: } /* - * Kick the writeback threads then try to free up some ZONE_NORMAL memory. - */ -static void free_more_memory(void) -{ - struct zoneref *z; - int nid; - - wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); - yield(); - - for_each_online_node(nid) { - - z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL); - if (z->zone) - try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, - GFP_NOFS, NULL); - } -} - -/* * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. */ @@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry) + bool retry) { struct buffer_head *bh, *head; + gfp_t gfp = GFP_NOFS; long offset; -try_again: + if (retry) + gfp |= __GFP_NOFAIL; + head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = alloc_buffer_head(GFP_NOFS); + bh = alloc_buffer_head(gfp); if (!bh) goto no_grow; @@ -896,23 +878,7 @@ no_grow: } while (head); } - /* - * Return failure for non-async IO requests. Async IO requests - * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with - * partially complete buffers, so all were released above. - */ - if (!retry) - return NULL; - - /* We're _really_ low on memory. Now we just - * wait for old buffer heads to become free due to - * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are - * async buffer heads in use. - */ - free_more_memory(); - goto try_again; + return NULL; } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, gfp_mask |= __GFP_NOFAIL; page = find_or_create_page(inode->i_mapping, index, gfp_mask); - if (!page) - return ret; BUG_ON(!PageLocked(page)); @@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, /* * Allocate some buffers for this page */ - bh = alloc_page_buffers(page, size, 0); - if (!bh) - goto failed; + bh = alloc_page_buffers(page, size, true); /* * Link the page to the buffers and initialise them. Take the @@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; - if (ret == 0) - free_more_memory(); } } @@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page, { struct buffer_head *bh, *head, *tail; - head = alloc_page_buffers(page, blocksize, 1); + head = alloc_page_buffers(page, blocksize, true); bh = head; do { bh->b_state |= b_state; @@ -2639,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping, * Be careful: the buffer linked list is a NULL terminated one, rather * than the circular one we're used to. */ - head = alloc_page_buffers(page, blocksize, 0); + head = alloc_page_buffers(page, blocksize, false); if (!head) { ret = -ENOMEM; goto out_release; @@ -3056,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio) sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); - maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; diff --git a/fs/direct-io.c b/fs/direct-io.c index 98fe1325da9d..3aafb3343a65 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 245c430a2e41..08f5debd07d1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, #endif /* CONFIG_CGROUP_WRITEBACK */ -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason) +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) { - struct wb_writeback_work *work; + return global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} +static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) +{ if (!wb_has_dirty_io(wb)) return; /* - * This is WB_SYNC_NONE writeback, so if allocation fails just - * wakeup the thread for old dirty data writeback + * All callers of this function want to start writeback of all + * dirty pages. Places like vmscan can call this at a very + * high frequency, causing pointless allocations of tons of + * work items and keeping the flusher threads busy retrieving + * that work. Ensure that we only allow one of them pending and + * inflight at the time. */ - work = kzalloc(sizeof(*work), - GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (!work) { - trace_writeback_nowork(wb); - wb_wakeup(wb); + if (test_bit(WB_start_all, &wb->state) || + test_and_set_bit(WB_start_all, &wb->state)) return; - } - - work->sync_mode = WB_SYNC_NONE; - work->nr_pages = nr_pages; - work->range_cyclic = range_cyclic; - work->reason = reason; - work->auto_free = 1; - wb_queue_work(wb, work); + wb->start_all_reason = reason; + wb_wakeup(wb); } /** @@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) return work; } -/* - * Add in the number of potentially dirty inodes, because each inode - * write can dirty pagecache in the underlying blockdev. - */ -static unsigned long get_nr_dirty_pages(void) -{ - return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + - get_nr_dirty_inodes(); -} - static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { @@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; } +static long wb_check_start_all(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_bit(WB_start_all, &wb->state)) + return 0; + + nr_pages = get_nr_dirty_pages(); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = wb_split_bdi_pages(wb, nr_pages), + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = wb->start_all_reason, + }; + + nr_pages = wb_writeback(wb, &work); + } + + clear_bit(WB_start_all, &wb->state); + return nr_pages; +} + + /* * Retrieve work items and do the writeback they describe */ @@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) } /* + * Check for a flush-everything request + */ + wrote += wb_check_start_all(wb); + + /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); @@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work) } /* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. + * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, + * write back the whole world. */ -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) +static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason) +{ + struct bdi_writeback *wb; + + if (!bdi_has_dirty_io(bdi)) + return; + + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) + wb_start_writeback(wb, reason); +} + +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason) +{ + rcu_read_lock(); + __wakeup_flusher_threads_bdi(bdi, reason); + rcu_read_unlock(); +} + +/* + * Wakeup the flusher threads to start writeback of all currently dirty pages + */ +void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; @@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) if (blk_needs_flush_plug(current)) blk_schedule_flush_plug(current); - if (!nr_pages) - nr_pages = get_nr_dirty_pages(); - rcu_read_lock(); - list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - struct bdi_writeback *wb; - - if (!bdi_has_dirty_io(bdi)) - continue; - - list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) - wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), - false, reason); - } + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } @@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) EXPORT_SYMBOL(writeback_inodes_sb); /** - * try_to_writeback_inodes_sb_nr - try to start writeback if none underway + * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock - * @nr: the number of pages to write - * @reason: the reason of writeback + * @reason: reason why some writeback work was initiated * - * Invoke writeback_inodes_sb_nr if no writeback is currently underway. - * Returns 1 if writeback was started, 0 if not. + * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ -bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, - enum wb_reason reason) +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) - return false; + return; - __writeback_inodes_sb_nr(sb, nr, reason, true); + __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); - return true; -} -EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); - -/** - * try_to_writeback_inodes_sb - try to start writeback if none underway - * @sb: the superblock - * @reason: reason why some writeback work was initiated - * - * Implement by try_to_writeback_inodes_sb_nr() - * Returns 1 if writeback was started, 0 if not. - */ -bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) -{ - return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); diff --git a/fs/iomap.c b/fs/iomap.c index 5011a964a550..b9f74803e56c 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || - !blk_mq_poll(dio->submit.last_queue, + !blk_poll(dio->submit.last_queue, dio->submit.cookie)) io_schedule(); } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index cc91856b5e2d..3a2e509c77c5 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { spin_lock(&mapping->private_lock); if (unlikely(!page_has_buffers(page))) { spin_unlock(&mapping->private_lock); - bh = head = alloc_page_buffers(page, bh_size, 1); + bh = head = alloc_page_buffers(page, bh_size, true); spin_lock(&mapping->private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index b6f402194f02..ee8392aee9f6 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, if (unlikely(!page_has_buffers(page))) { struct buffer_head *tail; - bh = head = alloc_page_buffers(page, blocksize, 1); + bh = head = alloc_page_buffers(page, blocksize, true); do { set_buffer_uptodate(bh); tail = bh; diff --git a/fs/sync.c b/fs/sync.c index 83ac79a960dd..6e0a2cbaf6de 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync) { int nowait = 0, wait = 1; - wakeup_flusher_threads(0, WB_REASON_SYNC); + wakeup_flusher_threads(WB_REASON_SYNC); iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); |