summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd /fs
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
downloadlwn-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.tar.gz
lwn-e2c5923c349c1738fe8fda980874d93f6fb2e5b6.zip
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
Diffstat (limited to 'fs')
-rw-r--r--fs/block_dev.c20
-rw-r--r--fs/buffer.c70
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/fs-writeback.c153
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/mft.c2
-rw-r--r--fs/sync.c2
8 files changed, 112 insertions, 141 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 789f55e851ae..4a181fcb5175 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -54,18 +54,6 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf);
- va_end(args);
-}
-
static void bdev_write_inode(struct block_device *bdev)
{
struct inode *inode = bdev->bd_inode;
@@ -249,7 +237,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
if (!READ_ONCE(bio.bi_private))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_mq_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc))
io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -414,7 +402,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
- !blk_mq_poll(bdev_get_queue(bdev), qc))
+ !blk_poll(bdev_get_queue(bdev), qc))
io_schedule();
}
__set_current_state(TASK_RUNNING);
@@ -674,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
- result = blk_queue_enter(bdev->bd_queue, false);
+ result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
@@ -710,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
- result = blk_queue_enter(bdev->bd_queue, false);
+ result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;
diff --git a/fs/buffer.c b/fs/buffer.c
index 49b7e9bdcd1d..1c18a22a6013 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -253,27 +253,6 @@ out:
}
/*
- * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
- */
-static void free_more_memory(void)
-{
- struct zoneref *z;
- int nid;
-
- wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
- yield();
-
- for_each_online_node(nid) {
-
- z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
- gfp_zone(GFP_NOFS), NULL);
- if (z->zone)
- try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS, NULL);
- }
-}
-
-/*
* I/O completion handler for block_read_full_page() - pages
* which come unlocked at the end of I/O.
*/
@@ -861,16 +840,19 @@ int remove_inode_buffers(struct inode *inode)
* which may not fail from ordinary buffer allocations.
*/
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
- int retry)
+ bool retry)
{
struct buffer_head *bh, *head;
+ gfp_t gfp = GFP_NOFS;
long offset;
-try_again:
+ if (retry)
+ gfp |= __GFP_NOFAIL;
+
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
- bh = alloc_buffer_head(GFP_NOFS);
+ bh = alloc_buffer_head(gfp);
if (!bh)
goto no_grow;
@@ -896,23 +878,7 @@ no_grow:
} while (head);
}
- /*
- * Return failure for non-async IO requests. Async IO requests
- * are not allowed to fail, so we have to wait until buffer heads
- * become available. But we don't want tasks sleeping with
- * partially complete buffers, so all were released above.
- */
- if (!retry)
- return NULL;
-
- /* We're _really_ low on memory. Now we just
- * wait for old buffer heads to become free due to
- * finishing IO. Since this is an async request and
- * the reserve list is empty, we're sure there are
- * async buffer heads in use.
- */
- free_more_memory();
- goto try_again;
+ return NULL;
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);
@@ -1001,8 +967,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
gfp_mask |= __GFP_NOFAIL;
page = find_or_create_page(inode->i_mapping, index, gfp_mask);
- if (!page)
- return ret;
BUG_ON(!PageLocked(page));
@@ -1021,9 +985,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
/*
* Allocate some buffers for this page
*/
- bh = alloc_page_buffers(page, size, 0);
- if (!bh)
- goto failed;
+ bh = alloc_page_buffers(page, size, true);
/*
* Link the page to the buffers and initialise them. Take the
@@ -1103,8 +1065,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
ret = grow_buffers(bdev, block, size, gfp);
if (ret < 0)
return NULL;
- if (ret == 0)
- free_more_memory();
}
}
@@ -1575,7 +1535,7 @@ void create_empty_buffers(struct page *page,
{
struct buffer_head *bh, *head, *tail;
- head = alloc_page_buffers(page, blocksize, 1);
+ head = alloc_page_buffers(page, blocksize, true);
bh = head;
do {
bh->b_state |= b_state;
@@ -2639,7 +2599,7 @@ int nobh_write_begin(struct address_space *mapping,
* Be careful: the buffer linked list is a NULL terminated one, rather
* than the circular one we're used to.
*/
- head = alloc_page_buffers(page, blocksize, 0);
+ head = alloc_page_buffers(page, blocksize, false);
if (!head) {
ret = -ENOMEM;
goto out_release;
@@ -3056,8 +3016,16 @@ void guard_bio_eod(int op, struct bio *bio)
sector_t maxsector;
struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned truncated_bytes;
+ struct hd_struct *part;
+
+ rcu_read_lock();
+ part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (part)
+ maxsector = part_nr_sects_read(part);
+ else
+ maxsector = get_capacity(bio->bi_disk);
+ rcu_read_unlock();
- maxsector = get_capacity(bio->bi_disk);
if (!maxsector)
return;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 98fe1325da9d..3aafb3343a65 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -497,7 +497,7 @@ static struct bio *dio_await_one(struct dio *dio)
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
- !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
+ !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 245c430a2e41..08f5debd07d1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -933,33 +933,36 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
#endif /* CONFIG_CGROUP_WRITEBACK */
-void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
- bool range_cyclic, enum wb_reason reason)
+/*
+ * Add in the number of potentially dirty inodes, because each inode
+ * write can dirty pagecache in the underlying blockdev.
+ */
+static unsigned long get_nr_dirty_pages(void)
{
- struct wb_writeback_work *work;
+ return global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS) +
+ get_nr_dirty_inodes();
+}
+static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
+{
if (!wb_has_dirty_io(wb))
return;
/*
- * This is WB_SYNC_NONE writeback, so if allocation fails just
- * wakeup the thread for old dirty data writeback
+ * All callers of this function want to start writeback of all
+ * dirty pages. Places like vmscan can call this at a very
+ * high frequency, causing pointless allocations of tons of
+ * work items and keeping the flusher threads busy retrieving
+ * that work. Ensure that we only allow one of them pending and
+ * inflight at the time.
*/
- work = kzalloc(sizeof(*work),
- GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (!work) {
- trace_writeback_nowork(wb);
- wb_wakeup(wb);
+ if (test_bit(WB_start_all, &wb->state) ||
+ test_and_set_bit(WB_start_all, &wb->state))
return;
- }
-
- work->sync_mode = WB_SYNC_NONE;
- work->nr_pages = nr_pages;
- work->range_cyclic = range_cyclic;
- work->reason = reason;
- work->auto_free = 1;
- wb_queue_work(wb, work);
+ wb->start_all_reason = reason;
+ wb_wakeup(wb);
}
/**
@@ -1814,17 +1817,6 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
return work;
}
-/*
- * Add in the number of potentially dirty inodes, because each inode
- * write can dirty pagecache in the underlying blockdev.
- */
-static unsigned long get_nr_dirty_pages(void)
-{
- return global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS) +
- get_nr_dirty_inodes();
-}
-
static long wb_check_background_flush(struct bdi_writeback *wb)
{
if (wb_over_bg_thresh(wb)) {
@@ -1877,6 +1869,30 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
return 0;
}
+static long wb_check_start_all(struct bdi_writeback *wb)
+{
+ long nr_pages;
+
+ if (!test_bit(WB_start_all, &wb->state))
+ return 0;
+
+ nr_pages = get_nr_dirty_pages();
+ if (nr_pages) {
+ struct wb_writeback_work work = {
+ .nr_pages = wb_split_bdi_pages(wb, nr_pages),
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ .reason = wb->start_all_reason,
+ };
+
+ nr_pages = wb_writeback(wb, &work);
+ }
+
+ clear_bit(WB_start_all, &wb->state);
+ return nr_pages;
+}
+
+
/*
* Retrieve work items and do the writeback they describe
*/
@@ -1893,6 +1909,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
}
/*
+ * Check for a flush-everything request
+ */
+ wrote += wb_check_start_all(wb);
+
+ /*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
@@ -1947,10 +1968,33 @@ void wb_workfn(struct work_struct *work)
}
/*
- * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
- * the whole world.
+ * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero,
+ * write back the whole world.
*/
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+ enum wb_reason reason)
+{
+ struct bdi_writeback *wb;
+
+ if (!bdi_has_dirty_io(bdi))
+ return;
+
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+ wb_start_writeback(wb, reason);
+}
+
+void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
+ enum wb_reason reason)
+{
+ rcu_read_lock();
+ __wakeup_flusher_threads_bdi(bdi, reason);
+ rcu_read_unlock();
+}
+
+/*
+ * Wakeup the flusher threads to start writeback of all currently dirty pages
+ */
+void wakeup_flusher_threads(enum wb_reason reason)
{
struct backing_dev_info *bdi;
@@ -1960,20 +2004,9 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
if (blk_needs_flush_plug(current))
blk_schedule_flush_plug(current);
- if (!nr_pages)
- nr_pages = get_nr_dirty_pages();
-
rcu_read_lock();
- list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- struct bdi_writeback *wb;
-
- if (!bdi_has_dirty_io(bdi))
- continue;
-
- list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
- wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
- false, reason);
- }
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ __wakeup_flusher_threads_bdi(bdi, reason);
rcu_read_unlock();
}
@@ -2343,37 +2376,19 @@ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
EXPORT_SYMBOL(writeback_inodes_sb);
/**
- * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
+ * try_to_writeback_inodes_sb - try to start writeback if none underway
* @sb: the superblock
- * @nr: the number of pages to write
- * @reason: the reason of writeback
+ * @reason: reason why some writeback work was initiated
*
- * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
- * Returns 1 if writeback was started, 0 if not.
+ * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
*/
-bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
- enum wb_reason reason)
+void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
if (!down_read_trylock(&sb->s_umount))
- return false;
+ return;
- __writeback_inodes_sb_nr(sb, nr, reason, true);
+ __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
up_read(&sb->s_umount);
- return true;
-}
-EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
-
-/**
- * try_to_writeback_inodes_sb - try to start writeback if none underway
- * @sb: the superblock
- * @reason: reason why some writeback work was initiated
- *
- * Implement by try_to_writeback_inodes_sb_nr()
- * Returns 1 if writeback was started, 0 if not.
- */
-bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
-{
- return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);
diff --git a/fs/iomap.c b/fs/iomap.c
index 5011a964a550..b9f74803e56c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1057,7 +1057,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!dio->submit.last_queue ||
- !blk_mq_poll(dio->submit.last_queue,
+ !blk_poll(dio->submit.last_queue,
dio->submit.cookie))
io_schedule();
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cc91856b5e2d..3a2e509c77c5 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1739,7 +1739,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
spin_lock(&mapping->private_lock);
if (unlikely(!page_has_buffers(page))) {
spin_unlock(&mapping->private_lock);
- bh = head = alloc_page_buffers(page, bh_size, 1);
+ bh = head = alloc_page_buffers(page, bh_size, true);
spin_lock(&mapping->private_lock);
if (likely(!page_has_buffers(page))) {
struct buffer_head *tail;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b6f402194f02..ee8392aee9f6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -507,7 +507,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
if (unlikely(!page_has_buffers(page))) {
struct buffer_head *tail;
- bh = head = alloc_page_buffers(page, blocksize, 1);
+ bh = head = alloc_page_buffers(page, blocksize, true);
do {
set_buffer_uptodate(bh);
tail = bh;
diff --git a/fs/sync.c b/fs/sync.c
index 83ac79a960dd..6e0a2cbaf6de 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -109,7 +109,7 @@ SYSCALL_DEFINE0(sync)
{
int nowait = 0, wait = 1;
- wakeup_flusher_threads(0, WB_REASON_SYNC);
+ wakeup_flusher_threads(WB_REASON_SYNC);
iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
iterate_supers(sync_fs_one_sb, &wait);