btrfs: use tagged writepage to mitigate livelock of snapshot

Snapshot is expected to be fast. But if there are writers steadily creating dirty pages in our subvolume, the snapshot may take a very long time to complete. To fix the problem, we use tagged writepage for snapshot flusher as we do in the generic write_cache_pages(), so we can omit pages dirtied after the snapshot command. This does not change the semantics regarding which data get to the snapshot, if there are pages being dirtied during the snapshotting operation. There's a sync called before snapshot is taken in old/new case, any IO in flight just after that may be in the snapshot but this depends on other system effects that might still sync the IO. We do a simple snapshot speed test on a Intel D-1531 box: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 1m58sec patched: 6.54sec This is the best case for this patch since for a sequential write case, we omit nearly all pages dirtied after the snapshot command. For a multi writers, random write test: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 15.83sec patched: 10.35sec The improvement is smaller compared to the sequential write case, since we omit only half of the pages dirtied after snapshot command. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Ethan Lien <ethanlien@synology.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Ethan Lien <ethanlien@synology.com> 2018-11-01 14:49:03 +0800
committer: David Sterba <dsterba@suse.com> 2018-12-17 14:51:33 +0100
commit: 3cd24c698004d2f7668e0eb9fc1f096f533c791b (patch)
tree: bc1ba05b443865b66accf822e6074aa601915031 /fs/btrfs/inode.c
parent: c629732d247e253e811a7ef6667a53349ae5a0ab (diff)
download: lwn-3cd24c698004d2f7668e0eb9fc1f096f533c791b.tar.gz
lwn-3cd24c698004d2f7668e0eb9fc1f096f533c791b.zip
1 files changed, 7 insertions, 4 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d7fa0fba49f8..2585200cb43a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9961,7 +9961,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -9989,6 +9989,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 		}
 		spin_unlock(&root->delalloc_lock);
 
+		if (snapshot)
+			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+				&binode->runtime_flags);
 		work = btrfs_alloc_delalloc_work(inode);
 		if (!work) {
 			iput(inode);
@@ -10022,7 +10025,7 @@ out:
 	return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -10030,7 +10033,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	ret = start_delalloc_inodes(root, -1);
+	ret = start_delalloc_inodes(root, -1, true);
 	if (ret > 0)
 		ret = 0;
 	return ret;
@@ -10059,7 +10062,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = start_delalloc_inodes(root, nr);
+		ret = start_delalloc_inodes(root, nr, false);
 		btrfs_put_fs_root(root);
 		if (ret < 0)
 			goto out;
author	Ethan Lien <ethanlien@synology.com>	2018-11-01 14:49:03 +0800
committer	David Sterba <dsterba@suse.com>	2018-12-17 14:51:33 +0100
commit	3cd24c698004d2f7668e0eb9fc1f096f533c791b (patch)
tree	bc1ba05b443865b66accf822e6074aa601915031 /fs/btrfs/inode.c
parent	c629732d247e253e811a7ef6667a53349ae5a0ab (diff)
download	lwn-3cd24c698004d2f7668e0eb9fc1f096f533c791b.tar.gz lwn-3cd24c698004d2f7668e0eb9fc1f096f533c791b.zip