summaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2022-01-20 11:00:08 +0000
committerDavid Sterba <dsterba@suse.com>2022-03-14 13:13:47 +0100
commit88d2beec7e53fc500a5ac99beb254e6079d03543 (patch)
tree73b9b27cd25d5e38caac3bfd4075c031ca0a5d62 /fs/btrfs/inode.c
parentd5f5bd546552a94eefd68c42f40f778c40a89d2c (diff)
downloadlwn-88d2beec7e53fc500a5ac99beb254e6079d03543.tar.gz
lwn-88d2beec7e53fc500a5ac99beb254e6079d03543.zip
btrfs: avoid logging all directory changes during renames
When doing a rename of a file, if the file or its old parent directory were logged before, we log the new name of the file and then make sure we log the old parent directory, to ensure that after a log replay the old name of the file is deleted and the new name added. The logging of the old parent directory can take some time, because it will scan all leaves modified in the current transaction, check which directory entries were already logged, copy the ones that were not logged before, etc. In this rename context all we need to do is make sure that the old name of the file is deleted on log replay, so instead of triggering a directory log operation, we can just delete the old directory entry from the log if it's there, or in case it isn't there, just log a range item to signal log replay that the old name must be deleted. So change btrfs_log_new_name() to do that. This scenario is actually not uncommon to trigger, and recently on a 5.15 kernel, an openSUSE Tumbleweed user reported package installations and upgrades, with the zypper tool, were often taking a long time to complete, much more than usual. With strace it could be observed that zypper was spending over 99% of its time on rename operations, and then with further analysis we checked that directory logging was happening too frequently and causing high latencies for the rename operations. Taking into account that installation/upgrade of some of these packages needed about a few thousand file renames, the slowdown was very noticeable for the user. The issue was caused indirectly due to an excessive number of inode evictions on a 5.15 kernel, about 100x more compared to a 5.13, 5.14 or a 5.16-rc8 kernel. After an inode eviction we can't tell for sure, in an efficient way, if an inode was previously logged in the current transaction, so we are pessimistic and assume it was, because in case it was we need to update the logged inode. More details on that in one of the patches in the same series (subject "btrfs: avoid inode logging during rename and link when possible"). Either way, in case the parent directory was logged before, we currently do more work then necessary during a rename, and this change minimizes that amount of work. The following script mimics part of what a package installation/upgrade with zypper does, which is basically renaming a lot of files, in some directory under /usr, to a name with a suffix of "-RPMDELETE": $ cat test.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 NUM_FILES=10000 mkfs.btrfs -f $DEV mount $DEV $MNT mkdir $MNT/testdir for ((i = 1; i <= $NUM_FILES; i++)); do echo -n > $MNT/testdir/file_$i done sync # Do some change to testdir and fsync it. echo -n > $MNT/testdir/file_$((NUM_FILES + 1)) xfs_io -c "fsync" $MNT/testdir echo "Renaming $NUM_FILES files..." start=$(date +%s%N) for ((i = 1; i <= $NUM_FILES; i++)); do mv $MNT/testdir/file_$i $MNT/testdir/file_$i-RPMDELETE done end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "Renames took $dur milliseconds" umount $MNT Testing this change on box using a non-debug kernel (Debian's default kernel config) gave the following results: NUM_FILES=10000, before this patch: 27399 ms NUM_FILES=10000, after this patch: 9093 ms (-66.8%) NUM_FILES=5000, before this patch: 9241 ms NUM_FILES=5000, after this patch: 4642 ms (-49.8%) NUM_FILES=2000, before this patch: 2550 ms NUM_FILES=2000, after this patch: 1788 ms (-29.9%) NUM_FILES=1000, before this patch: 1088 ms NUM_FILES=1000, after this patch: 905 ms (-16.9%) Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1193549 Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c33
1 files changed, 24 insertions, 9 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a172e76aadb..50656edad939 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,11 @@ struct btrfs_dio_data {
struct extent_changeset *data_reserved;
};
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -4062,7 +4067,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4118,6 +4124,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -4158,7 +4167,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -6531,7 +6540,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, old_dentry, NULL, parent);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -9024,6 +9033,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
@@ -9164,7 +9175,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9180,7 +9192,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9212,13 +9225,13 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (root_log_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
- new_dentry->d_parent);
+ old_rename_ctx.index, new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
- old_dentry->d_parent);
+ new_rename_ctx.index, old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -9324,6 +9337,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
int ret2;
@@ -9455,7 +9469,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9499,7 +9514,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (log_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
- new_dentry->d_parent);
+ rename_ctx.index, new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}