summaryrefslogtreecommitdiff
path: root/fs/btrfs/tree-log.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2020-11-25 12:19:26 +0000
committerDavid Sterba <dsterba@suse.com>2020-12-09 19:16:06 +0100
commit47d3db41e190ca4a9c6e4a848052f4c5ca633db1 (patch)
tree7b1feb586daeb908adcdcf5f16708ca5b12e1ac6 /fs/btrfs/tree-log.c
parent4d6221d7d83141d58ece6560e9cfd4cc92eab044 (diff)
downloadlwn-47d3db41e190ca4a9c6e4a848052f4c5ca633db1.tar.gz
lwn-47d3db41e190ca4a9c6e4a848052f4c5ca633db1.zip
btrfs: fix race that makes inode logging fallback to transaction commit
When logging an inode and the previous transaction is still committing, we have a time window where we can end up incorrectly think an inode has its last_unlink_trans field with a value greater than the last transaction committed, which results in the logging to fallback to a full transaction commit, which is usually much more expensive than doing a log commit. The race is described by the following steps: 1) We are at transaction 1000; 2) We modify an inode X (a directory) using transaction 1000 and set its last_unlink_trans field to 1000, because for example we removed one of its subdirectories; 3) We create a new inode Y with a dentry in inode X using transaction 1000, so its generation field is set to 1000; 4) The commit for transaction 1000 is started by task A; 5) The task committing transaction 1000 sets the transaction state to unblocked, writes the dirty extent buffers and the super blocks, then unlocks tree_log_mutex; 6) Some task starts a new transaction with a generation of 1001; 7) We do some modification to inode Y (using transaction 1001); 8) The transaction 1000 commit starts unpinning extents. At this point fs_info->last_trans_committed still has a value of 999; 9) Task B starts an fsync on inode Y, and gets a handle for transaction 1001. When it gets to check_parent_dirs_for_sync() it does the checking of the ancestor dentries because the following check does not evaluate to true: if (S_ISREG(inode->vfs_inode.i_mode) && inode->generation <= last_committed && inode->last_unlink_trans <= last_committed) goto out; The generation value for inode Y is 1000 and last_committed, which has the value read from fs_info->last_trans_committed, has a value of 999, so that check evaluates to false and we proceed to check the ancestor inodes. Once we get to the first ancestor, inode X, we call btrfs_must_commit_transaction() on it, which evaluates to true: static bool btrfs_must_commit_transaction(...) { struct btrfs_fs_info *fs_info = inode->root->fs_info; bool ret = false; mutex_lock(&inode->log_mutex); if (inode->last_unlink_trans > fs_info->last_trans_committed) { /* * Make sure any commits to the log are forced to be full * commits. */ btrfs_set_log_full_commit(trans); ret = true; } (...) because inode's X last_unlink_trans has a value of 1000 and fs_info->last_trans_committed still has a value of 999, it returns true to check_parent_dirs_for_sync(), making it return 1 which is propagated up to btrfs_sync_file(), causing it to fallback to a full transaction commit of transaction 1001. We should have not fallen back to commit transaction 1001, since inode X had last_unlink_trans set to 1000 and the super blocks for transaction 1000 were already written. So while not resulting in a functional problem, it leads to a lot more work and higher latencies for a fsync since committing a transaction is usually more expensive than committing a log (if other filesystem changes happened under that transaction). Similar problem happens when logging directories, for the same reason as btrfs_must_commit_transaction() returns true on an inode with its last_unlink_trans having the generation of the previous transaction and that transaction is still committing, unpinning its freed extents. So fix this by comparing last_unlink_trans with the id of the current transaction instead of fs_info->last_trans_committed. This case is often hit when running dbench for a long enough duration, as it does lots of rename and rmdir operations (both update the field last_unlink_trans of an inode) and fsyncs of files and directories. This patch belongs to a patch set that is comprised of the following patches: btrfs: fix race causing unnecessary inode logging during link and rename btrfs: fix race that results in logging old extents during a fast fsync btrfs: fix race that causes unnecessary logging of ancestor inodes btrfs: fix race that makes inode logging fallback to transaction commit btrfs: fix race leading to unnecessary transaction commit when logging inode btrfs: do not block inode logging for so long during transaction commit Performance results are mentioned in the change log of the last patch. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r--fs/btrfs/tree-log.c20
1 files changed, 8 insertions, 12 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f21bef62e349..312704b20158 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5448,11 +5448,10 @@ out_unlock:
static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
bool ret = false;
mutex_lock(&inode->log_mutex);
- if (inode->last_unlink_trans > fs_info->last_trans_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
/*
* Make sure any commits to the log are forced to be full
* commits.
@@ -5474,8 +5473,7 @@ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- struct super_block *sb,
- u64 last_committed)
+ struct super_block *sb)
{
int ret = 0;
struct dentry *old_parent = NULL;
@@ -5487,8 +5485,8 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed)
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid)
goto out;
if (!S_ISDIR(inode->vfs_inode.i_mode)) {
@@ -6023,7 +6021,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
int ret = 0;
- u64 last_committed = fs_info->last_trans_committed;
bool log_dentries = false;
sb = inode->vfs_inode.i_sb;
@@ -6048,8 +6045,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
- last_committed);
+ ret = check_parent_dirs_for_sync(trans, inode, parent, sb);
if (ret)
goto end_no_trans;
@@ -6079,8 +6075,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* and other fun in this file.
*/
if (S_ISREG(inode->vfs_inode.i_mode) &&
- inode->generation <= last_committed &&
- inode->last_unlink_trans <= last_committed) {
+ inode->generation < trans->transid &&
+ inode->last_unlink_trans < trans->transid) {
ret = 0;
goto end_trans;
}
@@ -6129,7 +6125,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
* and has a link count of 2.
*/
- if (inode->last_unlink_trans > last_committed) {
+ if (inode->last_unlink_trans >= trans->transid) {
ret = btrfs_log_all_parents(trans, inode, ctx);
if (ret)
goto end_trans;