From cf2834a5ed57562d6a1a8170724704149f0ae0a4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Dec 2019 16:42:52 -0500 Subject: ext4: treat buffers contining write errors as valid in ext4_sb_bread() In commit 7963e5ac9012 ("ext4: treat buffers with write errors as containing valid data") we missed changing ext4_sb_bread() to use ext4_buffer_uptodate(). So fix this oversight. Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2937a8873fe1..c3d66bb7fd96 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) if (bh == NULL) return ERR_PTR(-ENOMEM); - if (buffer_uptodate(bh)) + if (ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); wait_on_buffer(bh); -- cgit v1.2.3 From f629afe3369e9885fd6e9cc7a4f514b6a65cf9e9 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 12 Dec 2019 11:25:55 +0530 Subject: ext4: fix ext4_dax_read/write inode locking sequence for IOCB_NOWAIT Apparently our current rwsem code doesn't like doing the trylock, then lock for real scheme. So change our dax read/write methods to just do the trylock for the RWF_NOWAIT case. This seems to fix AIM7 regression in some scalable filesystems upto ~25% in some cases. Claimed in commit 942491c9e6d6 ("xfs: fix AIM7 regression") Reviewed-by: Jan Kara Reviewed-by: Matthew Bobrowski Tested-by: Joseph Qi Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191212055557.11151-2-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6a7293a5cda2..977ac58dc718 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - if (!inode_trylock_shared(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) return -EAGAIN; + } else { inode_lock_shared(inode); } /* @@ -487,9 +488,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } -- cgit v1.2.3 From aa9714d0e39788d0688474c9d5f6a9a36159599f Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 12 Dec 2019 11:25:56 +0530 Subject: ext4: Start with shared i_rwsem in case of DIO instead of exclusive Earlier there was no shared lock in DIO read path. But this patch (16c54688592ce: ext4: Allow parallel DIO reads) simplified some of the locking mechanism while still allowing for parallel DIO reads by adding shared lock in inode DIO read path. But this created problem with mixed read/write workload. It is due to the fact that in DIO path, we first start with exclusive lock and only when we determine that it is a ovewrite IO, we downgrade the lock. This causes the problem, since we still have shared locking in DIO reads. So, this patch tries to fix this issue by starting with shared lock and then switching to exclusive lock only when required based on ext4_dio_write_checks(). Other than that, it also simplifies below cases:- 1. Simplified ext4_unaligned_aio API to ext4_unaligned_io. Previous API was abused in the sense that it was not really checking for AIO anywhere also it used to check for extending writes. So this API was renamed and simplified to ext4_unaligned_io() which actully only checks if the IO is really unaligned. Now, in case of unaligned direct IO, iomap_dio_rw needs to do zeroing of partial block and that will require serialization against other direct IOs in the same block. So we take a exclusive inode lock for any unaligned DIO. In case of AIO we also need to wait for any outstanding IOs to complete so that conversion from unwritten to written is completed before anyone try to map the overlapping block. Hence we take exclusive inode lock and also wait for inode_dio_wait() for unaligned DIO case. Please note since we are anyway taking an exclusive lock in unaligned IO, inode_dio_wait() becomes a no-op in case of non-AIO DIO. 2. Added ext4_extending_io(). This checks if the IO is extending the file. 3. Added ext4_dio_write_checks(). In this we start with shared inode lock and only switch to exclusive lock if required. So in most cases with aligned, non-extending, dioread_nolock & overwrites, it tries to write with a shared lock. If not, then we restart the operation in ext4_dio_write_checks(), after acquiring exclusive lock. Reviewed-by: Jan Kara Tested-by: Joseph Qi Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191212055557.11151-3-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 191 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 142 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 977ac58dc718..1da49dffa3df 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -166,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -204,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; @@ -228,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + ret = file_modified(iocb->ki_filp); if (ret) return ret; - - return iov_iter_count(from); + return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, @@ -364,62 +381,139 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites in dioread_nolock + * mode. Otherwise we will switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; + /* + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. If so, check to see whether it is + * possible to take the dioread_nolock path. + * + * We need exclusive i_rwsem for changing security info + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_should_dioread_nolock(inode) || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - size_t count; - loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); - bool extend = false, overwrite = false, unaligned_aio = false; + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } } else { - inode_lock(inode); + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); } + /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_dio_supported(inode)) { - inode_unlock(inode); - /* - * Fallback to buffered I/O if the inode does not support - * direct I/O. - */ + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } - ret = ext4_write_checks(iocb, from); - if (ret <= 0) { - inode_unlock(inode); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) return ret; - } - /* - * Unaligned asynchronous direct I/O must be serialized among each - * other as the zeroing of partial blocks of two competing unaligned - * asynchronous direct I/O writes can result in data corruption. - */ offset = iocb->ki_pos; - count = iov_iter_count(from); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { - unaligned_aio = true; - inode_dio_wait(inode); - } + count = ret; /* - * Determine whether the I/O will overwrite allocated and initialized - * blocks. If so, check to see whether it is possible to take the - * dioread_nolock path. + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data + * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && - ext4_should_dioread_nolock(inode)) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } + if (unaligned_io) + inode_dio_wait(inode); - if (offset + count > EXT4_I(inode)->i_disksize) { + if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -432,18 +526,17 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - extend = true; ext4_journal_stop(handle); } ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_aio || extend); + is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: - if (overwrite) + if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); -- cgit v1.2.3 From bc6385dab125d20870f0eb9ca9e589f43abb3f56 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 12 Dec 2019 11:25:57 +0530 Subject: ext4: Move to shared i_rwsem even without dioread_nolock mount opt We were using shared locking only in case of dioread_nolock mount option in case of DIO overwrites. This mount condition is not needed anymore with current code, since:- 1. No race between buffered writes & DIO overwrites. Since buffIO writes takes exclusive lock & DIO overwrites will take shared locking. Also DIO path will make sure to flush and wait for any dirty page cache data. 2. No race between buffered reads & DIO overwrites, since there is no block allocation that is possible with DIO overwrites. So no stale data exposure should happen. Same is the case between DIO reads & DIO overwrites. 3. Also other paths like truncate is protected, since we wait there for any DIO in flight to be over. Reviewed-by: Jan Kara Tested-by: Joseph Qi Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/20191212055557.11151-4-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1da49dffa3df..9c2711bce0f9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -392,8 +392,8 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * - * - shared locking will only be true mostly with overwrites in dioread_nolock - * mode. Otherwise we will switch to exclusive i_rwsem lock. + * - shared locking will only be true mostly with overwrites. Otherwise we will + * switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend) @@ -415,14 +415,11 @@ restart: *extend = true; /* * Determine whether the IO operation will overwrite allocated - * and initialized blocks. If so, check to see whether it is - * possible to take the dioread_nolock path. - * + * and initialized blocks. * We need exclusive i_rwsem for changing security info * in file_modified(). */ if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_should_dioread_nolock(inode) || !ext4_overwrite_io(inode, offset, count))) { inode_unlock_shared(inode); *ilock_shared = false; -- cgit v1.2.3 From 878520ac45f9f698432d4276db3d9144b83931b6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 19 Nov 2019 21:54:15 -0500 Subject: ext4: save the error code which triggered an ext4_error() in the superblock This allows the cause of an ext4_error() report to be categorized based on whether it was triggered due to an I/O error, or an memory allocation error, or other possible causes. Most errors are caused by a detected file system inconsistency, so the default code stored in the superblock will be EXT4_ERR_EFSCORRUPTED. Link: https://lore.kernel.org/r/20191204032335.7683-1-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 1 + fs/ext4/ext4.h | 30 ++++++++++++++++++++++- fs/ext4/ext4_jbd2.c | 3 +++ fs/ext4/extents.c | 1 + fs/ext4/ialloc.c | 2 ++ fs/ext4/inline.c | 2 ++ fs/ext4/inode.c | 8 ++++++- fs/ext4/mballoc.c | 4 ++++ fs/ext4/mmp.c | 6 ++++- fs/ext4/namei.c | 4 ++++ fs/ext4/super.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/xattr.c | 4 +++- 12 files changed, 128 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0b202e00d93f..102c38527a10 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -506,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f8578caba40d..b00d07bad45b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1343,7 +1343,8 @@ struct ext4_super_block { __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; - __u8 s_pad[2]; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_reserved[95]; /* Padding to the end of the block */ @@ -1574,6 +1575,32 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + /* * Inode dynamic state flags */ @@ -2688,6 +2715,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern void ext4_set_errno(struct super_block *sb, int err); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d3b8cdea5df7..19217a3f1ae4 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -58,6 +58,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { + ext4_set_errno(sb, -journal->j_errno); ext4_abort(sb, "Detected aborted journal"); return -EROFS; } @@ -249,6 +250,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_set_errno(inode->i_sb, -err); __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } @@ -320,6 +322,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_block = cpu_to_le64(bh->b_blocknr); + ext4_set_errno(inode->i_sb, EIO); ext4_error_inode(inode, where, line, bh->b_blocknr, "IO error syncing itable block"); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e8708b77da6..ee83fe7c98aa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -492,6 +492,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: + ext4_set_errno(inode->i_sb, -err); ext4_error_inode(inode, function, line, 0, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8ca4a23129aa..0151ba8ea439 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -194,6 +194,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); @@ -1223,6 +1224,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); + ext4_set_errno(sb, -err); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", ino, err); return inode; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fec62d764fa..e61603f47035 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { + ext4_set_errno(inode->i_sb, -error); ext4_error_inode(inode, __func__, __LINE__, 0, "can't get inode location %lu", inode->i_ino); @@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { + ext4_set_errno(dir->i_sb, -err); EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); return true; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 629a25d999f0..23fa585206f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -271,6 +271,7 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { + ext4_set_errno(inode->i_sb, -err); ext4_error(inode->i_sb, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); @@ -2478,10 +2479,12 @@ update_disksize: EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); - if (err2) + if (err2) { + ext4_set_errno(inode->i_sb, -err2); ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", inode->i_ino); + } if (!err) err = err2; } @@ -4338,6 +4341,7 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); brelse(bh); @@ -4552,6 +4556,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; @@ -5090,6 +5095,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, "IO error syncing inode"); err = -EIO; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a3e2767bdf2f..f64838187559 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; @@ -4063,6 +4064,7 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; @@ -4071,6 +4073,7 @@ repeat: bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); @@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 2305b4374fd3..1c44b1a32001 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -173,8 +173,10 @@ static int kmmpd(void *data) * (s_mmp_update_interval * 60) seconds. */ if (retval) { - if ((failed_writes % 60) == 0) + if ((failed_writes % 60) == 0) { + ext4_set_errno(sb, -retval); ext4_error(sb, "Error writing to MMP block"); + } failed_writes++; } @@ -205,6 +207,7 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { + ext4_set_errno(sb, -retval); ext4_error(sb, "error reading MMP data: %d", retval); goto exit_thread; @@ -218,6 +221,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); + ext4_set_errno(sb, EBUSY); ext4_error(sb, "abort"); put_bh(bh_check); retval = -EBUSY; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1cb42d940784..1bb6099397af 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -156,6 +156,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dx_csum_verify(inode, dirent)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); @@ -166,6 +167,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dirblock_csum_verify(inode, bh)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); @@ -1527,6 +1529,7 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); @@ -1537,6 +1540,7 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { + ext4_set_errno(sb, EFSBADCRC); EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c3d66bb7fd96..f1a5c14c2a93 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -367,6 +367,8 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); + if (es->s_last_error_errcode == 0) + es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -375,6 +377,7 @@ static void __save_error_info(struct super_block *sb, const char *func, es->s_first_error_line = cpu_to_le32(line); es->s_first_error_ino = es->s_last_error_ino; es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; } /* * Start the daily error reporting function if it hasn't been @@ -631,6 +634,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } +void ext4_set_errno(struct super_block *sb, int err) +{ + if (err < 0) + err = -err; + + switch (err) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + EXT4_SB(sb)->s_es->s_last_error_errcode = err; +} + /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -655,6 +718,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } + ext4_set_errno(sb, -errno); save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -982,8 +1046,10 @@ static void ext4_put_super(struct super_block *sb) aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if ((err < 0) && !aborted) + if ((err < 0) && !aborted) { + ext4_set_errno(sb, -err); ext4_abort(sb, "Couldn't clean up the journal"); + } } ext4_unregister_sysfs(sb); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8966a5439a22..246fbeeb6366 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); - if (error == -EIO) + if (error == -EIO) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + } bh = NULL; goto cleanup; } -- cgit v1.2.3 From 46f870d690fecc792a66730dcbbf0aa109f5f9ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 21 Nov 2019 13:09:43 -0500 Subject: ext4: simulate various I/O and checksum errors when reading metadata This allows us to test various error handling code paths Link: https://lore.kernel.org/r/20191209012317.59398-1-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 4 +++- fs/ext4/ext4.h | 37 +++++++++++++++++++++++++++++++++++++ fs/ext4/ialloc.c | 4 +++- fs/ext4/inode.c | 6 +++++- fs/ext4/namei.c | 11 ++++++++--- fs/ext4/sysfs.c | 23 +++++++++++++++++++++++ 6 files changed, 79 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 102c38527a10..5f993a411251 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, @@ -505,6 +506,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read block bitmap - " diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b00d07bad45b..5edc16d36a96 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1557,6 +1557,9 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; struct dax_device *s_daxdev; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1575,6 +1578,40 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +/* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +static inline void ext4_simulate_fail_bh(struct super_block *sb, + struct buffer_head *bh, + unsigned long code) +{ + if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) + clear_buffer_uptodate(bh); +} + /* * Error number codes for s_{first,last}_error_errno * diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0151ba8ea439..c66e8f9451a2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); @@ -192,6 +193,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_set_errno(sb, EIO); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 23fa585206f0..c3270aaa2b75 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4243,6 +4243,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; + if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) + goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4341,6 +4343,7 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + simulate_eio: ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); @@ -4555,7 +4558,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + if (!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bb6099397af..d4c0d7a18d64 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0); + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else + bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " @@ -153,7 +156,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { - if (ext4_dx_csum_verify(inode, dirent)) + if (ext4_dx_csum_verify(inode, dirent) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_set_errno(inode->i_sb, EFSBADCRC); @@ -164,7 +168,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, } } if (!is_dx_block) { - if (ext4_dirblock_csum_verify(inode, bh)) + if (ext4_dirblock_csum_verify(inode, bh) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { ext4_set_errno(inode->i_sb, EFSBADCRC); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index eb1efad0e20a..a990d28d191b 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,6 +29,7 @@ typedef enum { attr_last_error_time, attr_feature, attr_pointer_ui, + attr_pointer_ul, attr_pointer_atomic, attr_journal_task, } attr_id_t; @@ -160,6 +161,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) +#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -194,6 +198,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +#ifdef CONFIG_EXT4_DEBUG +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); +#endif EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); @@ -228,6 +235,9 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), +#ifdef CONFIG_EXT4_DEBUG + ATTR_LIST(simulate_fail), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -318,6 +328,11 @@ static ssize_t ext4_attr_show(struct kobject *kobj, else return snprintf(buf, PAGE_SIZE, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%lu\n", + *((unsigned long *) ptr)); case attr_pointer_atomic: if (!ptr) return 0; @@ -361,6 +376,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_pointer_ul: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned long *) ptr) = t; + return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: -- cgit v1.2.3 From 4549b49f82ab40c214778f316b6898aa4132723a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 23 Dec 2019 18:44:49 -0500 Subject: ext4: export information about first/last errors via /sys/fs/ext4/ Make {first,last}_error_{ino,block,line,func,errcode} available via sysfs. Also add a missing newline for {first,last}_error_time. Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index a990d28d191b..d218ebdafa4a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,9 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_ul, + attr_pointer_u64, + attr_pointer_u8, + attr_pointer_string, attr_pointer_atomic, attr_journal_task, } attr_id_t; @@ -47,6 +50,7 @@ struct ext4_attr { struct attribute attr; short attr_id; short attr_ptr; + unsigned short attr_size; union { int offset; void *explicit_ptr; @@ -155,9 +159,29 @@ static struct ext4_attr ext4_attr_##_name = { \ }, \ } +#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_pointer_string, \ + .attr_size = _size, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + #define EXT4_RO_ATTR_ES_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) +#define EXT4_RO_ATTR_ES_U8(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U64(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ + EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -202,6 +226,16 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); +EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); +EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); +EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); +EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); +EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); +EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); +EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); +EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); +EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); @@ -232,6 +266,16 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(first_error_ino), + ATTR_LIST(last_error_ino), + ATTR_LIST(first_error_block), + ATTR_LIST(last_error_block), + ATTR_LIST(first_error_line), + ATTR_LIST(last_error_line), + ATTR_LIST(first_error_func), + ATTR_LIST(last_error_func), + ATTR_LIST(first_error_errcode), + ATTR_LIST(last_error_errcode), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), @@ -290,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld", + return snprintf(buf, PAGE_SIZE, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -333,6 +377,25 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return 0; return snprintf(buf, PAGE_SIZE, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned char *) ptr)); + case attr_pointer_u64: + if (!ptr) + return 0; + if (a->attr_ptr == ptr_ext4_super_block_offset) + return snprintf(buf, PAGE_SIZE, "%llu\n", + le64_to_cpup(ptr)); + else + return snprintf(buf, PAGE_SIZE, "%llu\n", + *((unsigned long long *) ptr)); + case attr_pointer_string: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; -- cgit v1.2.3 From 8cd115bdda17751ee2adab614a80df72228b3809 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 18 Dec 2019 18:44:33 +0100 Subject: ext4: Optimize ext4 DIO overwrites Currently we start transaction for mapping every extent for writing using direct IO. This is unnecessary when we know we are overwriting already allocated blocks and the overhead of starting a transaction can be significant especially for multithreaded workloads doing small writes. Use iomap operations that avoid starting a transaction for direct IO overwrites. This improves throughput of 4k random writes - fio jobfile: [global] rw=randrw norandommap=1 invalidate=0 bs=4k numjobs=16 time_based=1 ramp_time=30 runtime=120 group_reporting=1 ioengine=psync direct=1 size=16G filename=file1.0.0:file1.0.1:file1.0.2:file1.0.3:file1.0.4:file1.0.5:file1.0.6:file1.0.7:file1.0.8:file1.0.9:file1.0.10:file1.0.11:file1.0.12:file1.0.13:file1.0.14:file1.0.15:file1.0.16:file1.0.17:file1.0.18:file1.0.19:file1.0.20:file1.0.21:file1.0.22:file1.0.23:file1.0.24:file1.0.25:file1.0.26:file1.0.27:file1.0.28:file1.0.29:file1.0.30:file1.0.31 file_service_type=random nrfiles=32 from 3018MB/s to 4059MB/s in my test VM running test against simulated pmem device (note that before iomap conversion, this workload was able to achieve 3708MB/s because old direct IO path avoided transaction start for overwrites as well). For dax, the win is even larger improving throughput from 3042MB/s to 4311MB/s. Reported-by: Dan Williams Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20191218174433.19380-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/file.c | 5 ++++- fs/ext4/inode.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5edc16d36a96..791e54425b00 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3455,6 +3455,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9c2711bce0f9..5f225881176b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -447,6 +447,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unaligned_io = false; bool ilock_shared = true; @@ -526,7 +527,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_journal_stop(handle); } - ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c3270aaa2b75..d035acab5b2a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3451,6 +3451,22 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + int ret; + + /* + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. + */ + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + return ret; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { @@ -3472,6 +3488,11 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, + .iomap_end = ext4_iomap_end, +}; + static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { -- cgit v1.2.3 From d4c5e960bf202d99ec9a6922ad387eafb798b848 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 28 Nov 2019 22:26:51 -0500 Subject: ext4: avoid fetching btime in ext4_getattr() unless requested Linus observed that an allmodconfig build which does a lot of stat(2) calls that ext4_getattr() was a noticeable (1%) amount of CPU time, due to the cache line for i_extra_isize getting pulled in. Since the normal stat system call doesn't return btime, it's a complete waste. So only calculate btime when it is explicitly requested. [ Fixed to check against request_mask instead of query_flags. ] Link: https://lore.kernel.org/r/CAHk-=wivmk_j6KbTX+Er64mLrG8abXZo0M10PNdAnHc8fWXfsQ@mail.gmail.com Reported-by: Linus Torvalds Reviewed-by: Andreas Dilger Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d035acab5b2a..9100460d92e5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5399,7 +5399,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat, struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; -- cgit v1.2.3 From 7063743f6860874fedc943fb168e079abdc62f2c Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 6 Dec 2019 13:43:17 +0800 Subject: ext4: remove unnecessary assignment in ext4_htree_store_dirent() We have allocated memory using kzalloc() so don't have to set 0 again in last byte. Signed-off-by: Chengguang Xu Link: https://lore.kernel.org/r/20191206054317.3107-1-cgxu519@mykernel.net Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9f00fc0bf21d..8964778aabef 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -462,7 +462,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); - new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; -- cgit v1.2.3 From 64c314ff822786b4634bb5cfe91ed5a34ba8743e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Dec 2019 13:32:25 -0800 Subject: ext4: remove unnecessary ifdefs in htree_dirblock_to_tree() The ifdefs for CONFIG_FS_ENCRYPTION in htree_dirblock_to_tree() are unnecessary, as the called functions are already stubbed out when !CONFIG_FS_ENCRYPTION. Remove them. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191209213225.18477-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/namei.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d4c0d7a18d64..129d2ebae00d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1009,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); @@ -1024,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, return err; } } -#endif + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -1072,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); -#endif return count; } -- cgit v1.2.3 From 46797ad75af747652545f69240fdc78743446ac3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Dec 2019 15:36:02 -0800 Subject: ext4: uninline ext4_inode_journal_mode() Determining an inode's journaling mode has gotten more complicated over time. Move ext4_inode_journal_mode() from an inline function into ext4_jbd2.c to reduce the compiled code size. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191209233602.117778-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4_jbd2.c | 22 ++++++++++++++++++++++ fs/ext4/ext4_jbd2.h | 22 +--------------------- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 19217a3f1ae4..1f53d64e42a5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,6 +7,28 @@ #include +int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + BUG(); +} + /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a6b9b66dbfad..7ea4f6fa173b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -463,27 +463,7 @@ int ext4_force_commit(struct super_block *sb); #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ -static inline int ext4_inode_journal_mode(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) == NULL) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - /* We do not support data journalling with delayed allocation */ - if (!S_ISREG(inode->i_mode) || - ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { - /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - } - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - BUG(); -} +int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { -- cgit v1.2.3 From 4756ee183f25b1fa2a7306a439da3bcd687244e0 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Wed, 25 Dec 2019 10:45:59 +0800 Subject: ext4: use true,false for bool variable Fixes coccicheck warning: fs/ext4/extents.c:5271:6-12: WARNING: Assignment of 0/1 to bool variable fs/ext4/extents.c:5287:4-10: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1577241959-138695-1-git-send-email-zhengbin13@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ee83fe7c98aa..a7f90470b1dc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5269,7 +5269,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; - bool update = 0; + bool update = false; depth = path->p_depth; while (depth >= 0) { @@ -5285,7 +5285,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, goto out; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = 1; + update = true; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { -- cgit v1.2.3 From 284b3f6edbbb020ce479b8df1d2ac745b100cf53 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Dec 2019 09:39:20 -0600 Subject: ext4: remove unnecessary selections from EXT3_FS Since EXT3_FS already selects EXT4_FS, there's no reason for it to redundantly select all the selections of EXT4_FS -- notwithstanding the comments that claim otherwise. Remove these redundant selections to avoid confusion. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191226153920.4466-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/Kconfig | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index ef42ab040905..5841fd8aa706 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -4,12 +4,7 @@ # kernels after the removal of ext3 driver. config EXT3_FS tristate "The Extended 3 (ext3) filesystem" - # These must match EXT4_FS selects... select EXT4_FS - select JBD2 - select CRC16 - select CRYPTO - select CRYPTO_CRC32C help This config option is here only for backward compatibility. ext3 filesystem is now handled by the ext4 driver. @@ -33,7 +28,6 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" - # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO -- cgit v1.2.3 From 834f1565fa3f9c8f78adbfcaa80ae510fe4971c3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Dec 2019 09:41:05 -0600 Subject: ext4: handle decryption error in __ext4_block_zero_page_range() fscrypt_decrypt_pagecache_blocks() can fail, because it uses skcipher_request_alloc(), which uses kmalloc(), which can fail; and also because it calls crypto_skcipher_decrypt(), which can fail depending on the driver that actually implements the crypto. Therefore it's not appropriate to WARN on decryption error in __ext4_block_zero_page_range(). Remove the WARN and just handle the error instead. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191226154105.4704-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9100460d92e5..d3e1539c680b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3725,8 +3725,12 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks( - page, blocksize, bh_offset(bh))); + err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } } } if (ext4_should_journal_data(inode)) { -- cgit v1.2.3 From 457b1e353c739af39159269723949f315320446c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Dec 2019 09:42:16 -0600 Subject: ext4: allow ZERO_RANGE on encrypted files When ext4 encryption support was first added, ZERO_RANGE was disallowed, supposedly because test failures (e.g. ext4/001) were seen when enabling it, and at the time there wasn't enough time/interest to debug it. However, there's actually no reason why ZERO_RANGE can't work on encrypted files. And it fact it *does* work now. Whole blocks in the zeroed range are converted to unwritten extents, as usual; encryption makes no difference for that part. Partial blocks are zeroed in the pagecache and then ->writepages() encrypts those blocks as usual. ext4_block_zero_page_range() handles reading and decrypting the block if needed before actually doing the pagecache write. Also, f2fs has always supported ZERO_RANGE on encrypted files. As far as I can tell, the reason that ext4/001 was failing in v4.1 was actually because of one of the bugs fixed by commit 36086d43f657 ("ext4 crypto: fix bugs in ext4_encrypted_zeroout()"). The bug made ext4_encrypted_zeroout() always return a positive value, which caused unwritten extents in encrypted files to sometimes not be marked as initialized after being written to. This bug was not actually in ZERO_RANGE; it just happened to trigger during the extents manipulation done in ext4/001 (and probably other tests too). So, let's enable ZERO_RANGE on encrypted files on ext4. Tested with: gce-xfstests -c ext4/encrypt -g auto gce-xfstests -c ext4/encrypt_1k -g auto Got the same set of test failures both with and without this patch. But with this patch 6 fewer tests are skipped: ext4/001, generic/008, generic/009, generic/033, generic/096, and generic/511. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191226154216.4808-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- Documentation/filesystems/fscrypt.rst | 6 +++--- fs/ext4/extents.c | 7 +------ 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 68c2bc8275cf..07f1f15276bf 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -975,9 +975,9 @@ astute users may notice some differences in behavior: - Direct I/O is not supported on encrypted files. Attempts to use direct I/O on such files will fall back to buffered I/O. -- The fallocate operations FALLOC_FL_COLLAPSE_RANGE, - FALLOC_FL_INSERT_RANGE, and FALLOC_FL_ZERO_RANGE are not supported - on encrypted files and will fail with EOPNOTSUPP. +- The fallocate operations FALLOC_FL_COLLAPSE_RANGE and + FALLOC_FL_INSERT_RANGE are not supported on encrypted files and will + fail with EOPNOTSUPP. - Online defragmentation of encrypted files is not supported. The EXT4_IOC_MOVE_EXT and F2FS_IOC_MOVE_RANGE ioctls will fail with diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a7f90470b1dc..4ba8215fa288 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4891,14 +4891,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). - * - * XXX It's not clear why zero range isn't working, but we'll - * leave it disabled for encrypted inodes for now. This is a - * bug we should fix.... */ if (IS_ENCRYPTED(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | - FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ -- cgit v1.2.3 From 33b4cc2501d323feef3cc3ec9a084d80bef5b5e8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Dec 2019 10:10:22 -0600 Subject: ext4: only use fscrypt_zeroout_range() on regular files fscrypt_zeroout_range() is only for encrypted regular files, not for encrypted directories or symlinks. Fortunately, currently it seems it's never called on non-regular files. But to be safe ext4 should explicitly check S_ISREG() before calling it. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191226161022.53490-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d3e1539c680b..73482fc86561 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -403,7 +403,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (IS_ENCRYPTED(inode)) + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); -- cgit v1.2.3 From d85926474ffd5395dc39dbbedb969c31c4059a61 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 26 Dec 2019 10:11:14 -0600 Subject: ext4: re-enable extent zeroout optimization on encrypted files For encrypted files, commit 36086d43f657 ("ext4 crypto: fix bugs in ext4_encrypted_zeroout()") disabled the optimization where when a write occurs to the middle of an unwritten extent, the head and/or tail of the extent (when they aren't too large) are zeroed out, turned into an initialized extent, and merged with the part being written to. This optimization helps prevent fragmentation of the extent tree. However, disabling this optimization also made fscrypt_zeroout_range() nearly impossible to test, as now it's only reachable via the very rare case in ext4_split_extent_at() where allocating a new extent tree block fails due to ENOSPC. 'gce-xfstests -c ext4/encrypt -g auto' doesn't even hit this at all. It's preferable to avoid really rare cases that are hard to test. That commit also cited data corruption in xfstest generic/127 as a reason to disable the extent zeroout optimization, but that's no longer reproducible anymore. It also cited fscrypt_zeroout_range() having poor performance, but I've written a patch to fix that. Therefore, re-enable the extent zeroout optimization on encrypted files. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191226161114.53606-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4ba8215fa288..ed55ca77e684 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3719,9 +3719,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (IS_ENCRYPTED(inode)) - max_zeroout = 0; - /* * five cases: * 1. split the extent into three extents. -- cgit v1.2.3 From 8f27fd0ab569f5d09410010b15a3f45399e89fcf Mon Sep 17 00:00:00 2001 From: Naoto Kobayashi Date: Fri, 27 Dec 2019 17:05:21 +0900 Subject: ext4: Delete ext4_kvzvalloc() Since we're not using ext4_kvzalloc(), delete this function. Signed-off-by: Naoto Kobayashi Link: https://lore.kernel.org/r/20191227080523.31808-2-naoto.kobayashi4c@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/super.c | 10 ---------- 2 files changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 791e54425b00..95002a2b09fe 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2744,7 +2744,6 @@ extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); -extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f1a5c14c2a93..b8301b096d29 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -214,16 +214,6 @@ void *ext4_kvmalloc(size_t size, gfp_t flags) return ret; } -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { -- cgit v1.2.3 From 547c556f4db7c09447ecf5f833ab6aaae0c5ab58 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:11:49 -0600 Subject: ext4: fix deadlock allocating crypto bounce page from mempool ext4_writepages() on an encrypted file has to encrypt the data, but it can't modify the pagecache pages in-place, so it encrypts the data into bounce pages and writes those instead. All bounce pages are allocated from a mempool using GFP_NOFS. This is not correct use of a mempool, and it can deadlock. This is because GFP_NOFS includes __GFP_DIRECT_RECLAIM, which enables the "never fail" mode for mempool_alloc() where a failed allocation will fall back to waiting for one of the preallocated elements in the pool. But since this mode is used for all a bio's pages and not just the first, it can deadlock waiting for pages already in the bio to be freed. This deadlock can be reproduced by patching mempool_alloc() to pretend that pool->alloc() always fails (so that it always falls back to the preallocations), and then creating an encrypted file of size > 128 KiB. Fix it by only using GFP_NOFS for the first page in the bio. For subsequent pages just use GFP_NOWAIT, and if any of those fail, just submit the bio and start a new one. This will need to be fixed in f2fs too, but that's less straightforward. Fixes: c9af28fdd449 ("ext4 crypto: don't let data integrity writebacks fail with ENOMEM") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231181149.47619-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 24aeedb8fc75..68b39e75446a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + /* + * Since bounce page allocation uses a mempool, we can only use + * a waiting mask (i.e. request guaranteed allocation) on the + * first page of the bio. Otherwise it can deadlock. + */ + if (io->io_bio) + gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); - if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { - if (io->io_bio) { + if (ret == -ENOMEM && + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { + gfp_flags = GFP_NOFS; + if (io->io_bio) ext4_io_submit(io); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } - gfp_flags |= __GFP_NOFAIL; + else + gfp_flags |= __GFP_NOFAIL; + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry_encrypt; } -- cgit v1.2.3 From 68e45330e341dad2d3a0a3f8ef2ec46a2a0a3bbc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:12:22 -0600 Subject: ext4: fix deadlock allocating bio_post_read_ctx from mempool Without any form of coordination, any case where multiple allocations from the same mempool are needed at a time to make forward progress can deadlock under memory pressure. This is the case for struct bio_post_read_ctx, as one can be allocated to decrypt a Merkle tree page during fsverity_verify_bio(), which itself is running from a post-read callback for a data bio which has its own struct bio_post_read_ctx. Fix this by freeing the first bio_post_read_ctx before calling fsverity_verify_bio(). This works because verity (if enabled) is always the last post-read step. This deadlock can be reproduced by trying to read from an encrypted verity file after reducing NUM_PREALLOC_POST_READ_CTXS to 1 and patching mempool_alloc() to pretend that pool->alloc() always fails. Note that since NUM_PREALLOC_POST_READ_CTXS is actually 128, to actually hit this bug in practice would require reading from lots of encrypted verity files at the same time. But it's theoretically possible, as N available objects isn't enough to guarantee forward progress when > N/2 threads each need 2 objects at a time. Fixes: 22cfe4b48ccb ("ext4: add fs-verity read support") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231181222.47684-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fef7755300c3..410c904cf59b 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -57,6 +57,7 @@ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, + STEP_MAX, }; struct bio_post_read_ctx { @@ -106,10 +107,22 @@ static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fsverity_verify_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and although verity + * will be disabled for that, decryption may still be needed, causing + * another bio_post_read_ctx to be allocated. So to guarantee that + * mempool_alloc() never deadlocks we must free the current ctx first. + * This is safe because verity is the last post-read step. + */ + BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + + __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -- cgit v1.2.3 From fd5fe2535642982af695f2f77bb7dba5d4aa6aa5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:12:56 -0600 Subject: ext4: remove unneeded check for error allocating bio_post_read_ctx Since allocating an object from a mempool never fails when __GFP_DIRECT_RECLAIM (which is included in GFP_NOFS) is set, the check for failure to allocate a bio_post_read_ctx is unnecessary. Remove it. Also remove the redundant assignment to ->bi_private. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231181256.47770-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 410c904cf59b..c1769afbf799 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -189,12 +189,11 @@ static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } -static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, - struct bio *bio, - pgoff_t first_idx) +static void ext4_set_bio_post_read_ctx(struct bio *bio, + const struct inode *inode, + pgoff_t first_idx) { unsigned int post_read_steps = 0; - struct bio_post_read_ctx *ctx = NULL; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -203,14 +202,14 @@ static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { - ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); + /* Due to the mempool, this never fails. */ + struct bio_post_read_ctx *ctx = + mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } - return ctx; } static inline loff_t ext4_readpage_limit(struct inode *inode) @@ -371,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct bio_post_read_ctx *ctx; - /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - ctx = get_bio_post_read_ctx(inode, bio, page->index); - if (IS_ERR(ctx)) { - bio_put(bio); - bio = NULL; - goto set_error_page; - } + ext4_set_bio_post_read_ctx(bio, inode, page->index); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, is_readahead ? REQ_RAHEAD : 0); } -- cgit v1.2.3 From dd6683e6efad2339894575da46bc519646e37c24 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:37 -0600 Subject: ext4: remove ext4_{ind,ext}_calc_metadata_amount() Remove the ext4_ind_calc_metadata_amount() and ext4_ext_calc_metadata_amount() functions, which have been unused since commit 71d4f7d03214 ("ext4: remove metadata reservation checks"). Also remove the i_da_metadata_calc_last_lblock and i_da_metadata_calc_len fields from struct ext4_inode_info, as these were only used by these removed functions. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 5 ----- fs/ext4/extents.c | 47 ----------------------------------------------- fs/ext4/indirect.c | 26 -------------------------- fs/ext4/super.c | 2 -- 4 files changed, 80 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 95002a2b09fe..17db2e002822 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1052,8 +1052,6 @@ struct ext4_inode_info { /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - ext4_lblk_t i_da_metadata_calc_last_lblock; - int i_da_metadata_calc_len; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -2692,7 +2690,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, @@ -3335,8 +3332,6 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ed55ca77e684..e005d9821c0e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -309,53 +309,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); } -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - int idxs; - - idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx)); - - /* - * If the new delayed allocation block is contiguous with the - * previous da block, it can share index blocks with the - * previous block, so we only need to allocate a new index - * block every idxs leaf blocks. At ldxs**2 blocks, we need - * an additional index block, and at ldxs**3 blocks, yet - * another index blocks. - */ - if (ei->i_da_metadata_calc_len && - ei->i_da_metadata_calc_last_lblock+1 == lblock) { - int num = 0; - - if ((ei->i_da_metadata_calc_len % idxs) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { - num++; - ei->i_da_metadata_calc_len = 0; - } else - ei->i_da_metadata_calc_len++; - ei->i_da_metadata_calc_last_lblock++; - return num; - } - - /* - * In the worst case we need a new set of index blocks at - * every level of the inode's extent tree. - */ - ei->i_da_metadata_calc_len = 1; - ei->i_da_metadata_calc_last_lblock = lblock; - return ext_depth(inode) + 1; -} - static int ext4_ext_max_entries(struct inode *inode, int depth) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3a4ab70fe9e0..569fc68e8975 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -659,32 +659,6 @@ out: return err; } -/* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - /* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b8301b096d29..84a86d9b790f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1141,8 +1141,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_da_metadata_calc_len = 0; - ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA -- cgit v1.2.3 From 9b02e4987ab092caed8f3bccdacaef9acf9015dd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:38 -0600 Subject: ext4: clean up len and offset checks in ext4_fallocate() - Fix some comments. - Consistently access i_size directly rather than using i_size_read(), since in all relevant cases we're under inode_lock(). - Simplify the alignment checks by using the IS_ALIGNED() macro. - In ext4_insert_range(), do the check against s_maxbytes in a way that is safe against signed overflow. (This doesn't currently matter for ext4 due to ext4's limited max file size, but this is something other filesystems have gotten wrong. We might as well do it safely.) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e005d9821c0e..9ff19cfc4ecb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4716,7 +4716,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -4800,7 +4800,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Mark that we allocate beyond EOF so the subsequent truncate * can proceed even if the new size is the same as i_size. */ - if ((offset + len) > i_size_read(inode)) + if (offset + len > inode->i_size) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); @@ -4887,7 +4887,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -5435,9 +5435,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) + /* Collapse range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; if (!S_ISREG(inode->i_mode)) @@ -5460,7 +5459,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= i_size_read(inode)) { + if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } @@ -5538,7 +5537,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; } - new_size = i_size_read(inode) - len; + new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; @@ -5585,9 +5584,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) + /* Insert range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; if (!S_ISREG(inode->i_mode)) @@ -5612,14 +5610,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - /* Check for wrap through zero */ - if (inode->i_size + len > inode->i_sb->s_maxbytes) { + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } - /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + /* Offset must be less than i_size */ + if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } -- cgit v1.2.3 From a1180994f52c0867c134e411a6a532ffa166ceac Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:39 -0600 Subject: ext4: remove redundant S_ISREG() checks from ext4_fallocate() ext4_fallocate() is only used in the file_operations for regular files. Also, the VFS only allows fallocate() on regular files and block devices, but block devices always use blkdev_fallocate(). For both of these reasons, S_ISREG() is always true in ext4_fallocate(). Therefore the S_ISREG() checks in ext4_zero_range(), ext4_collapse_range(), ext4_insert_range(), and ext4_punch_hole() are redundant. Remove them. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 9 --------- fs/ext4/inode.c | 3 --- 2 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9ff19cfc4ecb..94a25ca5f08f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4674,9 +4674,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); @@ -5439,9 +5436,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -5588,9 +5582,6 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 73482fc86561..3d0efb452eb8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3940,9 +3940,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) unsigned int credits; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length, 0); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -- cgit v1.2.3 From 43f816772ff3b4adc5cef24c78916bfbc8fee57f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:40 -0600 Subject: ext4: make some functions static in extents.c Make the following functions static since they're only used in extents.c: __ext4_ext_dirty() ext4_can_extents_be_merged() ext4_collapse_range() ext4_insert_range() Also remove the prototype for ext4_ext_writepage_trans_blocks(), as this function is not defined anywhere. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-5-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 6 ------ fs/ext4/ext4_extents.h | 5 ----- fs/ext4/extents.c | 22 +++++++++++++++------- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17db2e002822..5e621b0da4da 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3315,7 +3315,6 @@ struct ext4_extent; #define EXT_MAX_BLOCKS 0xffffffff extern int ext4_ext_tree_init(handle_t *handle, struct inode *); -extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3335,9 +3334,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); @@ -3353,8 +3349,6 @@ extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); -extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); -extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98bd0e9ee7df..1c216fcc202a 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path); - #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 94a25ca5f08f..bae409aad53e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -179,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, return err; } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) + static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) @@ -1696,9 +1700,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -int -ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, - struct ext4_extent *ex2) +static int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; @@ -4657,6 +4661,10 @@ retry: return ret > 0 ? ret2 : ret; } +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); + +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); + static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { @@ -5415,7 +5423,7 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; @@ -5559,7 +5567,7 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; handle_t *handle; -- cgit v1.2.3 From adde81cfd5a8d4966f5805e030ab3d0a7825b118 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:41 -0600 Subject: ext4: fix documentation for ext4_ext_try_to_merge() Don't mention the nonexistent return value, and mention both types of merges that are attempted. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-6-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bae409aad53e..6e4d8a5cb7fb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1828,13 +1828,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, } /* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. + * This function tries to merge the @ex extent to neighbours in the tree, then + * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex) { + struct ext4_extent *ex) +{ struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; -- cgit v1.2.3 From 61a6cb49da8147bc5377b60dfe540f9ef320aeb8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:42 -0600 Subject: ext4: remove obsolete comment from ext4_can_extents_be_merged() Support for unwritten extents was added to ext4 a long time ago, so remove a misleading comment that says they're a future feature. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-7-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6e4d8a5cb7fb..6a8faa86d45f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1716,11 +1716,6 @@ static int ext4_can_extents_be_merged(struct inode *inode, le32_to_cpu(ex2->ee_block)) return 0; - /* - * To allow future support for preallocated extents to be added - * as an RO_COMPAT feature, refuse to merge to extents if - * this can result in the top bit of ee_len being set. - */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; -- cgit v1.2.3 From 6e89bbb79bc78f31e86a4de1ca3a36474bc3fb62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:43 -0600 Subject: ext4: fix some nonstandard indentation in extents.c Clean up some code that was using 2-character indents. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-8-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6a8faa86d45f..6f13071d76d5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -607,8 +607,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug("path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); + ext_debug(" %d->%llu", + le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), @@ -735,8 +736,8 @@ ext4_ext_binsearch_idx(struct inode *inode, chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { - if (k != 0 && - le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + if (k != 0 && le32_to_cpu(ix->ei_block) <= + le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); @@ -1590,17 +1591,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) return EXT_MAX_BLOCKS; while (depth >= 0) { + struct ext4_ext_path *p = &path[depth]; + if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext && - path[depth].p_ext != - EXT_LAST_EXTENT(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_ext[1].ee_block); + if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) + return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_idx[1].ei_block); + if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) + return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } -- cgit v1.2.3 From de7454854d6e8e3c3585c6df4b0b99017820aa0f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 31 Dec 2019 12:04:44 -0600 Subject: ext4: add missing braces in ext4_ext_drop_refs() For clarity, add braces to the loop in ext4_ext_drop_refs(). Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20191231180444.46586-9-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Reviewed-by: Ritesh Harjani Reviewed-by: Jan Kara --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6f13071d76d5..393533ff0527 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -690,11 +690,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) if (!path) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) + for (i = 0; i <= depth; i++, path++) { if (path->p_bh) { brelse(path->p_bh); path->p_bh = NULL; } + } } /* -- cgit v1.2.3 From e128d516d81289db7dd977b7706c96501f48c011 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Wed, 1 Jan 2020 15:21:37 +0530 Subject: ext4: remove unused macro MPAGE_DA_EXTENT_TAIL Remove unused macro MPAGE_DA_EXTENT_TAIL which is no more used after below commit 4e7ea81d ("ext4: restructure writeback path") Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200101095137.25656-1-riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3d0efb452eb8..3313168b680f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,8 +48,6 @@ #include -#define MPAGE_DA_EXTENT_TAIL 0x01 - static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { -- cgit v1.2.3 From a54d8d34d2354f3a2a9dda00d9dd6666a50c486b Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 27 Dec 2019 14:46:39 +0100 Subject: ext4: Add EXT4_IOC_FSGETXATTR/EXT4_IOC_FSSETXATTR to compat_ioctl These are backed by 'struct fsxattr' which has the same size on all architectures. Signed-off-by: Martijn Coenen Link: https://lore.kernel.org/r/20191227134639.35869-1-maco@android.com Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e8870fff8224..a0ec750018dd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1377,6 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_FSGETXATTR: + case EXT4_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; -- cgit v1.2.3 From 71b565ceff377a52e7d58cd871745cd339447323 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Jan 2020 10:08:16 -0500 Subject: ext4: drop ext4_kvmalloc() As Jan pointed out[1], as of commit 81378da64de ("jbd2: mark the transaction context with the scope GFP_NOFS context") we use memalloc_nofs_{save,restore}() while a jbd2 handle is active. So ext4_kvmalloc() so we can call allocate using GFP_NOFS is no longer necessary. [1] https://lore.kernel.org/r/20200109100007.GC27035@quack2.suse.cz Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20200116155031.266620-1-tytso@mit.edu Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 1 - fs/ext4/resize.c | 10 ++++------ fs/ext4/super.c | 10 ---------- fs/ext4/xattr.c | 2 +- 4 files changed, 5 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5e621b0da4da..9a2ee2428ecc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2740,7 +2740,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); -extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a8c0f2b5b6e1..86a2500ed292 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -824,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto errout; - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", @@ -900,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 84a86d9b790f..ecf36a23e0c4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -204,16 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb) es->s_checksum = ext4_superblock_csum(sb, es); } -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 246fbeeb6366..8cac7d95c3ad 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1456,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; - ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; -- cgit v1.2.3 From a09decff5c32060639a685581c380f51b14e1fc2 Mon Sep 17 00:00:00 2001 From: Kai Li Date: Sat, 11 Jan 2020 10:25:42 +0800 Subject: jbd2: clear JBD2_ABORT flag before journal_reset to update log tail info when load journal If the journal is dirty when the filesystem is mounted, jbd2 will replay the journal but the journal superblock will not be updated by journal_reset() because JBD2_ABORT flag is still set (it was set in journal_init_common()). This is problematic because when a new transaction is then committed, it will be recorded in block 1 (journal->j_tail was set to 1 in journal_reset()). If unclean shutdown happens again before the journal superblock is updated, the new recorded transaction will not be replayed during the next mount (because of stale sb->s_start and sb->s_sequence values) which can lead to filesystem corruption. Fixes: 85e0c4e89c1b ("jbd2: if the journal is aborted then don't allow update of the log tail") Signed-off-by: Kai Li Link: https://lore.kernel.org/r/20200111022542.5008-1-li.kai4@h3c.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5e408ee24a1a..069b22eba795 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1710,6 +1710,11 @@ int jbd2_journal_load(journal_t *journal) journal->j_devname); return -EFSCORRUPTED; } + /* + * clear JBD2_ABORT flag initialized in journal_init_common + * here to update log tail information with the newest seq. + */ + journal->j_flags &= ~JBD2_ABORT; /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory @@ -1717,7 +1722,6 @@ int jbd2_journal_load(journal_t *journal) if (journal_reset(journal)) goto recovery_error; - journal->j_flags &= ~JBD2_ABORT; journal->j_flags |= JBD2_LOADED; return 0; -- cgit v1.2.3 From 4068664e3cd2312610ceac05b74c4cf1853b8325 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 6 Nov 2019 12:25:02 +0000 Subject: ext4: fix extent_status fragmentation for plain files Extents are cached in read_extent_tree_block(); as a result, extents are not cached for inodes with depth == 0 when we try to find the extent using ext4_find_extent(). The result of the lookup is cached in ext4_map_blocks() but is only a subset of the extent on disk. As a result, the contents of extents status cache can get very badly fragmented for certain workloads, such as a random 4k read workload. File size of /mnt/test is 33554432 (8192 blocks of 4096 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 8191: 40960.. 49151: 8192: last,eof $ perf record -e 'ext4:ext4_es_*' /root/bin/fio --name=t --direct=0 --rw=randread --bs=4k --filesize=32M --size=32M --filename=/mnt/test $ perf script | grep ext4_es_insert_extent | head -n 10 fio 131 [000] 13.975421: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [494/1) mapped 41454 status W fio 131 [000] 13.975939: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6064/1) mapped 47024 status W fio 131 [000] 13.976467: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6907/1) mapped 47867 status W fio 131 [000] 13.976937: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3850/1) mapped 44810 status W fio 131 [000] 13.977440: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3292/1) mapped 44252 status W fio 131 [000] 13.977931: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [6882/1) mapped 47842 status W fio 131 [000] 13.978376: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [3117/1) mapped 44077 status W fio 131 [000] 13.978957: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [2896/1) mapped 43856 status W fio 131 [000] 13.979474: ext4:ext4_es_insert_extent: dev 253,0 ino 12 es [7479/1) mapped 48439 status W Fix this by caching the extents for inodes with depth == 0 in ext4_find_extent(). [ Renamed ext4_es_cache_extents() to ext4_cache_extents() since this newly added function is not in extents_cache.c, and to avoid potential visual confusion with ext4_es_cache_extent(). -TYT ] Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/20191106122502.19986-1-dmonakhov@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 393533ff0527..954013d6076b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -468,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } +static void ext4_cache_extents(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } +} + static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, ext4_fsblk_t pblk, int depth, @@ -502,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line, */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); - struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); - ext4_lblk_t prev = 0; - int i; - - for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { - unsigned int status = EXTENT_STATUS_WRITTEN; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - int len = ext4_ext_get_actual_len(ex); - - if (prev && (prev != lblk)) - ext4_es_cache_extent(inode, prev, - lblk - prev, ~0, - EXTENT_STATUS_HOLE); - - if (ext4_ext_is_unwritten(ex)) - status = EXTENT_STATUS_UNWRITTEN; - ext4_es_cache_extent(inode, lblk, len, - ext4_ext_pblock(ex), status); - prev = lblk + len; - } + ext4_cache_extents(inode, eh); } return bh; errout: @@ -871,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[0].p_bh = NULL; i = depth; + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) + ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", -- cgit v1.2.3 From 244adf6426ee31a83f397b700d964cff12a247d3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 23 Jan 2020 12:23:17 -0500 Subject: ext4: make dioread_nolock the default This fixes the direct I/O versus writeback race which can reveal stale data, and it improves the tail latency of commits on slow devices. Link: https://lore.kernel.org/r/20200125022254.1101588-1-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ecf36a23e0c4..c6fe742db798 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1592,6 +1592,7 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "nodioread_nolock"}, {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, @@ -3764,6 +3765,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); + set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3931,9 +3933,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); -- cgit v1.2.3 From ec772f01307a2c06ebf6cdd221e6b518a71ddae7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 23 Jan 2020 20:12:34 -0800 Subject: ext4: fix race conditions in ->d_compare() and ->d_hash() Since ->d_compare() and ->d_hash() can be called in RCU-walk mode, ->d_parent and ->d_inode can be concurrently modified, and in particular, ->d_inode may be changed to NULL. For ext4_d_hash() this resulted in a reproducible NULL dereference if a lookup is done in a directory being deleted, e.g. with: int main() { if (fork()) { for (;;) { mkdir("subdir", 0700); rmdir("subdir"); } } else { for (;;) access("subdir/file", 0); } } ... or by running the 't_encrypted_d_revalidate' program from xfstests. Both repros work in any directory on a filesystem with the encoding feature, even if the directory doesn't actually have the casefold flag. I couldn't reproduce a crash in ext4_d_compare(), but it appears that a similar crash is possible there. Fix these bugs by reading ->d_parent and ->d_inode using READ_ONCE() and falling back to the case sensitive behavior if the inode is NULL. Reported-by: Al Viro Fixes: b886ee3e778e ("ext4: Support case-insensitive file name lookups") Cc: # v5.2+ Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20200124041234.159740-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 8964778aabef..0129d1462988 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -671,9 +671,11 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; - struct inode *inode = dentry->d_parent->d_inode; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { + if (!inode || !IS_CASEFOLDED(inode) || + !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); @@ -686,10 +688,11 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) { const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode) || !um) + if (!inode || !IS_CASEFOLDED(inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); -- cgit v1.2.3 From 57c32ea42f8e802bda47010418e25043e0c9337f Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 16 Oct 2019 10:25:01 +0800 Subject: ext4: choose hardlimit when softlimit is larger than hardlimit in ext4_statfs_project() Setting softlimit larger than hardlimit seems meaningless for disk quota but currently it is allowed. In this case, there may be a bit of comfusion for users when they run df comamnd to directory which has project quota. For example, we set 20M softlimit and 10M hardlimit of block usage limit for project quota of test_dir(project id 123). [root@hades mnt_ext4]# repquota -P -a *** Report for project quotas on device /dev/loop0 Block grace time: 7days; Inode grace time: 7days Block limits File limits Project used soft hard grace used soft hard grace ---------------------------------------------------------------------- 0 -- 13 0 0 2 0 0 123 -- 10237 20480 10240 5 200 100 The result of df command as below: [root@hades mnt_ext4]# df -h test_dir Filesystem Size Used Avail Use% Mounted on /dev/loop0 20M 10M 10M 50% /home/cgxu/test/mnt_ext4 Even though it looks like there is another 10M free space to use, if we write new data to diretory test_dir(inherit project id), the write will fail with errno(-EDQUOT). After this patch, the df result looks like below. [root@hades mnt_ext4]# df -h test_dir Filesystem Size Used Avail Use% Mounted on /dev/loop0 10M 10M 3.0K 100% /home/cgxu/test/mnt_ext4 Signed-off-by: Chengguang Xu Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191016022501.760-1-cgxu519@mykernel.net Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c6fe742db798..88b213bd32bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5585,9 +5585,15 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit && + (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + limit >>= sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; @@ -5597,9 +5603,14 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit && + (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = -- cgit v1.2.3 From 52144d893d76294db9ed79a909397ea81bc25a02 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 14 Nov 2019 20:01:47 +0000 Subject: ext4: fix extent_status trace points Show pblock only if it has meaningful value. # before ext4:ext4_es_lookup_extent_exit: dev 253,0 ino 12 found 1 [1/4294967294) 576460752303423487 H ext4:ext4_es_lookup_extent_exit: dev 253,0 ino 12 found 1 [2/4294967293) 576460752303423487 HR # after ext4:ext4_es_lookup_extent_exit: dev 253,0 ino 12 found 1 [1/4294967294) 0 H ext4:ext4_es_lookup_extent_exit: dev 253,0 ino 12 found 1 [2/4294967293) 0 HR Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/20191114200147.1073-2-dmonakhov@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.h | 6 ++++++ include/trace/events/ext4.h | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 825313c59752..4ec30a798260 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -209,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) return es->es_pblk & ~ES_MASK; } +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 3bf71288f146..19c87661eeec 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2282,7 +2282,7 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; - __entry->pblk = ext4_es_pblock(es); + __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), @@ -2371,7 +2371,7 @@ TRACE_EVENT(ext4_es_find_extent_range_exit, __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; - __entry->pblk = ext4_es_pblock(es); + __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), @@ -2425,7 +2425,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; - __entry->pblk = ext4_es_pblock(es); + __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->found = found; ), @@ -2593,7 +2593,7 @@ TRACE_EVENT(ext4_es_insert_delayed_block, __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; - __entry->pblk = ext4_es_pblock(es); + __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->allocated = allocated; ), -- cgit v1.2.3 From 0c1cba6cca862c56bf8cb726314e5196505017f6 Mon Sep 17 00:00:00 2001 From: wangyan Date: Wed, 22 Jan 2020 17:33:10 +0800 Subject: jbd2: delete the duplicated words in the comments Delete the duplicated words "is" in the comments Signed-off-by: Yan Wang Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/12087f77-ab4d-c7ba-53b4-893dbf0026f0@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 27b9f9dee434..5c3abbaccb57 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start); * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction + * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to -- cgit v1.2.3 From 8d6ce136790268fba2fc66cb8d6fa2161d4b2385 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Thu, 23 Jan 2020 01:43:25 -0500 Subject: ext4,jbd2: fix comment and code style Fix comment and remove unneccessary blank. Signed-off-by: Shijie Luo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200123064325.36358-1-luoshijie1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 2 +- fs/jbd2/transaction.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e61603f47035..fad82d08fca5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -850,7 +850,7 @@ out: /* * Prepare the write for the inline data. - * If the the data can be written into the inode, we just read + * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5c3abbaccb57..e77a5a0b4e46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1595,7 +1595,7 @@ out: * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ -int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; -- cgit v1.2.3 From 17c51d836c19ecc58714fc671a914a47b6ae4db7 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Thu, 23 Jan 2020 02:00:54 -0500 Subject: jbd2: remove pointless assertion in __journal_remove_journal_head Only when jh->b_jcount = 0 in jbd2_journal_put_journal_head, we are allowed to call __journal_remove_journal_head. This assertion is meaningless, just remove it. Signed-off-by: Shijie Luo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20200123070054.50585-1-luoshijie1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 069b22eba795..1c61491fd86f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2560,7 +2560,6 @@ static void __journal_remove_journal_head(struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); - J_ASSERT_JH(jh, jh->b_jcount >= 0); J_ASSERT_JH(jh, jh->b_transaction == NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); -- cgit v1.2.3 From 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 23 Jan 2020 12:05:10 +0300 Subject: jbd2_seq_info_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. Script below generates endless output $ q=;while read -r r;do echo "$((++q)) $r";done Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/d13805e5-695e-8ac3-b678-26ca2313629f@virtuozzo.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1c61491fd86f..5f9edb12f11a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -982,6 +982,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) { + (*pos)++; return NULL; } -- cgit v1.2.3 From d0a186e0d3e7ac05cc77da7c157dae5aa59f95d9 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 4 Dec 2019 20:46:11 +0800 Subject: jbd2: switch to use jbd2_journal_abort() when failed to submit the commit record We invoke jbd2_journal_abort() to abort the journal and record errno in the jbd2 superblock when committing journal transaction besides the failure on submitting the commit record. But there is no need for the case and we can also invoke jbd2_journal_abort() instead of __jbd2_journal_abort_hard(). Fixes: 818d276ceb83a ("ext4: Add the journal checksum feature") Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191204124614.45424-2-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f0b362b3842..2494095e0340 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -782,7 +782,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } blk_finish_plug(&plug); @@ -875,7 +875,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } if (cbh) err = journal_wait_on_commit_record(journal, cbh); -- cgit v1.2.3 From 51f57b01e4a3c7d7bdceffd84de35144e8c538e7 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 4 Dec 2019 20:46:12 +0800 Subject: ext4, jbd2: ensure panic when aborting with zero errno JBD2_REC_ERR flag used to indicate the errno has been updated when jbd2 aborted, and then __ext4_abort() and ext4_handle_error() can invoke panic if ERRORS_PANIC is specified. But if the journal has been aborted with zero errno, jbd2_journal_abort() didn't set this flag so we can no longer panic. Fix this by always record the proper errno in the journal superblock. Fixes: 4327ba52afd03 ("ext4, jbd2: ensure entering into panic after recording an error in superblock") Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191204124614.45424-3-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/journal.c | 15 ++++----------- 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8fff6677a5da..96bf33986d03 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); - jbd2_journal_abort(journal, 0); + jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5f9edb12f11a..9e9275540071 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2156,12 +2156,10 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); - if (errno) { - jbd2_journal_update_sb_errno(journal); - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); - } + jbd2_journal_update_sb_errno(journal); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); } /** @@ -2203,11 +2201,6 @@ static void __journal_abort_soft (journal_t *journal, int errno) * failure to disk. ext3_error, for example, now uses this * functionality. * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * */ void jbd2_journal_abort(journal_t *journal, int errno) -- cgit v1.2.3 From 0e98c084a21177ef136149c6a293b3d1eb33ff92 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 4 Dec 2019 20:46:13 +0800 Subject: jbd2: make sure ESHUTDOWN to be recorded in the journal superblock Commit fb7c02445c49 ("ext4: pass -ESHUTDOWN code to jbd2 layer") want to allow jbd2 layer to distinguish shutdown journal abort from other error cases. So the ESHUTDOWN should be taken precedence over any other errno which has already been recoded after EXT4_FLAGS_SHUTDOWN is set, but it only update errno in the journal suoerblock now if the old errno is 0. Fixes: fb7c02445c49 ("ext4: pass -ESHUTDOWN code to jbd2 layer") Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191204124614.45424-4-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9e9275540071..a821c469cab6 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2147,8 +2147,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) if (journal->j_flags & JBD2_ABORT) { write_unlock(&journal->j_state_lock); - if (!old_errno && old_errno != -ESHUTDOWN && - errno == -ESHUTDOWN) + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) jbd2_journal_update_sb_errno(journal); return; } -- cgit v1.2.3 From 7f6225e446cc8dfa4c3c7959a4de3dd03ec277bf Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 4 Dec 2019 20:46:14 +0800 Subject: jbd2: clean __jbd2_journal_abort_hard() and __journal_abort_soft() __jbd2_journal_abort_hard() is no longer used, so now we can merge __jbd2_journal_abort_hard() and __journal_abort_soft() these two functions into jbd2_journal_abort() and remove them. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20191204124614.45424-5-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 103 +++++++++++++++++++++------------------------------ include/linux/jbd2.h | 1 - 2 files changed, 42 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a821c469cab6..60bf8ff78913 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -96,7 +96,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); #ifdef CONFIG_JBD2_DEBUG @@ -805,7 +804,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; - __journal_abort_soft(journal, err); + jbd2_journal_abort(journal, err); } } else { *retp = blocknr; /* +journal->j_blk_offset */ @@ -2103,64 +2102,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) return err; } -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal functions, which provide abort to the jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -void __jbd2_journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - - if (journal->j_flags & JBD2_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal->j_devname); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __jbd2_log_start_commit(journal, transaction->t_tid); - write_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - int old_errno; - - write_lock(&journal->j_state_lock); - old_errno = journal->j_errno; - if (!journal->j_errno || errno == -ESHUTDOWN) - journal->j_errno = errno; - - if (journal->j_flags & JBD2_ABORT) { - write_unlock(&journal->j_state_lock); - if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) - jbd2_journal_update_sb_errno(journal); - return; - } - write_unlock(&journal->j_state_lock); - - __jbd2_journal_abort_hard(journal); - - jbd2_journal_update_sb_errno(journal); - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); -} - /** * void jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. @@ -2204,7 +2145,47 @@ static void __journal_abort_soft (journal_t *journal, int errno) void jbd2_journal_abort(journal_t *journal, int errno) { - __journal_abort_soft(journal, errno); + transaction_t *transaction; + + /* + * ESHUTDOWN always takes precedence because a file system check + * caused by any other journal abort error is not required after + * a shutdown triggered. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + int old_errno = journal->j_errno; + + write_unlock(&journal->j_state_lock); + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { + journal->j_errno = errno; + jbd2_journal_update_sb_errno(journal); + } + return; + } + + /* + * Mark the abort as occurred and start current running transaction + * to release all journaled buffer. + */ + pr_err("Aborting journal on device %s.\n", journal->j_devname); + + journal->j_flags |= JBD2_ABORT; + journal->j_errno = errno; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); + + /* + * Record errno to the journal super block, so that fsck and jbd2 + * layer could realise that a filesystem check is needed. + */ + jbd2_journal_update_sb_errno(journal); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); } /** diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ce44b687d02b..f613d8529863 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1403,7 +1403,6 @@ extern int jbd2_journal_skip_recovery (journal_t *); extern void jbd2_journal_update_sb_errno(journal_t *); extern int jbd2_journal_update_sb_log_tail (journal_t *, tid_t, unsigned long, int); -extern void __jbd2_journal_abort_hard (journal_t *); extern void jbd2_journal_abort (journal_t *, int); extern int jbd2_journal_errno (journal_t *); extern void jbd2_journal_ack_err (journal_t *); -- cgit v1.2.3