diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-18 10:56:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-18 10:56:26 -0700 |
commit | 3530c1886291df061e3972c55590777ef1cb67f8 (patch) | |
tree | bd6755e533eb5a0f37ff600da6bc0d9d1ba33c17 | |
parent | 6952b61de9984073289859073e8195ad0bee8fd5 (diff) | |
parent | 1358870deaf11a752a84fbd89201749aa62498e8 (diff) | |
download | lwn-3530c1886291df061e3972c55590777ef1cb67f8.tar.gz lwn-3530c1886291df061e3972c55590777ef1cb67f8.zip |
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (64 commits)
ext4: Update documentation about quota mount options
ext4: replace MAX_DEFRAG_SIZE with EXT_MAX_BLOCK
ext4: Fix the alloc on close after a truncate hueristic
ext4: Add a tracepoint for ext4_alloc_da_blocks()
ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags
ext4: limit block allocations for indirect-block files to < 2^32
ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT
ext4: Add null extent check to ext_get_path
ext4: Replace BUG_ON() with ext4_error() in move_extents.c
ext4: Replace get_ext_path macro with an inline funciton
ext4: Fix include/trace/events/ext4.h to work with Systemtap
ext4: Fix initalization of s_flex_groups
ext4: Always set dx_node's fake_dirent explicitly.
ext4: Fix async commit mode to be safe by using a barrier
ext4: Don't update superblock write time when filesystem is read-only
ext4: Clarify the locking details in mballoc
ext4: check for need init flag in ext4_mb_load_buddy
ext4: move ext4_mb_init_group() function earlier in the mballoc.c
ext4: Make non-journal fsync work properly
ext4: Assure that metadata blocks are written during fsync in no journal mode
...
-rw-r--r-- | Documentation/filesystems/ext4.txt | 24 | ||||
-rw-r--r-- | fs/ext4/Kconfig | 11 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 91 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 4 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.c | 9 | ||||
-rw-r--r-- | fs/ext4/extents.c | 112 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 13 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/inode.c | 150 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 7 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 429 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 22 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 22 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 334 | ||||
-rw-r--r-- | fs/ext4/namei.c | 22 | ||||
-rw-r--r-- | fs/ext4/resize.c | 7 | ||||
-rw-r--r-- | fs/ext4/super.c | 155 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 15 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 11 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 6 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 7 | ||||
-rw-r--r-- | include/linux/jbd2.h | 2 | ||||
-rw-r--r-- | include/trace/events/ext4.h | 142 | ||||
-rw-r--r-- | include/trace/events/jbd2.h | 2 |
25 files changed, 1000 insertions, 601 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 7be02ac5fa36..18b5ec8cea45 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -134,15 +134,9 @@ ro Mount filesystem read only. Note that ext4 will mount options "ro,noload" can be used to prevent writes to the filesystem. -journal_checksum Enable checksumming of the journal transactions. - This will allow the recovery code in e2fsck and the - kernel to detect corruption in the kernel. It is a - compatible change and will be ignored by older kernels. - journal_async_commit Commit block can be written to disk without waiting for descriptor blocks. If enabled older kernels cannot - mount the device. This will enable 'journal_checksum' - internally. + mount the device. journal=update Update the ext4 file system's journal to the current format. @@ -263,10 +257,18 @@ resuid=n The user ID which may use the reserved blocks. sb=n Use alternate superblock at this location. -quota -noquota -grpquota -usrquota +quota These options are ignored by the filesystem. They +noquota are used only by quota tools to recognize volumes +grpquota where quota should be turned on. See documentation +usrquota in the quota-tools package for more details + (http://sourceforge.net/projects/linuxquota). + +jqfmt=<quota type> These options tell filesystem details about quota +usrjquota=<file> so that quota information can be properly updated +grpjquota=<file> during journal replay. They replace the above + quota options. See documentation in the quota-tools + package for more details + (http://sourceforge.net/projects/linuxquota). bh (*) ext4 associates buffer heads to data pages to nobh (a) cache disk block mapping information diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 418b6f3b0ae8..d5c0ea2e8f2d 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,7 +37,7 @@ config EXT4DEV_COMPAT To enable backwards compatibility so that systems that are still expecting to mount ext4 filesystems using ext4dev, - chose Y here. This feature will go away by 2.6.31, so + choose Y here. This feature will go away by 2.6.31, so please arrange to get your userspace programs fixed! config EXT4_FS_XATTR @@ -77,3 +77,12 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. + +config EXT4_DEBUG + bool "EXT4 debugging support" + depends on EXT4_FS + help + Enables run-time debugging support for the ext4 filesystem. + + If you select Y here, then you will be able to turn on debugging + with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug" diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e2126d70dff5..1d0418980f8d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, * new bitmap information */ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); - ext4_mb_update_group_info(grp, blocks_freed); + grp->bb_free += blocks_freed; up_write(&grp->alloc_sem); /* We dirtied the bitmap block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9714db393efe..e227eea23f05 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t; /* prefer goal again. length */ -#define EXT4_MB_HINT_MERGE 1 +#define EXT4_MB_HINT_MERGE 0x0001 /* blocks already reserved */ -#define EXT4_MB_HINT_RESERVED 2 +#define EXT4_MB_HINT_RESERVED 0x0002 /* metadata is being allocated */ -#define EXT4_MB_HINT_METADATA 4 +#define EXT4_MB_HINT_METADATA 0x0004 /* first blocks in the file */ -#define EXT4_MB_HINT_FIRST 8 +#define EXT4_MB_HINT_FIRST 0x0008 /* search for the best chunk */ -#define EXT4_MB_HINT_BEST 16 +#define EXT4_MB_HINT_BEST 0x0010 /* data is being allocated */ -#define EXT4_MB_HINT_DATA 32 +#define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ -#define EXT4_MB_HINT_NOPREALLOC 64 +#define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ -#define EXT4_MB_HINT_GROUP_ALLOC 128 +#define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ -#define EXT4_MB_HINT_GOAL_ONLY 256 +#define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ -#define EXT4_MB_HINT_TRY_GOAL 512 +#define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ -#define EXT4_MB_DELALLOC_RESERVED 1024 +#define EXT4_MB_DELALLOC_RESERVED 0x0400 +/* We are doing stream allocation */ +#define EXT4_MB_STREAM_ALLOC 0x0800 struct ext4_allocation_request { @@ -112,6 +114,21 @@ struct ext4_allocation_request { }; /* + * For delayed allocation tracking + */ +struct mpage_da_data { + struct inode *inode; + sector_t b_blocknr; /* start block number of extent */ + size_t b_size; /* size of extent */ + unsigned long b_state; /* state of the extent */ + unsigned long first_page, next_page; /* extent of pages */ + struct writeback_control *wbc; + int io_done; + int pages_written; + int retval; +}; + +/* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ @@ -251,7 +268,6 @@ struct flex_groups { #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ @@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -386,6 +403,9 @@ struct ext4_mount_options { #endif }; +/* Max physical block we can addres w/o extents */ +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF + /* * Structure of an inode on the disk */ @@ -456,7 +476,6 @@ struct move_extent { __u64 len; /* block length to be moved */ __u64 moved_len; /* moved block length */ }; -#define MAX_DEFRAG_SIZE ((1UL<<31) - 1) #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) @@ -694,7 +713,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ @@ -841,6 +859,7 @@ struct ext4_sb_info { unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead_last; /* Last calculated overhead */ unsigned long s_blocks_last; /* Last seen block count */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ @@ -950,6 +969,7 @@ struct ext4_sb_info { atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; + atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group *s_locality_groups; @@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *, ext4_fsblk_t, unsigned long, int, unsigned long *); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); -extern void ext4_mb_update_group_info(struct ext4_group_info *grp, - ext4_grpblk_t add); extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); extern void ext4_mb_put_buddy_cache_lock(struct super_block *, ext4_group_t, int); @@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; - unsigned short bb_counters[]; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. + * bb_counters[3] = 5 means + * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 @@ -1591,15 +1613,42 @@ struct ext4_group_info { #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +#define EXT4_MAX_CONTENTION 8 +#define EXT4_CONTENTION_THRESHOLD 2 + static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } +/* + * Returns true if the filesystem is busy enough that attempts to + * access the block group locks has run into contention. + */ +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) +{ + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - spin_lock(ext4_group_lock_ptr(sb, group)); + spinlock_t *lock = ext4_group_lock_ptr(sb, group); + if (spin_trylock(lock)) + /* + * We're able to grab the lock right away, so drop the + * lock contention counter. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); + else { + /* + * The lock is busy, so bump the contention counter, + * and then wait on the spin lock. + */ + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, + EXT4_MAX_CONTENTION); + spin_lock(lock); + } } static inline void ext4_unlock_group(struct super_block *sb, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 20a84105a10b..61652f1d15e6 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -43,8 +43,7 @@ #define CHECK_BINSEARCH__ /* - * If EXT_DEBUG is defined you can use the 'extdebug' mount option - * to get lots of info about what's going on. + * Turn on EXT_DEBUG to get lots of info about extents operations. */ #define EXT_DEBUG__ #ifdef EXT_DEBUG @@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, #define EXT_BREAK 1 #define EXT_REPEAT 2 +/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ #define EXT_MAX_BLOCK 0xffffffff /* diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index eb27fd0f2ee8..6a9409920dee 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, handle, err); } else - brelse(bh); + bforget(bh); return err; } @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, ext4_journal_abort_handle(where, __func__, bh, handle, err); } else { - mark_buffer_dirty(bh); + if (inode && bh) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73ebfb44ad75..7a3832577923 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } -static int ext4_ext_journal_restart(handle_t *handle, int needed) +static int ext4_ext_truncate_extend_restart(handle_t *handle, + struct inode *inode, + int needed) { int err; @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) err = ext4_journal_extend(handle, needed); if (err <= 0) return err; - return ext4_journal_restart(handle, needed); + err = ext4_truncate_restart_trans(handle, inode, needed); + /* + * We have dropped i_data_sem so someone might have cached again + * an extent we are going to truncate. + */ + ext4_ext_invalidate_cache(inode); + + return err; } /* @@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, return newblock; } -static int ext4_ext_space_block(struct inode *inode) +static inline int ext4_ext_space_block(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 6) - size = 6; + if (size > 6) + size = 6; #endif + } return size; } -static int ext4_ext_space_block_idx(struct inode *inode) +static inline int ext4_ext_space_block_idx(struct inode *inode, int check) { int size; size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 5) - size = 5; + if (size > 5) + size = 5; #endif + } return size; } -static int ext4_ext_space_root(struct inode *inode) +static inline int ext4_ext_space_root(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 3) - size = 3; + if (size > 3) + size = 3; #endif + } return size; } -static int ext4_ext_space_root_idx(struct inode *inode) +static inline int ext4_ext_space_root_idx(struct inode *inode, int check) { int size; size = sizeof(EXT4_I(inode)->i_data); size -= sizeof(struct ext4_extent_header); size /= sizeof(struct ext4_extent_idx); + if (!check) { #ifdef AGGRESSIVE_TEST - if (size > 4) - size = 4; + if (size > 4) + size = 4; #endif + } return size; } @@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) int lcap, icap, rcap, leafs, idxs, num; int newextents = blocks; - rcap = ext4_ext_space_root_idx(inode); - lcap = ext4_ext_space_block(inode); - icap = ext4_ext_space_block_idx(inode); + rcap = ext4_ext_space_root_idx(inode, 0); + lcap = ext4_ext_space_block(inode, 0); + icap = ext4_ext_space_block_idx(inode, 0); /* number of new leaf blocks needed */ num = leafs = (newextents + lcap - 1) / lcap; @@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth) if (depth == ext_depth(inode)) { if (depth == 0) - max = ext4_ext_space_root(inode); + max = ext4_ext_space_root(inode, 1); else - max = ext4_ext_space_root_idx(inode); + max = ext4_ext_space_root_idx(inode, 1); } else { if (depth == 0) - max = ext4_ext_space_block(inode); + max = ext4_ext_space_block(inode, 1); else - max = ext4_ext_space_block_idx(inode); + max = ext4_ext_space_block_idx(inode, 1); } return max; @@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:%d:%llu ", + ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext_pblock(path->p_ext)); } else @@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); + ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); } ext_debug("\n"); @@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode, } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:%d ", + ext_debug(" -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext_pblock(path->p_ext), + ext4_ext_is_uninitialized(path->p_ext), ext4_ext_get_actual_len(path->p_ext)); #ifdef CHECK_BINSEARCH @@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_depth = 0; eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; - eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); + eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); ext4_ext_invalidate_cache(inode); return 0; @@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = 0; - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; ex = EXT_FIRST_EXTENT(neh); @@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, path[depth].p_ext++; while (path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:%d in new leaf %llu\n", + ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(path[depth].p_ext->ee_block), ext_pblock(path[depth].p_ext), + ext4_ext_is_uninitialized(path[depth].p_ext), ext4_ext_get_actual_len(path[depth].p_ext), newblock); /*memmove(ex++, path[depth].p_ext++, @@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh = ext_block_hdr(bh); neh->eh_entries = cpu_to_le16(1); neh->eh_magic = EXT4_EXT_MAGIC; - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; @@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, /* old root could have indexes or leaves * so calculate e_max right way */ if (ext_depth(inode)) - neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); else - neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); + neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, goto out; curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; - curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); + curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); curp->p_hdr->eh_entries = cpu_to_le16(1); curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); @@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* try to insert block into found extent and return */ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append %d block to %d:%d (from %llu)\n", + ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), le32_to_cpu(ex->ee_block), + ext4_ext_is_uninitialized(ex), ext4_ext_get_actual_len(ex), ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -1651,9 +1676,10 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %d:%llu:%d\n", + ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext)); path[depth].p_ext = EXT_FIRST_EXTENT(eh); } else if (le32_to_cpu(newext->ee_block) @@ -1663,10 +1689,11 @@ has_space: len = EXT_MAX_EXTENT(eh) - nearex; len = (len - 1) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d after: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 2, nearex + 1, len); @@ -1676,10 +1703,11 @@ has_space: BUG_ON(newext->ee_block == nearex->ee_block); len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); len = len < 0 ? 0 : len; - ext_debug("insert %d:%llu:%d before: nearest 0x%p, " + ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " "move %d from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext_pblock(newext), + ext4_ext_is_uninitialized(newext), ext4_ext_get_actual_len(newext), nearex, len, nearex + 1, nearex + 2); memmove(nearex + 1, nearex, len); @@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, else uninitialized = 0; - ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); + ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + uninitialized, ex_ee_len); path[depth].p_ext = ex; a = ex_ee_block > start ? ex_ee_block : start; @@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, } credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); - err = ext4_ext_journal_restart(handle, credits); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); if (err) goto out; @@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) if (err == 0) { ext_inode_hdr(inode)->eh_depth = 0; ext_inode_hdr(inode)->eh_max = - cpu_to_le16(ext4_ext_space_root(inode)); + cpu_to_le16(ext4_ext_space_root(inode, 0)); err = ext4_ext_dirty(handle, inode, path); } } @@ -2743,6 +2772,7 @@ insert: } else if (err) goto fix_extent_len; out: + ext4_ext_show_leaf(inode, path); return err ? err : allocated; fix_extent_len: @@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%u requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %lu\n", iblock, max_blocks, inode->i_ino); /* check in cache */ @@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = iblock - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (iblock - ee_block); - ext_debug("%u fit into %lu:%d -> %llu\n", iblock, + ext_debug("%u fit into %u:%d -> %llu\n", iblock, ee_block, ee_len, newblock); /* Do not put uninitialized extent in the cache */ @@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%lu\n", + ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 83cf6415f599..07475740b512 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - int ret = 0; + int err, ret = 0; J_ASSERT(ext4_journal_current_handle() == NULL); @@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) goto out; } + if (!journal) + ret = sync_mapping_buffers(inode->i_mapping); + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; @@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) .sync_mode = WB_SYNC_ALL, .nr_to_write = 0, /* sys_fsync did this */ }; - ret = sync_inode(inode, &wbc); - if (journal && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + err = sync_inode(inode, &wbc); + if (ret == 0) + ret = err; } out: + if (journal && (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 29e6dc7299b8..f3624ead4f6c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, ext4_free_inodes_count(sb, gdp), x); + (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f9c642b22efa..4abd683b963d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) + int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) { + int ret; + + /* + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this + * moment, get_block can be called only for blocks inside i_size since + * page cache has been already dropped and writes are blocked by + * i_mutex. So we can safely drop the i_data_sem here. + */ BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); - return ext4_journal_restart(handle, blocks_for_truncate(inode)); + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + + return ret; } /* @@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode, int n = 0; int final = 0; - if (i_block < 0) { - ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); - } else if (i_block < direct_blocks) { + if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; } else if ((i_block -= direct_blocks) < indirect_blocks) { @@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * * Normally this function find the preferred place for block allocation, * returns it. + * Because this is only used for non-extent files, we limit the block nr + * to 32 bits. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { + ext4_fsblk_t goal; + /* * XXX need to get goal block from mballoc's data structures */ - return ext4_find_near(inode, partial); + goal = ext4_find_near(inode, partial); + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + return goal; } /** @@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (*err) goto failed_out; + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); + target -= count; /* allocate blocks for indirect blocks */ while (index < indirect_blks && count) { @@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; current_block = ext4_mb_new_blocks(handle, &ar, err); + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); if (*err && (target == blks)) { /* @@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, bh); if (err) { + /* Don't brelse(bh) here; it's done in + * ext4_journal_forget() below */ unlock_buffer(bh); - brelse(bh); goto failed; } @@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, sector_t logical, - sector_t phys, int len) +static int check_block_validity(struct inode *inode, const char *msg, + sector_t logical, sector_t phys, int len) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { - ext4_error(inode->i_sb, "check_block_validity", + ext4_error(inode->i_sb, msg, "inode #%lu logical block %llu mapped to %llu " "(size %d)", inode->i_ino, (unsigned long long) logical, (unsigned long long) phys, len); - WARN_ON(1); return -EIO; } return 0; @@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system corruption", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } } @@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, - bh->b_blocknr, retval); + int ret = check_block_validity(inode, "file system " + "corruption after allocation", + block, bh->b_blocknr, retval); if (ret != 0) return ret; } @@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -struct mpage_da_data { - struct inode *inode; - sector_t b_blocknr; /* start block number of extent */ - size_t b_size; /* size of extent */ - unsigned long b_state; /* state of the extent */ - unsigned long first_page, next_page; /* extent of pages */ - struct writeback_control *wbc; - int io_done; - int pages_written; - int retval; -}; - /* * mpage_da_submit_io - walks through extent of pages and try to write * them with writepage() call back @@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping, long pages_skipped; int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); trace_ext4_da_writepages(inode, wbc); @@ -2850,6 +2859,7 @@ retry: mpd.io_done = 1; ret = MPAGE_DA_EXTENT_TAIL; } + trace_ext4_da_write_pages(inode, &mpd); wbc->nr_to_write -= mpd.pages_written; ext4_journal_stop(handle); @@ -2905,6 +2915,7 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } @@ -3117,6 +3128,8 @@ out: */ int ext4_alloc_da_blocks(struct inode *inode) { + trace_ext4_alloc_da_blocks(inode); + if (!EXT4_I(inode)->i_reserved_data_blocks && !EXT4_I(inode)->i_reserved_meta_blocks) return 0; @@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); if (bh) { BUFFER_TRACE(bh, "retaking write access"); ext4_journal_get_write_access(handle, bh); @@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); - ext4_journal_test_restart(handle, inode); + ext4_truncate_restart_trans(handle, inode, + blocks_for_truncate(inode)); } ext4_free_blocks(handle, inode, nr, 1, 1); @@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (ei->i_disksize && inode->i_size == 0 && - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { @@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle, */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, - struct ext4_iloc *iloc) + struct ext4_iloc *iloc, + int do_sync) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); @@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - /* clear the migrate flag in the raw_inode */ - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); + raw_inode->i_flags = cpu_to_le32(ei->i_flags); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = @@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - rc = ext4_handle_dirty_metadata(handle, inode, bh); - if (!err) - err = rc; + /* + * If we're not using a journal and we were called from + * ext4_write_inode() to sync the inode (making do_sync true), + * we can just use sync_dirty_buffer() directly to do our dirty + * work. Testing s_journal here is a bit redundant but it's + * worth it to avoid potential future trouble. + */ + if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) { + BUFFER_TRACE(bh, "call sync_dirty_buffer"); + sync_dirty_buffer(bh); + } else { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, inode, bh); + if (!err) + err = rc; + } ei->i_state &= ~EXT4_STATE_NEW; out_brelse: @@ -4684,19 +4710,32 @@ out_brelse: */ int ext4_write_inode(struct inode *inode, int wait) { + int err; + if (current->flags & PF_MEMALLOC) return 0; - if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); - dump_stack(); - return -EIO; - } + if (EXT4_SB(inode->i_sb)->s_journal) { + if (ext4_journal_current_handle()) { + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + dump_stack(); + return -EIO; + } - if (!wait) - return 0; + if (!wait) + return 0; + + err = ext4_force_commit(inode->i_sb); + } else { + struct ext4_iloc iloc; - return ext4_force_commit(inode->i_sb); + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; + err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, + inode, &iloc, wait); + } + return err; } /* @@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ - err = ext4_do_update_inode(handle, inode, iloc); + err = ext4_do_update_inode(handle, inode, iloc, 0); put_bh(iloc->bh); return err; } @@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) else len = PAGE_CACHE_SIZE; + lock_page(page); + /* + * return if we have all the buffers mapped. This avoid + * the need to call write_begin/write_end which does a + * journal_start/journal_stop which can block and take + * long time + */ if (page_has_buffers(page)) { - /* return if we have all the buffers mapped */ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) + ext4_bh_unmapped)) { + unlock_page(page); goto out_unlock; + } } + unlock_page(page); /* * OK, we need to fill the hole... Do write_begin write_end * to do block allocation/reservation.We are not holding diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7050a9cd04a4..c1cdf613e725 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -243,10 +243,9 @@ setversion_out: me.donor_start, me.len, &me.moved_len); fput(donor_filp); - if (!err) - if (copy_to_user((struct move_extent *)arg, - &me, sizeof(me))) - return -EFAULT; + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) + return -EFAULT; + return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cd258463e2a9..e9c61896d605 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -22,6 +22,7 @@ */ #include "mballoc.h" +#include <linux/debugfs.h> #include <trace/events/ext4.h> /* @@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, /* FIXME!! need more doc */ static void ext4_mb_mark_free_simple(struct super_block *sb, - void *buddy, unsigned first, int len, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned short min; - unsigned short max; - unsigned short chunk; + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; unsigned short border; BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); @@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); - unsigned short i = 0; - unsigned short first; - unsigned short len; + ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); @@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) char *data; char *bitmap; - mb_debug("init page %lu\n", page->index); + mb_debug(1, "init page %lu\n", page->index); inode = page->mapping->host; sb = inode->i_sb; @@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) set_bitmap_uptodate(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", first_group + i); } /* wait for I/O completion */ @@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug("put buddy for group %u in page %lu/%x\n", + mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, - sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); /* * incore got set to the group block bitmap below */ @@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug("put bitmap for group %u in page %lu/%x\n", + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); /* see comments in ext4_mb_put_pa() */ @@ -908,6 +910,100 @@ out: return err; } +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret = 0; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug(1, "init group %u\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have taken the alloc_sem lock. + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; - mb_debug("load group %u\n", group); + mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); @@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, * groups mapped by the page is blocked * till we are done with allocation */ +repeat_load_buddy: down_read(e4b->alloc_semp); + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* we need to check for group need init flag + * with alloc_semp held so that we can be sure + * that new blocks didn't get added to the group + * when we are loading the buddy cache + */ + up_read(e4b->alloc_semp); + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + goto repeat_load_buddy; + } + /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. @@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->alloc_semp = e4b->alloc_semp; e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; @@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, } -static noinline_for_stack -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) -{ - - int ret; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; - struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; - - mb_debug("init group %lu\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - this_grp = ext4_get_group_info(sb, group); - /* - * This ensures we don't add group - * to this buddy cache via resize - */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { - /* - * somebody initialized the group - * return without doing anything - */ - ret = 0; - goto err; - } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { - /* - * If both the bitmap and buddy are in - * the same page we don't need to force - * init the buddy - */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { - ret = -EIO; - goto err; - } - mark_page_accessed(page); -err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); - return ret; -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; - loff_t size, isize; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + ngroups = sbi->s_blockfile_groups; + BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - if (size < isize) - size = isize; - if (size < sbi->s_mb_stream_request && - (ac->ac_flags & EXT4_MB_HINT_DATA)) { + /* if stream allocation is enabled, use global goal */ + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } + /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -2015,27 +2037,6 @@ repeat: if (grp->bb_free == 0) continue; - /* - * if the group is already init we check whether it is - * a good group and if not we don't load the buddy - */ - if (EXT4_MB_GRP_NEED_INIT(grp)) { - /* - * we need full data about the group - * to make a good selection - */ - err = ext4_mb_init_group(sb, group); - if (err) - goto out; - } - - /* - * If the particular group doesn't satisfy our - * criteria we continue with the next group - */ - if (!ext4_mb_good_group(ac, group, cr)) - continue; - err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; @@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " - "%-5s %-2s %-5s %-5s %-5s %-6s\n", + "%-5s %-2s %-6s %-5s %-5s %-6s\n", "pid", "inode", "original", "goal", "result", "found", "grps", "cr", "flags", "merge", "tail", "broken"); return 0; @@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) if (hs->op == EXT4_MB_HISTORY_ALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " - "%-5u %-5s %-5u %-6u\n"; + "0x%04x %-5s %-5u %-6u\n"; sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); @@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) { } -static struct seq_operations ext4_mb_seq_history_ops = { +static const struct seq_operations ext4_mb_seq_history_ops = { .start = ext4_mb_seq_history_start, .next = ext4_mb_seq_history_next, .stop = ext4_mb_seq_history_stop, @@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file, return count; } -static struct file_operations ext4_mb_seq_history_fops = { +static const struct file_operations ext4_mb_seq_history_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_history_open, .read = seq_read, @@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_buddy e4b; struct sg { struct ext4_group_info info; - unsigned short counters[16]; + ext4_grpblk_t counters[16]; } sg; group--; @@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static struct seq_operations ext4_mb_seq_groups_ops = { +static const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, @@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } -static struct file_operations ext4_mb_seq_groups_fops = { +static const struct file_operations ext4_mb_seq_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, @@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL;; + meta_group_info[i]->bb_free_root.rb_node = NULL; #ifdef DOUBLE_CHECK { @@ -2558,26 +2559,15 @@ exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -/* - * Update an existing group. - * This function is used for online resize - */ -void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) -{ - grp->bb_free += add; -} - static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; - int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int num_meta_group_infos; int num_meta_group_infos_max; int array_size; - struct ext4_group_info **meta_group_info; struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ @@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb) goto err_freesgi; } EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) { - if ((i + 1) == num_meta_group_infos) - metalen = sizeof(*meta_group_info) * - (ngroups - - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); - meta_group_info = kmalloc(metalen, GFP_KERNEL); - if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); - goto err_freemeta; - } - sbi->s_group_info[i] = meta_group_info; - } - for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { @@ -2655,7 +2629,6 @@ err_freebuddy: while (i-- > 0) kfree(ext4_get_group_info(sb, i)); i = num_meta_group_infos; -err_freemeta: while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); @@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { return -ENOMEM; } - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { kfree(sbi->s_mb_offsets); @@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) kmem_cache_free(ext4_pspace_cachep, pa); } if (count) - mb_debug("mballoc: %u PAs left\n", count); + mb_debug(1, "mballoc: %u PAs left\n", count); } @@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) list_for_each_safe(l, ltmp, &txn->t_private_list) { entry = list_entry(l, struct ext4_free_data, list); - mb_debug("gonna free %u blocks in group %u (0x%p):", + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) ext4_mb_release_desc(&e4b); } - mb_debug("freed %u blocks in %u structures\n", count, count2); + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +} + +#ifdef CONFIG_EXT4_DEBUG +u8 mb_enable_debug __read_mostly; + +static struct dentry *debugfs_dir; +static struct dentry *debugfs_debug; + +static void __init ext4_create_debugfs_entry(void) +{ + debugfs_dir = debugfs_create_dir("ext4", NULL); + if (debugfs_dir) + debugfs_debug = debugfs_create_u8("mballoc-debug", + S_IRUGO | S_IWUSR, + debugfs_dir, + &mb_enable_debug); +} + +static void ext4_remove_debugfs_entry(void) +{ + debugfs_remove(debugfs_debug); + debugfs_remove(debugfs_dir); } +#else + +static void __init ext4_create_debugfs_entry(void) +{ +} + +static void ext4_remove_debugfs_entry(void) +{ +} + +#endif + int __init init_ext4_mballoc(void) { ext4_pspace_cachep = @@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } + ext4_create_debugfs_entry(); return 0; } @@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); + ext4_remove_debugfs_entry(); } @@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; else ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug("#%u: goal %u blocks for locality group\n", + mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || ac->ac_o_ex.fe_logical < pa->pa_lstart)); - /* skip PA normalized request doesn't overlap with */ - if (pa->pa_lstart >= end) { - spin_unlock(&pa->pa_lock); - continue; - } - if (pa_end <= start) { + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { spin_unlock(&pa->pa_lock); continue; } BUG_ON(pa->pa_lstart <= start && pa_end >= end); + /* adjust start or end to be adjacent to this pa */ if (pa_end <= ac->ac_o_ex.fe_logical) { BUG_ON(pa_end < start); start = pa_end; - } - - if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { BUG_ON(pa->pa_lstart > end); end = pa->pa_lstart; } @@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, (unsigned) orig_size, (unsigned) start); } @@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); } /* @@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); } /* @@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) continue; + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + continue; + /* found preallocated blocks, use them */ spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free) { @@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, preallocated += len; count++; } - mb_debug("prellocated %u for group %u\n", preallocated, group); + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug("new inode pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); @@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug("new group pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); @@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, next = mb_find_next_bit(bitmap_bh->b_data, end, bit); start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + le32_to_cpu(sbi->s_es->s_first_data_block); - mb_debug(" free preallocated %u/%u in group %u\n", + mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) start, (unsigned) next - bit, (unsigned) group); free += next - bit; @@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %u\n", group); + mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; @@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode) return; } - mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, { BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); } -#ifdef MB_DEBUG +#ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%lu:%d:%u \n", i, - start, pa->pa_len); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); } ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; - printk(KERN_ERR "%lu: %d/%d \n", + printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } printk(KERN_ERR "\n"); @@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; size = max(size, isize); - /* don't use group allocation for large files */ - if (size >= sbi->s_mb_stream_request) + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; + } - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + /* don't use group allocation for large files */ + if (size >= sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; + } BUG_ON(ac->ac_lg != NULL); /* @@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct ext4_allocation_context *ac; - mb_debug("discard locality group preallocation\n"); + mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index c96bb19f58f9..188d3d709b24 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -37,11 +37,19 @@ /* */ -#define MB_DEBUG__ -#ifdef MB_DEBUG -#define mb_debug(fmt, a...) printk(fmt, ##a) +#ifdef CONFIG_EXT4_DEBUG +extern u8 mb_enable_debug; + +#define mb_debug(n, fmt, a...) \ + do { \ + if ((n) <= mb_enable_debug) { \ + printk(KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __func__); \ + printk(fmt, ## a); \ + } \ + } while (0) #else -#define mb_debug(fmt, a...) +#define mb_debug(n, fmt, a...) #endif /* @@ -128,8 +136,8 @@ struct ext4_prealloc_space { unsigned pa_deleted; ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ - unsigned short pa_len; /* len of preallocated chunk */ - unsigned short pa_free; /* how many blocks are free */ + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ @@ -144,7 +152,7 @@ struct ext4_free_extent { ext4_lblk_t fe_logical; ext4_grpblk_t fe_start; ext4_group_t fe_group; - int fe_len; + ext4_grpblk_t fe_len; }; /* diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 313a50b39741..bf519f239ae6 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, down_write(&EXT4_I(inode)->i_data_sem); /* - * if EXT4_EXT_MIGRATE is cleared a block allocation + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & - ~EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; /* * We have the extent map build with the tmp inode. * Now copy the i_data across @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode) * when we add extents we extent the journal */ /* - * Even though we take i_mutex we can still cause block allocation - * via mmap write to holes. If we have allocated new blocks we fail - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. - * The flag is updated with i_data_sem held to prevent racing with - * block allocation. + * Even though we take i_mutex we can still cause block + * allocation via mmap write to holes. If we have allocated + * new blocks we fail migrate. New block allocation will + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated + * with i_data_sem held to prevent racing with block + * allocation. */ down_read((&EXT4_I(inode)->i_data_sem)); - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, 1); @@ -618,7 +618,7 @@ err_out: tmp_inode->i_nlink = 0; ext4_journal_stop(handle); - + unlock_new_inode(tmp_inode); iput(tmp_inode); return retval; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index bbf2dd9404dc..c07a2915e40b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -19,14 +19,31 @@ #include "ext4_extents.h" #include "ext4.h" -#define get_ext_path(path, inode, block, ret) \ - do { \ - path = ext4_ext_find_extent(inode, block, path); \ - if (IS_ERR(path)) { \ - ret = PTR_ERR(path); \ - path = NULL; \ - } \ - } while (0) +/** + * get_ext_path - Find an extent path for designated logical block number. + * + * @inode: an inode which is searched + * @lblock: logical block number to find an extent path + * @path: pointer to an extent path pointer (for output) + * + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * on failure. + */ +static inline int +get_ext_path(struct inode *inode, ext4_lblk_t lblock, + struct ext4_ext_path **path) +{ + int ret = 0; + + *path = ext4_ext_find_extent(inode, lblock, *path); + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; + } else if ((*path)[ext_depth(inode)].p_ext == NULL) + ret = -ENODATA; + + return ret; +} /** * copy_extent_status - Copy the extent's initialization status @@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, } /** + * mext_check_null_inode - NULL check for two inodes + * + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +static int +mext_check_null_inode(struct inode *inode1, struct inode *inode2, + const char *function) +{ + int ret = 0; + + if (inode1 == NULL) { + ext4_error(inode2->i_sb, function, + "Both inodes should not be NULL: " + "inode1 NULL inode2 %lu", inode2->i_ino); + ret = -EIO; + } else if (inode2 == NULL) { + ext4_error(inode1->i_sb, function, + "Both inodes should not be NULL: " + "inode1 %lu inode2 NULL", inode1->i_ino); + ret = -EIO; + } + return ret; +} + +/** * mext_double_down_read - Acquire two inodes' read semaphore * * @orig_inode: original inode structure @@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; - BUG_ON(orig_inode == NULL || donor_inode == NULL); - /* * Use the inode number to provide the stable locking order instead * of its address, because the C language doesn't guarantee you can @@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) { struct inode *first = orig_inode, *second = donor_inode; - BUG_ON(orig_inode == NULL || donor_inode == NULL); - /* * Use the inode number to provide the stable locking order instead * of its address, because the C language doesn't guarantee you can @@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) static void mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) { - BUG_ON(orig_inode == NULL || donor_inode == NULL); - up_read(&EXT4_I(orig_inode)->i_data_sem); up_read(&EXT4_I(donor_inode)->i_data_sem); } @@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) static void mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) { - BUG_ON(orig_inode == NULL || donor_inode == NULL); - up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } @@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } if (new_flag) { - get_ext_path(orig_path, orig_inode, eblock, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, eblock, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, @@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, } if (end_flag) { - get_ext_path(orig_path, orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (err) goto out; if (ext4_ext_insert_extent(handle, orig_inode, @@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * oext |-----------| * new_ext |-------| */ - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { + ext4_error(orig_inode->i_sb, __func__, + "new_ext_end(%u) should be less than or equal to " + "oext->ee_block(%u) + oext_alen(%d) - 1", + new_ext_end, le32_to_cpu(oext->ee_block), + oext_alen); + ret = -EIO; + goto out; + } /* * Case: new_ext is smaller than original extent @@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, o_end, &start_ext, &new_ext, &end_ext); +out: return ret; } @@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, * @orig_off: block offset of original inode * @donor_off: block offset of donor inode * @max_count: the maximun length of extents + * + * Return 0 on success, or a negative error value on failure. */ -static void +static int mext_calc_swap_extents(struct ext4_extent *tmp_dext, struct ext4_extent *tmp_oext, ext4_lblk_t orig_off, ext4_lblk_t donor_off, @@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, ext4_lblk_t diff, orig_diff; struct ext4_extent dext_old, oext_old; + BUG_ON(orig_off != donor_off); + + /* original and donor extents have to cover the same block offset */ + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || + le32_to_cpu(tmp_oext->ee_block) + + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) + return -ENODATA; + + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || + le32_to_cpu(tmp_dext->ee_block) + + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) + return -ENODATA; + dext_old = *tmp_dext; oext_old = *tmp_oext; @@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, copy_extent_status(&oext_old, tmp_dext); copy_extent_status(&dext_old, tmp_oext); + + return 0; } /** @@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, mext_double_down_write(orig_inode, donor_inode); /* Get the original extent for the block "orig_off" */ - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, orig_off, &orig_path); + if (err) goto out; /* Get the donor extent for the head */ - get_ext_path(donor_path, donor_inode, donor_off, err); - if (donor_path == NULL) + err = get_ext_path(donor_inode, donor_off, &donor_path); + if (err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, dext = donor_path[depth].p_ext; tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, donor_off, count); + if (err) + goto out; /* Loop for the donor extents */ while (1) { /* The extent for donor must be found. */ - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); + if (!dext) { + ext4_error(donor_inode->i_sb, __func__, + "The extent for donor must be found"); + err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + ext4_error(donor_inode->i_sb, __func__, + "Donor offset(%u) and the first block of donor " + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); + err = -EIO; + goto out; + } /* Set donor extent to orig extent */ err = mext_leaf_block(handle, orig_inode, @@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, orig_off, err); - if (orig_path == NULL) + err = get_ext_path(orig_inode, orig_off, &orig_path); + if (err) goto out; depth = ext_depth(orig_inode); oext = orig_path[depth].p_ext; @@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, if (donor_path) ext4_ext_drop_refs(donor_path); - get_ext_path(donor_path, donor_inode, - donor_off, err); - if (donor_path == NULL) + err = get_ext_path(donor_inode, donor_off, &donor_path); + if (err) goto out; depth = ext_depth(donor_inode); dext = donor_path[depth].p_ext; @@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, } tmp_dext = *dext; - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, - count - replaced_count); + err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); + if (err) + goto out; } out: @@ -740,7 +815,7 @@ out: * on success, or a negative error value on failure. */ static int -move_extent_par_page(struct file *o_filp, struct inode *donor_inode, +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, int data_offset_in_page, int block_len_in_page, int uninit) { @@ -871,6 +946,7 @@ out: if (PageLocked(page)) unlock_page(page); page_cache_release(page); + ext4_journal_stop(handle); } out2: ext4_journal_stop(handle); @@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len, __u64 moved_len) { + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " @@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if ((orig_start > MAX_DEFRAG_SIZE) || - (donor_start > MAX_DEFRAG_SIZE) || - (*len > MAX_DEFRAG_SIZE) || - (orig_start + *len > MAX_DEFRAG_SIZE)) { - ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " - "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, + if ((orig_start > EXT_MAX_BLOCK) || + (donor_start > EXT_MAX_BLOCK) || + (*len > EXT_MAX_BLOCK) || + (orig_start + *len > EXT_MAX_BLOCK)) { + ext4_debug("ext4 move extent: Can't handle over [%u] blocks " + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_inode->i_size > donor_inode->i_size) { - if (orig_start >= donor_inode->i_size) { + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; + /* TODO: eliminate this artificial restriction */ + if (orig_start >= donor_blocks) { ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file size " - "[%lld] [ino:orig %lu, donor_inode %lu]\n", - orig_start, donor_inode->i_size, + "[%llu] should be less than donor file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, donor_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > donor_inode->i_size) { + /* TODO: eliminate this artificial restriction */ + if (orig_start + *len > donor_blocks) { ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file size [%lld]." - "So adjust length from %llu to %lld " + "be less than donor file blocks [%u]." + "So adjust length from %llu to %llu " "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_inode->i_size, - *len, donor_inode->i_size - orig_start, + orig_start + *len, donor_blocks, + *len, donor_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = donor_inode->i_size - orig_start; + *len = donor_blocks - orig_start; } } else { - if (orig_start >= orig_inode->i_size) { + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; + if (orig_start >= orig_blocks) { ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file size " - "[%lld] [inode:orig %lu, donor %lu]\n", - orig_start, orig_inode->i_size, + "should be less than original file blocks " + "[%u] [ino:orig %lu, donor %lu]\n", + orig_start, orig_blocks, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - if (orig_start + *len > orig_inode->i_size) { + if (orig_start + *len > orig_blocks) { ext4_debug("ext4 move extent: Adjust length " - "from %llu to %lld. Because it should be " - "less than original file size " + "from %llu to %llu. Because it should be " + "less than original file blocks " "[ino:orig %lu, donor %lu]\n", - *len, orig_inode->i_size - orig_start, + *len, orig_blocks - orig_start, orig_inode->i_ino, donor_inode->i_ino); - *len = orig_inode->i_size - orig_start; + *len = orig_blocks - orig_start; } } @@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode, * @inode1: the inode structure * @inode2: the inode structure * - * Lock two inodes' i_mutex by i_ino order. This function is moved from - * fs/inode.c. + * Lock two inodes' i_mutex by i_ino order. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_lock(struct inode *inode1, struct inode *inode2) { - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { - if (inode1) - mutex_lock(&inode1->i_mutex); - else if (inode2) - mutex_lock(&inode2->i_mutex); - return; + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + + if (inode1 == inode2) { + mutex_lock(&inode1->i_mutex); + goto out; } if (inode1->i_ino < inode2->i_ino) { @@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); } + +out: + return ret; } /** @@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) * @inode1: the inode that is released first * @inode2: the inode that is released second * - * This function is moved from fs/inode.c. + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. */ -static void +static int mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) { + int ret = 0; + + BUG_ON(inode1 == NULL && inode2 == NULL); + + ret = mext_check_null_inode(inode1, inode2, __func__); + if (ret < 0) + goto out; + if (inode1) mutex_unlock(&inode1->i_mutex); if (inode2 && inode2 != inode1) mutex_unlock(&inode2->i_mutex); + +out: + return ret; } /** @@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; ext4_lblk_t rest_blocks; pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + int ret1, ret2, depth, last_extent = 0; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; int data_offset_in_page; int block_len_in_page; int uninit; /* protect orig and donor against a truncate */ - mext_inode_double_lock(orig_inode, donor_inode); + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; mext_double_down_read(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, donor_start, &len, *moved_len); mext_double_up_read(orig_inode, donor_inode); - if (ret) - goto out2; + if (ret1) + goto out; file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; block_end = block_start + len - 1; if (file_end < block_end) len -= block_end - file_end; - get_ext_path(orig_path, orig_inode, block_start, ret); - if (orig_path == NULL) - goto out2; + ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (ret1) + goto out; /* Get path structure to check the hole */ - get_ext_path(holecheck_path, orig_inode, block_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (ret1) goto out; depth = ext_depth(orig_inode); ext_cur = holecheck_path[depth].p_ext; - if (ext_cur == NULL) { - ret = -EINVAL; - goto out; - } /* - * Get proper extent whose ee_block is beyond block_start - * if block_start was within the hole. + * Get proper starting location of block replacement if block_start was + * within the hole. */ if (le32_to_cpu(ext_cur->ee_block) + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + /* + * The hole exists between extents or the tail of + * original file. + */ last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } last_extent = mext_next_extent(orig_inode, orig_path, &ext_dummy); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; goto out; } - } - seq_start = block_start; + seq_start = le32_to_cpu(ext_cur->ee_block); + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) + /* The hole exists at the beginning of original file. */ + seq_start = le32_to_cpu(ext_cur->ee_block); + else + seq_start = block_start; /* No blocks within the specified range. */ if (le32_to_cpu(ext_cur->ee_block) > block_end) { ext4_debug("ext4 move extent: The specified range of file " "may be the hole\n"); - ret = -EINVAL; + ret1 = -EINVAL; goto out; } @@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, last_extent = mext_next_extent(orig_inode, holecheck_path, &ext_cur); if (last_extent < 0) { - ret = last_extent; + ret1 = last_extent; break; } add_blocks = ext4_ext_get_actual_len(ext_cur); @@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, while (orig_page_offset <= seq_end_page) { /* Swap original branches with new branches */ - ret = move_extent_par_page(o_filp, donor_inode, + ret1 = move_extent_per_page(o_filp, donor_inode, orig_page_offset, data_offset_in_page, block_len_in_page, uninit); - if (ret < 0) + if (ret1 < 0) goto out; orig_page_offset++; /* Count how many blocks we have exchanged */ *moved_len += block_len_in_page; - BUG_ON(*moved_len > len); + if (*moved_len > len) { + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; + goto out; + } data_offset_in_page = 0; rest_blocks -= block_len_in_page; @@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Decrease buffer counter */ if (holecheck_path) ext4_ext_drop_refs(holecheck_path); - get_ext_path(holecheck_path, orig_inode, - seq_start, ret); - if (holecheck_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (ret1) break; depth = holecheck_path->p_depth; /* Decrease buffer counter */ if (orig_path) ext4_ext_drop_refs(orig_path); - get_ext_path(orig_path, orig_inode, seq_start, ret); - if (orig_path == NULL) + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (ret1) break; ext_cur = holecheck_path[depth].p_ext; @@ -1307,14 +1422,13 @@ out: ext4_ext_drop_refs(holecheck_path); kfree(holecheck_path); } -out2: - mext_inode_double_unlock(orig_inode, donor_inode); - if (ret) - return ret; + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); - /* All of the specified blocks must be exchanged in succeed */ - BUG_ON(*moved_len != len); + if (ret1) + return ret1; + else if (ret2) + return ret2; return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 114abe5d2c1d..42f81d285cd5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, dentry, inode, bh); + if (retval == -ENOSPC) + brelse(bh); + return retval; + } brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - return add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + if (retval == -ENOSPC) + brelse(bh); + return retval; } /* @@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, frame->bh); if (err) @@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - bh = NULL; + if (err != -ENOSPC) + bh = NULL; goto cleanup; journal_error: @@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; int err, retries = 0; - if (EXT4_DIR_LINK_MAX(inode)) + if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; /* @@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && - new_dir->i_nlink >= EXT4_LINK_MAX) + EXT4_DIR_LINK_MAX(new_dir)) goto end_rename; } if (!new_bh) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 68b0351fc647..3cfc343c41b5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) struct inode *inode = NULL; handle_t *handle; int gdb_off, gdb_num; - int num_grp_locked = 0; int err, err2; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); @@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * using the new disk blocks. */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * descriptor */ err = ext4_mb_add_groupinfo(sb, input->group, gdp); - if (err) { - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); + if (err) goto exit_journal; - } /* * Make the new blocks and inodes valid next. We do this before @@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); ext4_handle_dirty_metadata(handle, NULL, primary); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f4f079e6b9a..a6b1ab734728 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,6 +45,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "mballoc.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> @@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, errstr = "Out of memory"; break; case -EROFS: - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) + if (!sb || (EXT4_SB(sb)->s_journal && + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; @@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb, *journal_devnum = option; break; case Opt_journal_checksum: - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); - break; + break; /* Kept for backwards compatibility */ case Opt_journal_async_commit: set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); - set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: set_opt(sbi->s_mount_opt, NOLOAD); @@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, - ext4_free_inodes_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, - ext4_free_blks_count(sb, gdp)); - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, - ext4_used_dirs_count(sb, gdp)); + atomic_add(ext4_free_inodes_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_inodes); + atomic_add(ext4_free_blks_count(sb, gdp), + &sbi->s_flex_groups[flex_group].free_blocks); + atomic_add(ext4_used_dirs_count(sb, gdp), + &sbi->s_flex_groups[flex_group].used_dirs); } return 1; @@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = { .release = ext4_sb_release, }; +/* + * Check whether this filesystem can be mounted based on + * the features present and the RDONLY/RDWR mount requested. + * Returns 1 if this filesystem can be mounted as requested, + * 0 if it cannot be. + */ +static int ext4_feature_set_ok(struct super_block *sb, int readonly) +{ + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); + return 0; + } + + if (readonly) + return 1; + + /* Check that feature set is OK for a read-write mount */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " + "unsupported optional features (%x)", + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); + return 0; + } + /* + * Large file size enabled file system can only be mounted + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF + */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (sizeof(blkcnt_t) < sizeof(u64)) { + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " + "cannot be mounted RDWR without " + "CONFIG_LBDAF"); + return 0; + } + } + return 1; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int db_count; unsigned int i; int needs_recovery, has_huge_files; - int features; __u64 blocks_count; int err; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); - if (features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & - ~EXT4_FEATURE_INCOMPAT_SUPP)); - goto failed_mount; - } - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); - if (!(sb->s_flags & MS_RDONLY) && features) { - ext4_msg(sb, KERN_ERR, - "Couldn't mount RDWR because of " - "unsupported optional features (%x)", - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; - } - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (has_huge_files) { - /* - * Large file size enabled file system can only be - * mount if kernel is build with CONFIG_LBDAF - */ - if (sizeof(root->i_blocks) < sizeof(u64) && - !(sb->s_flags & MS_RDONLY)) { - ext4_msg(sb, KERN_ERR, "Filesystem with huge " - "files cannot be mounted read-write " - "without CONFIG_LBDAF"); - goto failed_mount; - } - } + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || @@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); @@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (ext4_blocks_count(es) > - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + if ((ext4_blocks_count(es) > + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || + (ext4_blocks_count(es) > + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely"); + " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = -EFBIG; goto failed_mount; } @@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } sbi->s_groups_count = blocks_count; + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), @@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, + jbd2_journal_set_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + jbd2_journal_set_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); + else jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ @@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); } else { - int ret; - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, - ~EXT4_FEATURE_RO_COMPAT_SUPP))) { - ext4_msg(sb, KERN_WARNING, "couldn't " - "remount RDWR because of unsupported " - "optional features (%x)", - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & - ~EXT4_FEATURE_RO_COMPAT_SUPP)); + /* Make sure we can mount this feature set readwrite */ + if (!ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } - /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 62b31c246994..fed5b01d7a8d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -810,12 +810,23 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext4_fsblk_t goal = ext4_group_first_block_no(sb, + ext4_fsblk_t goal, block; + + goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + block = ext4_new_meta_blocks(handle, inode, goal, NULL, &error); if (error) goto cleanup; + + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + ea_idebug(inode, "creating block %d", block); new_bh = sb_getblk(sb, block); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0df600e9162d..26d991ddc1e6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -25,6 +25,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/bio.h> +#include <linux/blkdev.h> #include <trace/events/jbd2.h> /* @@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + !JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { set_buffer_ordered(bh); barrier_done = 1; } @@ -706,11 +707,13 @@ start_journal_io: /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) __jbd2_journal_abort_hard(journal); + if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(journal->j_dev, NULL); } /* @@ -833,7 +836,7 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e378cb383979..a8a358bc0f21 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 6213ac728f30..a0512700542f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -238,6 +238,8 @@ repeat_locked: __jbd2_log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) handle = ERR_PTR(err); goto out; } - - lock_map_acquire(&handle->h_lockdep_map); out: return handle; } @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) __jbd2_log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d97eb652d6ca..52695d3dfd0b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -652,7 +652,7 @@ struct transaction_s * This transaction is being forced and some process is * waiting for it to finish. */ - int t_synchronous_commit:1; + unsigned int t_synchronous_commit:1; /* * For use by the filesystem to store fs-specific data diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 8d433c4e3709..c1bd8f1e8b94 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -5,10 +5,15 @@ #define _TRACE_EXT4_H #include <linux/writeback.h> -#include "../../../fs/ext4/ext4.h" -#include "../../../fs/ext4/mballoc.h" #include <linux/tracepoint.h> +struct ext4_allocation_context; +struct ext4_allocation_request; +struct ext4_prealloc_space; +struct ext4_inode_info; + +#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) + TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), @@ -33,8 +38,8 @@ TRACE_EVENT(ext4_free_inode, ), TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, - __entry->uid, __entry->gid, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) ); @@ -56,7 +61,8 @@ TRACE_EVENT(ext4_request_inode, ), TP_printk("dev %s dir %lu mode %d", - jbd2_dev_to_name(__entry->dev), __entry->dir, __entry->mode) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, + __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, @@ -79,7 +85,8 @@ TRACE_EVENT(ext4_allocate_inode, ), TP_printk("dev %s ino %lu dir %lu mode %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->dir, __entry->mode) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_write_begin, @@ -106,8 +113,8 @@ TRACE_EVENT(ext4_write_begin, ), TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->flags) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->flags) ); TRACE_EVENT(ext4_ordered_write_end, @@ -133,8 +140,8 @@ TRACE_EVENT(ext4_ordered_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_writeback_write_end, @@ -160,8 +167,8 @@ TRACE_EVENT(ext4_writeback_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_journalled_write_end, @@ -186,8 +193,8 @@ TRACE_EVENT(ext4_journalled_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_writepage, @@ -209,7 +216,8 @@ TRACE_EVENT(ext4_writepage, ), TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->index) ); TRACE_EVENT(ext4_da_writepages, @@ -243,14 +251,49 @@ TRACE_EVENT(ext4_da_writepages, __entry->range_cyclic = wbc->range_cyclic; ), - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic) ); +TRACE_EVENT(ext4_da_write_pages, + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), + + TP_ARGS(inode, mpd), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u64, b_blocknr ) + __field( __u32, b_size ) + __field( __u32, b_state ) + __field( unsigned long, first_page ) + __field( int, io_done ) + __field( int, pages_written ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->b_blocknr = mpd->b_blocknr; + __entry->b_size = mpd->b_size; + __entry->b_state = mpd->b_state; + __entry->first_page = mpd->first_page; + __entry->io_done = mpd->io_done; + __entry->pages_written = mpd->pages_written; + ), + + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->b_blocknr, __entry->b_size, + __entry->b_state, __entry->first_page, + __entry->io_done, __entry->pages_written) +); + TRACE_EVENT(ext4_da_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), @@ -280,7 +323,8 @@ TRACE_EVENT(ext4_da_writepages_result, ), TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, + jbd2_dev_to_name(__entry->dev), + (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->encountered_congestion, __entry->more_io, __entry->no_nrwrite_index_update) @@ -309,8 +353,8 @@ TRACE_EVENT(ext4_da_write_begin, ), TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->flags) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->flags) ); TRACE_EVENT(ext4_da_write_end, @@ -336,8 +380,8 @@ TRACE_EVENT(ext4_da_write_end, ), TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pos, __entry->len, - __entry->copied) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); TRACE_EVENT(ext4_discard_blocks, @@ -387,8 +431,8 @@ TRACE_EVENT(ext4_mb_new_inode_pa, ), TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, - __entry->pa_len, __entry->pa_lstart) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); TRACE_EVENT(ext4_mb_new_group_pa, @@ -415,8 +459,8 @@ TRACE_EVENT(ext4_mb_new_group_pa, ), TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->pa_pstart, - __entry->pa_len, __entry->pa_lstart) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); TRACE_EVENT(ext4_mb_release_inode_pa, @@ -442,8 +486,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_printk("dev %s ino %lu block %llu count %u", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, - __entry->count) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, @@ -488,7 +532,7 @@ TRACE_EVENT(ext4_discard_preallocations, ), TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mb_discard_preallocations, @@ -543,8 +587,8 @@ TRACE_EVENT(ext4_request_blocks, ), TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, - __entry->len, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->flags, __entry->len, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -587,8 +631,8 @@ TRACE_EVENT(ext4_allocate_blocks, ), TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->flags, - __entry->len, __entry->block, + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->flags, __entry->len, __entry->block, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -621,8 +665,8 @@ TRACE_EVENT(ext4_free_blocks, ), TP_printk("dev %s ino %lu block %llu count %lu metadata %d", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->block, - __entry->count, __entry->metadata) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->block, __entry->count, __entry->metadata) ); TRACE_EVENT(ext4_sync_file, @@ -645,8 +689,8 @@ TRACE_EVENT(ext4_sync_file, ), TP_printk("dev %s ino %ld parent %ld datasync %d ", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->parent, - __entry->datasync) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_fs, @@ -669,6 +713,30 @@ TRACE_EVENT(ext4_sync_fs, __entry->wait) ); +TRACE_EVENT(ext4_alloc_da_blocks, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( unsigned int, data_blocks ) + __field( unsigned int, meta_blocks ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; + __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; + ), + + TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->data_blocks, __entry->meta_blocks) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 10813fa0c8d0..b851f0b4701c 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -159,7 +159,7 @@ TRACE_EVENT(jbd2_submit_inode_data, ), TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino) + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); #endif /* _TRACE_JBD2_H */ |