diff options
author | Jose R. Santos <jrs@us.ibm.com> | 2008-07-11 19:27:31 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-11 19:27:31 -0400 |
commit | 772cb7c83ba256a11c7bf99a11bef3858d23767c (patch) | |
tree | a42b97e5cbd870a76b2646c2dcb658a92c53f637 /fs/ext4/ialloc.c | |
parent | 736603ab297506f4396cb5af592004499950fcfd (diff) | |
download | lwn-772cb7c83ba256a11c7bf99a11bef3858d23767c.tar.gz lwn-772cb7c83ba256a11c7bf99a11bef3858d23767c.zip |
ext4: New inode allocation for FLEX_BG meta-data groups.
This patch mostly controls the way inode are allocated in order to
make ialloc aware of flex_bg block group grouping. It achieves this
by bypassing the Orlov allocator when block group meta-data are packed
toghether through mke2fs. Since the impact on the block allocator is
minimal, this patch should have little or no effect on other block
allocation algorithms. By controlling the inode allocation, it can
basically control where the initial search for new block begins and
thus indirectly manipulate the block allocator.
This allocator favors data and meta-data locality so the disk will
gradually be filled from block group zero upward. This helps improve
performance by reducing seek time. Since the group of inode tables
within one flex_bg are treated as one giant inode table, uninitialized
block groups would not need to partially initialize as many inode
table as with Orlov which would help fsck time as the filesystem usage
goes up.
Signed-off-by: Jose R. Santos <jrs@us.ibm.com>
Signed-off-by: Valerie Clement <valerie.clement@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ialloc.c')
-rw-r--r-- | fs/ext4/ialloc.c | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b30cc79b9fcb..8b0a10acd708 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) struct ext4_super_block * es; struct ext4_sb_info *sbi; int fatal = 0, err; + ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk ("ext4_free_inode: inode has count=%d\n", @@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, block_group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes++; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } } BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); err = ext4_journal_dirty_metadata(handle, bh2); @@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, return ret; } +#define free_block_ratio 10 + +static int find_group_flex(struct super_block *sb, struct inode *parent, + ext4_group_t *best_group) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *desc; + struct buffer_head *bh; + struct flex_groups *flex_group = sbi->s_flex_groups; + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); + ext4_group_t ngroups = sbi->s_groups_count; + int flex_size = ext4_flex_bg_size(sbi); + ext4_group_t best_flex = parent_fbg_group; + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; + int flexbg_free_blocks; + int flex_freeb_ratio; + ext4_group_t n_fbg_groups; + ext4_group_t i; + + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + sbi->s_log_groups_per_flex; + +find_close_to_parent: + flexbg_free_blocks = flex_group[best_flex].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + if (flex_group[best_flex].free_inodes && + flex_freeb_ratio > free_block_ratio) + goto found_flexbg; + + if (best_flex && best_flex == parent_fbg_group) { + best_flex--; + goto find_close_to_parent; + } + + for (i = 0; i < n_fbg_groups; i++) { + if (i == parent_fbg_group || i == parent_fbg_group - 1) + continue; + + flexbg_free_blocks = flex_group[i].free_blocks; + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; + + if (flex_freeb_ratio > free_block_ratio && + flex_group[i].free_inodes) { + best_flex = i; + goto found_flexbg; + } + + if (best_flex < 0 || + (flex_group[i].free_blocks > + flex_group[best_flex].free_blocks && + flex_group[i].free_inodes)) + best_flex = i; + } + + if (!flex_group[best_flex].free_inodes || + !flex_group[best_flex].free_blocks) + return -1; + +found_flexbg: + for (i = best_flex * flex_size; i < ngroups && + i < (best_flex + 1) * flex_size; i++) { + desc = ext4_get_group_desc(sb, i, &bh); + if (le16_to_cpu(desc->bg_free_inodes_count)) { + *best_group = i; + goto out; + } + } + + return -1; +out: + return 0; +} + /* * Orlov's allocator for directories. * @@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + ext4_group_t flex_group; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) sbi = EXT4_SB(sb); es = sbi->s_es; + + if (sbi->s_log_groups_per_flex) { + ret2 = find_group_flex(sb, dir, &group); + goto got_group; + } + if (S_ISDIR(mode)) { if (test_opt (sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); @@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) } else ret2 = find_group_other(sb, dir, &group); +got_group: err = -ENOSPC; if (ret2 == -1) goto out; @@ -676,6 +765,13 @@ got: percpu_counter_inc(&sbi->s_dirs_counter); sb->s_dirt = 1; + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); + spin_lock(sb_bgl_lock(sbi, flex_group)); + sbi->s_flex_groups[flex_group].free_inodes--; + spin_unlock(sb_bgl_lock(sbi, flex_group)); + } + inode->i_uid = current->fsuid; if (test_opt (sb, GRPID)) inode->i_gid = dir->i_gid; |