diff options
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r-- | fs/ext4/mballoc.c | 286 |
1 files changed, 207 insertions, 79 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b50c2e5abe67..9b4c6033bd69 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3978,6 +3978,24 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } +/* + * This function returns the next element to look at during inode + * PA rbtree walk. We assume that we have held the inode PA rbtree lock + * (ei->i_prealloc_lock) + * + * new_start The start of the range we want to compare + * cur_start The existing start that we are comparing against + * node The node of the rb_tree + */ +static inline struct rb_node* +ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) +{ + if (new_start < cur_start) + return node->rb_left; + else + return node->rb_right; +} + static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, ext4_lblk_t end) @@ -3986,19 +4004,22 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0) { - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); - } spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); } /* @@ -4006,60 +4027,140 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * - *Parameters: + * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, ext4_lblk_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_prealloc_space *tmp_pa; + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; + struct rb_node *iter; ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; new_start = *start; new_end = *end; - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { - if (tmp_pa->pa_deleted) - continue; - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted) { - spin_unlock(&tmp_pa->pa_lock); - continue; - } - + /* + * Adjust the normalized range so that it doesn't overlap with any + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock + * so it doesn't change underneath us. + */ + read_lock(&ei->i_prealloc_lock); + + /* Step 1: find any one immediate neighboring PA of the normalized range */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || - ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } - /* skip PAs this normalized request doesn't overlap with */ - if (tmp_pa_start >= new_end || tmp_pa_end <= new_start) { + /* + * Step 2: check if the found PA is left or right neighbor and + * get the other neighbor + */ + if (tmp_pa) { + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + + left_pa = tmp_pa; + tmp = rb_next(&left_pa->pa_node.inode_node); + if (tmp) { + right_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } else { + struct rb_node *tmp; + + right_pa = tmp_pa; + tmp = rb_prev(&right_pa->pa_node.inode_node); + if (tmp) { + left_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } + } + + /* Step 3: get the non deleted neighbors */ + if (left_pa) { + for (iter = &left_pa->pa_node.inode_node;; + iter = rb_prev(iter)) { + if (!iter) { + left_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + left_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } spin_unlock(&tmp_pa->pa_lock); - continue; } - BUG_ON(tmp_pa_start <= new_start && tmp_pa_end >= new_end); - - /* adjust start or end to be adjacent to this pa */ - if (tmp_pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_end < new_start); - new_start = tmp_pa_end; - } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_start > new_end); - new_end = tmp_pa_start; + } + + if (right_pa) { + for (iter = &right_pa->pa_node.inode_node;; + iter = rb_next(iter)) { + if (!iter) { + right_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + right_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); } - spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + + if (left_pa) { + left_pa_end = + left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); + } + + if (right_pa) { + right_pa_start = right_pa->pa_lstart; + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); + } + + /* Step 4: trim our normalized range to not overlap with the neighbors */ + if (left_pa) { + if (left_pa_end > new_start) + new_start = left_pa_end; + } + + if (right_pa) { + if (right_pa_start < new_end) + new_end = right_pa_start; + } + read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); @@ -4401,6 +4502,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa, *cpa = NULL; ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ @@ -4408,14 +4510,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return false; /* first, try per-file preallocation */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); /* all fields in this condition don't change, * so we can skip locking for them */ tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + /* original request start doesn't lie in this PA */ if (ac->ac_o_ex.fe_logical < tmp_pa_start || ac->ac_o_ex.fe_logical >= tmp_pa_end) continue; @@ -4438,12 +4545,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); ac->ac_criteria = 10; - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); return true; } spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) @@ -4596,6 +4703,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, { ext4_group_t grp; ext4_fsblk_t grp_blk; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); @@ -4641,16 +4749,41 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { - spin_lock(pa->pa_node_lock.inode_lock); - list_del_rcu(&pa->pa_node.inode_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } +} - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + struct ext4_prealloc_space *iter_pa, *new_pa; + ext4_lblk_t iter_start, new_start; + + while (*iter) { + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, + pa_node.inode_node); + new_pa = rb_entry(new, struct ext4_prealloc_space, + pa_node.inode_node); + iter_start = iter_pa->pa_lstart; + new_start = new_pa->pa_lstart; + + parent = *iter; + if (new_start < iter_start) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); } /* @@ -4724,7 +4857,6 @@ adjust_bex: pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_node.inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; @@ -4744,9 +4876,9 @@ adjust_bex: list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - spin_lock(pa->pa_node_lock.inode_lock); - list_add_rcu(&pa->pa_node.inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); + write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } @@ -4908,6 +5040,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; + struct ext4_inode_info *ei; int err; int free = 0; @@ -4971,18 +5104,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { - spin_lock(pa->pa_node_lock.inode_lock); - list_del_rcu(&pa->pa_node.inode_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); } - if (pa->pa_type == MB_GROUP_PA) + list_del(&pa->u.pa_tmp_list); + + if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); - else + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); + } } ext4_unlock_group(sb, group); @@ -5012,6 +5148,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; + struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) { @@ -5033,17 +5170,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) repeat: /* first, collect all pa's in the inode */ - spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list) && needed) { - pa = list_entry(ei->i_prealloc_list.prev, - struct ext4_prealloc_space, pa_node.inode_list); + write_lock(&ei->i_prealloc_lock); + for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + iter = rb_next(iter)) { + pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); @@ -5054,7 +5193,7 @@ repeat: if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_node.inode_list); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); needed--; continue; @@ -5062,7 +5201,7 @@ repeat: /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from @@ -5079,7 +5218,7 @@ repeat: schedule_timeout_uninterruptible(HZ); goto repeat; } - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); @@ -5111,7 +5250,7 @@ repeat: put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); } } @@ -5485,7 +5624,6 @@ static void ext4_mb_trim_inode_pa(struct inode *inode) static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct inode *inode = ac->ac_inode; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -5512,16 +5650,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) } } - if (pa->pa_type == MB_INODE_PA) { - /* - * treat per-inode prealloc list as a lru list, then try - * to trim the least recently used PA. - */ - spin_lock(pa->pa_node_lock.inode_lock); - list_move(&pa->pa_node.inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_node_lock.inode_lock); - } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) |