summaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c286
1 files changed, 207 insertions, 79 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b50c2e5abe67..9b4c6033bd69 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3978,6 +3978,24 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}
+/*
+ * This function returns the next element to look at during inode
+ * PA rbtree walk. We assume that we have held the inode PA rbtree lock
+ * (ei->i_prealloc_lock)
+ *
+ * new_start The start of the range we want to compare
+ * cur_start The existing start that we are comparing against
+ * node The node of the rb_tree
+ */
+static inline struct rb_node*
+ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
+{
+ if (new_start < cur_start)
+ return node->rb_left;
+ else
+ return node->rb_right;
+}
+
static inline void
ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
ext4_lblk_t start, ext4_lblk_t end)
@@ -3986,19 +4004,22 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *tmp_pa;
ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
- spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0) {
- tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ read_lock(&ei->i_prealloc_lock);
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
- }
spin_unlock(&tmp_pa->pa_lock);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
}
/*
@@ -4006,60 +4027,140 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
* and adjust boundaries if the range overlaps with any of the existing
* preallocatoins stored in the corresponding inode of the allocation context.
*
- *Parameters:
+ * Parameters:
* ac allocation context
* start start of the new range
* end end of the new range
*/
static inline void
ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
- ext4_lblk_t *start, ext4_lblk_t *end)
+ ext4_lblk_t *start, ext4_lblk_t *end)
{
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_prealloc_space *tmp_pa;
+ struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
+ struct rb_node *iter;
ext4_lblk_t new_start, new_end;
- ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
new_start = *start;
new_end = *end;
- /* check we don't cross already preallocated blocks */
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
- if (tmp_pa->pa_deleted)
- continue;
- spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted) {
- spin_unlock(&tmp_pa->pa_lock);
- continue;
- }
-
+ /*
+ * Adjust the normalized range so that it doesn't overlap with any
+ * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
+ * so it doesn't change underneath us.
+ */
+ read_lock(&ei->i_prealloc_lock);
+
+ /* Step 1: find any one immediate neighboring PA of the normalized range */
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
tmp_pa_start = tmp_pa->pa_lstart;
tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
/* PA must not overlap original request */
- BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
- ac->ac_o_ex.fe_logical < tmp_pa_start));
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
+ BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
+ ac->ac_o_ex.fe_logical < tmp_pa_start));
+ spin_unlock(&tmp_pa->pa_lock);
+ }
- /* skip PAs this normalized request doesn't overlap with */
- if (tmp_pa_start >= new_end || tmp_pa_end <= new_start) {
+ /*
+ * Step 2: check if the found PA is left or right neighbor and
+ * get the other neighbor
+ */
+ if (tmp_pa) {
+ if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+
+ left_pa = tmp_pa;
+ tmp = rb_next(&left_pa->pa_node.inode_node);
+ if (tmp) {
+ right_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ } else {
+ struct rb_node *tmp;
+
+ right_pa = tmp_pa;
+ tmp = rb_prev(&right_pa->pa_node.inode_node);
+ if (tmp) {
+ left_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ }
+ }
+
+ /* Step 3: get the non deleted neighbors */
+ if (left_pa) {
+ for (iter = &left_pa->pa_node.inode_node;;
+ iter = rb_prev(iter)) {
+ if (!iter) {
+ left_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ left_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
spin_unlock(&tmp_pa->pa_lock);
- continue;
}
- BUG_ON(tmp_pa_start <= new_start && tmp_pa_end >= new_end);
-
- /* adjust start or end to be adjacent to this pa */
- if (tmp_pa_end <= ac->ac_o_ex.fe_logical) {
- BUG_ON(tmp_pa_end < new_start);
- new_start = tmp_pa_end;
- } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) {
- BUG_ON(tmp_pa_start > new_end);
- new_end = tmp_pa_start;
+ }
+
+ if (right_pa) {
+ for (iter = &right_pa->pa_node.inode_node;;
+ iter = rb_next(iter)) {
+ if (!iter) {
+ right_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ right_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
+ spin_unlock(&tmp_pa->pa_lock);
}
- spin_unlock(&tmp_pa->pa_lock);
}
- rcu_read_unlock();
+
+ if (left_pa) {
+ left_pa_end =
+ left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
+ BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
+ }
+
+ if (right_pa) {
+ right_pa_start = right_pa->pa_lstart;
+ BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
+ }
+
+ /* Step 4: trim our normalized range to not overlap with the neighbors */
+ if (left_pa) {
+ if (left_pa_end > new_start)
+ new_start = left_pa_end;
+ }
+
+ if (right_pa) {
+ if (right_pa_start < new_end)
+ new_end = right_pa_start;
+ }
+ read_unlock(&ei->i_prealloc_lock);
/* XXX: extra loop to check we really don't overlap preallocations */
ext4_mb_pa_assert_overlap(ac, new_start, new_end);
@@ -4401,6 +4502,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg;
struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
ext4_fsblk_t goal_block;
/* only data can be preallocated */
@@ -4408,14 +4510,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
return false;
/* first, try per-file preallocation */
- rcu_read_lock();
- list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) {
+ read_lock(&ei->i_prealloc_lock);
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
/* all fields in this condition don't change,
* so we can skip locking for them */
tmp_pa_start = tmp_pa->pa_lstart;
tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ /* original request start doesn't lie in this PA */
if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
ac->ac_o_ex.fe_logical >= tmp_pa_end)
continue;
@@ -4438,12 +4545,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
ext4_mb_use_inode_pa(ac, tmp_pa);
spin_unlock(&tmp_pa->pa_lock);
ac->ac_criteria = 10;
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
return true;
}
spin_unlock(&tmp_pa->pa_lock);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
@@ -4596,6 +4703,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
{
ext4_group_t grp;
ext4_fsblk_t grp_blk;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
@@ -4641,16 +4749,41 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_unlock_group(sb, grp);
if (pa->pa_type == MB_INODE_PA) {
- spin_lock(pa->pa_node_lock.inode_lock);
- list_del_rcu(&pa->pa_node.inode_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_free(pa);
} else {
spin_lock(pa->pa_node_lock.lg_lock);
list_del_rcu(&pa->pa_node.lg_list);
spin_unlock(pa->pa_node_lock.lg_lock);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
+}
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
+{
+ struct rb_node **iter = &root->rb_node, *parent = NULL;
+ struct ext4_prealloc_space *iter_pa, *new_pa;
+ ext4_lblk_t iter_start, new_start;
+
+ while (*iter) {
+ iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ new_pa = rb_entry(new, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ iter_start = iter_pa->pa_lstart;
+ new_start = new_pa->pa_lstart;
+
+ parent = *iter;
+ if (new_start < iter_start)
+ iter = &((*iter)->rb_left);
+ else
+ iter = &((*iter)->rb_right);
+ }
+
+ rb_link_node(new, parent, iter);
+ rb_insert_color(new, root);
}
/*
@@ -4724,7 +4857,6 @@ adjust_bex:
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_node.inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
@@ -4744,9 +4876,9 @@ adjust_bex:
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- spin_lock(pa->pa_node_lock.inode_lock);
- list_add_rcu(&pa->pa_node.inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
atomic_inc(&ei->i_prealloc_active);
}
@@ -4908,6 +5040,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct list_head list;
struct ext4_buddy e4b;
+ struct ext4_inode_info *ei;
int err;
int free = 0;
@@ -4971,18 +5104,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
list_del_rcu(&pa->pa_node.lg_list);
spin_unlock(pa->pa_node_lock.lg_lock);
} else {
- spin_lock(pa->pa_node_lock.inode_lock);
- list_del_rcu(&pa->pa_node.inode_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ei = EXT4_I(pa->pa_inode);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
}
- if (pa->pa_type == MB_GROUP_PA)
+ list_del(&pa->u.pa_tmp_list);
+
+ if (pa->pa_type == MB_GROUP_PA) {
ext4_mb_release_group_pa(&e4b, pa);
- else
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ } else {
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
-
- list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
+ }
}
ext4_unlock_group(sb, group);
@@ -5012,6 +5148,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
ext4_group_t group = 0;
struct list_head list;
struct ext4_buddy e4b;
+ struct rb_node *iter;
int err;
if (!S_ISREG(inode->i_mode)) {
@@ -5033,17 +5170,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
repeat:
/* first, collect all pa's in the inode */
- spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list) && needed) {
- pa = list_entry(ei->i_prealloc_list.prev,
- struct ext4_prealloc_space, pa_node.inode_list);
+ write_lock(&ei->i_prealloc_lock);
+ for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+ iter = rb_next(iter)) {
+ pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
+
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/* this shouldn't happen often - nobody should
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
ext4_msg(sb, KERN_ERR,
"uh-oh! used pa while discarding");
WARN_ON(1);
@@ -5054,7 +5193,7 @@ repeat:
if (pa->pa_deleted == 0) {
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_node.inode_list);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
needed--;
continue;
@@ -5062,7 +5201,7 @@ repeat:
/* someone is deleting pa right now */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
/* we have to wait here because pa_deleted
* doesn't mean pa is already unlinked from
@@ -5079,7 +5218,7 @@ repeat:
schedule_timeout_uninterruptible(HZ);
goto repeat;
}
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
@@ -5111,7 +5250,7 @@ repeat:
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
}
}
@@ -5485,7 +5624,6 @@ static void ext4_mb_trim_inode_pa(struct inode *inode)
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct inode *inode = ac->ac_inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -5512,16 +5650,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
}
}
- if (pa->pa_type == MB_INODE_PA) {
- /*
- * treat per-inode prealloc list as a lru list, then try
- * to trim the least recently used PA.
- */
- spin_lock(pa->pa_node_lock.inode_lock);
- list_move(&pa->pa_node.inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_node_lock.inode_lock);
- }
-
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)