diff options
author | Mingming Cao <cmm@us.ibm.com> | 2008-07-14 17:52:37 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-07-14 17:52:37 -0400 |
commit | d2a1763791a634e315ec926b62829c1e88842c86 (patch) | |
tree | 75a701a89829ba7d728fdc19c30aa2794b9706b9 /fs | |
parent | e8ced39d5e8911c662d4d69a342b9d053eaaac4e (diff) | |
download | lwn-d2a1763791a634e315ec926b62829c1e88842c86.tar.gz lwn-d2a1763791a634e315ec926b62829c1e88842c86.zip |
ext4: delayed allocation ENOSPC handling
This patch does block reservation for delayed
allocation, to avoid ENOSPC later at page flush time.
Blocks(data and metadata) are reserved at da_write_begin()
time, the freeblocks counter is updated by then, and the number of
reserved blocks is store in per inode counter.
At the writepage time, the unused reserved meta blocks are returned
back. At unlink/truncate time, reserved blocks are properly released.
Updated fix from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
to fix the oldallocator block reservation accounting with delalloc, added
lock to guard the counters and also fix the reservation for meta blocks.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/balloc.c | 49 | ||||
-rw-r--r-- | fs/ext4/dir.c | 3 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 6 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 1 | ||||
-rw-r--r-- | fs/ext4/ext4_i.h | 7 | ||||
-rw-r--r-- | fs/ext4/extents.c | 32 | ||||
-rw-r--r-- | fs/ext4/inode.c | 184 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 20 | ||||
-rw-r--r-- | fs/ext4/super.c | 5 |
9 files changed, 257 insertions, 50 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6369bacf0dcb..495ab21b9832 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1701,7 +1701,12 @@ ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - *count = ext4_has_free_blocks(sbi, *count); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + *count = ext4_has_free_blocks(sbi, *count); + } if (*count == 0) { *errp = -ENOSPC; return 0; /*return with ENOSPC error */ @@ -1902,7 +1907,8 @@ allocated: le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_sub(&sbi->s_freeblocks_counter, num); + if (!EXT4_I(inode)->i_delalloc_reserved_flag) + percpu_counter_sub(&sbi->s_freeblocks_counter, num); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group_no); @@ -1976,40 +1982,49 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, } /* - * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) + * @count: total number of blocks need * @errp: error code * - * Return allocated block number on success + * Return 1st allocated block numberon success, *count stores total account + * error stores in errp pointer */ -ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp) { - unsigned long count = 1; - return do_blk_alloc(handle, inode, 0, goal, - &count, errp, EXT4_META_BLOCK); + ext4_fsblk_t ret; + ret = do_blk_alloc(handle, inode, 0, goal, + count, errp, EXT4_META_BLOCK); + /* + * Account for the allocated meta blocks + */ + if (!(*errp)) { + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + EXT4_I(inode)->i_allocated_meta_blocks += *count; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + } + return ret; } /* - * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: total number of blocks need * @errp: error code * - * Return 1st allocated block numberon success, *count stores total account - * error stores in errp pointer + * Return allocated block number on success */ -ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp) { - return do_blk_alloc(handle, inode, 0, goal, - count, errp, EXT4_META_BLOCK); + unsigned long count = 1; + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); } /* diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5ed5108766c1..d3d23d73c08b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, + 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ee9576dc0ba1..0962f4e26579 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -74,6 +74,9 @@ #define EXT4_MB_HINT_GOAL_ONLY 256 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 512 +/* blocks already pre-reserved by delayed allocation */ +#define EXT4_MB_DELALLOC_RESERVED 1024 + struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -1041,6 +1044,7 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp, /* inode.c */ +void ext4_da_release_space(struct inode *inode, int used, int to_free); int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, @@ -1234,7 +1238,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, int create, - int extend_disksize); + int extend_disksize, int flag); #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 75333b595fab..6c166c0a54b7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index c2903ef72159..ef7409f0e7e4 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -163,6 +163,13 @@ struct ext4_inode_info { /* mballoc */ struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + + /* allocation reservation info for delalloc */ + unsigned long i_reserved_data_blocks; + unsigned long i_reserved_meta_blocks; + unsigned long i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + spinlock_t i_block_reservation_lock; }; #endif /* _EXT4_I */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7844bbb2bac0..dabc3b68d249 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -248,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) return size; } +/* + * Calculate the number of metadata blocks needed + * to allocate @blocks + * Worse case is one block per extent + */ +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +{ + int lcap, icap, rcap, leafs, idxs, num; + int newextents = blocks; + + rcap = ext4_ext_space_root_idx(inode); + lcap = ext4_ext_space_block(inode); + icap = ext4_ext_space_block_idx(inode); + + /* number of new leaf blocks needed */ + num = leafs = (newextents + lcap - 1) / lcap; + + /* + * Worse case, we need separate index block(s) + * to link all new leaf blocks + */ + idxs = (leafs + icap - 1) / icap; + do { + num += idxs; + idxs = (idxs + icap - 1) / icap; + } while (idxs > rcap); + + return num; +} + static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -2910,7 +2940,7 @@ retry: } ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0); + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2bef4f879e4b..a6b800c58474 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,6 +39,7 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) @@ -982,7 +983,7 @@ out: */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize) + int create, int extend_disksize, int flag) { int retval; @@ -1023,6 +1024,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -1044,6 +1054,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ~EXT4_EXT_MIGRATE; } } + + if (flag) { + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + /* + * Update reserved blocks/metadata blocks + * after successful block allocation + * which were deferred till now + */ + if ((retval > 0) && buffer_delay(bh)) + ext4_da_release_space(inode, retval, 0); + } + up_write((&EXT4_I(inode)->i_data_sem)); return retval; } @@ -1069,7 +1091,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1095,7 +1117,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create, 1, 0); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1409,6 +1431,122 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_has_free_blocks(sbi, total) < total) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return -ENOSPC; + } + + /* reduce fs free blocks counter */ + percpu_counter_sub(&sbi->s_freeblocks_counter, total); + + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +void ext4_da_release_space(struct inode *inode, int used, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + release = to_free + mdb_free; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, 0, to_release); +} /* * Delayed allocation stuff @@ -1829,14 +1967,18 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0); - if (ret == 0) { - /* the block isn't allocated yet, let's reserve space */ - /* XXX: call reservation here */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + map_bh(bh_result, inode->i_sb, 0); set_buffer_new(bh_result); set_buffer_delay(bh_result); @@ -1847,7 +1989,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } - +#define EXT4_DELALLOC_RSVED 1 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -1865,7 +2007,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0); + bh_result, create, 0, EXT4_DELALLOC_RSVED); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); @@ -1952,7 +2094,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret; + int ret, retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -1963,6 +2105,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; +retry: /* * With delayed allocation, we don't log the i_disksize update * if there is delayed block allocation. But we still need @@ -1988,6 +2131,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, page_cache_release(page); } + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; out: return ret; } @@ -2040,9 +2185,6 @@ static int ext4_da_write_end(struct file *file, static void ext4_da_invalidatepage(struct page *page, unsigned long offset) { - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - /* * Drop reserved blocks */ @@ -2050,21 +2192,7 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) if (!page_has_buffers(page)) goto out; - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - /* - * is this block fully invalidated? - */ - if (offset <= curr_off && buffer_delay(bh)) { - clear_buffer_delay(bh); - /* XXX: add real stuff here */ - } - curr_off = next_off; - bh = bh->b_this_page; - } while (bh != head); + ext4_da_page_release_reservation(page, offset); out: ext4_invalidatepage(page, offset); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d254ca83d9e..8d141a25bbee 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2964,7 +2964,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + + /* + * free blocks account has already be reduced/reserved + * at write_begin() time for delayed allocation + * do not double accounting + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + percpu_counter_sub(&sbi->s_freeblocks_counter, + ac->ac_b_ex.fe_len); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -4169,7 +4177,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, &(ar->len), errp); return block; } - ar->len = ext4_has_free_blocks(sbi, ar->len); + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { + /* + * With delalloc we already reserved the blocks + */ + ar->len = ext4_has_free_blocks(sbi, ar->len); + } if (ar->len == 0) { *errp = -ENOSPC; @@ -4186,6 +4199,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } inquota = ar->len; + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index de9d3d0eb203..25e2f2488cd2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -574,6 +574,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); return &ei->vfs_inode; } |