diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2007-02-16 11:46:50 -0800 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2007-04-26 15:02:20 -0700 |
commit | 60b11392f1a09433740bda3048202213daa27736 (patch) | |
tree | a8687fcb0ce62b130b732d663b54a984564d28b2 /fs/ocfs2 | |
parent | 25baf2da1473d9dcde1a4c7b0ab26e7d67d9bf62 (diff) | |
download | lwn-60b11392f1a09433740bda3048202213daa27736.tar.gz lwn-60b11392f1a09433740bda3048202213daa27736.zip |
ocfs2: zero tail of sparse files on truncate
Since we don't zero on extend anymore, truncate needs to be fixed up to zero
the part of a file between i_size and and end of it's cluster. Otherwise a
subsequent extend could expose bad data.
This introduced a new helper, which can be used in ocfs2_write().
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/alloc.c | 224 | ||||
-rw-r--r-- | fs/ocfs2/alloc.h | 2 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 34 | ||||
-rw-r--r-- | fs/ocfs2/aops.h | 12 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 40 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 30 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 11 |
7 files changed, 328 insertions, 25 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9a40603c4d4b..98694a1add43 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -27,6 +27,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/swap.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -34,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "aops.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -3342,6 +3344,228 @@ bail: return status; } +static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return 0; +} + +static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return ocfs2_journal_dirty_data(handle, bh); +} + +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, + struct page **pages, int numpages, + u64 phys, handle_t *handle) +{ + int i, ret, partial = 0; + void *kaddr; + struct page *page; + unsigned int from, to = PAGE_CACHE_SIZE; + struct super_block *sb = inode->i_sb; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + if (numpages == 0) + goto out; + + from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ + if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { + /* + * Since 'from' has been capped to a value below page + * size, this calculation won't be able to overflow + * 'to' + */ + to = ocfs2_align_bytes_to_clusters(sb, from); + + /* + * The truncate tail in this case should never contain + * more than one page at maximum. The loop below also + * assumes this. + */ + BUG_ON(numpages != 1); + } + + for(i = 0; i < numpages; i++) { + page = pages[i]; + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + + ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); + if (ret) + mlog_errno(ret); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from, 0, to - from); + kunmap_atomic(kaddr, KM_USER0); + + /* + * Need to set the buffers we zero'd into uptodate + * here if they aren't - ocfs2_map_page_blocks() + * might've skipped some + */ + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle, + page_buffers(page), + from, to, &partial, + ocfs2_ordered_zero_func); + if (ret < 0) + mlog_errno(ret); + } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_writeback_zero_func); + if (ret < 0) + mlog_errno(ret); + } + + if (!partial) + SetPageUptodate(page); + + flush_dcache_page(page); + + /* + * Every page after the 1st one should be completely zero'd. + */ + from = 0; + } +out: + if (pages) { + for (i = 0; i < numpages; i++) { + page = pages[i]; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + } + } +} + +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, + int *num, u64 *phys) +{ + int i, numpages = 0, ret = 0; + unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned long index; + u64 next_cluster_bytes; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + /* Cluster boundary, so we don't need to grab any pages. */ + if ((isize & (csize - 1)) == 0) + goto out; + + ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, + phys, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Tail is a hole. */ + if (*phys == 0) + goto out; + + next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); + index = isize >> PAGE_CACHE_SHIFT; + do { + pages[numpages] = grab_cache_page(mapping, index); + if (!pages[numpages]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + numpages++; + index++; + } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); + +out: + if (ret != 0) { + if (pages) { + for (i = 0; i < numpages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + } + } + numpages = 0; + } + + *num = numpages; + + return ret; +} + +/* + * Zero the area past i_size but still within an allocated + * cluster. This avoids exposing nonzero data on subsequent file + * extends. + * + * We need to call this before i_size is updated on the inode because + * otherwise block_write_full_page() will skip writeout of pages past + * i_size. The new_i_size parameter is passed for this reason. + */ +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size) +{ + int ret, numpages; + struct page **pages = NULL; + u64 phys; + + /* + * File systems which don't support sparse files zero on every + * extend. + */ + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 0; + + pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Truncate on an i_size boundary - nothing more to do. + */ + if (numpages == 0) + goto out; + + ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, + handle); + + /* + * Initiate writeout of the pages we zero'd here. We don't + * wait on them - the truncate_inode_pages() call later will + * do that for us. + */ + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) + mlog_errno(ret); + +out: + if (pages) + kfree(pages); + + return ret; +} + /* * It is expected, that by the time you call this function, * inode->i_size and fe->i_size have been adjusted. diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index bff2a162b030..3cb39cd5e478 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -71,6 +71,8 @@ struct ocfs2_truncate_context { struct buffer_head *tc_last_eb_bh; }; +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size); int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index acf8f0006725..605c82a93f01 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -308,13 +308,13 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up * their fixes when they happen) --Mark */ -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -654,9 +654,9 @@ static void ocfs2_clear_page_regions(struct page *page, * * This will also skip zeroing, which is handled externally. */ -static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, - struct inode *inode, unsigned int from, - unsigned int to, int new) +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new) { int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; @@ -675,8 +675,7 @@ static int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * Ignore blocks outside of our i/o range - * they may belong to unallocated clusters. */ - if (block_start >= to || - (block_start + bsize) <= from) { + if (block_start >= to || block_end <= from) { if (PageUptodate(page)) set_buffer_uptodate(bh); continue; @@ -971,7 +970,6 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, u64 v_blkno, p_blkno; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - unsigned int cbits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; unsigned long index, start; struct page **cpages; @@ -979,13 +977,11 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, /* * Figure out how many pages we'll be manipulating here. For - * non-allocating write, or any writes where cluster size is - * less than page size, we only need one page. Otherwise, - * allocating writes of cluster size larger than page size - * need cluster size pages. + * non allocating write, we just change the one + * page. Otherwise, we'll need a whole clusters worth. */ - if (new && !wc->w_large_pages) - numpages = (1 << cbits) / PAGE_SIZE; + if (new) + numpages = ocfs2_pages_per_cluster(inode->i_sb); cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); if (!cpages) { diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index eeb2c42483e8..7d94071f0ab7 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -30,6 +30,18 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, unsigned from, unsigned to); +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new); + +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)); + struct ocfs2_write_ctxt; typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, u64 *, unsigned int *, unsigned int *); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 667e5a869bf5..5fd49ec169dc 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -262,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, { int status; handle_t *handle; + struct ocfs2_dinode *di; mlog_entry_void(); @@ -275,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + /* + * Do this before setting i_size. + */ + status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); + if (status) { + mlog_errno(status); + goto out_commit; + } + + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di = (struct ocfs2_dinode *) fe_bh->b_data; + di->i_size = cpu_to_le64(new_i_size); + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) mlog_errno(status); +out_commit: ocfs2_commit_trans(osb, handle); out: + mlog_exit(status); return status; } @@ -343,7 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, mlog_errno(status); goto bail; } - ocfs2_data_unlock(inode, 1); /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the @@ -352,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } /* TODO: orphan dir cleanup here. */ +bail_unlock_data: + ocfs2_data_unlock(inode, 1); + bail: mlog_exit(status); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0bd86a137591..78c99b5050df 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -489,12 +489,38 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, int status = 0; struct ocfs2_truncate_context *tc = NULL; struct ocfs2_dinode *fe; + handle_t *handle = NULL; mlog_entry_void(); fe = (struct ocfs2_dinode *) fe_bh->b_data; if (fe->i_clusters) { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + + i_size_write(inode, 0); + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } + + ocfs2_commit_trans(osb, handle); + handle = NULL; + status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); if (status < 0) { mlog_errno(status); @@ -507,8 +533,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, goto out; } } -out: +out: + if (handle) + ocfs2_commit_trans(osb, handle); mlog_exit(status); return status; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 2699f7cac21a..82cc92dcf8a6 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -495,6 +495,17 @@ static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_bloc return index; } +static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int pages_per_cluster = 1; + + if (PAGE_CACHE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + + return pages_per_cluster; +} + #define ocfs2_set_bit ext2_set_bit #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit |