diff options
author | npiggin@suse.de <npiggin@suse.de> | 2010-05-27 01:05:33 +1000 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2010-05-27 22:15:33 -0400 |
commit | 7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f (patch) | |
tree | e575d9c55e2a6ccc645dcb3ae2564de458b428f2 /fs/buffer.c | |
parent | 7000d3c424e5bb350e502a477fb0e1ed42f8b10e (diff) | |
download | lwn-7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f.tar.gz lwn-7bb46a6734a7e1ad4beaecc11cae7ed3ff81d30f.zip |
fs: introduce new truncate sequence
Introduce a new truncate calling sequence into fs/mm subsystems. Rather than
setattr > vmtruncate > truncate, have filesystems call their truncate sequence
from ->setattr if filesystem specific operations are required. vmtruncate is
deprecated, and truncate_pagecache and inode_newsize_ok helpers introduced
previously should be used.
simple_setattr is introduced for simple in-ram filesystems to implement
the new truncate sequence. Eventually all filesystems should be converted
to implement a setattr, and the default code in notify_change should go
away.
simple_setsize is also introduced to perform just the ATTR_SIZE portion
of simple_setattr (ie. changing i_size and trimming pagecache).
To implement the new truncate sequence:
- filesystem specific manipulations (eg freeing blocks) must be done in
the setattr method rather than ->truncate.
- vmtruncate can not be used by core code to trim blocks past i_size in
the event of write failure after allocation, so this must be performed
in the fs code.
- convert usage of helpers block_write_begin, nobh_write_begin,
cont_write_begin, and *blockdev_direct_IO* to use _newtrunc postfixed
variants. These avoid calling vmtruncate to trim blocks (see previous).
- inode_setattr should not be used. generic_setattr is a new function
to be used to copy simple attributes into the generic inode.
- make use of the better opportunity to handle errors with the new sequence.
Big problem with the previous calling sequence: the filesystem is not called
until i_size has already changed. This means it is not allowed to fail the
call, and also it does not know what the previous i_size was. Also, generic
code calling vmtruncate to truncate allocated blocks in case of error had
no good way to return a meaningful error (or, for example, atomically handle
block deallocation).
Cc: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/buffer.c')
-rw-r--r-- | fs/buffer.c | 123 |
1 files changed, 98 insertions, 25 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index e8aa7081d25c..d54812b198e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, } /* - * block_write_begin takes care of the basic task of block allocation and - * bringing partial write blocks uptodate first. - * - * If *pagep is not NULL, then block_write_begin uses the locked page - * at *pagep rather than allocating its own. In this case, the page will - * not be unlocked or deallocated on failure. + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct file *file, struct address_space *mapping, +int block_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); } } out: return status; } +EXPORT_SYMBOL(block_write_begin_newtrunc); + +/* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = block_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + * + * Filesystems which pass down their own page also cannot + * call into vmtruncate here because it would lead to lock + * inversion problems (*pagep is locked). This is a further + * example of where the old truncate sequence is inadequate. + */ + if (unlikely(ret) && *pagep == NULL) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, @@ -2324,7 +2351,7 @@ out: * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ -int cont_write_begin(struct file *file, struct address_space *mapping, +int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block, loff_t *bytes) @@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } *pagep = NULL; - err = block_write_begin(file, mapping, pos, len, + err = block_write_begin_newtrunc(file, mapping, pos, len, flags, pagep, fsdata, get_block); out: return err; } +EXPORT_SYMBOL(cont_write_begin_newtrunc); + +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) +{ + int ret; + + ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block, bytes); + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } + + return ret; +} EXPORT_SYMBOL(cont_write_begin); int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write); * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because - * vmtruncate() writes the inode size before removing pages, once we have the + * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. @@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) } /* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) + * Filesystems implementing the new truncate sequence should use the + * _newtrunc postfix variant which won't incorrectly call vmtruncate. + * The filesystem needs to handle block truncation upon failure. */ -int nobh_write_begin(struct file *file, struct address_space *mapping, +int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, get_block_t *get_block) @@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); *pagep = NULL; - return block_write_begin(file, mapping, pos, len, flags, pagep, - fsdata, get_block); + return block_write_begin_newtrunc(file, mapping, pos, len, + flags, pagep, fsdata, get_block); } if (PageMappedToDisk(page)) @@ -2605,8 +2652,34 @@ out_release: page_cache_release(page); *pagep = NULL; - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + return ret; +} +EXPORT_SYMBOL(nobh_write_begin_newtrunc); + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + int ret; + + ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, + pagep, fsdata, get_block); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (unlikely(ret)) { + loff_t isize = mapping->host->i_size; + if (pos + len > isize) + vmtruncate(mapping->host, isize); + } return ret; } |