summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 01:25:01 -0700
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 09:42:55 -0700
commitafddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /fs
parent637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
downloadlwn-afddba49d18f346e5cc2938b6ed7c512db18ca68.tar.gz
lwn-afddba49d18f346e5cc2938b6ed7c512db18ca68.zip
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c201
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/namei.c46
-rw-r--r--fs/splice.c69
4 files changed, 231 insertions, 129 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ece6c2086d0..68b8fbdc1b28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
goto done;
}
+/*
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
+ */
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start, block_end;
+ struct buffer_head *head, *bh;
+
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ return;
+
+ bh = head = page_buffers(page);
+ block_start = 0;
+ do {
+ block_end = block_start + bh->b_size;
+
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, size;
+
+ start = max(from, block_start);
+ size = min(to, block_end) - start;
+
+ zero_user_page(page, start, size, KM_USER0);
+ set_buffer_uptodate(bh);
+ }
+
+ clear_buffer_new(bh);
+ mark_buffer_dirty(bh);
+ }
+ }
+
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+EXPORT_SYMBOL(page_zero_new_buffers);
+
static int __block_prepare_write(struct inode *inode, struct page *page,
unsigned from, unsigned to, get_block_t *get_block)
{
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
- if (!err) {
- bh = head;
- do {
- if (buffer_new(bh))
- clear_buffer_new(bh);
- } while ((bh = bh->b_this_page) != head);
- return 0;
- }
- /* Error case: */
- /*
- * Zero out any newly allocated blocks to avoid exposing stale
- * data. If BH_New is set, we know that the block was newly
- * allocated in the above loop.
- */
- bh = head;
- block_start = 0;
- do {
- block_end = block_start+blocksize;
- if (block_end <= from)
- goto next_bh;
- if (block_start >= to)
- break;
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- zero_user_page(page, block_start, bh->b_size, KM_USER0);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- }
-next_bh:
- block_start = block_end;
- bh = bh->b_this_page;
- } while (bh != head);
+ if (unlikely(err))
+ page_zero_new_buffers(page, from, to);
return err;
}
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
}
+ clear_buffer_new(bh);
}
/*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
}
/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ get_block_t *get_block)
+{
+ struct inode *inode = mapping->host;
+ int status = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned start, end;
+ int ownpage = 0;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + len;
+
+ page = *pagep;
+ if (page == NULL) {
+ ownpage = 1;
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ status = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+ } else
+ BUG_ON(!PageLocked(page));
+
+ status = __block_prepare_write(inode, page, start, end, get_block);
+ if (unlikely(status)) {
+ ClearPageUptodate(page);
+
+ if (ownpage) {
+ unlock_page(page);
+ page_cache_release(page);
+ *pagep = NULL;
+
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ }
+ goto out;
+ }
+
+out:
+ return status;
+}
+EXPORT_SYMBOL(block_write_begin);
+
+int block_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ unsigned start;
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+
+ if (unlikely(copied < len)) {
+ /*
+ * The buffers that were written will now be uptodate, so we
+ * don't have to worry about a readpage reading them and
+ * overwriting a partial write. However if we have encountered
+ * a short write and only partially written into a buffer, it
+ * will not be marked uptodate, so a readpage might come in and
+ * destroy our partial write.
+ *
+ * Do the simplest thing, and just treat any short write to a
+ * non uptodate page as a zero-length write, and force the
+ * caller to redo the whole thing.
+ */
+ if (!PageUptodate(page))
+ copied = 0;
+
+ page_zero_new_buffers(page, start+copied, start+len);
+ }
+ flush_dcache_page(page);
+
+ /* This could be a short (even 0-length) commit */
+ __block_commit_write(inode, page, start, start+copied);
+
+ return copied;
+}
+EXPORT_SYMBOL(block_write_end);
+
+int generic_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ mark_inode_dirty(inode);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+
+ return copied;
+}
+EXPORT_SYMBOL(generic_write_end);
+
+/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
diff --git a/fs/libfs.c b/fs/libfs.c
index 5294de1f40c4..f2b32d3a9093 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page,
return 0;
}
+int simple_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct page *page;
+ pgoff_t index;
+ unsigned from;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+
+ page = __grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+
+ *pagep = page;
+
+ return simple_prepare_write(file, page, from, from+len);
+}
+
int simple_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page,
return 0;
}
+int simple_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+ /* zero the stale part of the page if we did a short copy */
+ if (copied < len) {
+ void *kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from + copied, 0, len - copied);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ simple_commit_write(file, page, from, from+copied);
+
+ unlock_page(page);
+ page_cache_release(page);
+
+ return copied;
+}
+
/*
* the inodes created here are not hashed. If you use iunique to generate
* unique inode values later for this filesystem, then you must take care
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
EXPORT_SYMBOL(dcache_readdir);
EXPORT_SYMBOL(generic_read_dir);
EXPORT_SYMBOL(get_sb_pseudo);
+EXPORT_SYMBOL(simple_write_begin);
+EXPORT_SYMBOL(simple_write_end);
EXPORT_SYMBOL(simple_commit_write);
EXPORT_SYMBOL(simple_dir_inode_operations);
EXPORT_SYMBOL(simple_dir_operations);
diff --git a/fs/namei.c b/fs/namei.c
index a83160acd748..b40b8084eefc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
+ void *fsdata;
int err;
char *kaddr;
retry:
- err = -ENOMEM;
- page = find_or_create_page(mapping, 0, gfp_mask);
- if (!page)
- goto fail;
- err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
- if (err == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry;
- }
+ err = pagecache_write_begin(NULL, mapping, 0, len-1,
+ AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
if (err)
- goto fail_map;
+ goto fail;
+
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, len-1);
kunmap_atomic(kaddr, KM_USER0);
- err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
- if (err == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto retry;
- }
- if (err)
- goto fail_map;
- /*
- * Notice that we are _not_ going to block here - end of page is
- * unmapped, so this will only try to map the rest of page, see
- * that it is unmapped (typically even will not look into inode -
- * ->i_size will be enough for everything) and zero it out.
- * OTOH it's obviously correct and should make the page up-to-date.
- */
- if (!PageUptodate(page)) {
- err = mapping->a_ops->readpage(NULL, page);
- if (err != AOP_TRUNCATED_PAGE)
- wait_on_page_locked(page);
- } else {
- unlock_page(page);
- }
- page_cache_release(page);
+
+ err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
+ page, fsdata);
if (err < 0)
goto fail;
+ if (err < len-1)
+ goto retry;
+
mark_inode_dirty(inode);
return 0;
-fail_map:
- unlock_page(page);
- page_cache_release(page);
fail:
return err;
}
diff --git a/fs/splice.c b/fs/splice.c
index 2df6be43c667..a7568bcc0f99 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct address_space *mapping = file->f_mapping;
unsigned int offset, this_len;
struct page *page;
- pgoff_t index;
+ void *fsdata;
int ret;
/*
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
if (unlikely(ret))
return ret;
- index = sd->pos >> PAGE_CACHE_SHIFT;
offset = sd->pos & ~PAGE_CACHE_MASK;
this_len = sd->len;
if (this_len + offset > PAGE_CACHE_SIZE)
this_len = PAGE_CACHE_SIZE - offset;
-find_page:
- page = find_lock_page(mapping, index);
- if (!page) {
- ret = -ENOMEM;
- page = page_cache_alloc_cold(mapping);
- if (unlikely(!page))
- goto out_ret;
-
- /*
- * This will also lock the page
- */
- ret = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
- if (unlikely(ret))
- goto out_release;
- }
-
- ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
- if (unlikely(ret)) {
- loff_t isize = i_size_read(mapping->host);
-
- if (ret != AOP_TRUNCATED_PAGE)
- unlock_page(page);
- page_cache_release(page);
- if (ret == AOP_TRUNCATED_PAGE)
- goto find_page;
-
- /*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
- */
- if (sd->pos + this_len > isize)
- vmtruncate(mapping->host, isize);
-
- goto out_ret;
- }
+ ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
+ AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ if (unlikely(ret))
+ goto out;
if (buf->page != page) {
/*
@@ -629,31 +596,9 @@ find_page:
kunmap_atomic(dst, KM_USER1);
buf->ops->unmap(pipe, buf, src);
}
-
- ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
- if (ret) {
- if (ret == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
- goto find_page;
- }
- if (ret < 0)
- goto out;
- /*
- * Partial write has happened, so 'ret' already initialized by
- * number of bytes written, Where is nothing we have to do here.
- */
- } else
- ret = this_len;
- /*
- * Return the number of bytes written and mark page as
- * accessed, we are now done!
- */
- mark_page_accessed(page);
+ ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
+ page, fsdata);
out:
- unlock_page(page);
-out_release:
- page_cache_release(page);
-out_ret:
return ret;
}