diff options
author | Sage Weil <sage@inktank.com> | 2012-11-05 11:07:23 -0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-11-05 11:07:23 -0800 |
commit | 22cddde104d715600a4c218bf9224923208afe90 (patch) | |
tree | 82fc93d9c89ef41145e52ace63484047e600f866 /fs/ceph/file.c | |
parent | 4d1d0534f53863108fdea496288cb3310f88118d (diff) | |
download | lwn-22cddde104d715600a4c218bf9224923208afe90.tar.gz lwn-22cddde104d715600a4c218bf9224923208afe90.zip |
ceph: Fix i_size update race
ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR
dirty before data is copied to page cache and inode size is updated.
If ceph_check_caps() flushes the dirty cap before the inode size is
updated, MDS can miss the new inode size. The fix is move
ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call
__ceph_mark_dirty_caps() after inode size is updated.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r-- | fs/ceph/file.c | 73 |
1 files changed, 30 insertions, 43 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5840d2aaed15..d415096800a6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; loff_t endoff = pos + iov->iov_len; - int want, got = 0; - int ret, err; + int got = 0; + int ret, err, written; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; retry_snap: + written = 0; if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) return -ENOSPC; __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - inode->i_size); - if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); - if (ret < 0) - goto out_put; - - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); - - if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) { - ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, - &iocb->ki_pos); - } else { - /* - * buffered write; drop Fw early to avoid slow - * revocation if we get stuck on balance_dirty_pages - */ - int dirty; - - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - ceph_put_cap_refs(ci, got); + /* + * try to do a buffered write. if we don't have sufficient + * caps, we'll get -EAGAIN from generic_file_aio_write, or a + * short write if we only get caps for some pages. + */ + if (!(iocb->ki_filp->f_flags & O_DIRECT) && + !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && + !(fi->flags & CEPH_F_SYNC)) { ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (ret >= 0) + written = ret; + if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + written - 1, 1); if (err < 0) ret = err; } + if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) + goto out; + } - if (dirty) - __mark_inode_dirty(inode, dirty); + dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, inode->i_size); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); + if (ret < 0) goto out; - } + dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); + ret = ceph_sync_write(file, iov->iov_base + written, + iov->iov_len - written, &iocb->ki_pos); if (ret >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); @@ -777,13 +767,10 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); } - -out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + inode, ceph_vinop(inode), pos + written, + (unsigned)iov->iov_len - written, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", |