From 72f465033702ebfe20db8f50edaad59f0f38b0f5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Aug 2010 13:35:09 +0200 Subject: bio-integrity.c: remove dependency on __GFP_NOFAIL The kmalloc() in bio_integrity_prep() is failable, so remove __GFP_NOFAIL from its mask. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 612a5c38d3c1..a8f4cc679983 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -413,7 +413,7 @@ int bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); return -EIO; -- cgit v1.2.3 From 220eb7fd984bfc7e6b4005fdf32efe9cd8af7cf2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 23 Aug 2010 13:36:20 +0200 Subject: fs/bio-integrity.c: return -ENOMEM on kmalloc failure Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a8f4cc679983..4d0ff5ee27b8 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -416,7 +416,7 @@ int bio_integrity_prep(struct bio *bio) buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); - return -EIO; + return -ENOMEM; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; -- cgit v1.2.3 From b76b4014f9d988d2412b873e4d4c13c7f9afc4e4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 28 Aug 2010 08:52:10 +0200 Subject: writeback: Fix lost wake-up shutting down writeback thread Setting the task state here may cause us to miss the wake up from kthread_stop(), so we need to recheck kthread_should_stop() or risk sleeping forever in the following schedule(). Symptom was an indefinite hang on an NFSv4 mount. (NFSv4 may create multiple mounts in a temporary namespace while traversing the mount path, and since the temporary namespace is immediately destroyed, it may end up destroying a mount very soon after it was created, possibly making this race more likely.) INFO: task mount.nfs4:4314 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mount.nfs4 D 0000000000000000 2880 4314 4313 0x00000000 ffff88001ed6da28 0000000000000046 ffff88001ed6dfd8 ffff88001ed6dfd8 ffff88001ed6c000 ffff88001ed6c000 ffff88001ed6c000 ffff88001e5003a0 ffff88001ed6dfd8 ffff88001e5003a8 ffff88001ed6c000 ffff88001ed6dfd8 Call Trace: [] schedule_timeout+0x1cd/0x2e0 [] ? mark_held_locks+0x6c/0xa0 [] ? _raw_spin_unlock_irq+0x30/0x60 [] ? trace_hardirqs_on_caller+0x14d/0x190 [] ? sub_preempt_count+0xe/0xd0 [] wait_for_common+0x120/0x190 [] ? default_wake_function+0x0/0x20 [] wait_for_completion+0x1d/0x20 [] kthread_stop+0x4a/0x150 [] ? thaw_process+0x70/0x80 [] bdi_unregister+0x10a/0x1a0 [] nfs_put_super+0x19/0x20 [] generic_shutdown_super+0x54/0xe0 [] kill_anon_super+0x16/0x60 [] nfs4_kill_super+0x39/0x90 [] deactivate_locked_super+0x45/0x60 [] deactivate_super+0x49/0x70 [] mntput_no_expire+0x84/0xe0 [] release_mounts+0x9f/0xc0 [] put_mnt_ns+0x65/0x80 [] nfs_follow_remote_path+0x1e6/0x420 [] nfs4_try_mount+0x6f/0xd0 [] nfs4_get_sb+0xa2/0x360 [] vfs_kern_mount+0x88/0x1f0 [] do_kern_mount+0x52/0x130 [] ? _lock_kernel+0x6a/0x170 [] do_mount+0x26e/0x7f0 [] ? copy_mount_options+0xea/0x190 [] sys_mount+0x98/0xf0 [] system_call_fastpath+0x16/0x1b 1 lock held by mount.nfs4/4314: #0: (&type->s_umount_key#24){+.+...}, at: [] deactivate_super+0x41/0x70 Signed-off-by: J. Bruce Fields Signed-off-by: Jens Axboe Acked-by: Artem Bityutskiy --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7d9d06ba184b..81e086d8aa57 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data) wb->last_active = jiffies; set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list)) { + if (!list_empty(&bdi->work_list) || kthread_should_stop()) { __set_current_state(TASK_RUNNING); continue; } -- cgit v1.2.3 From 4afc31345e5f543e5d89a47aeadaaad1d91a5bc8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 29 Aug 2010 01:55:38 +0900 Subject: nilfs2: fix leak of shadow dat inode in error path of load_nilfs If load_nilfs() gets an error while doing recovery, it will fail to free the shadow inode of dat (nilfs->ns_gc_dat). This fixes the leak issue. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 4317f177ea7c..ba7c10c917fc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_gc_dat); failed: nilfs_clear_recovery_info(&ri); -- cgit v1.2.3 From 8f587df479c3cea14ba1a9b9d58f34fd2fd6d58b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Aug 2010 16:27:45 +0000 Subject: 9p: potential ERR_PTR() dereference p9_client_walk() can return error values if we run out of space or there is a problem with the network. Signed-off-by: Dan Carpenter Signed-off-by: Eric Van Hensbergen --- fs/9p/fid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 358563689064..6406f896bf95 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) } kfree(wnames); fid_out: - v9fs_fid_add(dentry, fid); + if (!IS_ERR(fid)) + v9fs_fid_add(dentry, fid); err_out: up_read(&v9ses->rename_sem); return fid; -- cgit v1.2.3 From 9bc08a45fb117c696e4940cfa1208cb1cc7a2f25 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Sep 2010 15:14:38 +1000 Subject: xfs: improve buffer cache hash scalability When doing large parallel file creates on a 16p machines, large amounts of time is being spent in _xfs_buf_find(). A system wide profile with perf top shows this: 1134740.00 19.3% _xfs_buf_find 733142.00 12.5% __ticket_spin_lock The problem is that the hash contains 45,000 buffers, and the hash table width is only 256 buffers. That means we've got around 200 buffers per chain, and searching it is quite expensive. The hash table size needs to increase. Secondly, every time we do a lookup, we promote the buffer we find to the head of the hash chain. This is causing cachelines to be dirtied and causes invalidation of cachelines across all CPUs that may have walked the hash chain recently. hence every walk of the hash chain is effectively a cold cache walk. Remove the promotion to avoid this invalidation. The results are: 1045043.00 21.2% __ticket_spin_lock 326184.00 6.6% _xfs_buf_find A 70% drop in the CPU usage when looking up buffers. Unfortunately that does not result in an increase in performance underthis workload as contention on the inode_lock soaks up most of the reduction in CPU usage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 8 +------- fs/xfs/linux-2.6/xfs_buf.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072f5210..d72cf2bb054a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find( ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index d072e5ff923b..2a05614f0b92 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -137,7 +137,6 @@ typedef struct xfs_buftarg { size_t bt_smask; /* per device buffer hash table */ - uint bt_hashmask; uint bt_hashshift; xfs_bufhash_t *bt_hash; -- cgit v1.2.3 From 23963e54ce187ca6e907c83176c15508b0f6e60d Mon Sep 17 00:00:00 2001 From: Arkadiusz Mi?kiewicz Date: Thu, 26 Aug 2010 10:19:43 +0000 Subject: xfs: Disallow 32bit project quota id Currently on-disk structure is able to keep only 16bit project quota id, so disallow 32bit ones. This fixes a problem where parts of kernel structures holding project quota id are 32bit while parts (on-disk) are 16bit variables which causes project quota member files to be inaccessible for some operations (like mv/rm). Signed-off-by: Arkadiusz Mi?kiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 237f5ffb2ee8..4fec427b83ef 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -906,6 +906,13 @@ xfs_ioctl_setattr( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + /* + * Disallow 32bit project ids because on-disk structure + * is 16bit only. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + return XFS_ERROR(EINVAL); + /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later -- cgit v1.2.3 From 8f34a430ac16d5fbd9d6b383184d35e152f5a963 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 2 Sep 2010 15:23:16 -0400 Subject: nfsd4: mask out non-access bits in nfs4_access_to_omode This fixes an unnecessary BUG(). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3dfef0623968..cf0d2ffb3c84 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { static int nfs4_access_to_omode(u32 access) { - switch (access) { + switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: -- cgit v1.2.3 From 72656c46f50b8dfe50e15793692982e636e3df20 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 3 Sep 2010 12:19:33 +1000 Subject: xfs: prevent 32bit overflow in space reservation If we attempt to preallocate more than 2^32 blocks of space in a single syscall, the transaction block reservation will overflow leading to a hangs in the superblock block accounting code. This is trivially reproduced with xfs_io. Fix the problem by capping the allocation reservation to the maximum number of blocks a single xfs_bmapi() call can allocate (2^21 blocks). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 66d585c6917c..4c7c7bfb2b2f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2299,15 +2299,22 @@ xfs_alloc_file_space( e = allocatesize_fsb; } + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = (uint)(e - s); + resrtextents = qblocks = resblks; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { resrtextents = 0; - resblks = qblocks = \ - XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); quota_flag = XFS_QMOPT_RES_REGBLKS; } -- cgit v1.2.3 From 9af25465081480a75824fd7a16a37a5cfebeede9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 30 Aug 2010 02:44:03 +0000 Subject: xfs: Make fiemap work with sparse files In xfs_vn_fiemap, we set bvm_count to fi_extent_max + 1 and want to return fi_extent_max extents, but actually it won't work for a sparse file. The reason is that in xfs_getbmap we will calculate holes and set it in 'out', while out is malloced by bmv_count(fi_extent_max+1) which didn't consider holes. So in the worst case, if 'out' vector looks like [hole, extent, hole, extent, hole, ... hole, extent, hole], we will only return half of fi_extent_max extents. This patch add a new parameter BMV_IF_NO_HOLES for bvm_iflags. So with this flags, we don't use our 'out' in xfs_getbmap for a hole. The solution is a bit ugly by just don't increasing index of 'out' vector. I felt that it is not easy to skip it at the very beginning since we have the complicated check and some function like xfs_getbmapx_fix_eof_hole to adjust 'out'. Cc: Dave Chinner Signed-off-by: Tao Ma Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- fs/xfs/xfs_bmap.c | 14 +++++++++++++- fs/xfs/xfs_fs.h | 4 +++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 68be25dcd301..b1fc2a6bfe83 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -664,7 +664,7 @@ xfs_vn_fiemap( fieinfo->fi_extents_max + 1; bm.bmv_count = min_t(__s32, bm.bmv_count, (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC; + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 23f14e595c18..f90dadd5a968 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5533,12 +5533,24 @@ xfs_getbmap( map[i].br_startblock)) goto out_free_map; - nexleft--; bmv->bmv_offset = out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; bmv->bmv_entries++; cur_ext++; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 7cf7220e7d5f..87c2e9d02288 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -114,8 +114,10 @@ struct getbmapx { #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ #define BMV_IF_VALID \ - (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ -- cgit v1.2.3 From 57f9bdac2510cd7fda58e4a111d250861eb1ebeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:12:29 +0200 Subject: sysfs: checking for NULL instead of ERR_PTR d_path() returns an ERR_PTR and it doesn't return NULL. Signed-off-by: Dan Carpenter Cc: stable Reviewed-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b27b5688f62..da3fefe91a8f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ -- cgit v1.2.3 From 595afaf9e6ee1b48e13ec4b8bcc8c7dee888161a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 7 Sep 2010 13:42:41 +0200 Subject: fuse: flush background queue on connection close David Bartly reported that fuse can hang in fuse_get_req_nofail() when the connection to the filesystem server is no longer active. If bg_queue is not empty then flush_bg_queue() called from request_end() can put more requests on to the pending queue. If this happens while ending requests on the processing queue then those background requests will be queued to the pending list and never ended. Another problem is that fuse_dev_release() didn't wake up processes sleeping on blocked_waitq. Solve this by: a) flushing the background queue before calling end_requests() on the pending and processing queues b) setting blocked = 0 and waking up processes waiting on blocked_waitq() Thanks to David for an excellent bug report. Reported-by: David Bartley Signed-off-by: Miklos Szeredi CC: stable@kernel.org --- fs/fuse/dev.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69ad053ffd78..b4fc47ff4bc2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1769,6 +1769,14 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1795,8 +1803,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1811,8 +1818,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } -- cgit v1.2.3 From b9ca67b2ddf021491a3741d9555e8ff59b2175ba Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 7 Sep 2010 13:42:41 +0200 Subject: fuse: fix lock annotations Sparse doesn't understand lock annotations of the form __releases(&foo->lock). Change them to __releases(foo->lock). Same for __acquires(). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 26 ++++++++++++++------------ fs/fuse/file.c | 8 ++++---- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4fc47ff4bc2..d367af1514ef 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -306,8 +306,8 @@ __releases(&fc->lock) static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (signal_pending(current)) return; @@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -934,7 +934,7 @@ __acquires(&fc->lock) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -1744,8 +1744,8 @@ __acquires(&fc->lock) * locked). */ static void end_io_requests(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1770,6 +1770,8 @@ __acquires(&fc->lock) } static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) { fc->max_background = UINT_MAX; flush_bg_queue(fc); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 147c1f71bdb9..c8224587123f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1183,8 +1183,8 @@ __acquires(&fc->lock) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); -- cgit v1.2.3 From 7a2e8a8faab76386d8eaae9ded739ee5615be174 Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Thu, 26 Aug 2010 11:07:22 -0700 Subject: VFS: Sanity check mount flags passed to change_mnt_propagation() Sanity check the flags passed to change_mnt_propagation(). Exactly one flag should be set. Return EINVAL otherwise. Userspace can pass in arbitrary combinations of MS_* flags to mount(). do_change_type() is called if any of MS_SHARED, MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE is set. do_change_type() clears MS_REC and then calls change_mnt_propagation() with the rest of the user-supplied flags. change_mnt_propagation() clearly assumes only one flag is set but do_change_type() does not check that this is true. For example, mount() with flags MS_SHARED | MS_RDONLY does not actually make the mount shared or read-only but does clear MNT_UNBINDABLE. Signed-off-by: Valerie Aurora Signed-off-by: Linus Torvalds --- fs/namespace.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index de402eb6eafb..a72eaabfe8f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1483,6 +1483,23 @@ out_unlock: return err; } +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~MS_REC; + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + /* * recursively change the type of the mountpoint. */ @@ -1490,7 +1507,7 @@ static int do_change_type(struct path *path, int flag) { struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; int err = 0; if (!capable(CAP_SYS_ADMIN)) @@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag) if (path->dentry != path->mnt->mnt_root) return -EINVAL; + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + down_write(&namespace_sem); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); -- cgit v1.2.3 From dc696aced9f09f05b1f927b93f5a7918017a3e49 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 12 Aug 2010 16:24:25 -0700 Subject: ocfs2: Fix metaecc error messages Like tools, the checksum validate function now prints the values in hex. Signed-off-by: Sunil Mushran Singed-off-by: Tao Ma --- fs/ocfs2/blockcheck.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index ec6d12339593..c7ee03c22226 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, - "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ @@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, goto out; } - mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); rc = -EIO; -- cgit v1.2.3 From f5ce5a08a40f2086435858ddc80cb40394b082eb Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 12 Aug 2010 16:24:26 -0700 Subject: ocfs2: Fix incorrect checksum validation error For local mounts, ocfs2_read_locked_inode() calls ocfs2_read_blocks_sync() to read the inode off the disk. The latter first checks to see if that block is cached in the journal, and, if so, returns that block. That is ok. But ocfs2_read_locked_inode() goes wrong when it tries to validate the checksum of such blocks. Blocks that are cached in the journal may not have had their checksum computed as yet. We should not validate the checksums of such blocks. Fixes ossbz#1282 http://oss.oracle.com/bugzilla/show_bug.cgi?id=1282 Signed-off-by: Sunil Mushran Cc: stable@kernel.org Singed-off-by: Tao Ma --- fs/ocfs2/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0492464916b1..eece3e05d9d0 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { -- cgit v1.2.3 From f63afdb2c32db850fa1bfccf84643a8885cbeb61 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 17 Jul 2010 21:45:49 +0800 Subject: ocfs2: make __ocfs2_page_mkwrite handle file end properly. __ocfs2_page_mkwrite now is broken in handling file end. 1. the last page should be the page contains i_size - 1. 2. the len in the last page is also calculated wrong. So change them accordingly. Acked-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/mmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe1f139..4c18f4ad93b4 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, /* * Another node might have truncated while we were waiting on * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. */ - last_index = size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) { + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { ret = -EINVAL; goto out; } @@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = size & ~PAGE_CACHE_MASK; + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); -- cgit v1.2.3 From 04eda1a18019bb387dc7e97ee99979dd88dc608a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 5 Aug 2010 20:32:45 +0200 Subject: ocfs2: Flush drive's caches on fdatasync When 'barrier' mount option is specified, we have to issue a cache flush during fdatasync(2). We have to do this even if inode doesn't have I_DIRTY_DATASYNC set because we still have to get written *data* to disk so that they are not lost in case of crash. Acked-by: Tao Ma Signed-off-by: Jan Kara Singed-off-by: Tao Ma --- fs/ocfs2/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 81296b4e3646..6b2be0f2eacd 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -36,6 +36,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) if (err) goto bail; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); goto bail; + } journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); -- cgit v1.2.3 From 889f004a8c83d515f275078687f859bc0d5ede9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 2 Sep 2010 13:10:10 +0800 Subject: ocfs2: Use the right group in nfs sync check. We have added discontig block group now, and now an inode can be allocated in an discontig block group. So get it in ocfs2_get_suballoc_slot_bit. The old ocfs2_test_suballoc_bit gets group block no from the allocation inode which is wrong. Fix it by passing the right group. Acked-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a8e6a95a353f..8a009ee1f7fd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2567,7 +2567,8 @@ out: * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, - u16 *suballoc_slot, u16 *suballoc_bit) + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; @@ -2604,6 +2605,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); @@ -2621,7 +2624,8 @@ bail: */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, - struct buffer_head *alloc_bh, u64 blkno, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; @@ -2642,10 +2646,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, goto bail; } - if (alloc_di->i_suballoc_loc) - bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); - else - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { @@ -2680,6 +2682,7 @@ bail: int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; + u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; @@ -2687,7 +2690,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, - &suballoc_bit); + &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; @@ -2715,7 +2718,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, - blkno, suballoc_bit, res); + group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); -- cgit v1.2.3 From b2b6ebf5f740e015b2155343958f067e594323ea Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 26 Aug 2010 13:06:50 -0700 Subject: ocfs2: properly set and use inode group alloc hint We were setting ac->ac_last_group in ocfs2_claim_suballoc_bits from res->sr_bg_blkno. Unfortunately, res->sr_bg_blkno is going to be zero under normal (non-fragmented) circumstances. The discontig block group patches effectively turned off that feature. Fix this by correctly calculating what the next group hint should be. Acked-by: Tao Ma Signed-off-by: Mark Fasheh Tested-by: Goldwyn Rodrigues Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8a009ee1f7fd..b93d7e72175a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,6 +62,17 @@ struct ocfs2_suballoc_result { unsigned int sr_bits; /* How many bits we claimed */ }; +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -1845,6 +1856,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, int status; u16 victim, i; u16 bits_left = 0; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1872,7 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - res->sr_bg_blkno = ac->ac_last_group; + res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used @@ -1896,8 +1908,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1920,8 +1934,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1936,7 +1952,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = res->sr_bg_blkno; + ac->ac_last_group = hint; } bail: -- cgit v1.2.3 From 9b4c0ff32ccd87ab52d4c5bd0a0536febce11370 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2010 14:28:03 +0200 Subject: ocfs2: Fix deadlock when allocating page We cannot call grab_cache_page() when holding filesystem locks or with a transaction started as grab_cache_page() calls page allocation with GFP_KERNEL flag and thus page reclaim can recurse back into the filesystem causing deadlocks or various assertion failures. We have to use find_or_create_page() instead and pass it GFP_NOFS as we do with other allocations. Acked-by: Mark Fasheh Signed-off-by: Jan Kara Signed-off-by: Tao Ma --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/file.c | 2 +- fs/ocfs2/refcounttree.c | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 215e12ce1d85..592fae5007d1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6b2be0f2eacd..2caa3a7a1a39 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -783,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 73a11ccfd4c2..0afeda83120f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); - page = grab_cache_page(mapping, page_index); + page = find_or_create_page(mapping, page_index, GFP_NOFS); /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = grab_cache_page(context->inode->i_mapping, page_index); + page = find_or_create_page(context->inode->i_mapping, + page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); -- cgit v1.2.3 From 81c8c82b5a39f9127e8b239e9b406a6c3a41b228 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Thu, 19 Aug 2010 15:15:00 +0800 Subject: Ocfs2: Fix a regression bug from mainline commit(6b933c8e6f1a2f3118082c455eef25f9b1ac7b45). The patch is to fix the regression bug brought from commit 6b933c8...( 'ocfs2: Avoid direct write if we fall back to buffered I/O'): http://oss.oracle.com/bugzilla/show_bug.cgi?id=1285 The commit 6b933c8e6f1a2f3118082c455eef25f9b1ac7b45 changed __generic_file_aio_write to generic_file_buffered_write, which didn't call filemap_{write,wait}_range to flush the pagecaches when we were falling O_DIRECT writes back to buffered ones. it did hurt the O_DIRECT semantics somehow in extented odirect writes. This patch tries to guarantee O_DIRECT writes of 'fall back to buffered' to be correctly flushed. Signed-off-by: Tristan Ye Signed-off-by: Tao Ma --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2caa3a7a1a39..9a03c151b5ce 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2338,7 +2338,7 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && has_refcount)) { + ((file->f_flags & O_DIRECT) && !direct_io)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) -- cgit v1.2.3 From 021960cab320ae3cc4e9aba9cca42f9f5ce785f3 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:15 -0700 Subject: ocfs2: split out inode alloc code from ocfs2_mknod_locked Do this by splitting the bulk of the function away from the inode allocation code at the very tom of ocfs2_mknod_locked(). Existing callers don't need to change and won't see any difference. The new function created, __ocfs2_mknod_locked() will be used shortly. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f171b51a74f7..2aa66b695fab 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -472,32 +472,23 @@ leave: return status; } -static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, - struct inode *inode, - dev_t dev, - struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *inode_ac) +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 suballoc_loc, fe_blkno = 0; - u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, - inode_ac, &suballoc_loc, - &suballoc_bit, &fe_blkno); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ @@ -591,6 +582,34 @@ leave: return status; } +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) -- cgit v1.2.3 From d51349829c378c06ba4aa7d4b16ca23739858608 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:16 -0700 Subject: ocfs2: use ocfs2_alloc_dinode_update_counts() instead of open coding ocfs2_search_chain() makes the same updates as ocfs2_alloc_dinode_update_counts to the alloc inode. Instead of open coding the bitmap update, use our helper function. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b93d7e72175a..e7edda8c6a11 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1719,7 +1719,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; - u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1807,22 +1806,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(alloc_inode), - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); - ocfs2_journal_dirty(handle, ac->ac_bh); - status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, -- cgit v1.2.3 From e49e27674d1dd2717ad90b21ece8f83102153315 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:17 -0700 Subject: ocfs2: allow return of new inode block location before allocation of the inode This allows code which needs to know the eventual block number of an inode but can't allocate it yet due to transaction or lock ordering. For example, ocfs2_create_inode_in_orphan() currently gives a junk blkno for preparation of the orphan dir because it can't yet know where the actual inode is placed - that code is actually in ocfs2_mknod_locked. This is a problem when the orphan dirs are indexed as the junk inode number will create an index entry which goes unused (and fails the later removal from the orphan dir). Now with these interfaces, ocfs2_create_inode_in_orphan() can run the block group search (and get back the inode block number) *before* any actual allocation occurs. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/suballoc.h | 21 +++++++ 2 files changed, 180 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index e7edda8c6a11..8a286f54dca1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -57,6 +57,12 @@ struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ @@ -149,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -1689,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); @@ -1702,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (ret < 0) mlog_errno(ret); +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1780,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1806,6 +1831,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } + if (ac->ac_find_loc_only) + goto out_loc_only; + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, chain); @@ -1828,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -2023,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, + (unsigned long long)di_blkno); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a017dd3ee7d9..b8afabfeede4 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,9 @@ struct ocfs2_alloc_context { u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + struct ocfs2_alloc_reservation *ac_resv; }; @@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context **meta_ac); int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + #endif /* _CHAINALLOC_H_ */ -- cgit v1.2.3 From dd43bcde23c527f64897eef41aa1fed2c9905ea9 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:18 -0700 Subject: ocfs2: split out ocfs2_prepare_orphan_dir() into locking and prep functions We do this because ocfs2_create_inode_in_orphan() wants to order locking of the orphan dir with respect to locking of the inode allocator *before* making any changes to the directory. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 120 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2aa66b695fab..54c629855357 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1871,61 +1871,117 @@ bail: return status; } -static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, - struct inode **ret_orphan_dir, - u64 blkno, - char *name, - struct ocfs2_dir_lookup_result *lookup) +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; - int status = 0; - - status = ocfs2_blkno_stringify(blkno, name); - if (status < 0) { - mlog_errno(status); - return status; - } + int ret = 0; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -ENOENT; - mlog_errno(status); - return status; + ret = -ENOENT; + mlog_errno(ret); + return ret; } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mlog_errno(status); - goto leave; + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; } - status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, - orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); - if (status < 0) { - ocfs2_inode_unlock(orphan_dir_inode, 1); + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; - mlog_errno(status); - goto leave; + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; } *ret_orphan_dir = orphan_dir_inode; -leave: - if (status) { +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); } - brelse(orphan_dir_bh); - - mlog_exit(status); - return status; + mlog_exit(ret); + return ret; } static int ocfs2_orphan_add(struct ocfs2_super *osb, -- cgit v1.2.3 From 97b8f4a9dfd932997677136e11980eb2fafea91d Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:19 -0700 Subject: ocfs2: Fix orphan add in ocfs2_create_inode_in_orphan ocfs2_create_inode_in_orphan() is used by reflink to create the newly reflinked inode simultaneously in the orphan dir. This allows us to easily handle partially-reflinked files during recovery cleanup. We have a problem though - the orphan dir stringifies inode # to determine a unique name under which the orphan entry dirent can be created. Since ocfs2_create_inode_in_orphan() needs the space allocated in the orphan dir before it can allocate the inode, we currently call into the orphan code: /* * We give the orphan dir the root blkno to fake an orphan name, * and allocate enough space for our insertion. */ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, osb->root_blkno, orphan_name, &orphan_insert); Using osb->root_blkno might work fine on unindexed directories, but the orphan dir can have an index. When it has that index, the above code fails to allocate the proper index entry. Later, when we try to remove the file from the orphan dir (using the actual inode #), the reflink operation will fail. To fix this, I created a function ocfs2_alloc_orphaned_file() which uses the newly split out orphan and inode alloc code to figure out what the inode block number will be (once allocated) and then prepare the orphan dir from that data. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 54c629855357..a00dda2e4f16 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2128,6 +2128,99 @@ leave: return status; } +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return 0; +} + int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode) @@ -2143,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); if (status < 0) { @@ -2151,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, return status; } - /* - * We give the orphan dir the root blkno to fake an orphan name, - * and allocate enough space for our insertion. - */ - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - osb->root_blkno, - orphan_name, &orphan_insert); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - /* reserve an inode spot */ - status = ocfs2_reserve_new_inode(osb, &inode_ac); + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -2191,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; did_quota_inode = 1; - inode->i_nlink = 0; - /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, - 0, &new_di_bh, parent_di_bh, handle, - inode_ac); + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + inode->i_nlink = 0; + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); if (status < 0) { mlog_errno(status); goto leave; -- cgit v1.2.3 From 39aa3cb3e8250db9188a6f1e3fb62ffa1a717678 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Tue, 31 Aug 2010 15:52:27 +0200 Subject: mm: Move vma_stack_continue into mm.h So it can be used by all that need to check for that. Signed-off-by: Stefan Bader Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mm.h | 6 ++++++ mm/mlock.c | 6 ------ 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 439fc1f1c1c4..271afc48b9a5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; if (vma->vm_flags & VM_GROWSDOWN) - start += PAGE_SIZE; + if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) + start += PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b1210772ce..74949fbef8c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -864,6 +864,12 @@ int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +/* Is the vma a continuation of the stack vma above it? */ +static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); diff --git a/mm/mlock.c b/mm/mlock.c index cbae7c5b9568..b70919ce4f72 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page) } } -/* Is the vma a continuation of the stack vma above it? */ -static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) -{ - return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); -} - static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) { return (vma->vm_flags & VM_GROWSDOWN) && -- cgit v1.2.3 From 7a801ac6f5067539ceb5fad0fe90ec49fc156e47 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 9 Sep 2010 16:37:33 -0700 Subject: O_DIRECT: fix the splitting up of contiguous I/O commit c2c6ca4 (direct-io: do not merge logically non-contiguous requests) introduced a bug whereby all O_DIRECT I/Os were submitted a page at a time to the block layer. The problem is that the code expected dio->block_in_file to correspond to the current page in the dio. In fact, it corresponds to the previous page submitted via submit_page_section. This was purely an oversight, as the dio->cur_page_fs_offset field was introduced for just this purpose. This patch simply uses the correct variable when calculating whether there is a mismatch between contiguous logical blocks and contiguous physical blocks (as described in the comments). I also switched the if conditional following this check to an else if, to ensure that we never call dio_bio_submit twice for the same dio (in theory, this should not happen, anyway). I've tested this by running blktrace and verifying that a 64KB I/O was submitted as a single I/O. I also ran the patched kernel through xfstests' aio tests using xfs, ext4 (with 1k and 4k block sizes) and btrfs and verified that there were no regressions as compared to an unpatched kernel. Signed-off-by: Jeff Moyer Acked-by: Josef Bacik Cc: Christoph Hellwig Cc: Chris Mason Cc: [2.6.35.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 51f270b479b6..48d74c7391d1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -634,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { - loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t cur_offset = dio->cur_page_fs_offset; loff_t bio_next_offset = dio->logical_offset_in_bio + dio->bio->bi_size; @@ -659,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio) * Submit now if the underlying fs is about to perform a * metadata read */ - if (dio->boundary) + else if (dio->boundary) dio_bio_submit(dio); } -- cgit v1.2.3 From ed430fec756ad65f7cfba24f8ad17c3d5a403290 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 9 Sep 2010 16:37:36 -0700 Subject: proc: export uncached bit properly in /proc/kpageflags Fix the left-over old ifdef for PG_uncached in /proc/kpageflags. Now it's used by x86, too. Signed-off-by: Takashi Iwai Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 180cf5a0bd67..3b8b45660331 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -146,7 +146,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif -- cgit v1.2.3 From ee3aebdd8f5f8eac41c25c80ceee3d728f920f3b Mon Sep 17 00:00:00 2001 From: Jan Sembera Date: Thu, 9 Sep 2010 16:37:54 -0700 Subject: binfmt_misc: fix binfmt_misc priority Commit 74641f584da ("alpha: binfmt_aout fix") (May 2009) introduced a regression - binfmt_misc is now consulted after binfmt_elf, which will unfortunately break ia32el. ia32 ELF binaries on ia64 used to be matched using binfmt_misc and executed using wrapper. As 32bit binaries are now matched by binfmt_elf before bindmt_misc kicks in, the wrapper is ignored. The fix increases precedence of binfmt_misc to the original state. Signed-off-by: Jan Sembera Cc: Ivan Kokshaysky Cc: Al Viro Cc: Richard Henderson [2.6.everything.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a7528b913936..fd0cc0bf9a40 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -724,7 +724,7 @@ static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) { - err = register_binfmt(&misc_format); + err = insert_binfmt(&misc_format); if (err) unregister_filesystem(&bm_fs_type); } -- cgit v1.2.3 From 3ab04d5cf9736b7a4e9dfcf28285d8663b01aa0e Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 9 Sep 2010 16:38:12 -0700 Subject: vfs: take O_NONBLOCK out of the O_* uniqueness test O_NONBLOCK on parisc has a dual value: #define O_NONBLOCK 000200004 /* HPUX has separate NDELAY & NONBLOCK */ It is caught by the O_* bits uniqueness check and leads to a parisc compile error. The fix would be to take O_NONBLOCK out. Signed-off-by: Wu Fengguang Signed-off-by: James Bottomley Cc: Jamie Lokier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 6769fd0f35b8..f8cc34f542c3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -769,11 +769,15 @@ EXPORT_SYMBOL(kill_fasync); static int __init fcntl_init(void) { - /* please add new bits here to ensure allocation uniqueness */ - BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + /* + * Please add new bits here to ensure allocation uniqueness. + * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY + * is defined as O_NONBLOCK on some platforms and not on others. + */ + BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | O_NONBLOCK | + O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | -- cgit v1.2.3 From eee743fd7eac9f2ea69ad06d093dfb5a12538fe5 Mon Sep 17 00:00:00 2001 From: "Jorge Boncompte [DTI2]" Date: Thu, 9 Sep 2010 16:38:19 -0700 Subject: minix: fix regression in minix_mkdir() Commit 9eed1fb721c ("minix: replace inode uid,gid,mode init with helper") broke directory creation on minix filesystems. Fix it by passing the needed mode flag to inode init helper. Signed-off-by: Jorge Boncompte [DTI2] Cc: Dmitry Monakhov Cc: Al Viro Cc: [2.6.35.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/minix/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/minix/namei.c b/fs/minix/namei.c index e20ee85955d1..f3f3578393a4 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, mode, &err); + inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; -- cgit v1.2.3 From a122eb2fdfd78b58c6dd992d6f4b1aaef667eef9 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Mon, 6 Sep 2010 18:24:57 -0400 Subject: xfs: prevent reading uninitialized stack memory The XFS_IOC_FSGETXATTR ioctl allows unprivileged users to read 12 bytes of uninitialized stack memory, because the fsxattr struct declared on the stack in xfs_ioc_fsgetxattr() does not alter (or zero) the 12-byte fsx_pad member before copying it back to the user. This patch takes care of it. Signed-off-by: Dan Rosenberg Reviewed-by: Eric Sandeen Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 4fec427b83ef..3b9e626f7cd1 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -785,6 +785,8 @@ xfs_ioc_fsgetxattr( { struct fsxattr fa; + memset(&fa, 0, sizeof(struct fsxattr)); + xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; -- cgit v1.2.3 From 1b528181b2ffa14721fb28ad1bd539fe1732c583 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:35:49 -0700 Subject: setup_arg_pages: diagnose excessive argument size The CONFIG_STACK_GROWSDOWN variant of setup_arg_pages() does not check the size of the argument/environment area on the stack. When it is unworkably large, shift_arg_pages() hits its BUG_ON. This is exploitable with a very large RLIMIT_STACK limit, to create a crash pretty easily. Check that the initial stack is not too large to make it possible to map in any executable. We're not checking that the actual executable (or intepreter, for binfmt_elf) will fit. So those mappings might clobber part of the initial stack mapping. But that is just userland lossage that userland made happen, not a kernel problem. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 2d9455282744..1b63237fc6dc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -594,6 +594,11 @@ int setup_arg_pages(struct linux_binprm *bprm, #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; -- cgit v1.2.3 From 7993bc1f4663c0db67bb8f0d98e6678145b387cd Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:36:28 -0700 Subject: execve: improve interactivity with large arguments This adds a preemption point during the copying of the argument and environment strings for execve, in copy_strings(). There is already a preemption point in the count() loop, so this doesn't add any new points in the abstract sense. When the total argument+environment strings are very large, the time spent copying them can be much more than a normal user time slice. So this change improves the interactivity of the rest of the system when one process is doing an execve with very large arguments. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1b63237fc6dc..6f2d777431a8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -419,6 +419,8 @@ static int copy_strings(int argc, const char __user *const __user *argv, while (len > 0) { int offset, bytes_to_copy; + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; -- cgit v1.2.3 From 9aea5a65aa7a1af9a4236dfaeb0088f1624f9919 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 7 Sep 2010 19:37:06 -0700 Subject: execve: make responsive to SIGKILL with large arguments An execve with a very large total of argument/environment strings can take a really long time in the execve system call. It runs uninterruptibly to count and copy all the strings. This change makes it abort the exec quickly if sent a SIGKILL. Note that this is the conservative change, to interrupt only for SIGKILL, by using fatal_signal_pending(). It would be perfectly correct semantics to let any signal interrupt the string-copying in execve, i.e. use signal_pending() instead of fatal_signal_pending(). We'll save that change for later, since it could have user-visible consequences, such as having a timer set too quickly make it so that an execve can never complete, though it always happened to work before. Signed-off-by: Roland McGrath Reviewed-by: KOSAKI Motohiro Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6f2d777431a8..828dd2461d6b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -376,6 +376,9 @@ static int count(const char __user * const __user * argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; cond_resched(); } } @@ -419,6 +422,10 @@ static int copy_strings(int argc, const char __user *const __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } cond_resched(); offset = pos % PAGE_SIZE; -- cgit v1.2.3 From 51749e47e191db8e588ad5cebea731caf7b705d7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 8 Sep 2010 09:00:22 +0000 Subject: xfs: log IO completion workqueue is a high priority queue The workqueue implementation in 2.6.36-rcX has changed, resulting in the workqueues no longer having dedicated threads for work processing. This has caused severe livelocks under heavy parallel create workloads because the log IO completions have been getting held up behind metadata IO completions. Hence log commits would stall, memory allocation would stall because pages could not be cleaned, and lock contention on the AIL during inode IO completion processing was being seen to slow everything down even further. By making the log Io completion workqueue a high priority workqueue, they are queued ahead of all data/metadata IO completions and processed before the data/metadata completions. Hence the log never gets stalled, and operations needed to clean memory can continue as quickly as possible. This avoids the livelock conditions and allos the system to keep running under heavy load as per normal. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_buf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d72cf2bb054a..286e36e21dae 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1932,7 +1932,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = create_workqueue("xfslogd"); + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_RESCUER | WQ_HIGHPRI, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; -- cgit v1.2.3 From b1bde04c6d9a120dec602cc8a70b8a7f21600883 Mon Sep 17 00:00:00 2001 From: Fabio Olive Leite Date: Sun, 12 Sep 2010 19:55:25 -0400 Subject: Remove incorrect do_vfs_lock message The do_vfs_lock function on fs/nfs/file.c is only called if NLM is not being used, via the -onolock mount option. Therefore it cannot really be "out of sync with lock manager" when the local locking function called returns an error, as there will be no corresponding call to the NLM. For details, simply check the if/else on do_setlk and do_unlk on fs/nfs/file.c. Signed-Off-By: Fabio Olive Leite Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index eb51bd6201da..05bf3c0dc751 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -723,10 +723,6 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) default: BUG(); } - if (res < 0) - dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager" - " - error %d!\n", - __func__, res); return res; } -- cgit v1.2.3 From b20d37ca9561711c6a3c4b859c2855f49565e061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: NFS: Fix a typo in nfs_sockaddr_match_ipaddr6 Reported-by: Ben Greear Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4e7df2adb212..e7340729af89 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -275,7 +275,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, -- cgit v1.2.3 From fbf3fdd2443965d9ba6fb4b5fecd1f6e0847218f Mon Sep 17 00:00:00 2001 From: Menyhart Zoltan Date: Sun, 12 Sep 2010 19:55:26 -0400 Subject: statfs() gives ESTALE error Hi, An NFS client executes a statfs("file", &buff) call. "file" exists / existed, the client has read / written it, but it has already closed it. user_path(pathname, &path) looks up "file" successfully in the directory-cache and restarts the aging timer of the directory-entry. Even if "file" has already been removed from the server, because the lookupcache=positive option I use, keeps the entries valid for a while. nfs_statfs() returns ESTALE if "file" has already been removed from the server. If the user application repeats the statfs("file", &buff) call, we are stuck: "file" remains young forever in the directory-cache. Signed-off-by: Zoltan Menyhart Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ec3966e4706b..f4cbf0c306c6 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } nfs_free_fattr(res.fattr); if (error < 0) goto out_err; -- cgit v1.2.3 From 827e3457022d0bb0b1bb8a0eb88501876fe7dcf0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 12 Sep 2010 19:57:50 -0400 Subject: SUNRPC: Fix the NFSv4 and RPCSEC_GSS Kconfig dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The NFSv4 client's callback server calls svc_gss_principal(), which is defined in the auth_rpcgss.ko The NFSv4 server has the same dependency, and in addition calls svcauth_gss_flavor(), gss_mech_get_by_pseudoflavor(), gss_pseudoflavor_to_service() and gss_mech_put() from the same module. The module auth_rpcgss itself has no dependencies aside from sunrpc, so we only need to select RPCSEC_GSS. Reported-by: Uwe Kleine-König Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 1 + fs/nfsd/Kconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 6c2aad49d731..f7e13db613cb 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -63,6 +63,7 @@ config NFS_V3_ACL config NFS_V4 bool "NFS client support for NFS version 4" depends on NFS_FS + select SUNRPC_GSS help This option enables support for version 4 of the NFS protocol (RFC 3530) in the kernel's NFS client. diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 95932f523aef..4264377552e2 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -69,6 +69,7 @@ config NFSD_V4 depends on NFSD && PROC_FS && EXPERIMENTAL select NFSD_V3 select FS_POSIX_ACL + select SUNRPC_GSS help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). -- cgit v1.2.3