From f04de505e3fa322728d1a851e08bf7060b117743 Mon Sep 17 00:00:00 2001
From: Steve Glendinning <steve.glendinning@smsc.com>
Date: Tue, 21 Oct 2008 13:25:51 +0100
Subject: [JFFS2] Fix build failure with !CONFIG_JFFS2_FS_WRITEBUFFER

Build failure introduced by 5bf1723723487ddb0b9c9641b6559da96b27cc93
[JFFS2] Write buffer offset adjustment for NOR-ECC (Sibley) flash

Signed-off-by: Steve Glendinning <steve.glendinning@smsc.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/nodemgmt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0875b60b4bf7..21a052915aa9 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,9 +261,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* adjust write buffer offset, else we get a non contiguous write bug */
 	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
 		c->wbuf_ofs = 0xffffffff;
+#endif
 
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
-- 
cgit v1.2.3


From 526719ba51e7d7bd31f7af9ab04b015b70096685 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Oct 2008 15:19:48 +0000
Subject: Switch to a valid email address...

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c | 2 +-
 fs/nfs/inode.c  | 2 +-
 fs/nfs/super.c  | 2 +-
 fs/proc/array.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index cfd29da714d1..0376ac66c44a 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -2,7 +2,7 @@
  *      	An implementation of a loadable kernel mode driver providing
  *		multiple kernel/user space bidirectional communications links.
  *
- * 		Author: 	Alan Cox <alan@redhat.com>
+ * 		Author: 	Alan Cox <alan@lxorguk.ukuu.org.uk>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b9195c02a863..dc52793ff8f8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -5,7 +5,7 @@
  *
  *  nfs inode and superblock handling functions
  *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
  *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
  *
  *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a3b0061dfd45..f48db679a1c6 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -5,7 +5,7 @@
  *
  *  nfs superblock handling functions
  *
- *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some
  *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
  *
  *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bb9f4b05703d..6af7fba7abb1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -40,7 +40,7 @@
  *
  *
  * Alan Cox	     :  security fixes.
- *			<Alan.Cox@linux.org>
+ *			<alan@lxorguk.ukuu.org.uk>
  *
  * Al Viro           :  safe handling of mm_struct
  *
-- 
cgit v1.2.3


From 6c87df37dcb9c6c33923707fa5191e0a65874d60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 27 Oct 2008 22:38:27 +0300
Subject: proc: revert /proc/uptime to ->read_proc hook

Turned out some VMware userspace does pread(2) on /proc/uptime, but
seqfiles currently don't allow pread() resulting in -ESPIPE.

Seqfiles in theory can do pread(), but this can be a long story,
so revert to ->read_proc until then.

http://bugzilla.kernel.org/show_bug.cgi?id=11856

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/uptime.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 0c10a0b3f146..df26aa88fa47 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,43 +1,45 @@
-#include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
-#include <linux/seq_file.h>
 #include <linux/time.h>
 #include <asm/cputime.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static int proc_calc_metrics(char *page, char **start, off_t off,
+				 int count, int *eof, int len)
+{
+	if (len <= off + count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len > count)
+		len = count;
+	if (len < 0)
+		len = 0;
+	return len;
+}
+
+static int uptime_read_proc(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
 {
 	struct timespec uptime;
 	struct timespec idle;
+	int len;
 	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	monotonic_to_bootbased(&uptime);
 	cputime_to_timespec(idletime, &idle);
-	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+	len = sprintf(page, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
 			(unsigned long) idle.tv_sec,
 			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-	return 0;
+	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
-static int uptime_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, uptime_proc_show, NULL);
-}
-
-static const struct file_operations uptime_proc_fops = {
-	.open		= uptime_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	create_proc_read_entry("uptime", 0, NULL, uptime_read_proc, NULL);
 	return 0;
 }
 module_init(proc_uptime_init);
-- 
cgit v1.2.3


From 44d6f78756560e95903de239e10f8a40a6eae444 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:51:46 -0400
Subject: ext3: fix a bug accessing freed memory in ext3_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext3_put_super() calls
ext3_abort() after freeing the journal_t object, and then ext3_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/super.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 18eaa78ecb4e..e5717a4fae67 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -281,7 +281,8 @@ void ext3_abort (struct super_block * sb, const char * function,
 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
-	journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+	if (EXT3_SB(sb)->s_journal)
+		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 
 void ext3_warning (struct super_block * sb, const char * function,
@@ -390,11 +391,14 @@ static void ext3_put_super (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext3_xattr_put_super(sb);
-	if (journal_destroy(sbi->s_journal) < 0)
+	err = journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
+	if (err < 0)
 		ext3_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.3


From ef2cabf7c6d838eb0ee2b4fb8ef84f7c06ce16d9 Mon Sep 17 00:00:00 2001
From: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Date: Mon, 27 Oct 2008 22:53:05 -0400
Subject: ext4: fix a bug accessing freed memory in ext4_abort

Vegard Nossum reported a bug which accesses freed memory (found via
kmemcheck).  When journal has been aborted, ext4_put_super() calls
ext4_abort() after freeing the journal_t object, and then ext4_abort()
accesses it.  This patch fix it.

Signed-off-by: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bdddea14e782..994859df010e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,8 @@ void ext4_abort(struct super_block *sb, const char *function,
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
-	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+	if (EXT4_SB(sb)->s_journal)
+		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
 void ext4_warning(struct super_block *sb, const char *function,
@@ -442,14 +443,16 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
 
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	if (jbd2_journal_destroy(sbi->s_journal) < 0)
-		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+	err = jbd2_journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
+	if (err < 0)
+		ext4_abort(sb, __func__, "Couldn't clean up the journal");
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-- 
cgit v1.2.3


From 6c20ec850360bc6e5c66a787f0523a80450d65ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 21:08:20 -0400
Subject: jbd2: Call the commit callback before the transaction could get
 dropped

The transaction can potentially get dropped if there are no buffers
that need to be written.  Make sure we call the commit callback before
potentially deciding to drop the transaction.  Also avoid
dereferencing the commit_transaction pointer in the marker for the
same reason.

This patch fixes the bug reported by Eric Paris at:
http://bugzilla.kernel.org/show_bug.cgi?id=11838

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Eric Paris <eparis@redhat.com>
---
 fs/jbd2/commit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8b119e16aa36..ebc667bc54a8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -974,6 +974,9 @@ restart_loop:
 	journal->j_committing_transaction = NULL;
 	spin_unlock(&journal->j_state_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -995,11 +998,8 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
-
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_devname, journal->j_commit_sequence,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-- 
cgit v1.2.3


From 8c3bf8a01c005385e9be0bc992e10abfb355278c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:12 -0400
Subject: merge ext4_claim_free_blocks & ext4_has_free_blocks

Mingming pointed out that ext4_claim_free_blocks & ext4_has_free_blocks
are largely cut & pasted; they can be collapsed/merged as follows.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 57 ++++++++++++++++----------------------------------------
 fs/ext4/ext4.h   |  3 +--
 2 files changed, 17 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b9821be709bd..e28203ec45bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -589,8 +589,15 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	return;
 }
 
-int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-						s64 nblocks)
+/**
+ * ext4_has_free_blocks()
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of needed blocks
+ *
+ * Check if filesystem has nblocks free & available for allocation.
+ * On success return 1, return 0 on failure.
+ */
+int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
 	s64 free_blocks, dirty_blocks;
 	s64 root_blocks = 0;
@@ -620,53 +627,21 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 	 */
 	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
 		/* we don't have free space */
-		return -ENOSPC;
+		return 0;
 
-	/* Add the blocks to nblocks */
-	percpu_counter_add(dbc, nblocks);
-	return 0;
+	return 1;
 }
 
-/**
- * ext4_has_free_blocks()
- * @sbi:	in-core super block structure.
- * @nblocks:	number of neeed blocks
- *
- * Check if filesystem has free blocks available for allocation.
- * Return the number of blocks avaible for allocation for this request
- * On success, return nblocks
- */
-ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 						s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
-	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
-	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
-
-	free_blocks  = percpu_counter_read_positive(fbc);
-	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
-
-	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
-						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
-	}
-	if (free_blocks <= (root_blocks + dirty_blocks))
-		/* we don't have free space */
+	if (ext4_has_free_blocks(sbi, nblocks)) {
+		percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
 		return 0;
-
-	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
-		return free_blocks - (root_blocks + dirty_blocks);
-	return nblocks;
+	} else
+		return -ENOSPC;
 }
 
-
 /**
  * ext4_should_retry_alloc()
  * @sb:			super block
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4880cc3e6727..b0537c827024 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1003,8 +1003,7 @@ extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-					 s64 nblocks);
+extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-- 
cgit v1.2.3


From a996031c87e093017c0763326a08896a3a4817f4 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Oct 2008 00:08:17 -0400
Subject: delay capable() check in ext4_has_free_blocks()

As reported by Eric Paris, the capable() check in ext4_has_free_blocks()
sometimes causes SELinux denials.

We can rearrange the logic so that we only try to use the root-reserved
blocks when necessary, and even then we can move the capable() test
to last, to avoid the check most of the time.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e28203ec45bd..d2003cdc36aa 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -599,18 +599,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
  */
 int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 {
-	s64 free_blocks, dirty_blocks;
-	s64 root_blocks = 0;
+	s64 free_blocks, dirty_blocks, root_blocks;
 	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
 	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
 	free_blocks  = percpu_counter_read_positive(fbc);
 	dirty_blocks = percpu_counter_read_positive(dbc);
-
-	if (!capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
-		root_blocks = ext4_r_blocks_count(sbi->s_es);
+	root_blocks = ext4_r_blocks_count(sbi->s_es);
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
@@ -623,13 +618,20 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 		}
 	}
 	/* Check whether we have space after
-	 * accounting for current dirty blocks
+	 * accounting for current dirty blocks & root reserved blocks.
 	 */
-	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
-		/* we don't have free space */
-		return 0;
+	if (free_blocks >= ((root_blocks + nblocks) + dirty_blocks))
+		return 1;
 
-	return 1;
+	/* Hm, nope.  Are (enough) root reserved blocks available? */
+	if (sbi->s_resuid == current->fsuid ||
+	    ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE)) {
+		if (free_blocks >= (nblocks + dirty_blocks))
+			return 1;
+	}
+
+	return 0;
 }
 
 int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
-- 
cgit v1.2.3


From ae05f269400533cbb32bfba131ab528d78dffd16 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 28 Oct 2008 15:21:40 -0400
Subject: NFS: Convert nfs_attr_generation_counter into an atomic_long

The most important property we need from nfs_attr_generation_counter is
monotonicity, which is not guaranteed by the current system of smp memory
barriers. We should convert it to an atomic_long_t, and drop the memory
barriers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dc52793ff8f8..d22eb383e1cf 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -908,21 +908,16 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt
 	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
 
-static unsigned long nfs_attr_generation_counter;
+static atomic_long_t nfs_attr_generation_counter;
 
 static unsigned long nfs_read_attr_generation_counter(void)
 {
-	smp_rmb();
-	return nfs_attr_generation_counter;
+	return atomic_long_read(&nfs_attr_generation_counter);
 }
 
 unsigned long nfs_inc_attr_generation_counter(void)
 {
-	unsigned long ret;
-	smp_rmb();
-	ret = ++nfs_attr_generation_counter;
-	smp_wmb();
-	return ret;
+	return atomic_long_inc_return(&nfs_attr_generation_counter);
 }
 
 void nfs_fattr_init(struct nfs_fattr *fattr)
-- 
cgit v1.2.3


From edf1ae403896cb7750800508b14996ba6be39a53 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 29 Oct 2008 00:47:57 +0000
Subject: [CIFS] Reduce number of socket retries in large write path

CIFS in some heavy stress conditions cifs could get EAGAIN
repeatedly in smb_send2 which led to repeated retries and eventually
failure of large writes which could lead to data corruption.

There are three changes that were suggested by various network
developers:

1) convert cifs from non-blocking to blocking tcp sendmsg
(we left in the retry on failure)
2) change cifs to not set sendbuf and rcvbuf size for the socket
(let tcp autotune the buffer sizes since that works much better
in the TCP stack now)
3) if we have a partial frame sent in smb_send2, mark the tcp
session as invalid (close the socket and reconnect) so we do
not corrupt the remaining part of the SMB with the beginning
of the next SMB.

This does not appear to hurt performance measurably and has
been run in various scenarios, but it definately removes
a corruption that we were seeing in some high stress
test cases.

Acked-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES     |  6 +++++-
 fs/cifs/cifsglob.h  |  2 ++
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/connect.c   | 50 +++++++++++++++++++++++++++++++++++++-------------
 fs/cifs/transport.c | 41 +++++++++++++++++++++++++++++++----------
 5 files changed, 76 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8f528ea24c48..8855331b2fba 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -4,7 +4,11 @@ Various fixes to make delete of open files behavior more predictable
 (when delete of an open file fails we mark the file as "delete-on-close"
 in a way that more servers accept, but only if we can first rename the
 file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).
+handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
+sends, and also let tcp autotune the socket send and receive buffers.
+This reduces the number of EAGAIN errors returned by TCP/IP in
+high stress workloads (and the number of retries on socket writes
+when sending large SMBWriteX requests).
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c791e5b5a914..1cb1189f24e0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -141,6 +141,8 @@ struct TCP_Server_Info {
 	char versionMajor;
 	char versionMinor;
 	bool svlocal:1;			/* local server or remote */
+	bool noblocksnd;		/* use blocking sendmsg */
+	bool noautotune;		/* do not autotune send buf sizes */
 	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0cff7fe986e8..6f21ecb85ce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,7 +36,7 @@ extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
 extern int smb_send(struct socket *, struct smb_hdr *,
-			unsigned int /* length */ , struct sockaddr *);
+			unsigned int /* length */ , struct sockaddr *, bool);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 71b7661e2260..e9f9248cb3fe 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -92,6 +92,8 @@ struct smb_vol {
 	bool seal:1;       /* request transport encryption on share */
 	bool nodfs:1;      /* Do not request DFS, even if available */
 	bool local_lease:1; /* check leases only on local system, not remote */
+	bool noblocksnd:1;
+	bool noautotune:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -102,9 +104,11 @@ struct smb_vol {
 static int ipv4_connect(struct sockaddr_in *psin_server,
 			struct socket **csocket,
 			char *netb_name,
-			char *server_netb_name);
+			char *server_netb_name,
+			bool noblocksnd,
+			bool nosndbuf); /* ipv6 never set sndbuf size */
 static int ipv6_connect(struct sockaddr_in6 *psin_server,
-			struct socket **csocket);
+			struct socket **csocket, bool noblocksnd);
 
 
 	/*
@@ -191,12 +195,13 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		try_to_freeze();
 		if (server->protocolType == IPV6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
-					  &server->ssocket);
+					  &server->ssocket, server->noautotune);
 		} else {
 			rc = ipv4_connect(&server->addr.sockAddr,
 					&server->ssocket,
 					server->workstation_RFC1001_name,
-					server->server_RFC1001_name);
+					server->server_RFC1001_name,
+					server->noblocksnd, server->noautotune);
 		}
 		if (rc) {
 			cFYI(1, ("reconnect error %d", rc));
@@ -1192,6 +1197,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			/* ignore */
 		} else if (strnicmp(data, "rw", 2) == 0) {
 			vol->rw = true;
+		} else if (strnicmp(data, "noblocksend", 11) == 0) {
+			vol->noblocksnd = 1;
+		} else if (strnicmp(data, "noautotune", 10) == 0) {
+			vol->noautotune = 1;
 		} else if ((strnicmp(data, "suid", 4) == 0) ||
 				   (strnicmp(data, "nosuid", 6) == 0) ||
 				   (strnicmp(data, "exec", 4) == 0) ||
@@ -1518,7 +1527,8 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 
 static int
 ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
-	     char *netbios_name, char *target_name)
+	     char *netbios_name, char *target_name,
+	     bool noblocksnd, bool noautotune)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1590,11 +1600,16 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		 (*csocket)->sk->sk_sndbuf,
 		 (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo));
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 	/* make the bufsizes depend on wsize/rsize and max requests */
-	if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
-		(*csocket)->sk->sk_sndbuf = 200 * 1024;
-	if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
-		(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	if (noautotune) {
+		if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
+			(*csocket)->sk->sk_sndbuf = 200 * 1024;
+		if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+			(*csocket)->sk->sk_rcvbuf = 140 * 1024;
+	}
 
 	/* send RFC1001 sessinit */
 	if (psin_server->sin_port == htons(RFC1001_PORT)) {
@@ -1631,7 +1646,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 			/* sizeof RFC1002_SESSION_REQUEST with no scope */
 			smb_buf->smb_buf_length = 0x81000044;
 			rc = smb_send(*csocket, smb_buf, 0x44,
-				(struct sockaddr *)psin_server);
+				(struct sockaddr *)psin_server, noblocksnd);
 			kfree(ses_init_buf);
 			msleep(1); /* RFC1001 layer in at least one server
 				      requires very short break before negprot
@@ -1651,7 +1666,8 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 }
 
 static int
-ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
+ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket,
+	     bool noblocksnd)
 {
 	int rc = 0;
 	int connected = 0;
@@ -1720,6 +1736,9 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
 		the default. sock_setsockopt not used because it expects
 		user space buffer */
 	(*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+	if (!noblocksnd)
+		(*csocket)->sk->sk_sndtimeo = 3 * HZ;
+
 
 	return rc;
 }
@@ -1983,11 +2002,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket);
+			rc = ipv6_connect(&sin_server6, &csocket,
+					volume_info.noblocksnd);
 		} else
 			rc = ipv4_connect(&sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
-				  volume_info.target_rfc1001_name);
+				  volume_info.target_rfc1001_name,
+				  volume_info.noblocksnd,
+				  volume_info.noautotune);
 		if (rc < 0) {
 			cERROR(1, ("Error connecting to IPv4 socket. "
 				   "Aborting operation"));
@@ -2002,6 +2024,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			sock_release(csocket);
 			goto out;
 		} else {
+			srvTcp->noblocksnd = volume_info.noblocksnd;
+			srvTcp->noautotune = volume_info.noautotune;
 			memcpy(&srvTcp->addr.sockAddr, &sin_server,
 				sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bf0e6d8e382a..ba4d66644ebf 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -161,7 +161,7 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 
 int
 smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-	 unsigned int smb_buf_length, struct sockaddr *sin)
+	 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -178,7 +178,10 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -229,8 +232,8 @@ smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
 }
 
 static int
-smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
-	  struct sockaddr *sin)
+smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
+	  struct sockaddr *sin, bool noblocksnd)
 {
 	int rc = 0;
 	int i = 0;
@@ -240,6 +243,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	unsigned int total_len;
 	int first_vec = 0;
 	unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+	struct socket *ssocket = server->ssocket;
 
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
@@ -248,7 +252,10 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL; /* BB add more flags?*/
+	if (noblocksnd)
+		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
+	else
+		smb_msg.msg_flags = MSG_NOSIGNAL;
 
 	/* smb header is converted in header_assemble. bcc and rest of SMB word
 	   area, and byte area if necessary, is converted to littleendian in
@@ -312,6 +319,16 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 		i = 0; /* in case we get ENOSPC on the next send */
 	}
 
+	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
+		cFYI(1, ("partial send (%d remaining), terminating session",
+			total_len));
+		/* If we have only sent part of an SMB then the next SMB
+		   could be taken as the remainder of this one.  We need
+		   to kill the socket so the server throws away the partial
+		   SMB */
+		server->tcpStatus = CifsNeedReconnect;
+	}
+
 	if (rc < 0) {
 		cERROR(1, ("Error %d sending data on socket to server", rc));
 	} else
@@ -518,8 +535,9 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send2(ses->server->ssocket, iov, n_vec,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	rc = smb_send2(ses->server, iov, n_vec,
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		       ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -711,7 +729,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -851,7 +870,8 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
 		return rc;
 	}
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-	      (struct sockaddr *) &(ses->server->addr.sockAddr));
+	      (struct sockaddr *) &(ses->server->addr.sockAddr),
+	      ses->server->noblocksnd);
 	up(&ses->server->tcpSem);
 	return rc;
 }
@@ -941,7 +961,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	atomic_inc(&ses->server->inSend);
 #endif
 	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr));
+		      (struct sockaddr *) &(ses->server->addr.sockAddr),
+		      ses->server->noblocksnd);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
-- 
cgit v1.2.3


From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 29 Oct 2008 14:00:55 -0700
Subject: fs: remove prepare_write/commit_write

Nothing uses prepare_write or commit_write. Remove them from the tree
completely.

[akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  12 +-
 Documentation/filesystems/vfs.txt |  39 +-----
 drivers/block/loop.c              |   5 +-
 fs/fat/inode.c                    |   2 +-
 fs/libfs.c                        |   2 +-
 fs/ocfs2/file.c                   |   3 +-
 fs/splice.c                       |   4 +-
 include/linux/fs.h                |   7 --
 mm/filemap.c                      | 242 +-------------------------------------
 9 files changed, 23 insertions(+), 293 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8362860e21a7..23d2f4460deb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -161,8 +161,12 @@ prototypes:
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+	int (*write_begin)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata);
+	int (*write_end)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
@@ -180,8 +184,6 @@ sync_page:		no	maybe
 writepages:		no
 set_page_dirty		no	no
 readpages:		no
-prepare_write:		no	yes			yes
-commit_write:		no	yes			yes
 write_begin:		no	locks the page		yes
 write_end:		no	yes, unlocks		yes
 perform_write:		no	n/a			yes
@@ -191,7 +193,7 @@ releasepage:		no	yes
 direct_IO:		no
 launder_page:		no	yes
 
-	->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
+	->write_begin(), ->write_end(), ->sync_page() and ->readpage()
 may be called from the request handler (/dev/loop).
 
 	->readpage() unlocks the page, either synchronously or via I/O
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c4d348dabe94..5579bda58a6d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -492,7 +492,7 @@ written-back to storage typically in whole pages, however the
 address_space has finer control of write sizes.
 
 The read process essentially only requires 'readpage'.  The write
-process is more complicated and uses prepare_write/commit_write or
+process is more complicated and uses write_begin/write_end or
 set_page_dirty to write data into the address_space, and writepage,
 sync_page, and writepages to writeback data to storage.
 
@@ -521,8 +521,6 @@ struct address_space_operations {
 	int (*set_page_dirty)(struct page *page);
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
@@ -598,37 +596,7 @@ struct address_space_operations {
 	readpages is only used for read-ahead, so read errors are
   	ignored.  If anything goes wrong, feel free to give up.
 
-  prepare_write: called by the generic write path in VM to set up a write
-  	request for a page.  This indicates to the address space that
-  	the given range of bytes is about to be written.  The
-  	address_space should check that the write will be able to
-  	complete, by allocating space if necessary and doing any other
-  	internal housekeeping.  If the write will update parts of
-  	any basic-blocks on storage, then those blocks should be
-  	pre-read (if they haven't been read already) so that the
-  	updated blocks can be written out properly.
-	The page will be locked.
-
-	Note: the page _must not_ be marked uptodate in this function
-	(or anywhere else) unless it actually is uptodate right now. As
-	soon as a page is marked uptodate, it is possible for a concurrent
-	read(2) to copy it to userspace.
-
-  commit_write: If prepare_write succeeds, new data will be copied
-        into the page and then commit_write will be called.  It will
-        typically update the size of the file (if appropriate) and
-        mark the inode as dirty, and do any other related housekeeping
-        operations.  It should avoid returning an error if possible -
-        errors should have been handled by prepare_write.
-
-  write_begin: This is intended as a replacement for prepare_write. The
-	key differences being that:
-		- it returns a locked page (in *pagep) rather than being
-		  given a pre locked page;
-		- it must be able to cope with short writes (where the
-		  length passed to write_begin is greater than the number
-		  of bytes copied into the page).
-
+  write_begin:
 	Called by the generic buffered write code to ask the filesystem to
 	prepare to write len bytes at the given offset in the file. The
 	address_space should check that the write will be able to complete,
@@ -640,6 +608,9 @@ struct address_space_operations {
         The filesystem must return the locked pagecache page for the specified
 	offset, in *pagep, for the caller to write into.
 
+	It must be able to cope with short writes (where the length passed to
+	write_begin is greater than the number of bytes copied into the page).
+
 	flags is a field for AOP_FLAG_xxx flags, described in
 	include/linux/fs.h.
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3f09cd8bcc38..5c4ee70d5cf3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -40,8 +40,7 @@
  * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  *
  * Support for falling back on the write file operation when the address space
- * operations prepare_write and/or commit_write are not available on the
- * backing filesystem.
+ * operations write_begin is not available on the backing filesystem.
  * Anton Altaparmakov, 16 Feb 2005
  *
  * Still To Fix:
@@ -765,7 +764,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		 */
 		if (!file->f_op->splice_read)
 			goto out_putf;
-		if (aops->prepare_write || aops->write_begin)
+		if (aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c379..2b2eec1283bf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 	if (rw == WRITE) {
 		/*
-		 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(),
+		 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
 		 * so we need to update the ->mmu_private to block boundary.
 		 *
 		 * But we must fill the remaining area or hole by nul for
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf7..e960a8321902 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a78073..7efe937a415f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -679,8 +679,7 @@ leave:
 
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->prepare_write() and
- * ->commit_write(). */
+ * worry about recursive locking in ->write_begin() and ->write_end(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c27156..1abab5cee4ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	};
 
 	/*
-	 * The actor worker might be calling ->prepare_write and
-	 * ->commit_write. Most of the time, these expect i_mutex to
+	 * The actor worker might be calling ->write_begin and
+	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
 	 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b248d61430c..0dcdd9458f4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -489,13 +489,6 @@ struct address_space_operations {
 	int (*readpages)(struct file *filp, struct address_space *mapping,
 			struct list_head *pages, unsigned nr_pages);
 
-	/*
-	 * ext3 requires that a successful prepare_write() call be followed
-	 * by a commit_write() call - they must be balanced
-	 */
-	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
-
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab8553658af3..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
 {
 	const struct address_space_operations *aops = mapping->a_ops;
 
-	if (aops->write_begin) {
-		return aops->write_begin(file, mapping, pos, len, flags,
+	return aops->write_begin(file, mapping, pos, len, flags,
 							pagep, fsdata);
-	} else {
-		int ret;
-		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-		struct page *page;
-again:
-		page = __grab_cache_page(mapping, index);
-		*pagep = page;
-		if (!page)
-			return -ENOMEM;
-
-		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
-			/*
-			 * There is no way to resolve a short write situation
-			 * for a !Uptodate page (except by double copying in
-			 * the caller done by generic_perform_write_2copy).
-			 *
-			 * Instead, we have to bring it uptodate here.
-			 */
-			ret = aops->readpage(file, page);
-			page_cache_release(page);
-			if (ret) {
-				if (ret == AOP_TRUNCATED_PAGE)
-					goto again;
-				return ret;
-			}
-			goto again;
-		}
-
-		ret = aops->prepare_write(file, page, offset, offset+len);
-		if (ret) {
-			unlock_page(page);
-			page_cache_release(page);
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		}
-		return ret;
-	}
 }
 EXPORT_SYMBOL(pagecache_write_begin);
 
@@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 				struct page *page, void *fsdata)
 {
 	const struct address_space_operations *aops = mapping->a_ops;
-	int ret;
-
-	if (aops->write_end) {
-		mark_page_accessed(page);
-		ret = aops->write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	} else {
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-		struct inode *inode = mapping->host;
-
-		flush_dcache_page(page);
-		ret = aops->commit_write(file, page, offset, offset+len);
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-
-		if (ret < 0) {
-			if (pos + len > inode->i_size)
-				vmtruncate(inode, inode->i_size);
-		} else if (ret > 0)
-			ret = min_t(size_t, copied, ret);
-		else
-			ret = copied;
-	}
 
-	return ret;
+	mark_page_accessed(page);
+	return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 EXPORT_SYMBOL(pagecache_write_end);
 
@@ -2226,174 +2163,6 @@ repeat:
 }
 EXPORT_SYMBOL(__grab_cache_page);
 
-static ssize_t generic_perform_write_2copy(struct file *file,
-				struct iov_iter *i, loff_t pos)
-{
-	struct address_space *mapping = file->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode *inode = mapping->host;
-	long status = 0;
-	ssize_t written = 0;
-
-	do {
-		struct page *src_page;
-		struct page *page;
-		pgoff_t index;		/* Pagecache index for current page */
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
-		size_t copied;		/* Bytes copied from user */
-
-		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(i));
-
-		/*
-		 * a non-NULL src_page indicates that we're doing the
-		 * copy via get_user_pages and kmap.
-		 */
-		src_page = NULL;
-
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 *
-		 * Not only is this an optimisation, but it is also required
-		 * to check that the address is actually valid, when atomic
-		 * usercopies are used, below.
-		 */
-		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-
-		page = __grab_cache_page(mapping, index);
-		if (!page) {
-			status = -ENOMEM;
-			break;
-		}
-
-		/*
-		 * non-uptodate pages cannot cope with short copies, and we
-		 * cannot take a pagefault with the destination page locked.
-		 * So pin the source page to copy it.
-		 */
-		if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
-			unlock_page(page);
-
-			src_page = alloc_page(GFP_KERNEL);
-			if (!src_page) {
-				page_cache_release(page);
-				status = -ENOMEM;
-				break;
-			}
-
-			/*
-			 * Cannot get_user_pages with a page locked for the
-			 * same reason as we can't take a page fault with a
-			 * page locked (as explained below).
-			 */
-			copied = iov_iter_copy_from_user(src_page, i,
-								offset, bytes);
-			if (unlikely(copied == 0)) {
-				status = -EFAULT;
-				page_cache_release(page);
-				page_cache_release(src_page);
-				break;
-			}
-			bytes = copied;
-
-			lock_page(page);
-			/*
-			 * Can't handle the page going uptodate here, because
-			 * that means we would use non-atomic usercopies, which
-			 * zero out the tail of the page, which can cause
-			 * zeroes to become transiently visible. We could just
-			 * use a non-zeroing copy, but the APIs aren't too
-			 * consistent.
-			 */
-			if (unlikely(!page->mapping || PageUptodate(page))) {
-				unlock_page(page);
-				page_cache_release(page);
-				page_cache_release(src_page);
-				continue;
-			}
-		}
-
-		status = a_ops->prepare_write(file, page, offset, offset+bytes);
-		if (unlikely(status))
-			goto fs_write_aop_error;
-
-		if (!src_page) {
-			/*
-			 * Must not enter the pagefault handler here, because
-			 * we hold the page lock, so we might recursively
-			 * deadlock on the same lock, or get an ABBA deadlock
-			 * against a different lock, or against the mmap_sem
-			 * (which nests outside the page lock).  So increment
-			 * preempt count, and use _atomic usercopies.
-			 *
-			 * The page is uptodate so we are OK to encounter a
-			 * short copy: if unmodified parts of the page are
-			 * marked dirty and written out to disk, it doesn't
-			 * really matter.
-			 */
-			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, i,
-								offset, bytes);
-			pagefault_enable();
-		} else {
-			void *src, *dst;
-			src = kmap_atomic(src_page, KM_USER0);
-			dst = kmap_atomic(page, KM_USER1);
-			memcpy(dst + offset, src + offset, bytes);
-			kunmap_atomic(dst, KM_USER1);
-			kunmap_atomic(src, KM_USER0);
-			copied = bytes;
-		}
-		flush_dcache_page(page);
-
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (unlikely(status < 0))
-			goto fs_write_aop_error;
-		if (unlikely(status > 0)) /* filesystem did partial write */
-			copied = min_t(size_t, copied, status);
-
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		iov_iter_advance(i, copied);
-		pos += copied;
-		written += copied;
-
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-		continue;
-
-fs_write_aop_error:
-		unlock_page(page);
-		page_cache_release(page);
-		if (src_page)
-			page_cache_release(src_page);
-
-		/*
-		 * prepare_write() may have instantiated a few blocks
-		 * outside i_size.  Trim these off again. Don't need
-		 * i_size_read because we hold i_mutex.
-		 */
-		if (pos + bytes > inode->i_size)
-			vmtruncate(inode, inode->i_size);
-		break;
-	} while (iov_iter_count(i));
-
-	return written ? written : status;
-}
-
 static ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
 {
@@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 
 	iov_iter_init(&i, iov, nr_segs, count, written);
-	if (a_ops->write_begin)
-		status = generic_perform_write(file, &i, pos);
-	else
-		status = generic_perform_write_2copy(file, &i, pos);
+	status = generic_perform_write(file, &i, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
-- 
cgit v1.2.3


From 87b811c3f96559e466403e22b1fa99d472571625 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 29 Oct 2008 14:01:08 -0700
Subject: ecryptfs: fix memory corruption when storing crypto info in xattrs

When ecryptfs allocates space to write crypto headers into, before copying
it out to file headers or to xattrs, it looks at the value of
crypt_stat->num_header_bytes_at_front to determine how much space it
needs.  This is also used as the file offset to the actual encrypted data,
so for xattr-stored crypto info, the value was zero.

So, we kzalloc'd 0 bytes, and then ran off to write to that memory.
(Which returned as ZERO_SIZE_PTR, so we explode quickly).

The right answer is to always allocate a page to write into; the current
code won't ever write more than that (this is enforced by the
(PAGE_CACHE_SIZE - offset) length in the call to
ecryptfs_generate_key_packet_set).  To be explicit about this, we now send
in a "max" parameter, rather than magically using PAGE_CACHE_SIZE there.

Also, since the pointer we pass down the callchain eventually gets the
virt_to_page() treatment, we should be using a alloc_page variant, not
kzalloc (see also 7fcba054373d5dfc43d26e243a5c9b92069972ee)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 06db79d05c12..6046239465a1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1251,6 +1251,7 @@ struct kmem_cache *ecryptfs_header_cache_2;
 /**
  * ecryptfs_write_headers_virt
  * @page_virt: The virtual address to write the headers to
+ * @max: The size of memory allocated at page_virt
  * @size: Set to the number of bytes written by this function
  * @crypt_stat: The cryptographic context
  * @ecryptfs_dentry: The eCryptfs dentry
@@ -1278,7 +1279,8 @@ struct kmem_cache *ecryptfs_header_cache_2;
  *
  * Returns zero on success
  */
-static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
+static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
+				       size_t *size,
 				       struct ecryptfs_crypt_stat *crypt_stat,
 				       struct dentry *ecryptfs_dentry)
 {
@@ -1296,7 +1298,7 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t *size,
 	offset += written;
 	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
 					      ecryptfs_dentry, &written,
-					      PAGE_CACHE_SIZE - offset);
+					      max - offset);
 	if (rc)
 		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
 				"set; rc = [%d]\n", rc);
@@ -1368,14 +1370,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out;
 	}
 	/* Released in this function */
-	virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL);
+	virt = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!virt) {
 		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = ecryptfs_write_headers_virt(virt, &size, crypt_stat,
-					 ecryptfs_dentry);
+	rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
+					 crypt_stat, ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
 		       __func__, rc);
@@ -1393,8 +1395,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		goto out_free;
 	}
 out_free:
-	memset(virt, 0, crypt_stat->num_header_bytes_at_front);
-	kfree(virt);
+	free_page((unsigned long)virt);
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From e74481e23283fb080d4591c258de20785cc3b6c3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 29 Oct 2008 14:01:10 -0700
Subject: fs: remove excess kernel-doc

Delete excess kernel-doc notation in fs/ subdirectory:

Warning(linux-2.6.27-git10//fs/jbd/transaction.c:886): Excess function parameter or struct member 'credits' description in 'journal_get_undo_access'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d15cd6e7251e..60d4c32c8808 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -860,7 +860,6 @@ out:
  * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
  * @handle: transaction
  * @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
  *
  * Sometimes there is a need to distinguish between metadata which has
  * been committed to disk and that which has not.  The ext3fs code uses
-- 
cgit v1.2.3


From 61de800d33af585cb7e6f27b5cdd51029c6855cb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 30 Oct 2008 20:15:22 +0000
Subject: [CIFS] fix error in smb_send2

smb_send2 exit logic was strange, and with the previous change
could cause us to fail large
smb writes when all of the smb was not sent as one chunk.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c   | 2 +-
 fs/cifs/file.c      | 2 +-
 fs/cifs/transport.c | 7 +++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 843a85fb8b9a..d5eac48fc415 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1536,7 +1536,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	__u32 bytes_sent;
 	__u16 byte_count;
 
-	/* cFYI(1,("write at %lld %d bytes",offset,count));*/
+	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 62d8bd8f14c0..ead1a3bb0256 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1824,7 +1824,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	pTcon = cifs_sb->tcon;
 
 	pagevec_init(&lru_pvec, 0);
-		cFYI(DBG2, ("rpages: num pages %d", num_pages));
+	cFYI(DBG2, ("rpages: num pages %d", num_pages));
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
 		struct page *tmp_page;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ba4d66644ebf..ff8243a8fe3e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -290,8 +290,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 		if (rc < 0)
 			break;
 
-		if (rc >= total_len) {
-			WARN_ON(rc > total_len);
+		if (rc == total_len) {
+			total_len = 0;
+			break;
+		} else if (rc > total_len) {
+			cERROR(1, ("sent %d requested %d", rc, total_len));
 			break;
 		}
 		if (rc == 0) {
-- 
cgit v1.2.3


From 8d7c4203c681a3ec359eccff4e53bc8c0ccf403b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 30 Oct 2008 13:48:33 -0400
Subject: nfsd: fix failure to set eof in readdir in some situations

Before 14f7dd632011bb89c035722edd6ea0d90ca6b078 "[PATCH] Copy XFS
readdir hack into nfsd code", readdir_cd->err was reset to eof before
each call to vfs_readdir; afterwards, it is set only once.  Similarly,
c002a6c7977320f95b5edede5ce4e0eeecf291ff "[PATCH] Optimise NFS readdir
hack slightly", can cause us to exit without nfserr_eof set.  Fix this.

This ensures the "eof" bit is set when needed in readdir replies.  (The
particular case I saw was an nfsv4 readdir of an empty directory, which
returned with no entries (the protocol requires "." and ".." to be
filtered out), but with eof unset.)

Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 0bc56f6d9276..848a03e83a42 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1912,6 +1912,7 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
+		cdp->err = nfserr_eof;
 		if (!buf.full)
 			break;
 	}
-- 
cgit v1.2.3


From d7dc61d0a70371b1c6557ea8ffbc60fff94c8168 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@ORACLE.COM>
Date: Thu, 23 Oct 2008 00:50:35 -0400
Subject: NLM: Set address family before calling nlm_host_rebooted()

The nlm_host_rebooted() function uses nlm_cmp_addr() to find an
nsm_handle that matches the rebooted peer.  In order for this to work,
the passed-in address must have a proper address family.

This fixes a post-2.6.28 regression introduced by commit 781b61a6, which
added AF_INET6 support to nlm_cmp_addr().  Before that commit,
nlm_cmp_addr() didn't care about the address family; it compared only
the sin_addr.s_addr field for equality.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc4proc.c | 1 +
 fs/lockd/svcproc.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 014f6ce48172..4dfdcbc6bf68 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -434,6 +434,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 548b0bb2b84d..3ca89e2a9381 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -466,6 +466,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	 * reclaim all locks we hold on this server.
 	 */
 	memset(&saddr, 0, sizeof(saddr));
+	saddr.sin_family = AF_INET;
 	saddr.sin_addr.s_addr = argp->addr;
 	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
-- 
cgit v1.2.3


From b27cf88e9592953ae292d05324887f2f44979433 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Fri, 31 Oct 2008 14:52:24 +0000
Subject: [JFFS2] Fix lack of locking in thread_should_wake()

The thread_should_wake() function trawls through the list of 'very
dirty' eraseblocks, determining whether the background GC thread should
wake. Doing this without holding the appropriate locks is a bad idea.

OLPC Trac #8615

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@kernel.org
---
 fs/jffs2/background.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 8adebd3e43c6..3cceef4ad2b7 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -85,15 +85,15 @@ static int jffs2_garbage_collect_thread(void *_c)
 	for (;;) {
 		allow_signal(SIGHUP);
 	again:
+		spin_lock(&c->erase_completion_lock);
 		if (!jffs2_thread_should_wake(c)) {
 			set_current_state (TASK_INTERRUPTIBLE);
+			spin_unlock(&c->erase_completion_lock);
 			D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread sleeping...\n"));
-			/* Yes, there's a race here; we checked jffs2_thread_should_wake()
-			   before setting current->state to TASK_INTERRUPTIBLE. But it doesn't
-			   matter - We don't care if we miss a wakeup, because the GC thread
-			   is only an optimisation anyway. */
 			schedule();
-		}
+		} else
+			spin_unlock(&c->erase_completion_lock);
+			
 
 		/* This thread is purely an optimisation. But if it runs when
 		   other things could be running, it actually makes things a
-- 
cgit v1.2.3


From 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 31 Oct 2008 23:28:30 +0000
Subject: saner FASYNC handling on file close

As it is, all instances of ->release() for files that have ->fasync()
need to remember to evict file from fasync lists; forgetting that
creates a hole and we actually have a bunch that *does* forget.

So let's keep our lives simple - let __fput() check FASYNC in
file->f_flags and call ->fasync() there if it's been set.  And lose that
crap in ->release() instances - leaving it there is still valid, but we
don't have to bother anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/perfmon.c            |  5 -----
 drivers/char/hpet.c                   |  3 ---
 drivers/char/ipmi/ipmi_devintf.c      |  2 --
 drivers/char/ipmi/ipmi_watchdog.c     |  1 -
 drivers/char/random.c                 |  7 -------
 drivers/char/rtc.c                    |  2 --
 drivers/char/sonypi.c                 |  1 -
 drivers/gpu/drm/drm_fops.c            |  2 --
 drivers/hid/usbhid/hiddev.c           |  2 --
 drivers/ieee1394/dv1394.c             |  3 ---
 drivers/infiniband/core/uverbs_main.c |  2 --
 drivers/input/evdev.c                 |  1 -
 drivers/input/joydev.c                |  1 -
 drivers/input/misc/hp_sdc_rtc.c       | 13 -------------
 drivers/input/mousedev.c              |  1 -
 drivers/input/serio/serio_raw.c       |  1 -
 drivers/message/fusion/mptctl.c       |  7 -------
 drivers/message/i2o/i2o_config.c      | 21 +++++----------------
 drivers/misc/sony-laptop.c            |  1 -
 drivers/net/tun.c                     |  2 --
 drivers/rtc/rtc-dev.c                 |  3 ---
 drivers/scsi/megaraid/megaraid_sas.c  | 12 ------------
 drivers/scsi/sg.c                     |  1 -
 drivers/staging/me4000/me4000.c       |  3 ---
 drivers/telephony/ixj.c               |  1 -
 drivers/uio/uio.c                     |  3 ---
 drivers/usb/gadget/inode.c            |  1 -
 fs/file_table.c                       |  4 ++++
 fs/fuse/dev.c                         |  1 -
 fs/inotify_user.c                     |  3 ---
 fs/pipe.c                             |  3 ---
 net/socket.c                          |  1 -
 sound/core/control.c                  |  1 -
 sound/core/init.c                     |  5 ++++-
 sound/core/pcm_native.c               |  1 -
 sound/core/timer.c                    |  1 -
 36 files changed, 13 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ada4605d1223..6543a5547c84 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -1995,11 +1995,6 @@ pfm_close(struct inode *inode, struct file *filp)
 		return -EBADF;
 	}
 
-	if (filp->f_flags & FASYNC) {
-		DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
-		pfm_do_fasync(-1, filp, ctx, 0);
-	}
-
 	PROTECT_CTX(ctx, flags);
 
 	state     = ctx->ctx_state;
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 408f5f92cb4e..53fdc7ff3870 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -427,9 +427,6 @@ static int hpet_release(struct inode *inode, struct file *file)
 	if (irq)
 		free_irq(irq, devp);
 
-	if (file->f_flags & FASYNC)
-		hpet_fasync(-1, file, 0);
-
 	file->private_data = NULL;
 	return 0;
 }
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index 1d7b429f7ffa..41fc11dc921c 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -162,8 +162,6 @@ static int ipmi_release(struct inode *inode, struct file *file)
 	if (rv)
 		return rv;
 
-	ipmi_fasync (-1, file, 0);
-
 	/* FIXME - free the messages in the list. */
 	kfree(priv);
 
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 235fab0bdf79..a4d57e31f713 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -870,7 +870,6 @@ static int ipmi_close(struct inode *ino, struct file *filep)
 		clear_bit(0, &ipmi_wdog_open);
 	}
 
-	ipmi_fasync(-1, filep, 0);
 	expect_close = 0;
 
 	return 0;
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 705a839f1796..675076f5fca8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1139,18 +1139,12 @@ static int random_fasync(int fd, struct file *filp, int on)
 	return fasync_helper(fd, filp, on, &fasync);
 }
 
-static int random_release(struct inode *inode, struct file *filp)
-{
-	return fasync_helper(-1, filp, 0, &fasync);
-}
-
 const struct file_operations random_fops = {
 	.read  = random_read,
 	.write = random_write,
 	.poll  = random_poll,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
-	.release = random_release,
 };
 
 const struct file_operations urandom_fops = {
@@ -1158,7 +1152,6 @@ const struct file_operations urandom_fops = {
 	.write = random_write,
 	.unlocked_ioctl = random_ioctl,
 	.fasync = random_fasync,
-	.release = random_release,
 };
 
 /***************************************************************
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 32dc89720d58..20d6efb6324e 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -788,8 +788,6 @@ static int rtc_release(struct inode *inode, struct file *file)
 	}
 	spin_unlock_irq(&rtc_lock);
 
-	if (file->f_flags & FASYNC)
-		rtc_fasync(-1, file, 0);
 no_irq:
 #endif
 
diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 85e0eb76eeab..2457b07dabd6 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -898,7 +898,6 @@ static int sonypi_misc_fasync(int fd, struct file *filp, int on)
 
 static int sonypi_misc_release(struct inode *inode, struct file *file)
 {
-	sonypi_misc_fasync(-1, file, 0);
 	mutex_lock(&sonypi_device.lock);
 	sonypi_device.open_count--;
 	mutex_unlock(&sonypi_device.lock);
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index 0d46627663b1..78eeed5caaff 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -406,8 +406,6 @@ int drm_release(struct inode *inode, struct file *filp)
 	if (dev->driver->driver_features & DRIVER_GEM)
 		drm_gem_release(dev, file_priv);
 
-	drm_fasync(-1, filp, 0);
-
 	mutex_lock(&dev->ctxlist_mutex);
 	if (!list_empty(&dev->ctxlist)) {
 		struct drm_ctx_list *pos, *n;
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 3ac320785fc5..83e851a5ed30 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -242,8 +242,6 @@ static int hiddev_release(struct inode * inode, struct file * file)
 	struct hiddev_list *list = file->private_data;
 	unsigned long flags;
 
-	hiddev_fasync(-1, file, 0);
-
 	spin_lock_irqsave(&list->hiddev->list_lock, flags);
 	list_del(&list->node);
 	spin_unlock_irqrestore(&list->hiddev->list_lock, flags);
diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 2f83543a9dfc..965cfdb84ebc 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -1828,9 +1828,6 @@ static int dv1394_release(struct inode *inode, struct file *file)
 	/* OK to free the DMA buffer, no more mappings can exist */
 	do_dv1394_shutdown(video, 1);
 
-	/* clean up async I/O users */
-	dv1394_fasync(-1, file, 0);
-
 	/* give someone else a turn */
 	clear_bit(0, &video->open);
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index d85af1b67027..eb36a81dd09b 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -358,8 +358,6 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
 	}
 	spin_unlock_irq(&file->lock);
 
-	ib_uverbs_event_fasync(-1, filp, 0);
-
 	if (file->is_async) {
 		ib_unregister_event_handler(&file->uverbs_file->event_handler);
 		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 3524bef62be6..1070db330d35 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -235,7 +235,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 		evdev_ungrab(evdev, client);
 	mutex_unlock(&evdev->mutex);
 
-	evdev_fasync(-1, file, 0);
 	evdev_detach_client(evdev, client);
 	kfree(client);
 
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 65d7077a75a1..a85b1485e774 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -244,7 +244,6 @@ static int joydev_release(struct inode *inode, struct file *file)
 	struct joydev_client *client = file->private_data;
 	struct joydev *joydev = client->joydev;
 
-	joydev_fasync(-1, file, 0);
 	joydev_detach_client(joydev, client);
 	kfree(client);
 
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 82ec6b1b6467..216a559f55ea 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -71,7 +71,6 @@ static int hp_sdc_rtc_ioctl(struct inode *inode, struct file *file,
 static unsigned int hp_sdc_rtc_poll(struct file *file, poll_table *wait);
 
 static int hp_sdc_rtc_open(struct inode *inode, struct file *file);
-static int hp_sdc_rtc_release(struct inode *inode, struct file *file);
 static int hp_sdc_rtc_fasync (int fd, struct file *filp, int on);
 
 static int hp_sdc_rtc_read_proc(char *page, char **start, off_t off,
@@ -414,17 +413,6 @@ static int hp_sdc_rtc_open(struct inode *inode, struct file *file)
         return 0;
 }
 
-static int hp_sdc_rtc_release(struct inode *inode, struct file *file)
-{
-	/* Turn off interrupts? */
-
-        if (file->f_flags & FASYNC) {
-                hp_sdc_rtc_fasync (-1, file, 0);
-        }
-
-        return 0;
-}
-
 static int hp_sdc_rtc_fasync (int fd, struct file *filp, int on)
 {
         return fasync_helper (fd, filp, on, &hp_sdc_rtc_async_queue);
@@ -680,7 +668,6 @@ static const struct file_operations hp_sdc_rtc_fops = {
         .poll =		hp_sdc_rtc_poll,
         .ioctl =	hp_sdc_rtc_ioctl,
         .open =		hp_sdc_rtc_open,
-        .release =	hp_sdc_rtc_release,
         .fasync =	hp_sdc_rtc_fasync,
 };
 
diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c
index 8137e50ded87..d8c056fe7e98 100644
--- a/drivers/input/mousedev.c
+++ b/drivers/input/mousedev.c
@@ -519,7 +519,6 @@ static int mousedev_release(struct inode *inode, struct file *file)
 	struct mousedev_client *client = file->private_data;
 	struct mousedev *mousedev = client->mousedev;
 
-	mousedev_fasync(-1, file, 0);
 	mousedev_detach_client(mousedev, client);
 	kfree(client);
 
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index 470770c09260..06bbd0e74c6f 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -135,7 +135,6 @@ static int serio_raw_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&serio_raw_mutex);
 
-	serio_raw_fasync(-1, file, 0);
 	serio_raw_cleanup(serio_raw);
 
 	mutex_unlock(&serio_raw_mutex);
diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c
index f5233f3d9eff..b89f476cd0a9 100644
--- a/drivers/message/fusion/mptctl.c
+++ b/drivers/message/fusion/mptctl.c
@@ -559,12 +559,6 @@ mptctl_fasync(int fd, struct file *filep, int mode)
 	return ret;
 }
 
-static int
-mptctl_release(struct inode *inode, struct file *filep)
-{
-	return fasync_helper(-1, filep, 0, &async_queue);
-}
-
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /*
  *  MPT ioctl handler
@@ -2706,7 +2700,6 @@ mptctl_hp_targetinfo(unsigned long arg)
 static const struct file_operations mptctl_fops = {
 	.owner =	THIS_MODULE,
 	.llseek =	no_llseek,
-	.release =	mptctl_release,
 	.fasync = 	mptctl_fasync,
 	.unlocked_ioctl = mptctl_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index a3fabdbe6ca6..f3384c32b9a1 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -1097,28 +1097,17 @@ static int cfg_fasync(int fd, struct file *fp, int on)
 static int cfg_release(struct inode *inode, struct file *file)
 {
 	ulong id = (ulong) file->private_data;
-	struct i2o_cfg_info *p1, *p2;
+	struct i2o_cfg_info *p, **q;
 	unsigned long flags;
 
 	lock_kernel();
-	p1 = p2 = NULL;
-
 	spin_lock_irqsave(&i2o_config_lock, flags);
-	for (p1 = open_files; p1;) {
-		if (p1->q_id == id) {
-
-			if (p1->fasync)
-				cfg_fasync(-1, file, 0);
-			if (p2)
-				p2->next = p1->next;
-			else
-				open_files = p1->next;
-
-			kfree(p1);
+	for (q = &open_files; (p = *q) != NULL; q = &p->next) {
+		if (p->q_id == id) {
+			*q = p->next;
+			kfree(p);
 			break;
 		}
-		p2 = p1;
-		p1 = p1->next;
 	}
 	spin_unlock_irqrestore(&i2o_config_lock, flags);
 	unlock_kernel();
diff --git a/drivers/misc/sony-laptop.c b/drivers/misc/sony-laptop.c
index f483c4221f76..06f07e19dc70 100644
--- a/drivers/misc/sony-laptop.c
+++ b/drivers/misc/sony-laptop.c
@@ -1920,7 +1920,6 @@ static int sonypi_misc_fasync(int fd, struct file *filp, int on)
 
 static int sonypi_misc_release(struct inode *inode, struct file *file)
 {
-	sonypi_misc_fasync(-1, file, 0);
 	atomic_dec(&sonypi_compat.open_count);
 	return 0;
 }
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 6daea0c91862..33b6d1b122fb 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1070,8 +1070,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 
 	DBG(KERN_INFO "%s: tun_chr_close\n", tun->dev->name);
 
-	tun_chr_fasync(-1, file, 0);
-
 	rtnl_lock();
 
 	/* Detach from net device */
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 079e9ed907e0..ecdea44ae4e5 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -446,9 +446,6 @@ static int rtc_dev_release(struct inode *inode, struct file *file)
 	if (rtc->ops->release)
 		rtc->ops->release(rtc->dev.parent);
 
-	if (file->f_flags & FASYNC)
-		rtc_dev_fasync(-1, file, 0);
-
 	clear_bit_unlock(RTC_DEV_BUSY, &rtc->flags);
 	return 0;
 }
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index afe1de998763..a454f94623d7 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -2987,17 +2987,6 @@ static int megasas_mgmt_open(struct inode *inode, struct file *filep)
 	return 0;
 }
 
-/**
- * megasas_mgmt_release - char node "release" entry point
- */
-static int megasas_mgmt_release(struct inode *inode, struct file *filep)
-{
-	filep->private_data = NULL;
-	fasync_helper(-1, filep, 0, &megasas_async_queue);
-
-	return 0;
-}
-
 /**
  * megasas_mgmt_fasync -	Async notifier registration from applications
  *
@@ -3345,7 +3334,6 @@ megasas_mgmt_compat_ioctl(struct file *file, unsigned int cmd,
 static const struct file_operations megasas_mgmt_fops = {
 	.owner = THIS_MODULE,
 	.open = megasas_mgmt_open,
-	.release = megasas_mgmt_release,
 	.fasync = megasas_mgmt_fasync,
 	.unlocked_ioctl = megasas_mgmt_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9adf35bd8b56..5103855242ae 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -327,7 +327,6 @@ sg_release(struct inode *inode, struct file *filp)
 	if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
 		return -ENXIO;
 	SCSI_LOG_TIMEOUT(3, printk("sg_release: %s\n", sdp->disk->disk_name));
-	sg_fasync(-1, filp, 0);	/* remove filp from async notification list */
 	if (0 == sg_remove_sfp(sdp, sfp)) {	/* Returns 1 when sdp gone */
 		if (!sdp->detached) {
 			scsi_device_put(sdp->device);
diff --git a/drivers/staging/me4000/me4000.c b/drivers/staging/me4000/me4000.c
index 0b33773bb4f6..cf8b01bcac8d 100644
--- a/drivers/staging/me4000/me4000.c
+++ b/drivers/staging/me4000/me4000.c
@@ -1633,9 +1633,6 @@ static int me4000_release(struct inode *inode_p, struct file *file_p)
 
 		free_irq(ext_int_context->irq, ext_int_context);
 
-		/* Delete the fasync structure and free memory */
-		me4000_ext_int_fasync(0, file_p, 0);
-
 		/* Mark as unused */
 		ext_int_context->in_use = 0;
 	} else {
diff --git a/drivers/telephony/ixj.c b/drivers/telephony/ixj.c
index 41b6530b8f25..a913efc69669 100644
--- a/drivers/telephony/ixj.c
+++ b/drivers/telephony/ixj.c
@@ -2328,7 +2328,6 @@ static int ixj_release(struct inode *inode, struct file *file_p)
 	j->rec_codec = j->play_codec = 0;
 	j->rec_frame_size = j->play_frame_size = 0;
 	j->flags.cidsent = j->flags.cidring = 0;
-	ixj_fasync(-1, file_p, 0);	/* remove from list of async notification */
 
 	if(j->cardtype == QTI_LINEJACK && !j->readers && !j->writers) {
 		ixj_set_port(j, PORT_PSTN);
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index f9b4647255aa..2d2440cd57a9 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -367,9 +367,6 @@ static int uio_release(struct inode *inode, struct file *filep)
 		ret = idev->info->release(idev->info, inode);
 
 	module_put(idev->owner);
-
-	if (filep->f_flags & FASYNC)
-		ret = uio_fasync(-1, filep, 0);
 	kfree(listener);
 	return ret;
 }
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index f4585d3e90d7..eeb26c0f88e5 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1251,7 +1251,6 @@ dev_release (struct inode *inode, struct file *fd)
 	 * alternatively, all host requests will time out.
 	 */
 
-	fasync_helper (-1, fd, 0, &dev->fasync);
 	kfree (dev->buf);
 	dev->buf = NULL;
 	put_dev (dev);
diff --git a/fs/file_table.c b/fs/file_table.c
index efc06faede6c..5ad0eca6eea2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -269,6 +269,10 @@ void __fput(struct file *file)
 	eventpoll_release(file);
 	locks_remove_flock(file);
 
+	if (unlikely(file->f_flags & FASYNC)) {
+		if (file->f_op && file->f_op->fasync)
+			file->f_op->fasync(-1, file, 0);
+	}
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 87250b6a8682..b72361479be2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1056,7 +1056,6 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		spin_unlock(&fc->lock);
-		fasync_helper(-1, file, 0, &fc->fasync);
 		fuse_conn_put(fc);
 	}
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index d85c7d931cdf..d367e9b92862 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -537,9 +537,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
 		inotify_dev_event_dequeue(dev);
 	mutex_unlock(&dev->ev_mutex);
 
-	if (file->f_flags & FASYNC)
-		inotify_fasync(-1, file, 0);
-
 	/* free this device: the put matching the get in inotify_init() */
 	put_inotify_dev(dev);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index fcba6542b8d0..7aea8b89baac 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -717,14 +717,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 static int
 pipe_read_release(struct inode *inode, struct file *filp)
 {
-	pipe_read_fasync(-1, filp, 0);
 	return pipe_release(inode, 1, 0);
 }
 
 static int
 pipe_write_release(struct inode *inode, struct file *filp)
 {
-	pipe_write_fasync(-1, filp, 0);
 	return pipe_release(inode, 0, 1);
 }
 
@@ -733,7 +731,6 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
 {
 	int decr, decw;
 
-	pipe_rdwr_fasync(-1, filp, 0);
 	decr = (filp->f_mode & FMODE_READ) != 0;
 	decw = (filp->f_mode & FMODE_WRITE) != 0;
 	return pipe_release(inode, decr, decw);
diff --git a/net/socket.c b/net/socket.c
index 2b7a4b5c9b72..57550c3bcabe 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -990,7 +990,6 @@ static int sock_close(struct inode *inode, struct file *filp)
 		printk(KERN_DEBUG "sock_close: NULL inode\n");
 		return 0;
 	}
-	sock_fasync(-1, filp, 0);
 	sock_release(SOCKET_I(inode));
 	return 0;
 }
diff --git a/sound/core/control.c b/sound/core/control.c
index b0bf42691047..636b3b52ef8b 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -113,7 +113,6 @@ static int snd_ctl_release(struct inode *inode, struct file *file)
 	unsigned int idx;
 
 	ctl = file->private_data;
-	fasync_helper(-1, file, 0, &ctl->fasync);
 	file->private_data = NULL;
 	card = ctl->card;
 	write_lock_irqsave(&card->ctl_files_rwlock, flags);
diff --git a/sound/core/init.c b/sound/core/init.c
index ef2352c2e451..b47ff8b44be8 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -264,8 +264,11 @@ static int snd_disconnect_release(struct inode *inode, struct file *file)
 	}
 	spin_unlock(&shutdown_lock);
 
-	if (likely(df))
+	if (likely(df)) {
+		if ((file->f_flags & FASYNC) && df->disconnected_f_op->fasync)
+			df->disconnected_f_op->fasync(-1, file, 0);
 		return df->disconnected_f_op->release(inode, file);
+	}
 
 	panic("%s(%p, %p) failed!", __func__, inode, file);
 }
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index aef18682c035..a789efc9df39 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2169,7 +2169,6 @@ static int snd_pcm_release(struct inode *inode, struct file *file)
 	if (snd_BUG_ON(!substream))
 		return -ENXIO;
 	pcm = substream->pcm;
-	fasync_helper(-1, file, 0, &substream->runtime->fasync);
 	mutex_lock(&pcm->open_mutex);
 	snd_pcm_release_substream(substream);
 	kfree(pcm_file);
diff --git a/sound/core/timer.c b/sound/core/timer.c
index e582face89d2..c584408c9f17 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1263,7 +1263,6 @@ static int snd_timer_user_release(struct inode *inode, struct file *file)
 	if (file->private_data) {
 		tu = file->private_data;
 		file->private_data = NULL;
-		fasync_helper(-1, file, 0, &tu->fasync);
 		if (tu->timeri)
 			snd_timer_close(tu->timeri);
 		kfree(tu->queue);
-- 
cgit v1.2.3


From e219cca082f52e7dfea41f3be264b7b5eb204227 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:37:59 -0500
Subject: jbd: don't give up looking for space so easily in
 __log_wait_for_space

Commit be07c4ed introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes the bug reported by Meelis Roos at:
http://bugzilla.kernel.org/show_bug.cgi?id=11937

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd/checkpoint.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 1bd8d4acc6f2..61f32f3868cd 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -115,7 +115,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -128,25 +128,42 @@ void __log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__log_space_left(journal) < nblocks) {
+		space_left = __log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				log_do_checkpoint(journal);
+			} else if (cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space\n", __func__);
+				WARN_ON(1);
 				journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 8c3f25d8950c3e9fe6c9849f88679b3f2a071550 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 22:38:07 -0500
Subject: jbd2: don't give up looking for space so easily in
 __jbd2_log_wait_for_space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 23f8b79e introducd a regression because it assumed that if
there were no transactions ready to be checkpointed, that no progress
could be made on making space available in the journal, and so the
journal should be aborted.  This assumption is false; it could be the
case that simply calling jbd2_cleanup_journal_tail() will recover the
necessary space, or, for small journals, the currently committing
transaction could be responsible for chewing up the required space in
the log, so we need to wait for the currently committing transaction
to finish before trying to force a checkpoint operation.

This patch fixes a bug reported by Mihai Harpau at:
https://bugzilla.redhat.com/show_bug.cgi?id=469582

This patch fixes a bug reported by François Valenduc at:
http://bugzilla.kernel.org/show_bug.cgi?id=11840

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Duane Griffin <duaneg@dghda.com>
Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
---
 fs/jbd2/checkpoint.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9203c3332f17..9497718fe920 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -116,7 +116,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
  */
 void __jbd2_log_wait_for_space(journal_t *journal)
 {
-	int nblocks;
+	int nblocks, space_left;
 	assert_spin_locked(&journal->j_state_lock);
 
 	nblocks = jbd_space_needed(journal);
@@ -129,25 +129,43 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 		/*
 		 * Test again, another process may have checkpointed while we
 		 * were waiting for the checkpoint lock. If there are no
-		 * outstanding transactions there is nothing to checkpoint and
-		 * we can't make progress. Abort the journal in this case.
+		 * transactions ready to be checkpointed, try to recover
+		 * journal space by calling cleanup_journal_tail(), and if
+		 * that doesn't work, by waiting for the currently committing
+		 * transaction to complete.  If there is absolutely no way
+		 * to make progress, this is either a BUG or corrupted
+		 * filesystem, so abort the journal and leave a stack
+		 * trace for forensic evidence.
 		 */
 		spin_lock(&journal->j_state_lock);
 		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
-		if (__jbd2_log_space_left(journal) < nblocks) {
+		space_left = __jbd2_log_space_left(journal);
+		if (space_left < nblocks) {
 			int chkpt = journal->j_checkpoint_transactions != NULL;
+			tid_t tid = 0;
 
+			if (journal->j_committing_transaction)
+				tid = journal->j_committing_transaction->t_tid;
 			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
 			if (chkpt) {
 				jbd2_log_do_checkpoint(journal);
+			} else if (jbd2_cleanup_journal_tail(journal) == 0) {
+				/* We were able to recover space; yay! */
+				;
+			} else if (tid) {
+				jbd2_log_wait_commit(journal, tid);
 			} else {
-				printk(KERN_ERR "%s: no transactions\n",
-				       __func__);
+				printk(KERN_ERR "%s: needed %d blocks and "
+				       "only had %d space available\n",
+				       __func__, nblocks, space_left);
+				printk(KERN_ERR "%s: no way to get more "
+				       "journal space in %s\n", __func__,
+				       journal->j_devname);
+				WARN_ON(1);
 				jbd2_journal_abort(journal, 0);
 			}
-
 			spin_lock(&journal->j_state_lock);
 		} else {
 			spin_unlock(&journal->j_list_lock);
-- 
cgit v1.2.3


From 2423840ded13e6d3b52d88aff8d033bb78fafd08 Mon Sep 17 00:00:00 2001
From: Sami Liedes <sliedes@cc.hut.fi>
Date: Sun, 2 Nov 2008 19:23:30 -0500
Subject: jbd2: deregister proc on failure in jbd2_journal_init_inode

jbd2_journal_init_inode() does not call jbd2_stats_proc_exit() on all
failure paths after calling jbd2_stats_proc_init(). This leaves
dangling references to the fs in proc.

This patch fixes a bug reported by Sami Leides at:
http://bugzilla.kernel.org/show_bug.cgi?id=11493

Signed-off-by: Sami Liedes <sliedes@cc.hut.fi>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 783de118de92..e70d657a19f8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1089,6 +1089,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
@@ -1098,6 +1099,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
+		jbd2_stats_proc_exit(journal);
 		kfree(journal);
 		return NULL;
 	}
-- 
cgit v1.2.3


From ae6884a9da56f8921e432e663b4ccb4a1851b2ea Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 3 Nov 2008 14:05:08 -0500
Subject: cifs: fix renaming one hardlink on top of another

cifs: fix renaming one hardlink on top of another

POSIX says that renaming one hardlink on top of another to the same
inode is a no-op. We had the logic mostly right, but forgot to clear
the return code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d54fa8aeaea9..ff8c68de4a92 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1361,9 +1361,11 @@ int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (tmprc == 0 && (info_buf_source->UniqueId ==
-				   info_buf_target->UniqueId))
+				   info_buf_target->UniqueId)) {
 			/* same file, POSIX says that this is a noop */
+			rc = 0;
 			goto cifs_rename_exit;
+		}
 	} /* else ... BB we could add the same check for Windows by
 		     checking the UniqueId via FILE_INTERNAL_INFO */
 
-- 
cgit v1.2.3


From c527c8a7ffa18400c2c1488f7ab5aff5e83f3c8e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 3 Nov 2008 20:46:21 +0000
Subject: [CIFS] Can't rely on iov length and base when kernel_recvmsg returns
 error

When retrying kernel_recvmsg, reset iov_base and iov_len.

Note comment from Sridhar: "In the normal path, iov.iov_len is clearly set to 4. But i think you are
running into a case where kernel_recvmsg() is called via 'goto incomplete_rcv'
It happens if the previous call fails with EAGAIN.
If you want to call recvmsg() after EAGAIN failure, you need to reset iov."

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9f9248cb3fe..c682be8f2984 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -417,9 +417,14 @@ incomplete_rcv:
 			msleep(1); /* minimum sleep to prevent looping
 				allowing socket to clear and app threads to set
 				tcpStatus CifsNeedReconnect if server hung */
-			if (pdu_length < 4)
+			if (pdu_length < 4) {
+				iov.iov_base = (4 - pdu_length) +
+							(char *)smb_buffer;
+				iov.iov_len = pdu_length;
+				smb_msg.msg_control = NULL;
+				smb_msg.msg_controllen = 0;
 				goto incomplete_rcv;
-			else
+			} else
 				continue;
 		} else if (length <= 0) {
 			if (server->tcpStatus == CifsNew) {
-- 
cgit v1.2.3


From ae2d9fb18e575ed37ffc241ece4bf68f0be4ae32 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:10:50 -0500
Subject: ext4: fix missing ext4_unlock_group in error path

If we try to free a block which is already freed, the code was
returning without first unlocking the group.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dfe17a134052..444ad998f72e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4441,6 +4441,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
+			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
 			    "Double free of blocks %d (%d %d)\n",
 			    block, entry->start_blk, entry->count);
-- 
cgit v1.2.3


From d94e99a64c3beece22dbfb2b335771a59184eb0a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 4 Nov 2008 09:11:26 -0500
Subject: ext4: Convert to host order before using the values.

Use le16_to_cpu to read the s_reserved_gdt_blocks values
from super block.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 994859df010e..e27acd18b4b0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1458,9 +1458,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 
 	/* We allocate both existing and potentially added groups */
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
-			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
-			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
-			   groups_per_flex;
+			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
 	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-- 
cgit v1.2.3


From 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 3 Nov 2008 18:10:55 -0500
Subject: ext4: wait on all pending commits in ext4_sync_fs()

In ext4_sync_fs, we only wait for a commit to finish if we started it,
but there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which
are delayed due to a long list of other I/O on the backing block
device, this causes the buffer associated with the long symlinks to
not be moved to the inode dirty list in the second phase of
fsync_super.  Then, before they can be dirtied again, kjournald exits,
seeing the UMOUNT flag and the dirty pages are never written to the
backing block device, causing long symlink corruption and exposing new
or previously freed block data to userspace.

To ensure all commits are synced, we flush all journal commits now
when sync_fs'ing ext4.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
---
 fs/ext4/super.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e27acd18b4b0..e4a241c65dbe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2884,12 +2884,9 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
  */
-
 static void ext4_write_super(struct super_block *sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
@@ -2899,15 +2896,15 @@ static void ext4_write_super(struct super_block *sb)
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
+	int ret = 0;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
-	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-		if (wait)
-			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-	}
-	return 0;
+	if (wait)
+		ret = ext4_force_commit(sb);
+	else
+		jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From dc8a0843a435b2c0891e7eaea64faaf1ebec9b11 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Date: Wed, 5 Nov 2008 23:21:16 +0100
Subject: [JFFS2] fix race condition in jffs2_lzo_compress()

deflate_mutex protects the globals lzo_mem and lzo_compress_buf.  However,
jffs2_lzo_compress() unlocks deflate_mutex _before_ it has copied out the
compressed data from lzo_compress_buf.  Correct this by moving the mutex
unlock after the copy.

In addition, document what deflate_mutex actually protects.

Cc: stable@kernel.org
Signed-off-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Richard Purdie <rpurdie@openedhand.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr_lzo.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 47b045797e42..90cb60d09787 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -19,7 +19,7 @@
 
 static void *lzo_mem;
 static void *lzo_compress_buf;
-static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(deflate_mutex);	/* for lzo_mem and lzo_compress_buf */
 
 static void free_workspace(void)
 {
@@ -49,18 +49,21 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out,
 
 	mutex_lock(&deflate_mutex);
 	ret = lzo1x_1_compress(data_in, *sourcelen, lzo_compress_buf, &compress_size, lzo_mem);
-	mutex_unlock(&deflate_mutex);
-
 	if (ret != LZO_E_OK)
-		return -1;
+		goto fail;
 
 	if (compress_size > *dstlen)
-		return -1;
+		goto fail;
 
 	memcpy(cpage_out, lzo_compress_buf, compress_size);
-	*dstlen = compress_size;
+	mutex_unlock(&deflate_mutex);
 
+	*dstlen = compress_size;
 	return 0;
+
+ fail:
+	mutex_unlock(&deflate_mutex);
+	return -1;
 }
 
 static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out,
-- 
cgit v1.2.3


From 89f97496e81d2112b5e41416fe3020688c443818 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 5 Nov 2008 10:21:06 +0100
Subject: block: fix __blkdev_get() for removable devices

Commit 0762b8bde9729f10f8e6249809660ff2ec3ad735 moved disk_get_part()
in front of recursive get on the whole disk, which caused removable
devices to try disk_get_part() before rescanning after a new media is
inserted, which might fail legit open attempts or give the old
partition.

This patch fixes the problem by moving disk_get_part() after
__blkdev_get() on the whole disk.

This problem was spotted by Borislav Petkov.

Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88a776fa0ef6..db831efbdbbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -986,7 +986,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
-	struct hd_struct *part = NULL;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1004,24 +1003,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		return ret;
 	}
 
-	ret = -ENXIO;
-
 	lock_kernel();
 
+	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk)
 		goto out_unlock_kernel;
-	part = disk_get_part(disk, partno);
-	if (!part)
-		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
-		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
+
+			ret = -ENXIO;
+			bdev->bd_part = disk_get_part(disk, partno);
+			if (!bdev->bd_part)
+				goto out_clear;
+
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
 				if (ret)
@@ -1049,18 +1049,17 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bdev->bd_contains = whole;
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
+			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
-			    !part || !part->nr_sects) {
+			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
 				ret = -ENXIO;
 				goto out_clear;
 			}
-			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
+			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		disk_put_part(part);
 		put_disk(disk);
 		module_put(disk->fops->owner);
-		part = NULL;
 		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
@@ -1080,6 +1079,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return 0;
 
  out_clear:
+	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
@@ -1091,7 +1091,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_kernel:
 	unlock_kernel();
 
-	disk_put_part(part);
 	if (disk)
 		module_put(disk->fops->owner);
 	put_disk(disk);
-- 
cgit v1.2.3


From 069782a1ee55105220e5ae2db448495dac267cb1 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 21 Oct 2008 12:56:31 +0300
Subject: UBIFS: remove printk

Remove the "UBIFS background thread ubifs_bgd0_0 started" message.
We kill the background thread when we switch to R/O mode, and
start it again whan we switch to R/W mode. OLPC is doing this
many times during boot, and we see this message many times as
well, which is irritating. So just kill the message.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/commit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 0a6aa2cc78f0..b49884c8c10e 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -234,8 +234,8 @@ int ubifs_bg_thread(void *info)
 	int err;
 	struct ubifs_info *c = info;
 
-	ubifs_msg("background thread \"%s\" started, PID %d",
-		  c->bgt_name, current->pid);
+	dbg_msg("background thread \"%s\" started, PID %d",
+		c->bgt_name, current->pid);
 	set_freezable();
 
 	while (1) {
-- 
cgit v1.2.3


From 0ecb9529a4d47825778e7b0d226eb36019252a9d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 24 Oct 2008 10:52:57 -0700
Subject: UBIFS: endian handling fixes and annotations

Noticed by sparse:
fs/ubifs/file.c:75:2: warning: restricted __le64 degrades to integer
fs/ubifs/file.c:629:4: warning: restricted __le64 degrades to integer
fs/ubifs/dir.c:431:3: warning: restricted __le64 degrades to integer

This should be checked to ensure the ubifs_assert is working as
intended, I've done the suggested annotation in this patch.

fs/ubifs/sb.c:298:6: warning: incorrect type in assignment (different base types)
fs/ubifs/sb.c:298:6:    expected int [signed] [assigned] tmp
fs/ubifs/sb.c:298:6:    got restricted __le64 [usertype] <noident>
fs/ubifs/sb.c:299:19: warning: incorrect type in assignment (different base types)
fs/ubifs/sb.c:299:19:    expected restricted __le64 [usertype] atime_sec
fs/ubifs/sb.c:299:19:    got int [signed] [assigned] tmp
fs/ubifs/sb.c:300:19: warning: incorrect type in assignment (different base types)
fs/ubifs/sb.c:300:19:    expected restricted __le64 [usertype] ctime_sec
fs/ubifs/sb.c:300:19:    got int [signed] [assigned] tmp
fs/ubifs/sb.c:301:19: warning: incorrect type in assignment (different base types)
fs/ubifs/sb.c:301:19:    expected restricted __le64 [usertype] mtime_sec
fs/ubifs/sb.c:301:19:    got int [signed] [assigned] tmp

This looks like a bugfix as your tmp was a u32 so there was truncation in
the atime, mtime, ctime value, probably not intentional, add a tmp_le64
and use it here.

fs/ubifs/key.h:348:9: warning: cast to restricted __le32
fs/ubifs/key.h:348:9: warning: cast to restricted __le32
fs/ubifs/key.h:419:9: warning: cast to restricted __le32

Read from the annotated union member instead.

fs/ubifs/recovery.c:175:13: warning: incorrect type in assignment (different base types)
fs/ubifs/recovery.c:175:13:    expected unsigned int [unsigned] [usertype] save_flags
fs/ubifs/recovery.c:175:13:    got restricted __le32 [usertype] flags
fs/ubifs/recovery.c:186:13: warning: incorrect type in assignment (different base types)
fs/ubifs/recovery.c:186:13:    expected restricted __le32 [usertype] flags
fs/ubifs/recovery.c:186:13:    got unsigned int [unsigned] [usertype] save_flags

Do byteshifting at compile time of the flag value.  Annotate the saved_flags
as le32.

fs/ubifs/debug.c:368:10: warning: cast to restricted __le32
fs/ubifs/debug.c:368:10: warning: cast from restricted __le64

Should be checked if the truncation was intentional, I've changed the
printk to print the full width.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c    | 4 ++--
 fs/ubifs/dir.c      | 3 ++-
 fs/ubifs/file.c     | 4 ++--
 fs/ubifs/key.h      | 4 ++--
 fs/ubifs/recovery.c | 4 ++--
 fs/ubifs/sb.c       | 9 +++++----
 6 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7186400750e7..f9deccbc90c0 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -364,8 +364,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		       le32_to_cpu(mst->ihead_lnum));
 		printk(KERN_DEBUG "\tihead_offs     %u\n",
 		       le32_to_cpu(mst->ihead_offs));
-		printk(KERN_DEBUG "\tindex_size     %u\n",
-		       le32_to_cpu(mst->index_size));
+		printk(KERN_DEBUG "\tindex_size     %llu\n",
+		       (unsigned long long)le64_to_cpu(mst->index_size));
 		printk(KERN_DEBUG "\tlpt_lnum       %u\n",
 		       le32_to_cpu(mst->lpt_lnum));
 		printk(KERN_DEBUG "\tlpt_offs       %u\n",
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 526c01ec8003..37a9e604c3e9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -428,7 +428,8 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
 			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
 			key_hash_flash(c, &dent->key));
-		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+		ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
+			     ubifs_inode(dir)->creat_sqnum);
 
 		nm.len = le16_to_cpu(dent->nlen);
 		over = filldir(dirent, dent->name, nm.len, file->f_pos,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 51cf511d44d9..9124eee73aea 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,7 +72,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
 		return err;
 	}
 
-	ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+	ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
 
 	len = le32_to_cpu(dn->size);
 	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
@@ -626,7 +626,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 			dn = bu->buf + (bu->zbranch[nn].offs - offs);
 
-			ubifs_assert(dn->ch.sqnum >
+			ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
 				     ubifs_inode(inode)->creat_sqnum);
 
 			len = le32_to_cpu(dn->size);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 9ee65086f627..3f1f16bc25c9 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -345,7 +345,7 @@ static inline int key_type_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
-	return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+	return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
 }
 
 /**
@@ -416,7 +416,7 @@ static inline unsigned int key_block_flash(const struct ubifs_info *c,
 {
 	const union ubifs_key *key = k;
 
-	return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+	return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK;
 }
 
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77d26c141cf6..bed97421b972 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -168,12 +168,12 @@ static int write_rcvrd_mst_node(struct ubifs_info *c,
 				struct ubifs_mst_node *mst)
 {
 	int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
-	uint32_t save_flags;
+	__le32 save_flags;
 
 	dbg_rcvry("recovery");
 
 	save_flags = mst->flags;
-	mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
 
 	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
 	err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 2bf753b38889..0f392351dc5a 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -81,6 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
 	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
 	uint64_t tmp64, main_bytes;
+	__le64 tmp_le64;
 
 	/* Some functions called from here depend on the @c->key_len filed */
 	c->key_len = UBIFS_SK_LEN;
@@ -295,10 +296,10 @@ static int create_default_filesystem(struct ubifs_info *c)
 	ino->ch.node_type = UBIFS_INO_NODE;
 	ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
 	ino->nlink = cpu_to_le32(2);
-	tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
-	ino->atime_sec   = tmp;
-	ino->ctime_sec   = tmp;
-	ino->mtime_sec   = tmp;
+	tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+	ino->atime_sec   = tmp_le64;
+	ino->ctime_sec   = tmp_le64;
+	ino->mtime_sec   = tmp_le64;
 	ino->atime_nsec  = 0;
 	ino->ctime_nsec  = 0;
 	ino->mtime_nsec  = 0;
-- 
cgit v1.2.3


From e84461ad9c4f0ff91ab8553596acdb7bf5522df4 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 29 Oct 2008 12:08:43 +0200
Subject: UBIFS: fix compilation warnings

We print 'ino_t' type using '%lu' printk() placeholder, but this
results in many warnings when compiling for Alpha platform. Fix
this by adding (unsingned long) casts.

Fixes these warnings:

fs/ubifs/journal.c:693: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/journal.c:1131: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/dir.c:163: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/tnc.c:2680: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/tnc.c:2700: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/replay.c:1066: warning: format '%lu' expects type 'long unsigned int', but argument 7 has type 'ino_t'
fs/ubifs/orphan.c:108: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:135: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:142: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:154: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:159: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:451: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:539: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:612: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:843: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/orphan.c:856: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/recovery.c:1438: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/recovery.c:1443: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/recovery.c:1475: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/recovery.c:1495: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:105: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:105: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:110: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:110: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:114: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:114: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:118: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:118: warning: format '%lu' expects type 'long unsigned int', but argument 3 has type 'ino_t'
fs/ubifs/debug.c:1591: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1671: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1674: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:1680: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1699: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:1788: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:1821: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:1833: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:1924: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1932: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1938: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1945: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1953: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1960: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1967: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1973: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1988: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'ino_t'
fs/ubifs/debug.c:1991: warning: format '%lu' expects type 'long unsigned int', but argument 5 has type 'ino_t'
fs/ubifs/debug.c:2009: warning: format '%lu' expects type 'long unsigned int', but argument 2 has type 'ino_t'

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c    | 62 ++++++++++++++++++++++++++++++++---------------------
 fs/ubifs/dir.c      |  2 +-
 fs/ubifs/journal.c  |  8 ++++---
 fs/ubifs/orphan.c   | 28 +++++++++++++-----------
 fs/ubifs/recovery.c | 13 +++++------
 fs/ubifs/replay.c   |  2 +-
 fs/ubifs/tnc.c      |  5 +++--
 7 files changed, 70 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index f9deccbc90c0..510ffa0bbda4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -101,21 +101,24 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
 	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
 		switch (type) {
 		case UBIFS_INO_KEY:
-			sprintf(p, "(%lu, %s)", key_inum(c, key),
+			sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
 			       get_key_type(type));
 			break;
 		case UBIFS_DENT_KEY:
 		case UBIFS_XENT_KEY:
-			sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+			sprintf(p, "(%lu, %s, %#08x)",
+				(unsigned long)key_inum(c, key),
 				get_key_type(type), key_hash(c, key));
 			break;
 		case UBIFS_DATA_KEY:
-			sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+			sprintf(p, "(%lu, %s, %u)",
+				(unsigned long)key_inum(c, key),
 				get_key_type(type), key_block(c, key));
 			break;
 		case UBIFS_TRUN_KEY:
 			sprintf(p, "(%lu, %s)",
-				key_inum(c, key), get_key_type(type));
+				(unsigned long)key_inum(c, key),
+				get_key_type(type));
 			break;
 		default:
 			sprintf(p, "(bad key type: %#08x, %#08x)",
@@ -1589,7 +1592,7 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
 
 	if (inum > c->highest_inum) {
 		ubifs_err("too high inode number, max. is %lu",
-			  c->highest_inum);
+			  (unsigned long)c->highest_inum);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1668,16 +1671,18 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
 	ino_key_init(c, &key, inum);
 	err = ubifs_lookup_level0(c, &key, &znode, &n);
 	if (!err) {
-		ubifs_err("inode %lu not found in index", inum);
+		ubifs_err("inode %lu not found in index", (unsigned long)inum);
 		return ERR_PTR(-ENOENT);
 	} else if (err < 0) {
-		ubifs_err("error %d while looking up inode %lu", err, inum);
+		ubifs_err("error %d while looking up inode %lu",
+			  err, (unsigned long)inum);
 		return ERR_PTR(err);
 	}
 
 	zbr = &znode->zbranch[n];
 	if (zbr->len < UBIFS_INO_NODE_SZ) {
-		ubifs_err("bad node %lu node length %d", inum, zbr->len);
+		ubifs_err("bad node %lu node length %d",
+			  (unsigned long)inum, zbr->len);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1697,7 +1702,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
 	kfree(ino);
 	if (IS_ERR(fscki)) {
 		ubifs_err("error %ld while adding inode %lu node",
-			  PTR_ERR(fscki), inum);
+			  PTR_ERR(fscki), (unsigned long)inum);
 		return fscki;
 	}
 
@@ -1786,7 +1791,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing data node and "
-				  "trying to find inode node %lu", err, inum);
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1819,7 +1825,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing entry node and "
-				  "trying to find inode node %lu", err, inum);
+				  "trying to find inode node %lu",
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1832,7 +1839,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 			err = PTR_ERR(fscki);
 			ubifs_err("error %d while processing entry node and "
 				  "trying to find parent inode node %lu",
-				  err, inum);
+				  err, (unsigned long)inum);
 			goto out_dump;
 		}
 
@@ -1923,7 +1930,8 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 			    fscki->references != 1) {
 				ubifs_err("directory inode %lu has %d "
 					  "direntries which refer it, but "
-					  "should be 1", fscki->inum,
+					  "should be 1",
+					  (unsigned long)fscki->inum,
 					  fscki->references);
 				goto out_dump;
 			}
@@ -1931,27 +1939,29 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 			    fscki->references != 0) {
 				ubifs_err("root inode %lu has non-zero (%d) "
 					  "direntries which refer it",
-					  fscki->inum, fscki->references);
+					  (unsigned long)fscki->inum,
+					  fscki->references);
 				goto out_dump;
 			}
 			if (fscki->calc_sz != fscki->size) {
 				ubifs_err("directory inode %lu size is %lld, "
 					  "but calculated size is %lld",
-					  fscki->inum, fscki->size,
-					  fscki->calc_sz);
+					  (unsigned long)fscki->inum,
+					  fscki->size, fscki->calc_sz);
 				goto out_dump;
 			}
 			if (fscki->calc_cnt != fscki->nlink) {
 				ubifs_err("directory inode %lu nlink is %d, "
 					  "but calculated nlink is %d",
-					  fscki->inum, fscki->nlink,
-					  fscki->calc_cnt);
+					  (unsigned long)fscki->inum,
+					  fscki->nlink, fscki->calc_cnt);
 				goto out_dump;
 			}
 		} else {
 			if (fscki->references != fscki->nlink) {
 				ubifs_err("inode %lu nlink is %d, but "
-					  "calculated nlink is %d", fscki->inum,
+					  "calculated nlink is %d",
+					  (unsigned long)fscki->inum,
 					  fscki->nlink, fscki->references);
 				goto out_dump;
 			}
@@ -1959,20 +1969,21 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
 		if (fscki->xattr_sz != fscki->calc_xsz) {
 			ubifs_err("inode %lu has xattr size %u, but "
 				  "calculated size is %lld",
-				  fscki->inum, fscki->xattr_sz,
+				  (unsigned long)fscki->inum, fscki->xattr_sz,
 				  fscki->calc_xsz);
 			goto out_dump;
 		}
 		if (fscki->xattr_cnt != fscki->calc_xcnt) {
 			ubifs_err("inode %lu has %u xattrs, but "
-				  "calculated count is %lld", fscki->inum,
+				  "calculated count is %lld",
+				  (unsigned long)fscki->inum,
 				  fscki->xattr_cnt, fscki->calc_xcnt);
 			goto out_dump;
 		}
 		if (fscki->xattr_nms != fscki->calc_xnms) {
 			ubifs_err("inode %lu has xattr names' size %u, but "
 				  "calculated names' size is %lld",
-				  fscki->inum, fscki->xattr_nms,
+				  (unsigned long)fscki->inum, fscki->xattr_nms,
 				  fscki->calc_xnms);
 			goto out_dump;
 		}
@@ -1985,11 +1996,12 @@ out_dump:
 	ino_key_init(c, &key, fscki->inum);
 	err = ubifs_lookup_level0(c, &key, &znode, &n);
 	if (!err) {
-		ubifs_err("inode %lu not found in index", fscki->inum);
+		ubifs_err("inode %lu not found in index",
+			  (unsigned long)fscki->inum);
 		return -ENOENT;
 	} else if (err < 0) {
 		ubifs_err("error %d while looking up inode %lu",
-			  err, fscki->inum);
+			  err, (unsigned long)fscki->inum);
 		return err;
 	}
 
@@ -2007,7 +2019,7 @@ out_dump:
 	}
 
 	ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
-		  fscki->inum, zbr->lnum, zbr->offs);
+		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
 	dbg_dump_node(c, ino);
 	kfree(ino);
 	return -EINVAL;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 37a9e604c3e9..0422c98e1793 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -161,7 +161,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 			return ERR_PTR(-EINVAL);
 		}
 		ubifs_warn("running out of inode numbers (current %lu, max %d)",
-			   c->highest_inum, INUM_WATERMARK);
+			   (unsigned long)c->highest_inum, INUM_WATERMARK);
 	}
 
 	inode->i_ino = ++c->highest_inum;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 22993f867d19..f91b745908ea 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,8 +690,9 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
-	dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
-		key_block(c, key), len, DBGKEY(key));
+	dbg_jnl("ino %lu, blk %u, len %d, key %s",
+		(unsigned long)key_inum(c, key), key_block(c, key), len,
+		DBGKEY(key));
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
 	data = kmalloc(dlen, GFP_NOFS);
@@ -1128,7 +1129,8 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	ino_t inum = inode->i_ino;
 	unsigned int blk;
 
-	dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+	dbg_jnl("ino %lu, size %lld -> %lld",
+		(unsigned long)inum, old_size, new_size);
 	ubifs_assert(!ui->data_len);
 	ubifs_assert(S_ISREG(inode->i_mode));
 	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 02d3462f4d3e..9bd5a43d4526 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -105,7 +105,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
 	list_add_tail(&orphan->list, &c->orph_list);
 	list_add_tail(&orphan->new_list, &c->orph_new);
 	spin_unlock(&c->orphan_lock);
-	dbg_gen("ino %lu", inum);
+	dbg_gen("ino %lu", (unsigned long)inum);
 	return 0;
 }
 
@@ -132,14 +132,16 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 		else {
 			if (o->dnext) {
 				spin_unlock(&c->orphan_lock);
-				dbg_gen("deleted twice ino %lu", inum);
+				dbg_gen("deleted twice ino %lu",
+					(unsigned long)inum);
 				return;
 			}
 			if (o->cnext) {
 				o->dnext = c->orph_dnext;
 				c->orph_dnext = o;
 				spin_unlock(&c->orphan_lock);
-				dbg_gen("delete later ino %lu", inum);
+				dbg_gen("delete later ino %lu",
+					(unsigned long)inum);
 				return;
 			}
 			rb_erase(p, &c->orph_tree);
@@ -151,12 +153,12 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 			}
 			spin_unlock(&c->orphan_lock);
 			kfree(o);
-			dbg_gen("inum %lu", inum);
+			dbg_gen("inum %lu", (unsigned long)inum);
 			return;
 		}
 	}
 	spin_unlock(&c->orphan_lock);
-	dbg_err("missing orphan ino %lu", inum);
+	dbg_err("missing orphan ino %lu", (unsigned long)inum);
 	dbg_dump_stack();
 }
 
@@ -448,7 +450,7 @@ static void erase_deleted(struct ubifs_info *c)
 		rb_erase(&orphan->rb, &c->orph_tree);
 		list_del(&orphan->list);
 		c->tot_orphans -= 1;
-		dbg_gen("deleting orphan ino %lu", orphan->inum);
+		dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);
 		kfree(orphan);
 	}
 	c->orph_dnext = NULL;
@@ -536,8 +538,8 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
 	list_add_tail(&orphan->list, &c->orph_list);
 	orphan->dnext = c->orph_dnext;
 	c->orph_dnext = orphan;
-	dbg_mnt("ino %lu, new %d, tot %d",
-		inum, c->new_orphans, c->tot_orphans);
+	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
+		c->new_orphans, c->tot_orphans);
 	return 0;
 }
 
@@ -609,7 +611,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
 		for (i = 0; i < n; i++) {
 			inum = le64_to_cpu(orph->inos[i]);
-			dbg_rcvry("deleting orphaned inode %lu", inum);
+			dbg_rcvry("deleting orphaned inode %lu",
+				  (unsigned long)inum);
 			err = ubifs_tnc_remove_ino(c, inum);
 			if (err)
 				return err;
@@ -840,8 +843,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	if (inum != ci->last_ino) {
 		/* Lowest node type is the inode node, so it comes first */
 		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
-			ubifs_err("found orphan node ino %lu, type %d", inum,
-				  key_type(c, &zbr->key));
+			ubifs_err("found orphan node ino %lu, type %d",
+				  (unsigned long)inum, key_type(c, &zbr->key));
 		ci->last_ino = inum;
 		ci->tot_inos += 1;
 		err = ubifs_tnc_read_node(c, zbr, ci->node);
@@ -853,7 +856,8 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 			/* Must be recorded as an orphan */
 			if (!dbg_find_check_orphan(&ci->root, inum) &&
 			    !dbg_find_orphan(c, inum)) {
-				ubifs_err("missing orphan, ino %lu", inum);
+				ubifs_err("missing orphan, ino %lu",
+					  (unsigned long)inum);
 				ci->missing += 1;
 			}
 	}
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index bed97421b972..90acac603e63 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1435,13 +1435,13 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
 	err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
 	if (err)
 		goto out;
-	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
-		  i_size, e->d_size);
+	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+		  (unsigned long)e->inum, lnum, offs, i_size, e->d_size);
 	return 0;
 
 out:
 	ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
-		   e->inum, e->i_size, e->d_size, err);
+		   (unsigned long)e->inum, e->i_size, e->d_size, err);
 	return err;
 }
 
@@ -1472,7 +1472,8 @@ int ubifs_recover_size(struct ubifs_info *c)
 				return err;
 			if (err == -ENOENT) {
 				/* Remove data nodes that have no inode */
-				dbg_rcvry("removing ino %lu", e->inum);
+				dbg_rcvry("removing ino %lu",
+					  (unsigned long)e->inum);
 				err = ubifs_tnc_remove_ino(c, e->inum);
 				if (err)
 					return err;
@@ -1493,8 +1494,8 @@ int ubifs_recover_size(struct ubifs_info *c)
 					return PTR_ERR(inode);
 				if (inode->i_size < e->d_size) {
 					dbg_rcvry("ino %lu size %lld -> %lld",
-						  e->inum, e->d_size,
-						  inode->i_size);
+						  (unsigned long)e->inum,
+						  e->d_size, inode->i_size);
 					inode->i_size = e->d_size;
 					ubifs_inode(inode)->ui_size = e->d_size;
 					e->inode = inode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 7399692af859..21f7d047c306 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1065,7 +1065,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
 	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
 	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
 		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
-		c->highest_inum);
+		(unsigned long)c->highest_inum);
 out:
 	destroy_replay_tree(c);
 	destroy_bud_list(c);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index d27fd918b9c9..99e9a744cfd0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2677,7 +2677,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 	struct ubifs_dent_node *xent, *pxent = NULL;
 	struct qstr nm = { .name = NULL };
 
-	dbg_tnc("ino %lu", inum);
+	dbg_tnc("ino %lu", (unsigned long)inum);
 
 	/*
 	 * Walk all extended attribute entries and remove them together with
@@ -2697,7 +2697,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 		}
 
 		xattr_inum = le64_to_cpu(xent->inum);
-		dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+		dbg_tnc("xent '%s', ino %lu", xent->name,
+			(unsigned long)xattr_inum);
 
 		nm.name = xent->name;
 		nm.len = le16_to_cpu(xent->nlen);
-- 
cgit v1.2.3


From ac51d83705c2a38c71f39cde99708b14e6212a60 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 6 Nov 2008 16:49:36 -0500
Subject: ext4: calculate journal credits correctly

This fixes a 2.6.27 regression which was introduced in commit a02908f1.

We weren't passing the chunk parameter down to the two subections,
ext4_indirect_trans_blocks() and ext4_ext_index_trans_blocks(), with
the result that massively overestimate the amount of credits needed by
ext4_da_writepages, especially in the non-extents case.  This causes
failures especially on /boot partitions, which tend to be small and
non-extent using since GRUB doesn't handle extents.

This patch fixes the bug reported by Joseph Fannin at:
http://bugzilla.kernel.org/show_bug.cgi?id=11964

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8dbf6953845b..5a130b56f1cf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4580,9 +4580,10 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
-		return ext4_indirect_trans_blocks(inode, nrblocks, 0);
-	return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
+
 /*
  * Account for index blocks, block groups bitmaps and block group
  * descriptor blocks if modify datablocks and index blocks
-- 
cgit v1.2.3


From bc9c4068388eea01d3b5da31016879f2341ecec5 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:22 -0800
Subject: autofs4: correct offset mount expire check

When checking a directory tree in autofs_tree_busy() we can incorrectly
decide that the tree isn't busy.  This happens for the case of an active
offset mount as autofs4_follow_mount() follows past the active offset
mount, which has an open file handle used for expires, causing the file
handle not to count toward the busyness check.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/expire.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index cde2f8e8935a..4b6fb3f628c0 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -56,12 +56,23 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	mntget(mnt);
 	dget(dentry);
 
-	if (!autofs4_follow_mount(&mnt, &dentry))
+	if (!follow_down(&mnt, &dentry))
 		goto done;
 
-	/* This is an autofs submount, we can't expire it */
-	if (is_autofs4_dentry(dentry))
-		goto done;
+	if (is_autofs4_dentry(dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+
+		/* This is an autofs submount, we can't expire it */
+		if (sbi->type == AUTOFS_TYPE_INDIRECT)
+			goto done;
+
+		/*
+		 * Otherwise it's an offset mount and we need to check
+		 * if we can umount its mount, if there is one.
+		 */
+		if (!d_mountpoint(dentry))
+			goto done;
+	}
 
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(mnt)) {
-- 
cgit v1.2.3


From 96b0317906690997c16c7efffbc4c0fafcd6f7f2 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Thu, 6 Nov 2008 12:53:23 -0800
Subject: autofs4: collect version check return

The function check_dev_ioctl_version() returns an error code upon fail but
it isn't captured and returned in validate_dev_ioctl() as it should be.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 625abf5422e2..33bf8cbfd051 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -128,9 +128,10 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
-	int err = -EINVAL;
+	int err;
 
-	if (check_dev_ioctl_version(cmd, param)) {
+	err = check_dev_ioctl_version(cmd, param);
+	if (err) {
 		AUTOFS_WARN("invalid device control module version "
 		     "supplied for cmd(0x%08x)", cmd);
 		goto out;
-- 
cgit v1.2.3


From c87591b719737b4e91eb1a9fa8fd55a4ff1886d6 Mon Sep 17 00:00:00 2001
From: Arthur Jones <ajones@riverbed.com>
Date: Thu, 6 Nov 2008 12:53:35 -0800
Subject: ext3: wait on all pending commits in ext3_sync_fs

In ext3_sync_fs, we only wait for a commit to finish if we started it, but
there may be one already in progress which will not be synced.

In the case of a data=ordered umount with pending long symlinks which are
delayed due to a long list of other I/O on the backing block device, this
causes the buffer associated with the long symlinks to not be moved to the
inode dirty list in the second phase of fsync_super.  Then, before they
can be dirtied again, kjournald exits, seeing the UMOUNT flag and the
dirty pages are never written to the backing block device, causing long
symlink corruption and exposing new or previously freed block data to
userspace.

This can be reproduced with a script created
by Eric Sandeen <sandeen@redhat.com>:

	#!/bin/bash

	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	rm -f /mnt/test2/*
	dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
	touch
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	ln -s
	/mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
	/mnt/test2/link
	umount /mnt/test2
	mount /dev/sdb4 /mnt/test2
	ls /mnt/test2/
	umount /mnt/test2

To ensure all commits are synced, we flush all journal commits now when
sync_fs'ing ext3.

Signed-off-by: Arthur Jones <ajones@riverbed.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>		[2.6.everything]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e5717a4fae67..5dec6d1356c4 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2390,13 +2390,12 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	tid_t target;
-
 	sb->s_dirt = 0;
-	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
-		if (wait)
-			log_wait_commit(EXT3_SB(sb)->s_journal, target);
-	}
+	if (wait)
+		ext3_force_commit(sb);
+	else
+		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 990e194e69009028e029b7d25da68c38241ec4f0 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:45 -0800
Subject: fat: move fs/vfat/* and fs/msdos/* to fs/fat

This just moves those files, but change link order from MSDOS, VFAT to
VFAT, MSDOS.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile          |    2 -
 fs/fat/Makefile      |    6 +-
 fs/fat/namei_msdos.c |  702 +++++++++++++++++++++++++++++++++
 fs/fat/namei_vfat.c  | 1055 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/msdos/Makefile    |    7 -
 fs/msdos/namei.c     |  702 ---------------------------------
 fs/vfat/Makefile     |    7 -
 fs/vfat/namei.c      | 1055 --------------------------------------------------
 8 files changed, 1762 insertions(+), 1774 deletions(-)
 create mode 100644 fs/fat/namei_msdos.c
 create mode 100644 fs/fat/namei_vfat.c
 delete mode 100644 fs/msdos/Makefile
 delete mode 100644 fs/msdos/namei.c
 delete mode 100644 fs/vfat/Makefile
 delete mode 100644 fs/vfat/namei.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 2168c902d5ca..d9f8afe6f0c4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -81,8 +81,6 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
 obj-$(CONFIG_MINIX_FS)		+= minix/
 obj-$(CONFIG_FAT_FS)		+= fat/
-obj-$(CONFIG_MSDOS_FS)		+= msdos/
-obj-$(CONFIG_VFAT_FS)		+= vfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
diff --git a/fs/fat/Makefile b/fs/fat/Makefile
index bfb5f06cf2c8..e06190322c1c 100644
--- a/fs/fat/Makefile
+++ b/fs/fat/Makefile
@@ -3,5 +3,9 @@
 #
 
 obj-$(CONFIG_FAT_FS) += fat.o
+obj-$(CONFIG_VFAT_FS) += vfat.o
+obj-$(CONFIG_MSDOS_FS) += msdos.o
 
-fat-objs := cache.o dir.o fatent.o file.o inode.o misc.o
+fat-y := cache.o dir.o fatent.o file.o inode.o misc.o
+vfat-y := namei_vfat.o
+msdos-y := namei_msdos.o
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
new file mode 100644
index 000000000000..e844b9809d27
--- /dev/null
+++ b/fs/fat/namei_msdos.c
@@ -0,0 +1,702 @@
+/*
+ *  linux/fs/msdos/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
+ *  Rewritten for constant inumbers 1999 by Al Viro
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/buffer_head.h>
+#include <linux/msdos_fs.h>
+#include <linux/smp_lock.h>
+
+/* Characters that are undesirable in an MS-DOS file name */
+static unsigned char bad_chars[] = "*?<>|\"";
+static unsigned char bad_if_strict[] = "+=,; ";
+
+/***** Formats an MS-DOS file name. Rejects invalid names. */
+static int msdos_format_name(const unsigned char *name, int len,
+			     unsigned char *res, struct fat_mount_options *opts)
+	/*
+	 * name is the proposed name, len is its length, res is
+	 * the resulting name, opts->name_check is either (r)elaxed,
+	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
+	 * beginning of name (for hidden files)
+	 */
+{
+	unsigned char *walk;
+	unsigned char c;
+	int space;
+
+	if (name[0] == '.') {	/* dotfile because . and .. already done */
+		if (opts->dotsOK) {
+			/* Get rid of dot - test for it elsewhere */
+			name++;
+			len--;
+		} else
+			return -EINVAL;
+	}
+	/*
+	 * disallow names that _really_ start with a dot
+	 */
+	space = 1;
+	c = 0;
+	for (walk = res; len && walk - res < 8; walk++) {
+		c = *name++;
+		len--;
+		if (opts->name_check != 'r' && strchr(bad_chars, c))
+			return -EINVAL;
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
+			return -EINVAL;
+		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+			return -EINVAL;
+		if (c < ' ' || c == ':' || c == '\\')
+			return -EINVAL;
+	/*
+	 * 0xE5 is legal as a first character, but we must substitute
+	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
+	 * does this.
+	 * It seems that Microsoft hacked DOS to support non-US
+	 * characters after the 0xE5 character was already in use to
+	 * mark deleted files.
+	 */
+		if ((res == walk) && (c == 0xE5))
+			c = 0x05;
+		if (c == '.')
+			break;
+		space = (c == ' ');
+		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
+	}
+	if (space)
+		return -EINVAL;
+	if (opts->name_check == 's' && len && c != '.') {
+		c = *name++;
+		len--;
+		if (c != '.')
+			return -EINVAL;
+	}
+	while (c != '.' && len--)
+		c = *name++;
+	if (c == '.') {
+		while (walk - res < 8)
+			*walk++ = ' ';
+		while (len > 0 && walk - res < MSDOS_NAME) {
+			c = *name++;
+			len--;
+			if (opts->name_check != 'r' && strchr(bad_chars, c))
+				return -EINVAL;
+			if (opts->name_check == 's' &&
+			    strchr(bad_if_strict, c))
+				return -EINVAL;
+			if (c < ' ' || c == ':' || c == '\\')
+				return -EINVAL;
+			if (c == '.') {
+				if (opts->name_check == 's')
+					return -EINVAL;
+				break;
+			}
+			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
+				return -EINVAL;
+			space = c == ' ';
+			if (!opts->nocase && c >= 'a' && c <= 'z')
+				*walk++ = c - 32;
+			else
+				*walk++ = c;
+		}
+		if (space)
+			return -EINVAL;
+		if (opts->name_check == 's' && len)
+			return -EINVAL;
+	}
+	while (walk - res < MSDOS_NAME)
+		*walk++ = ' ';
+
+	return 0;
+}
+
+/***** Locates a directory entry.  Uses unformatted name. */
+static int msdos_find(struct inode *dir, const unsigned char *name, int len,
+		      struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	unsigned char msdos_name[MSDOS_NAME];
+	int err;
+
+	err = msdos_format_name(name, len, msdos_name, &sbi->options);
+	if (err)
+		return -ENOENT;
+
+	err = fat_scan(dir, msdos_name, sinfo);
+	if (!err && sbi->options.dotsOK) {
+		if (name[0] == '.') {
+			if (!(sinfo->de->attr & ATTR_HIDDEN))
+				err = -ENOENT;
+		} else {
+			if (sinfo->de->attr & ATTR_HIDDEN)
+				err = -ENOENT;
+		}
+		if (err)
+			brelse(sinfo->bh);
+	}
+	return err;
+}
+
+/*
+ * Compute the hash for the msdos name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The msdos fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
+	if (!error)
+		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+	return 0;
+}
+
+/*
+ * Compare two msdos names. If either of the names are invalid,
+ * we fall back to doing the standard name comparison.
+ */
+static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
+	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
+	int error;
+
+	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
+	if (error)
+		goto old_compare;
+	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
+out:
+	return error;
+
+old_compare:
+	error = 1;
+	if (a->len == b->len)
+		error = memcmp(a->name, b->name, a->len);
+	goto out;
+}
+
+static struct dentry_operations msdos_dentry_operations = {
+	.d_hash		= msdos_hash,
+	.d_compare	= msdos_cmp,
+};
+
+/*
+ * AV. Wrappers for FAT sb operations. Is it wise?
+ */
+
+/***** Get inode using directory and name */
+static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	int res;
+
+	dentry->d_op = &msdos_dentry_operations;
+
+	lock_super(sb);
+	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (res == -ENOENT)
+		goto add;
+	if (res < 0)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		res = PTR_ERR(inode);
+		goto out;
+	}
+add:
+	res = 0;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_op = &msdos_dentry_operations;
+out:
+	unlock_super(sb);
+	if (!res)
+		return dentry;
+	return ERR_PTR(res);
+}
+
+/***** Creates a directory entry (name is already formatted). */
+static int msdos_add_entry(struct inode *dir, const unsigned char *name,
+			   int is_dir, int is_hid, int cluster,
+			   struct timespec *ts, struct fat_slot_info *sinfo)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct msdos_dir_entry de;
+	__le16 time, date;
+	int err;
+
+	memcpy(de.name, name, MSDOS_NAME);
+	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	if (is_hid)
+		de.attr |= ATTR_HIDDEN;
+	de.lcase = 0;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de.cdate = de.adate = 0;
+	de.ctime = 0;
+	de.ctime_cs = 0;
+	de.time = time;
+	de.date = date;
+	de.start = cpu_to_le16(cluster);
+	de.starthi = cpu_to_le16(cluster >> 16);
+	de.size = 0;
+
+	err = fat_add_entries(dir, &de, 1, sinfo);
+	if (err)
+		return err;
+
+	dir->i_ctime = dir->i_mtime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	return 0;
+}
+
+/***** Create a file */
+static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = NULL;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	unsigned char msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* Have to do it due to foo vs. .foo conflicts */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+	return err;
+}
+
+/***** Remove a directory */
+static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode = dentry->d_inode;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	/*
+	 * Check whether the directory is not in use, then check
+	 * whether it is empty.
+	 */
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+/***** Make a directory */
+static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode;
+	unsigned char msdos_name[MSDOS_NAME];
+	struct timespec ts;
+	int err, is_hid, cluster;
+
+	lock_super(sb);
+
+	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
+				msdos_name, &MSDOS_SB(sb)->options);
+	if (err)
+		goto out;
+	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
+	/* foo vs .foo situation */
+	if (!fat_scan(dir, msdos_name, &sinfo)) {
+		brelse(sinfo.bh);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	fat_flush_inodes(sb, dir, inode);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+/***** Unlink a file */
+static int msdos_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_ctime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
+
+	return err;
+}
+
+static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
+			   struct dentry *old_dentry,
+			   struct inode *new_dir, unsigned char *new_name,
+			   struct dentry *new_dentry, int is_hid)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	err = fat_scan(old_dir, old_name, &old_sinfo);
+	if (err) {
+		err = -EIO;
+		goto out;
+	}
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	old_attrs = MSDOS_I(old_inode)->i_attrs;
+	err = fat_scan(new_dir, new_name, &sinfo);
+	if (!err) {
+		if (!new_inode) {
+			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
+			if (sinfo.de != old_sinfo.de) {
+				err = -EINVAL;
+				goto out;
+			}
+			if (is_hid)
+				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+			else
+				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+			if (IS_DIRSYNC(old_dir)) {
+				err = fat_sync_inode(old_inode);
+				if (err) {
+					MSDOS_I(old_inode)->i_attrs = old_attrs;
+					goto out;
+				}
+			} else
+				mark_inode_dirty(old_inode);
+
+			old_dir->i_version++;
+			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+			if (IS_DIRSYNC(old_dir))
+				(void)fat_sync_inode(old_dir);
+			else
+				mark_inode_dirty(old_dir);
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (err)
+			goto out;
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
+				      &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (is_hid)
+		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
+	else
+		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	MSDOS_I(old_inode)->i_attrs = old_attrs;
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
+static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct super_block *sb = old_dir->i_sb;
+	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
+	int err, is_hid;
+
+	lock_super(sb);
+
+	err = msdos_format_name(old_dentry->d_name.name,
+				old_dentry->d_name.len, old_msdos_name,
+				&MSDOS_SB(old_dir->i_sb)->options);
+	if (err)
+		goto out;
+	err = msdos_format_name(new_dentry->d_name.name,
+				new_dentry->d_name.len, new_msdos_name,
+				&MSDOS_SB(new_dir->i_sb)->options);
+	if (err)
+		goto out;
+
+	is_hid =
+	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
+
+	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
+			      new_dir, new_msdos_name, new_dentry, is_hid);
+out:
+	unlock_super(sb);
+	if (!err)
+		err = fat_flush_inodes(sb, old_dir, new_dir);
+	return err;
+}
+
+static const struct inode_operations msdos_dir_inode_operations = {
+	.create		= msdos_create,
+	.lookup		= msdos_lookup,
+	.unlink		= msdos_unlink,
+	.mkdir		= msdos_mkdir,
+	.rmdir		= msdos_rmdir,
+	.rename		= msdos_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int msdos_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
+	if (res)
+		return res;
+
+	sb->s_flags |= MS_NOATIME;
+	sb->s_root->d_op = &msdos_dentry_operations;
+	return 0;
+}
+
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
+}
+
+static struct file_system_type msdos_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "msdos",
+	.get_sb		= msdos_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_msdos_fs(void)
+{
+	return register_filesystem(&msdos_fs_type);
+}
+
+static void __exit exit_msdos_fs(void)
+{
+	unregister_filesystem(&msdos_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Werner Almesberger");
+MODULE_DESCRIPTION("MS-DOS filesystem support");
+
+module_init(init_msdos_fs)
+module_exit(exit_msdos_fs)
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
new file mode 100644
index 000000000000..155c10b4adbd
--- /dev/null
+++ b/fs/fat/namei_vfat.c
@@ -0,0 +1,1055 @@
+/*
+ *  linux/fs/vfat/namei.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  Windows95/Windows NT compatible extended MSDOS filesystem
+ *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
+ *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
+ *    what file operation caused you trouble and if you can duplicate
+ *    the problem, send a script that demonstrates it.
+ *
+ *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
+ *
+ *  Support Multibyte characters and cleanup by
+ *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
+ */
+
+#include <linux/module.h>
+
+#include <linux/jiffies.h>
+#include <linux/msdos_fs.h>
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int ret = 1;
+
+	if (!dentry->d_inode &&
+	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
+		/*
+		 * negative dentry is dropped, in order to make sure
+		 * to use the name which a user desires if this is
+		 * create path.
+		 */
+		ret = 0;
+	else {
+		spin_lock(&dentry->d_lock);
+		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
+			ret = 0;
+		spin_unlock(&dentry->d_lock);
+	}
+	return ret;
+}
+
+/* returns the length of a struct qstr, ignoring trailing dots */
+static unsigned int vfat_striptail_len(struct qstr *qstr)
+{
+	unsigned int len = qstr->len;
+
+	while (len && qstr->name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
+{
+	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+	return 0;
+}
+
+/*
+ * Compute the hash for the vfat name corresponding to the dentry.
+ * Note: if the name is invalid, we leave the hash code unchanged so
+ * that the existing dentry can be used. The vfat fs routines will
+ * return ENOENT or EINVAL as appropriate.
+ */
+static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = vfat_striptail_len(qstr);
+
+	hash = init_name_hash();
+	while (len--)
+		hash = partial_name_hash(nls_tolower(t, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Case sensitive compare of two vfat names.
+ */
+static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
+{
+	unsigned int alen, blen;
+
+	/* A filename cannot end in '.' or we treat it like it has none */
+	alen = vfat_striptail_len(a);
+	blen = vfat_striptail_len(b);
+	if (alen == blen) {
+		if (strncmp(a->name, b->name, alen) == 0)
+			return 0;
+	}
+	return 1;
+}
+
+static struct dentry_operations vfat_dentry_ops[4] = {
+	{
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hashi,
+		.d_compare	= vfat_cmpi,
+	},
+	{
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	},
+	{
+		.d_revalidate	= vfat_revalidate,
+		.d_hash		= vfat_hash,
+		.d_compare	= vfat_cmp,
+	}
+};
+
+/* Characters that are undesirable in an MS-DOS file name */
+
+static inline wchar_t vfat_bad_char(wchar_t w)
+{
+	return (w < 0x0020)
+	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
+	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
+	    || (w == '\\');
+}
+
+static inline wchar_t vfat_replace_char(wchar_t w)
+{
+	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
+	    || (w == '+') || (w == '=');
+}
+
+static wchar_t vfat_skip_char(wchar_t w)
+{
+	return (w == '.') || (w == ' ');
+}
+
+static inline int vfat_is_used_badchars(const wchar_t *s, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (vfat_bad_char(s[i]))
+			return -EINVAL;
+
+	if (s[i - 1] == ' ') /* last character cannot be space */
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vfat_find_form(struct inode *dir, unsigned char *name)
+{
+	struct fat_slot_info sinfo;
+	int err = fat_scan(dir, name, &sinfo);
+	if (err)
+		return -ENOENT;
+	brelse(sinfo.bh);
+	return 0;
+}
+
+/*
+ * 1) Valid characters for the 8.3 format alias are any combination of
+ * letters, uppercase alphabets, digits, any of the
+ * following special characters:
+ *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
+ * In this case Longfilename is not stored in disk.
+ *
+ * WinNT's Extension:
+ * File name and extension name is contain uppercase/lowercase
+ * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
+ *
+ * 2) File name is 8.3 format, but it contain the uppercase and
+ * lowercase char, muliti bytes char, etc. In this case numtail is not
+ * added, but Longfilename is stored.
+ *
+ * 3) When the one except for the above, or the following special
+ * character are contained:
+ *        .   [ ] ; , + =
+ * numtail is added, and Longfilename must be stored in disk .
+ */
+struct shortname_info {
+	unsigned char lower:1,
+		      upper:1,
+		      valid:1;
+};
+#define INIT_SHORTNAME_INFO(x)	do {		\
+	(x)->lower = 1;				\
+	(x)->upper = 1;				\
+	(x)->valid = 1;				\
+} while (0)
+
+static inline int to_shortname_char(struct nls_table *nls,
+				    unsigned char *buf, int buf_size,
+				    wchar_t *src, struct shortname_info *info)
+{
+	int len;
+
+	if (vfat_skip_char(*src)) {
+		info->valid = 0;
+		return 0;
+	}
+	if (vfat_replace_char(*src)) {
+		info->valid = 0;
+		buf[0] = '_';
+		return 1;
+	}
+
+	len = nls->uni2char(*src, buf, buf_size);
+	if (len <= 0) {
+		info->valid = 0;
+		buf[0] = '_';
+		len = 1;
+	} else if (len == 1) {
+		unsigned char prev = buf[0];
+
+		if (buf[0] >= 0x7F) {
+			info->lower = 0;
+			info->upper = 0;
+		}
+
+		buf[0] = nls_toupper(nls, buf[0]);
+		if (isalpha(buf[0])) {
+			if (buf[0] == prev)
+				info->lower = 0;
+			else
+				info->upper = 0;
+		}
+	} else {
+		info->lower = 0;
+		info->upper = 0;
+	}
+
+	return len;
+}
+
+/*
+ * Given a valid longname, create a unique shortname.  Make sure the
+ * shortname does not exist
+ * Returns negative number on error, 0 for a normal
+ * return, and 1 for valid shortname
+ */
+static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
+				 wchar_t *uname, int ulen,
+				 unsigned char *name_res, unsigned char *lcase)
+{
+	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
+	wchar_t *ip, *ext_start, *end, *name_start;
+	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
+	int chl, chi;
+	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
+	int is_shortname;
+	struct shortname_info base_info, ext_info;
+
+	is_shortname = 1;
+	INIT_SHORTNAME_INFO(&base_info);
+	INIT_SHORTNAME_INFO(&ext_info);
+
+	/* Now, we need to create a shortname from the long name */
+	ext_start = end = &uname[ulen];
+	while (--ext_start >= uname) {
+		if (*ext_start == 0x002E) {	/* is `.' */
+			if (ext_start == end - 1) {
+				sz = ulen;
+				ext_start = NULL;
+			}
+			break;
+		}
+	}
+
+	if (ext_start == uname - 1) {
+		sz = ulen;
+		ext_start = NULL;
+	} else if (ext_start) {
+		/*
+		 * Names which start with a dot could be just
+		 * an extension eg. "...test".  In this case Win95
+		 * uses the extension as the name and sets no extension.
+		 */
+		name_start = &uname[0];
+		while (name_start < ext_start) {
+			if (!vfat_skip_char(*name_start))
+				break;
+			name_start++;
+		}
+		if (name_start != ext_start) {
+			sz = ext_start - uname;
+			ext_start++;
+		} else {
+			sz = ulen;
+			ext_start = NULL;
+		}
+	}
+
+	numtail_baselen = 6;
+	numtail2_baselen = 2;
+	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
+		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+					ip, &base_info);
+		if (chl == 0)
+			continue;
+
+		if (baselen < 2 && (baselen + chl) > 2)
+			numtail2_baselen = baselen;
+		if (baselen < 6 && (baselen + chl) > 6)
+			numtail_baselen = baselen;
+		for (chi = 0; chi < chl; chi++) {
+			*p++ = charbuf[chi];
+			baselen++;
+			if (baselen >= 8)
+				break;
+		}
+		if (baselen >= 8) {
+			if ((chi < chl - 1) || (ip + 1) - uname < sz)
+				is_shortname = 0;
+			break;
+		}
+	}
+	if (baselen == 0) {
+		return -EINVAL;
+	}
+
+	extlen = 0;
+	if (ext_start) {
+		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
+			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
+						ip, &ext_info);
+			if (chl == 0)
+				continue;
+
+			if ((extlen + chl) > 3) {
+				is_shortname = 0;
+				break;
+			}
+			for (chi = 0; chi < chl; chi++) {
+				*p++ = charbuf[chi];
+				extlen++;
+			}
+			if (extlen >= 3) {
+				if (ip + 1 != end)
+					is_shortname = 0;
+				break;
+			}
+		}
+	}
+	ext[extlen] = '\0';
+	base[baselen] = '\0';
+
+	/* Yes, it can happen. ".\xe5" would do it. */
+	if (base[0] == DELETED_FLAG)
+		base[0] = 0x05;
+
+	/* OK, at this point we know that base is not longer than 8 symbols,
+	 * ext is not longer than 3, base is nonempty, both don't contain
+	 * any bad symbols (lowercase transformed to uppercase).
+	 */
+
+	memset(name_res, ' ', MSDOS_NAME);
+	memcpy(name_res, base, baselen);
+	memcpy(name_res + 8, ext, extlen);
+	*lcase = 0;
+	if (is_shortname && base_info.valid && ext_info.valid) {
+		if (vfat_find_form(dir, name_res) == 0)
+			return -EEXIST;
+
+		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
+			return (base_info.upper && ext_info.upper);
+		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
+			if ((base_info.upper || base_info.lower) &&
+			    (ext_info.upper || ext_info.lower)) {
+				if (!base_info.upper && base_info.lower)
+					*lcase |= CASE_LOWER_BASE;
+				if (!ext_info.upper && ext_info.lower)
+					*lcase |= CASE_LOWER_EXT;
+				return 1;
+			}
+			return 0;
+		} else {
+			BUG();
+		}
+	}
+
+	if (opts->numtail == 0)
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+
+	/*
+	 * Try to find a unique extension.  This used to
+	 * iterate through all possibilities sequentially,
+	 * but that gave extremely bad performance.  Windows
+	 * only tries a few cases before using random
+	 * values for part of the base.
+	 */
+
+	if (baselen > 6) {
+		baselen = numtail_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen] = '~';
+	for (i = 1; i < 10; i++) {
+		name_res[baselen + 1] = i + '0';
+		if (vfat_find_form(dir, name_res) < 0)
+			return 0;
+	}
+
+	i = jiffies & 0xffff;
+	sz = (jiffies >> 16) & 0x7;
+	if (baselen > 2) {
+		baselen = numtail2_baselen;
+		name_res[7] = ' ';
+	}
+	name_res[baselen + 4] = '~';
+	name_res[baselen + 5] = '1' + sz;
+	while (1) {
+		sprintf(buf, "%04X", i);
+		memcpy(&name_res[baselen], buf, 4);
+		if (vfat_find_form(dir, name_res) < 0)
+			break;
+		i -= 11;
+	}
+	return 0;
+}
+
+/* Translate a string, including coded sequences into Unicode */
+static int
+xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
+	     int *longlen, int *outlen, int escape, int utf8,
+	     struct nls_table *nls)
+{
+	const unsigned char *ip;
+	unsigned char nc;
+	unsigned char *op;
+	unsigned int ec;
+	int i, k, fill;
+	int charlen;
+
+	if (utf8) {
+		int name_len = strlen(name);
+
+		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+
+		/*
+		 * We stripped '.'s before and set len appropriately,
+		 * but utf8_mbstowcs doesn't care about len
+		 */
+		*outlen -= (name_len - len);
+
+		if (*outlen > 255)
+			return -ENAMETOOLONG;
+
+		op = &outname[*outlen * sizeof(wchar_t)];
+	} else {
+		if (nls) {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     *outlen += 1)
+			{
+				if (escape && (*ip == ':')) {
+					if (i > len - 5)
+						return -EINVAL;
+					ec = 0;
+					for (k = 1; k < 5; k++) {
+						nc = ip[k];
+						ec <<= 4;
+						if (nc >= '0' && nc <= '9') {
+							ec |= nc - '0';
+							continue;
+						}
+						if (nc >= 'a' && nc <= 'f') {
+							ec |= nc - ('a' - 10);
+							continue;
+						}
+						if (nc >= 'A' && nc <= 'F') {
+							ec |= nc - ('A' - 10);
+							continue;
+						}
+						return -EINVAL;
+					}
+					*op++ = ec & 0xFF;
+					*op++ = ec >> 8;
+					ip += 5;
+					i += 5;
+				} else {
+					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
+						return -EINVAL;
+					ip += charlen;
+					i += charlen;
+					op += 2;
+				}
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		} else {
+			for (i = 0, ip = name, op = outname, *outlen = 0;
+			     i < len && *outlen <= 255;
+			     i++, *outlen += 1)
+			{
+				*op++ = *ip++;
+				*op++ = 0;
+			}
+			if (i < len)
+				return -ENAMETOOLONG;
+		}
+	}
+
+	*longlen = *outlen;
+	if (*outlen % 13) {
+		*op++ = 0;
+		*op++ = 0;
+		*outlen += 1;
+		if (*outlen % 13) {
+			fill = 13 - (*outlen % 13);
+			for (i = 0; i < fill; i++) {
+				*op++ = 0xff;
+				*op++ = 0xff;
+			}
+			*outlen += fill;
+		}
+	}
+
+	return 0;
+}
+
+static int vfat_build_slots(struct inode *dir, const unsigned char *name,
+			    int len, int is_dir, int cluster,
+			    struct timespec *ts,
+			    struct msdos_dir_slot *slots, int *nr_slots)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
+	struct fat_mount_options *opts = &sbi->options;
+	struct msdos_dir_slot *ps;
+	struct msdos_dir_entry *de;
+	unsigned char cksum, lcase;
+	unsigned char msdos_name[MSDOS_NAME];
+	wchar_t *uname;
+	__le16 time, date;
+	int err, ulen, usize, i;
+	loff_t offset;
+
+	*nr_slots = 0;
+
+	uname = __getname();
+	if (!uname)
+		return -ENOMEM;
+
+	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
+			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
+	if (err)
+		goto out_free;
+
+	err = vfat_is_used_badchars(uname, ulen);
+	if (err)
+		goto out_free;
+
+	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
+				    msdos_name, &lcase);
+	if (err < 0)
+		goto out_free;
+	else if (err == 1) {
+		de = (struct msdos_dir_entry *)slots;
+		err = 0;
+		goto shortname;
+	}
+
+	/* build the entry of long file name */
+	cksum = fat_checksum(msdos_name);
+
+	*nr_slots = usize / 13;
+	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
+		ps->id = i;
+		ps->attr = ATTR_EXT;
+		ps->reserved = 0;
+		ps->alias_checksum = cksum;
+		ps->start = 0;
+		offset = (i - 1) * 13;
+		fatwchar_to16(ps->name0_4, uname + offset, 5);
+		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
+		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
+	}
+	slots[0].id |= 0x40;
+	de = (struct msdos_dir_entry *)ps;
+
+shortname:
+	/* build the entry of 8.3 alias name */
+	(*nr_slots)++;
+	memcpy(de->name, msdos_name, MSDOS_NAME);
+	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
+	de->lcase = lcase;
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	de->time = de->ctime = time;
+	de->date = de->cdate = de->adate = date;
+	de->ctime_cs = 0;
+	de->start = cpu_to_le16(cluster);
+	de->starthi = cpu_to_le16(cluster >> 16);
+	de->size = 0;
+out_free:
+	__putname(uname);
+	return err;
+}
+
+static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
+			  int cluster, struct timespec *ts,
+			  struct fat_slot_info *sinfo)
+{
+	struct msdos_dir_slot *slots;
+	unsigned int len;
+	int err, nr_slots;
+
+	len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
+	if (slots == NULL)
+		return -ENOMEM;
+
+	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
+			       slots, &nr_slots);
+	if (err)
+		goto cleanup;
+
+	err = fat_add_entries(dir, slots, nr_slots, sinfo);
+	if (err)
+		goto cleanup;
+
+	/* update timestamp */
+	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
+	if (IS_DIRSYNC(dir))
+		(void)fat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+cleanup:
+	kfree(slots);
+	return err;
+}
+
+static int vfat_find(struct inode *dir, struct qstr *qname,
+		     struct fat_slot_info *sinfo)
+{
+	unsigned int len = vfat_striptail_len(qname);
+	if (len == 0)
+		return -ENOENT;
+	return fat_search_long(dir, qname->name, len, sinfo);
+}
+
+static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	struct inode *inode = NULL;
+	struct dentry *alias;
+	int err, table;
+
+	lock_super(sb);
+	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
+	dentry->d_op = &vfat_dentry_ops[table];
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err) {
+		table++;
+		goto error;
+	}
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		unlock_super(sb);
+		return ERR_CAST(inode);
+	}
+	alias = d_find_alias(inode);
+	if (alias) {
+		if (d_invalidate(alias) == 0)
+			dput(alias);
+		else {
+			iput(inode);
+			unlock_super(sb);
+			return alias;
+		}
+
+	}
+error:
+	unlock_super(sb);
+	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry) {
+		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	}
+	return dentry;
+}
+
+static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
+	if (err)
+		goto out;
+	dir->i_version++;
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = fat_dir_empty(inode);
+	if (err)
+		goto out;
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct fat_slot_info sinfo;
+	int err;
+
+	lock_super(sb);
+
+	err = vfat_find(dir, &dentry->d_name, &sinfo);
+	if (err)
+		goto out;
+
+	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
+	if (err)
+		goto out;
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
+	fat_detach(inode);
+out:
+	unlock_super(sb);
+
+	return err;
+}
+
+static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct fat_slot_info sinfo;
+	struct timespec ts;
+	int err, cluster;
+
+	lock_super(sb);
+
+	ts = CURRENT_TIME_SEC;
+	cluster = fat_alloc_new_dir(dir, &ts);
+	if (cluster < 0) {
+		err = cluster;
+		goto out;
+	}
+	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
+	if (err)
+		goto out_free;
+	dir->i_version++;
+	inc_nlink(dir);
+
+	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
+	brelse(sinfo.bh);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		/* the directory was completed, just return a error */
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_nlink = 2;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+	unlock_super(sb);
+	return 0;
+
+out_free:
+	fat_free_clusters(dir, cluster);
+out:
+	unlock_super(sb);
+	return err;
+}
+
+static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct buffer_head *dotdot_bh;
+	struct msdos_dir_entry *dotdot_de;
+	struct inode *old_inode, *new_inode;
+	struct fat_slot_info old_sinfo, sinfo;
+	struct timespec ts;
+	loff_t dotdot_i_pos, new_i_pos;
+	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
+
+	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+	lock_super(sb);
+	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
+	if (err)
+		goto out;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+	update_dotdot = (is_dir && old_dir != new_dir);
+	if (update_dotdot) {
+		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
+					 &dotdot_i_pos) < 0) {
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	ts = CURRENT_TIME_SEC;
+	if (new_inode) {
+		if (is_dir) {
+			err = fat_dir_empty(new_inode);
+			if (err)
+				goto out;
+		}
+		new_i_pos = MSDOS_I(new_inode)->i_pos;
+		fat_detach(new_inode);
+	} else {
+		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
+				     &ts, &sinfo);
+		if (err)
+			goto out;
+		new_i_pos = sinfo.i_pos;
+	}
+	new_dir->i_version++;
+
+	fat_detach(old_inode);
+	fat_attach(old_inode, new_i_pos);
+	if (IS_DIRSYNC(new_dir)) {
+		err = fat_sync_inode(old_inode);
+		if (err)
+			goto error_inode;
+	} else
+		mark_inode_dirty(old_inode);
+
+	if (update_dotdot) {
+		int start = MSDOS_I(new_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		if (IS_DIRSYNC(new_dir)) {
+			err = sync_dirty_buffer(dotdot_bh);
+			if (err)
+				goto error_dotdot;
+		}
+		drop_nlink(old_dir);
+		if (!new_inode)
+ 			inc_nlink(new_dir);
+	}
+
+	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
+	old_sinfo.bh = NULL;
+	if (err)
+		goto error_dotdot;
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void)fat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		if (is_dir)
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+out:
+	brelse(sinfo.bh);
+	brelse(dotdot_bh);
+	brelse(old_sinfo.bh);
+	unlock_super(sb);
+
+	return err;
+
+error_dotdot:
+	/* data cluster is shared, serious corruption */
+	corrupt = 1;
+
+	if (update_dotdot) {
+		int start = MSDOS_I(old_dir)->i_logstart;
+		dotdot_de->start = cpu_to_le16(start);
+		dotdot_de->starthi = cpu_to_le16(start >> 16);
+		mark_buffer_dirty(dotdot_bh);
+		corrupt |= sync_dirty_buffer(dotdot_bh);
+	}
+error_inode:
+	fat_detach(old_inode);
+	fat_attach(old_inode, old_sinfo.i_pos);
+	if (new_inode) {
+		fat_attach(new_inode, new_i_pos);
+		if (corrupt)
+			corrupt |= fat_sync_inode(new_inode);
+	} else {
+		/*
+		 * If new entry was not sharing the data cluster, it
+		 * shouldn't be serious corruption.
+		 */
+		int err2 = fat_remove_entries(new_dir, &sinfo);
+		if (corrupt)
+			corrupt |= err2;
+		sinfo.bh = NULL;
+	}
+	if (corrupt < 0) {
+		fat_fs_panic(new_dir->i_sb,
+			     "%s: Filesystem corrupted (i_pos %lld)",
+			     __func__, sinfo.i_pos);
+	}
+	goto out;
+}
+
+static const struct inode_operations vfat_dir_inode_operations = {
+	.create		= vfat_create,
+	.lookup		= vfat_lookup,
+	.unlink		= vfat_unlink,
+	.mkdir		= vfat_mkdir,
+	.rmdir		= vfat_rmdir,
+	.rename		= vfat_rename,
+	.setattr	= fat_setattr,
+	.getattr	= fat_getattr,
+};
+
+static int vfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	int res;
+
+	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
+	if (res)
+		return res;
+
+	if (MSDOS_SB(sb)->options.name_check != 's')
+		sb->s_root->d_op = &vfat_dentry_ops[0];
+	else
+		sb->s_root->d_op = &vfat_dentry_ops[2];
+
+	return 0;
+}
+
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
+}
+
+static struct file_system_type vfat_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "vfat",
+	.get_sb		= vfat_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_vfat_fs(void)
+{
+	return register_filesystem(&vfat_fs_type);
+}
+
+static void __exit exit_vfat_fs(void)
+{
+	unregister_filesystem(&vfat_fs_type);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("VFAT filesystem support");
+MODULE_AUTHOR("Gordon Chaffee");
+
+module_init(init_vfat_fs)
+module_exit(exit_vfat_fs)
diff --git a/fs/msdos/Makefile b/fs/msdos/Makefile
deleted file mode 100644
index ea67646fcb95..000000000000
--- a/fs/msdos/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the Linux msdos filesystem routines.
-#
-
-obj-$(CONFIG_MSDOS_FS) += msdos.o
-
-msdos-y := namei.o
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
deleted file mode 100644
index e844b9809d27..000000000000
--- a/fs/msdos/namei.c
+++ /dev/null
@@ -1,702 +0,0 @@
-/*
- *  linux/fs/msdos/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *  Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu>
- *  Rewritten for constant inumbers 1999 by Al Viro
- */
-
-#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
-
-/* Characters that are undesirable in an MS-DOS file name */
-static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict[] = "+=,; ";
-
-/***** Formats an MS-DOS file name. Rejects invalid names. */
-static int msdos_format_name(const unsigned char *name, int len,
-			     unsigned char *res, struct fat_mount_options *opts)
-	/*
-	 * name is the proposed name, len is its length, res is
-	 * the resulting name, opts->name_check is either (r)elaxed,
-	 * (n)ormal or (s)trict, opts->dotsOK allows dots at the
-	 * beginning of name (for hidden files)
-	 */
-{
-	unsigned char *walk;
-	unsigned char c;
-	int space;
-
-	if (name[0] == '.') {	/* dotfile because . and .. already done */
-		if (opts->dotsOK) {
-			/* Get rid of dot - test for it elsewhere */
-			name++;
-			len--;
-		} else
-			return -EINVAL;
-	}
-	/*
-	 * disallow names that _really_ start with a dot
-	 */
-	space = 1;
-	c = 0;
-	for (walk = res; len && walk - res < 8; walk++) {
-		c = *name++;
-		len--;
-		if (opts->name_check != 'r' && strchr(bad_chars, c))
-			return -EINVAL;
-		if (opts->name_check == 's' && strchr(bad_if_strict, c))
-			return -EINVAL;
-		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-			return -EINVAL;
-		if (c < ' ' || c == ':' || c == '\\')
-			return -EINVAL;
-	/*
-	 * 0xE5 is legal as a first character, but we must substitute
-	 * 0x05 because 0xE5 marks deleted files.  Yes, DOS really
-	 * does this.
-	 * It seems that Microsoft hacked DOS to support non-US
-	 * characters after the 0xE5 character was already in use to
-	 * mark deleted files.
-	 */
-		if ((res == walk) && (c == 0xE5))
-			c = 0x05;
-		if (c == '.')
-			break;
-		space = (c == ' ');
-		*walk = (!opts->nocase && c >= 'a' && c <= 'z') ? c - 32 : c;
-	}
-	if (space)
-		return -EINVAL;
-	if (opts->name_check == 's' && len && c != '.') {
-		c = *name++;
-		len--;
-		if (c != '.')
-			return -EINVAL;
-	}
-	while (c != '.' && len--)
-		c = *name++;
-	if (c == '.') {
-		while (walk - res < 8)
-			*walk++ = ' ';
-		while (len > 0 && walk - res < MSDOS_NAME) {
-			c = *name++;
-			len--;
-			if (opts->name_check != 'r' && strchr(bad_chars, c))
-				return -EINVAL;
-			if (opts->name_check == 's' &&
-			    strchr(bad_if_strict, c))
-				return -EINVAL;
-			if (c < ' ' || c == ':' || c == '\\')
-				return -EINVAL;
-			if (c == '.') {
-				if (opts->name_check == 's')
-					return -EINVAL;
-				break;
-			}
-			if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
-				return -EINVAL;
-			space = c == ' ';
-			if (!opts->nocase && c >= 'a' && c <= 'z')
-				*walk++ = c - 32;
-			else
-				*walk++ = c;
-		}
-		if (space)
-			return -EINVAL;
-		if (opts->name_check == 's' && len)
-			return -EINVAL;
-	}
-	while (walk - res < MSDOS_NAME)
-		*walk++ = ' ';
-
-	return 0;
-}
-
-/***** Locates a directory entry.  Uses unformatted name. */
-static int msdos_find(struct inode *dir, const unsigned char *name, int len,
-		      struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	unsigned char msdos_name[MSDOS_NAME];
-	int err;
-
-	err = msdos_format_name(name, len, msdos_name, &sbi->options);
-	if (err)
-		return -ENOENT;
-
-	err = fat_scan(dir, msdos_name, sinfo);
-	if (!err && sbi->options.dotsOK) {
-		if (name[0] == '.') {
-			if (!(sinfo->de->attr & ATTR_HIDDEN))
-				err = -ENOENT;
-		} else {
-			if (sinfo->de->attr & ATTR_HIDDEN)
-				err = -ENOENT;
-		}
-		if (err)
-			brelse(sinfo->bh);
-	}
-	return err;
-}
-
-/*
- * Compute the hash for the msdos name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The msdos fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int msdos_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
-	if (!error)
-		qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
-	return 0;
-}
-
-/*
- * Compare two msdos names. If either of the names are invalid,
- * we fall back to doing the standard name comparison.
- */
-static int msdos_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
-	unsigned char a_msdos_name[MSDOS_NAME], b_msdos_name[MSDOS_NAME];
-	int error;
-
-	error = msdos_format_name(a->name, a->len, a_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = msdos_format_name(b->name, b->len, b_msdos_name, options);
-	if (error)
-		goto old_compare;
-	error = memcmp(a_msdos_name, b_msdos_name, MSDOS_NAME);
-out:
-	return error;
-
-old_compare:
-	error = 1;
-	if (a->len == b->len)
-		error = memcmp(a->name, b->name, a->len);
-	goto out;
-}
-
-static struct dentry_operations msdos_dentry_operations = {
-	.d_hash		= msdos_hash,
-	.d_compare	= msdos_cmp,
-};
-
-/*
- * AV. Wrappers for FAT sb operations. Is it wise?
- */
-
-/***** Get inode using directory and name */
-static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
-
-	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
-	}
-add:
-	res = 0;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry)
-		dentry->d_op = &msdos_dentry_operations;
-out:
-	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
-}
-
-/***** Creates a directory entry (name is already formatted). */
-static int msdos_add_entry(struct inode *dir, const unsigned char *name,
-			   int is_dir, int is_hid, int cluster,
-			   struct timespec *ts, struct fat_slot_info *sinfo)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct msdos_dir_entry de;
-	__le16 time, date;
-	int err;
-
-	memcpy(de.name, name, MSDOS_NAME);
-	de.attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	if (is_hid)
-		de.attr |= ATTR_HIDDEN;
-	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de.cdate = de.adate = 0;
-	de.ctime = 0;
-	de.ctime_cs = 0;
-	de.time = time;
-	de.date = date;
-	de.start = cpu_to_le16(cluster);
-	de.starthi = cpu_to_le16(cluster >> 16);
-	de.size = 0;
-
-	err = fat_add_entries(dir, &de, 1, sinfo);
-	if (err)
-		return err;
-
-	dir->i_ctime = dir->i_mtime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-
-	return 0;
-}
-
-/***** Create a file */
-static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
-			struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = NULL;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	unsigned char msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* Have to do it due to foo vs. .foo conflicts */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-	return err;
-}
-
-/***** Remove a directory */
-static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode = dentry->d_inode;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	/*
-	 * Check whether the directory is not in use, then check
-	 * whether it is empty.
-	 */
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-/***** Make a directory */
-static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode;
-	unsigned char msdos_name[MSDOS_NAME];
-	struct timespec ts;
-	int err, is_hid, cluster;
-
-	lock_super(sb);
-
-	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
-				msdos_name, &MSDOS_SB(sb)->options);
-	if (err)
-		goto out;
-	is_hid = (dentry->d_name.name[0] == '.') && (msdos_name[0] != '.');
-	/* foo vs .foo situation */
-	if (!fat_scan(dir, msdos_name, &sinfo)) {
-		brelse(sinfo.bh);
-		err = -EINVAL;
-		goto out;
-	}
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	fat_flush_inodes(sb, dir, inode);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-/***** Unlink a file */
-static int msdos_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb= inode->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, dir, inode);
-
-	return err;
-}
-
-static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
-			   struct dentry *old_dentry,
-			   struct inode *new_dir, unsigned char *new_name,
-			   struct dentry *new_dentry, int is_hid)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-
-	err = fat_scan(old_dir, old_name, &old_sinfo);
-	if (err) {
-		err = -EIO;
-		goto out;
-	}
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	old_attrs = MSDOS_I(old_inode)->i_attrs;
-	err = fat_scan(new_dir, new_name, &sinfo);
-	if (!err) {
-		if (!new_inode) {
-			/* "foo" -> ".foo" case. just change the ATTR_HIDDEN */
-			if (sinfo.de != old_sinfo.de) {
-				err = -EINVAL;
-				goto out;
-			}
-			if (is_hid)
-				MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-			else
-				MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-			if (IS_DIRSYNC(old_dir)) {
-				err = fat_sync_inode(old_inode);
-				if (err) {
-					MSDOS_I(old_inode)->i_attrs = old_attrs;
-					goto out;
-				}
-			} else
-				mark_inode_dirty(old_inode);
-
-			old_dir->i_version++;
-			old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
-			if (IS_DIRSYNC(old_dir))
-				(void)fat_sync_inode(old_dir);
-			else
-				mark_inode_dirty(old_dir);
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (err)
-			goto out;
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0,
-				      &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (is_hid)
-		MSDOS_I(old_inode)->i_attrs |= ATTR_HIDDEN;
-	else
-		MSDOS_I(old_inode)->i_attrs &= ~ATTR_HIDDEN;
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
-			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	MSDOS_I(old_inode)->i_attrs = old_attrs;
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-/***** Rename, a wrapper for rename_same_dir & rename_diff_dir */
-static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct super_block *sb = old_dir->i_sb;
-	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
-	int err, is_hid;
-
-	lock_super(sb);
-
-	err = msdos_format_name(old_dentry->d_name.name,
-				old_dentry->d_name.len, old_msdos_name,
-				&MSDOS_SB(old_dir->i_sb)->options);
-	if (err)
-		goto out;
-	err = msdos_format_name(new_dentry->d_name.name,
-				new_dentry->d_name.len, new_msdos_name,
-				&MSDOS_SB(new_dir->i_sb)->options);
-	if (err)
-		goto out;
-
-	is_hid =
-	     (new_dentry->d_name.name[0] == '.') && (new_msdos_name[0] != '.');
-
-	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
-			      new_dir, new_msdos_name, new_dentry, is_hid);
-out:
-	unlock_super(sb);
-	if (!err)
-		err = fat_flush_inodes(sb, old_dir, new_dir);
-	return err;
-}
-
-static const struct inode_operations msdos_dir_inode_operations = {
-	.create		= msdos_create,
-	.lookup		= msdos_lookup,
-	.unlink		= msdos_unlink,
-	.mkdir		= msdos_mkdir,
-	.rmdir		= msdos_rmdir,
-	.rename		= msdos_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int msdos_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, 0);
-	if (res)
-		return res;
-
-	sb->s_flags |= MS_NOATIME;
-	sb->s_root->d_op = &msdos_dentry_operations;
-	return 0;
-}
-
-static int msdos_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
-			   mnt);
-}
-
-static struct file_system_type msdos_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "msdos",
-	.get_sb		= msdos_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_msdos_fs(void)
-{
-	return register_filesystem(&msdos_fs_type);
-}
-
-static void __exit exit_msdos_fs(void)
-{
-	unregister_filesystem(&msdos_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Werner Almesberger");
-MODULE_DESCRIPTION("MS-DOS filesystem support");
-
-module_init(init_msdos_fs)
-module_exit(exit_msdos_fs)
diff --git a/fs/vfat/Makefile b/fs/vfat/Makefile
deleted file mode 100644
index 40f2798a4f08..000000000000
--- a/fs/vfat/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for the linux vfat-filesystem routines.
-#
-
-obj-$(CONFIG_VFAT_FS) += vfat.o
-
-vfat-y := namei.o
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
deleted file mode 100644
index 155c10b4adbd..000000000000
--- a/fs/vfat/namei.c
+++ /dev/null
@@ -1,1055 +0,0 @@
-/*
- *  linux/fs/vfat/namei.c
- *
- *  Written 1992,1993 by Werner Almesberger
- *
- *  Windows95/Windows NT compatible extended MSDOS filesystem
- *    by Gordon Chaffee Copyright (C) 1995.  Send bug reports for the
- *    VFAT filesystem to <chaffee@cs.berkeley.edu>.  Specify
- *    what file operation caused you trouble and if you can duplicate
- *    the problem, send a script that demonstrates it.
- *
- *  Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
- *
- *  Support Multibyte characters and cleanup by
- *				OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
- */
-
-#include <linux/module.h>
-
-#include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
-#include <linux/ctype.h>
-#include <linux/slab.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
-		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
-	return ret;
-}
-
-/* returns the length of a struct qstr, ignoring trailing dots */
-static unsigned int vfat_striptail_len(struct qstr *qstr)
-{
-	unsigned int len = qstr->len;
-
-	while (len && qstr->name[len - 1] == '.')
-		len--;
-	return len;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
-	return 0;
-}
-
-/*
- * Compute the hash for the vfat name corresponding to the dentry.
- * Note: if the name is invalid, we leave the hash code unchanged so
- * that the existing dentry can be used. The vfat fs routines will
- * return ENOENT or EINVAL as appropriate.
- */
-static int vfat_hashi(struct dentry *dentry, struct qstr *qstr)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	const unsigned char *name;
-	unsigned int len;
-	unsigned long hash;
-
-	name = qstr->name;
-	len = vfat_striptail_len(qstr);
-
-	hash = init_name_hash();
-	while (len--)
-		hash = partial_name_hash(nls_tolower(t, *name++), hash);
-	qstr->hash = end_name_hash(hash);
-
-	return 0;
-}
-
-/*
- * Case insensitive compare of two vfat names.
- */
-static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	struct nls_table *t = MSDOS_SB(dentry->d_inode->i_sb)->nls_io;
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (nls_strnicmp(t, a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-/*
- * Case sensitive compare of two vfat names.
- */
-static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
-{
-	unsigned int alen, blen;
-
-	/* A filename cannot end in '.' or we treat it like it has none */
-	alen = vfat_striptail_len(a);
-	blen = vfat_striptail_len(b);
-	if (alen == blen) {
-		if (strncmp(a->name, b->name, alen) == 0)
-			return 0;
-	}
-	return 1;
-}
-
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
-};
-
-/* Characters that are undesirable in an MS-DOS file name */
-
-static inline wchar_t vfat_bad_char(wchar_t w)
-{
-	return (w < 0x0020)
-	    || (w == '*') || (w == '?') || (w == '<') || (w == '>')
-	    || (w == '|') || (w == '"') || (w == ':') || (w == '/')
-	    || (w == '\\');
-}
-
-static inline wchar_t vfat_replace_char(wchar_t w)
-{
-	return (w == '[') || (w == ']') || (w == ';') || (w == ',')
-	    || (w == '+') || (w == '=');
-}
-
-static wchar_t vfat_skip_char(wchar_t w)
-{
-	return (w == '.') || (w == ' ');
-}
-
-static inline int vfat_is_used_badchars(const wchar_t *s, int len)
-{
-	int i;
-
-	for (i = 0; i < len; i++)
-		if (vfat_bad_char(s[i]))
-			return -EINVAL;
-
-	if (s[i - 1] == ' ') /* last character cannot be space */
-		return -EINVAL;
-
-	return 0;
-}
-
-static int vfat_find_form(struct inode *dir, unsigned char *name)
-{
-	struct fat_slot_info sinfo;
-	int err = fat_scan(dir, name, &sinfo);
-	if (err)
-		return -ENOENT;
-	brelse(sinfo.bh);
-	return 0;
-}
-
-/*
- * 1) Valid characters for the 8.3 format alias are any combination of
- * letters, uppercase alphabets, digits, any of the
- * following special characters:
- *     $ % ' ` - @ { } ~ ! # ( ) & _ ^
- * In this case Longfilename is not stored in disk.
- *
- * WinNT's Extension:
- * File name and extension name is contain uppercase/lowercase
- * only. And it is expressed by CASE_LOWER_BASE and CASE_LOWER_EXT.
- *
- * 2) File name is 8.3 format, but it contain the uppercase and
- * lowercase char, muliti bytes char, etc. In this case numtail is not
- * added, but Longfilename is stored.
- *
- * 3) When the one except for the above, or the following special
- * character are contained:
- *        .   [ ] ; , + =
- * numtail is added, and Longfilename must be stored in disk .
- */
-struct shortname_info {
-	unsigned char lower:1,
-		      upper:1,
-		      valid:1;
-};
-#define INIT_SHORTNAME_INFO(x)	do {		\
-	(x)->lower = 1;				\
-	(x)->upper = 1;				\
-	(x)->valid = 1;				\
-} while (0)
-
-static inline int to_shortname_char(struct nls_table *nls,
-				    unsigned char *buf, int buf_size,
-				    wchar_t *src, struct shortname_info *info)
-{
-	int len;
-
-	if (vfat_skip_char(*src)) {
-		info->valid = 0;
-		return 0;
-	}
-	if (vfat_replace_char(*src)) {
-		info->valid = 0;
-		buf[0] = '_';
-		return 1;
-	}
-
-	len = nls->uni2char(*src, buf, buf_size);
-	if (len <= 0) {
-		info->valid = 0;
-		buf[0] = '_';
-		len = 1;
-	} else if (len == 1) {
-		unsigned char prev = buf[0];
-
-		if (buf[0] >= 0x7F) {
-			info->lower = 0;
-			info->upper = 0;
-		}
-
-		buf[0] = nls_toupper(nls, buf[0]);
-		if (isalpha(buf[0])) {
-			if (buf[0] == prev)
-				info->lower = 0;
-			else
-				info->upper = 0;
-		}
-	} else {
-		info->lower = 0;
-		info->upper = 0;
-	}
-
-	return len;
-}
-
-/*
- * Given a valid longname, create a unique shortname.  Make sure the
- * shortname does not exist
- * Returns negative number on error, 0 for a normal
- * return, and 1 for valid shortname
- */
-static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
-				 wchar_t *uname, int ulen,
-				 unsigned char *name_res, unsigned char *lcase)
-{
-	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
-	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
-	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
-	int chl, chi;
-	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
-	int is_shortname;
-	struct shortname_info base_info, ext_info;
-
-	is_shortname = 1;
-	INIT_SHORTNAME_INFO(&base_info);
-	INIT_SHORTNAME_INFO(&ext_info);
-
-	/* Now, we need to create a shortname from the long name */
-	ext_start = end = &uname[ulen];
-	while (--ext_start >= uname) {
-		if (*ext_start == 0x002E) {	/* is `.' */
-			if (ext_start == end - 1) {
-				sz = ulen;
-				ext_start = NULL;
-			}
-			break;
-		}
-	}
-
-	if (ext_start == uname - 1) {
-		sz = ulen;
-		ext_start = NULL;
-	} else if (ext_start) {
-		/*
-		 * Names which start with a dot could be just
-		 * an extension eg. "...test".  In this case Win95
-		 * uses the extension as the name and sets no extension.
-		 */
-		name_start = &uname[0];
-		while (name_start < ext_start) {
-			if (!vfat_skip_char(*name_start))
-				break;
-			name_start++;
-		}
-		if (name_start != ext_start) {
-			sz = ext_start - uname;
-			ext_start++;
-		} else {
-			sz = ulen;
-			ext_start = NULL;
-		}
-	}
-
-	numtail_baselen = 6;
-	numtail2_baselen = 2;
-	for (baselen = i = 0, p = base, ip = uname; i < sz; i++, ip++) {
-		chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-					ip, &base_info);
-		if (chl == 0)
-			continue;
-
-		if (baselen < 2 && (baselen + chl) > 2)
-			numtail2_baselen = baselen;
-		if (baselen < 6 && (baselen + chl) > 6)
-			numtail_baselen = baselen;
-		for (chi = 0; chi < chl; chi++) {
-			*p++ = charbuf[chi];
-			baselen++;
-			if (baselen >= 8)
-				break;
-		}
-		if (baselen >= 8) {
-			if ((chi < chl - 1) || (ip + 1) - uname < sz)
-				is_shortname = 0;
-			break;
-		}
-	}
-	if (baselen == 0) {
-		return -EINVAL;
-	}
-
-	extlen = 0;
-	if (ext_start) {
-		for (p = ext, ip = ext_start; extlen < 3 && ip < end; ip++) {
-			chl = to_shortname_char(nls, charbuf, sizeof(charbuf),
-						ip, &ext_info);
-			if (chl == 0)
-				continue;
-
-			if ((extlen + chl) > 3) {
-				is_shortname = 0;
-				break;
-			}
-			for (chi = 0; chi < chl; chi++) {
-				*p++ = charbuf[chi];
-				extlen++;
-			}
-			if (extlen >= 3) {
-				if (ip + 1 != end)
-					is_shortname = 0;
-				break;
-			}
-		}
-	}
-	ext[extlen] = '\0';
-	base[baselen] = '\0';
-
-	/* Yes, it can happen. ".\xe5" would do it. */
-	if (base[0] == DELETED_FLAG)
-		base[0] = 0x05;
-
-	/* OK, at this point we know that base is not longer than 8 symbols,
-	 * ext is not longer than 3, base is nonempty, both don't contain
-	 * any bad symbols (lowercase transformed to uppercase).
-	 */
-
-	memset(name_res, ' ', MSDOS_NAME);
-	memcpy(name_res, base, baselen);
-	memcpy(name_res + 8, ext, extlen);
-	*lcase = 0;
-	if (is_shortname && base_info.valid && ext_info.valid) {
-		if (vfat_find_form(dir, name_res) == 0)
-			return -EEXIST;
-
-		if (opts->shortname & VFAT_SFN_CREATE_WIN95) {
-			return (base_info.upper && ext_info.upper);
-		} else if (opts->shortname & VFAT_SFN_CREATE_WINNT) {
-			if ((base_info.upper || base_info.lower) &&
-			    (ext_info.upper || ext_info.lower)) {
-				if (!base_info.upper && base_info.lower)
-					*lcase |= CASE_LOWER_BASE;
-				if (!ext_info.upper && ext_info.lower)
-					*lcase |= CASE_LOWER_EXT;
-				return 1;
-			}
-			return 0;
-		} else {
-			BUG();
-		}
-	}
-
-	if (opts->numtail == 0)
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-
-	/*
-	 * Try to find a unique extension.  This used to
-	 * iterate through all possibilities sequentially,
-	 * but that gave extremely bad performance.  Windows
-	 * only tries a few cases before using random
-	 * values for part of the base.
-	 */
-
-	if (baselen > 6) {
-		baselen = numtail_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen] = '~';
-	for (i = 1; i < 10; i++) {
-		name_res[baselen + 1] = i + '0';
-		if (vfat_find_form(dir, name_res) < 0)
-			return 0;
-	}
-
-	i = jiffies & 0xffff;
-	sz = (jiffies >> 16) & 0x7;
-	if (baselen > 2) {
-		baselen = numtail2_baselen;
-		name_res[7] = ' ';
-	}
-	name_res[baselen + 4] = '~';
-	name_res[baselen + 5] = '1' + sz;
-	while (1) {
-		sprintf(buf, "%04X", i);
-		memcpy(&name_res[baselen], buf, 4);
-		if (vfat_find_form(dir, name_res) < 0)
-			break;
-		i -= 11;
-	}
-	return 0;
-}
-
-/* Translate a string, including coded sequences into Unicode */
-static int
-xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
-	     int *longlen, int *outlen, int escape, int utf8,
-	     struct nls_table *nls)
-{
-	const unsigned char *ip;
-	unsigned char nc;
-	unsigned char *op;
-	unsigned int ec;
-	int i, k, fill;
-	int charlen;
-
-	if (utf8) {
-		int name_len = strlen(name);
-
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
-
-		/*
-		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
-		 */
-		*outlen -= (name_len - len);
-
-		if (*outlen > 255)
-			return -ENAMETOOLONG;
-
-		op = &outname[*outlen * sizeof(wchar_t)];
-	} else {
-		if (nls) {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     *outlen += 1)
-			{
-				if (escape && (*ip == ':')) {
-					if (i > len - 5)
-						return -EINVAL;
-					ec = 0;
-					for (k = 1; k < 5; k++) {
-						nc = ip[k];
-						ec <<= 4;
-						if (nc >= '0' && nc <= '9') {
-							ec |= nc - '0';
-							continue;
-						}
-						if (nc >= 'a' && nc <= 'f') {
-							ec |= nc - ('a' - 10);
-							continue;
-						}
-						if (nc >= 'A' && nc <= 'F') {
-							ec |= nc - ('A' - 10);
-							continue;
-						}
-						return -EINVAL;
-					}
-					*op++ = ec & 0xFF;
-					*op++ = ec >> 8;
-					ip += 5;
-					i += 5;
-				} else {
-					if ((charlen = nls->char2uni(ip, len - i, (wchar_t *)op)) < 0)
-						return -EINVAL;
-					ip += charlen;
-					i += charlen;
-					op += 2;
-				}
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		} else {
-			for (i = 0, ip = name, op = outname, *outlen = 0;
-			     i < len && *outlen <= 255;
-			     i++, *outlen += 1)
-			{
-				*op++ = *ip++;
-				*op++ = 0;
-			}
-			if (i < len)
-				return -ENAMETOOLONG;
-		}
-	}
-
-	*longlen = *outlen;
-	if (*outlen % 13) {
-		*op++ = 0;
-		*op++ = 0;
-		*outlen += 1;
-		if (*outlen % 13) {
-			fill = 13 - (*outlen % 13);
-			for (i = 0; i < fill; i++) {
-				*op++ = 0xff;
-				*op++ = 0xff;
-			}
-			*outlen += fill;
-		}
-	}
-
-	return 0;
-}
-
-static int vfat_build_slots(struct inode *dir, const unsigned char *name,
-			    int len, int is_dir, int cluster,
-			    struct timespec *ts,
-			    struct msdos_dir_slot *slots, int *nr_slots)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
-	struct fat_mount_options *opts = &sbi->options;
-	struct msdos_dir_slot *ps;
-	struct msdos_dir_entry *de;
-	unsigned char cksum, lcase;
-	unsigned char msdos_name[MSDOS_NAME];
-	wchar_t *uname;
-	__le16 time, date;
-	int err, ulen, usize, i;
-	loff_t offset;
-
-	*nr_slots = 0;
-
-	uname = __getname();
-	if (!uname)
-		return -ENOMEM;
-
-	err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize,
-			   opts->unicode_xlate, opts->utf8, sbi->nls_io);
-	if (err)
-		goto out_free;
-
-	err = vfat_is_used_badchars(uname, ulen);
-	if (err)
-		goto out_free;
-
-	err = vfat_create_shortname(dir, sbi->nls_disk, uname, ulen,
-				    msdos_name, &lcase);
-	if (err < 0)
-		goto out_free;
-	else if (err == 1) {
-		de = (struct msdos_dir_entry *)slots;
-		err = 0;
-		goto shortname;
-	}
-
-	/* build the entry of long file name */
-	cksum = fat_checksum(msdos_name);
-
-	*nr_slots = usize / 13;
-	for (ps = slots, i = *nr_slots; i > 0; i--, ps++) {
-		ps->id = i;
-		ps->attr = ATTR_EXT;
-		ps->reserved = 0;
-		ps->alias_checksum = cksum;
-		ps->start = 0;
-		offset = (i - 1) * 13;
-		fatwchar_to16(ps->name0_4, uname + offset, 5);
-		fatwchar_to16(ps->name5_10, uname + offset + 5, 6);
-		fatwchar_to16(ps->name11_12, uname + offset + 11, 2);
-	}
-	slots[0].id |= 0x40;
-	de = (struct msdos_dir_entry *)ps;
-
-shortname:
-	/* build the entry of 8.3 alias name */
-	(*nr_slots)++;
-	memcpy(de->name, msdos_name, MSDOS_NAME);
-	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
-	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
-	de->time = de->ctime = time;
-	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
-	de->start = cpu_to_le16(cluster);
-	de->starthi = cpu_to_le16(cluster >> 16);
-	de->size = 0;
-out_free:
-	__putname(uname);
-	return err;
-}
-
-static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
-			  int cluster, struct timespec *ts,
-			  struct fat_slot_info *sinfo)
-{
-	struct msdos_dir_slot *slots;
-	unsigned int len;
-	int err, nr_slots;
-
-	len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-
-	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
-	if (slots == NULL)
-		return -ENOMEM;
-
-	err = vfat_build_slots(dir, qname->name, len, is_dir, cluster, ts,
-			       slots, &nr_slots);
-	if (err)
-		goto cleanup;
-
-	err = fat_add_entries(dir, slots, nr_slots, sinfo);
-	if (err)
-		goto cleanup;
-
-	/* update timestamp */
-	dir->i_ctime = dir->i_mtime = dir->i_atime = *ts;
-	if (IS_DIRSYNC(dir))
-		(void)fat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
-cleanup:
-	kfree(slots);
-	return err;
-}
-
-static int vfat_find(struct inode *dir, struct qstr *qname,
-		     struct fat_slot_info *sinfo)
-{
-	unsigned int len = vfat_striptail_len(qname);
-	if (len == 0)
-		return -ENOENT;
-	return fat_search_long(dir, qname->name, len, sinfo);
-}
-
-static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
-				  struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	struct dentry *alias;
-	int err, table;
-
-	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err) {
-		table++;
-		goto error;
-	}
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
-	}
-	alias = d_find_alias(inode);
-	if (alias) {
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
-
-	}
-error:
-	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	dentry = d_splice_alias(inode, dentry);
-	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
-		dentry->d_time = dentry->d_parent->d_inode->i_version;
-	}
-	return dentry;
-}
-
-static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-		       struct nameidata *nd)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
-	if (err)
-		goto out;
-	dir->i_version++;
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = fat_dir_empty(inode);
-	if (err)
-		goto out;
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	drop_nlink(dir);
-
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = dir->i_sb;
-	struct fat_slot_info sinfo;
-	int err;
-
-	lock_super(sb);
-
-	err = vfat_find(dir, &dentry->d_name, &sinfo);
-	if (err)
-		goto out;
-
-	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
-	if (err)
-		goto out;
-	clear_nlink(inode);
-	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
-	fat_detach(inode);
-out:
-	unlock_super(sb);
-
-	return err;
-}
-
-static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-	struct fat_slot_info sinfo;
-	struct timespec ts;
-	int err, cluster;
-
-	lock_super(sb);
-
-	ts = CURRENT_TIME_SEC;
-	cluster = fat_alloc_new_dir(dir, &ts);
-	if (cluster < 0) {
-		err = cluster;
-		goto out;
-	}
-	err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo);
-	if (err)
-		goto out_free;
-	dir->i_version++;
-	inc_nlink(dir);
-
-	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
-	brelse(sinfo.bh);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		/* the directory was completed, just return a error */
-		goto out;
-	}
-	inode->i_version++;
-	inode->i_nlink = 2;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
-	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
-
-	dentry->d_time = dentry->d_parent->d_inode->i_version;
-	d_instantiate(dentry, inode);
-
-	unlock_super(sb);
-	return 0;
-
-out_free:
-	fat_free_clusters(dir, cluster);
-out:
-	unlock_super(sb);
-	return err;
-}
-
-static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
-		       struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct buffer_head *dotdot_bh;
-	struct msdos_dir_entry *dotdot_de;
-	struct inode *old_inode, *new_inode;
-	struct fat_slot_info old_sinfo, sinfo;
-	struct timespec ts;
-	loff_t dotdot_i_pos, new_i_pos;
-	int err, is_dir, update_dotdot, corrupt = 0;
-	struct super_block *sb = old_dir->i_sb;
-
-	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
-	old_inode = old_dentry->d_inode;
-	new_inode = new_dentry->d_inode;
-	lock_super(sb);
-	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
-	if (err)
-		goto out;
-
-	is_dir = S_ISDIR(old_inode->i_mode);
-	update_dotdot = (is_dir && old_dir != new_dir);
-	if (update_dotdot) {
-		if (fat_get_dotdot_entry(old_inode, &dotdot_bh, &dotdot_de,
-					 &dotdot_i_pos) < 0) {
-			err = -EIO;
-			goto out;
-		}
-	}
-
-	ts = CURRENT_TIME_SEC;
-	if (new_inode) {
-		if (is_dir) {
-			err = fat_dir_empty(new_inode);
-			if (err)
-				goto out;
-		}
-		new_i_pos = MSDOS_I(new_inode)->i_pos;
-		fat_detach(new_inode);
-	} else {
-		err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0,
-				     &ts, &sinfo);
-		if (err)
-			goto out;
-		new_i_pos = sinfo.i_pos;
-	}
-	new_dir->i_version++;
-
-	fat_detach(old_inode);
-	fat_attach(old_inode, new_i_pos);
-	if (IS_DIRSYNC(new_dir)) {
-		err = fat_sync_inode(old_inode);
-		if (err)
-			goto error_inode;
-	} else
-		mark_inode_dirty(old_inode);
-
-	if (update_dotdot) {
-		int start = MSDOS_I(new_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		if (IS_DIRSYNC(new_dir)) {
-			err = sync_dirty_buffer(dotdot_bh);
-			if (err)
-				goto error_dotdot;
-		}
-		drop_nlink(old_dir);
-		if (!new_inode)
- 			inc_nlink(new_dir);
-	}
-
-	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
-	old_sinfo.bh = NULL;
-	if (err)
-		goto error_dotdot;
-	old_dir->i_version++;
-	old_dir->i_ctime = old_dir->i_mtime = ts;
-	if (IS_DIRSYNC(old_dir))
-		(void)fat_sync_inode(old_dir);
-	else
-		mark_inode_dirty(old_dir);
-
-	if (new_inode) {
-		drop_nlink(new_inode);
-		if (is_dir)
-			drop_nlink(new_inode);
-		new_inode->i_ctime = ts;
-	}
-out:
-	brelse(sinfo.bh);
-	brelse(dotdot_bh);
-	brelse(old_sinfo.bh);
-	unlock_super(sb);
-
-	return err;
-
-error_dotdot:
-	/* data cluster is shared, serious corruption */
-	corrupt = 1;
-
-	if (update_dotdot) {
-		int start = MSDOS_I(old_dir)->i_logstart;
-		dotdot_de->start = cpu_to_le16(start);
-		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
-		corrupt |= sync_dirty_buffer(dotdot_bh);
-	}
-error_inode:
-	fat_detach(old_inode);
-	fat_attach(old_inode, old_sinfo.i_pos);
-	if (new_inode) {
-		fat_attach(new_inode, new_i_pos);
-		if (corrupt)
-			corrupt |= fat_sync_inode(new_inode);
-	} else {
-		/*
-		 * If new entry was not sharing the data cluster, it
-		 * shouldn't be serious corruption.
-		 */
-		int err2 = fat_remove_entries(new_dir, &sinfo);
-		if (corrupt)
-			corrupt |= err2;
-		sinfo.bh = NULL;
-	}
-	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
-			     "%s: Filesystem corrupted (i_pos %lld)",
-			     __func__, sinfo.i_pos);
-	}
-	goto out;
-}
-
-static const struct inode_operations vfat_dir_inode_operations = {
-	.create		= vfat_create,
-	.lookup		= vfat_lookup,
-	.unlink		= vfat_unlink,
-	.mkdir		= vfat_mkdir,
-	.rmdir		= vfat_rmdir,
-	.rename		= vfat_rename,
-	.setattr	= fat_setattr,
-	.getattr	= fat_getattr,
-};
-
-static int vfat_fill_super(struct super_block *sb, void *data, int silent)
-{
-	int res;
-
-	res = fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, 1);
-	if (res)
-		return res;
-
-	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
-	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
-
-	return 0;
-}
-
-static int vfat_get_sb(struct file_system_type *fs_type,
-		       int flags, const char *dev_name,
-		       void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
-			   mnt);
-}
-
-static struct file_system_type vfat_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "vfat",
-	.get_sb		= vfat_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_vfat_fs(void)
-{
-	return register_filesystem(&vfat_fs_type);
-}
-
-static void __exit exit_vfat_fs(void)
-{
-	unregister_filesystem(&vfat_fs_type);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("VFAT filesystem support");
-MODULE_AUTHOR("Gordon Chaffee");
-
-module_init(init_vfat_fs)
-module_exit(exit_vfat_fs)
-- 
cgit v1.2.3


From 9e975dae2970d22557662761c8505ce9fd165684 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:46 -0800
Subject: fat: split include/msdos_fs.h

This splits __KERNEL__ stuff in include/msdos_fs.h into fs/fat/fat.h.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c           |   2 +-
 fs/fat/dir.c             |   2 +-
 fs/fat/fat.h             | 274 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/fat/fatent.c          |   1 +
 fs/fat/file.c            |   2 +-
 fs/fat/inode.c           |   2 +-
 fs/fat/misc.c            |   2 +-
 fs/fat/namei_msdos.c     |   2 +-
 fs/fat/namei_vfat.c      |   3 +-
 include/linux/msdos_fs.h | 276 +----------------------------------------------
 10 files changed, 284 insertions(+), 282 deletions(-)
 create mode 100644 fs/fat/fat.h

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3222f51c41cf..589edde9053c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,8 +9,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /* this must be > 0. */
 #define FAT_MAX_CACHE	8
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index bae1c3292522..08b23ad25f1c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -16,11 +16,11 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include "fat.h"
 
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
new file mode 100644
index 000000000000..51f1c42ca5e3
--- /dev/null
+++ b/fs/fat/fat.h
@@ -0,0 +1,274 @@
+#ifndef _FAT_H
+#define _FAT_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/msdos_fs.h>
+
+/*
+ * vfat shortname flags
+ */
+#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
+#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
+#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
+#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
+#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
+
+struct fat_mount_options {
+	uid_t fs_uid;
+	gid_t fs_gid;
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short codepage;  /* Codepage for shortname conversions */
+	char *iocharset;          /* Charset used for filename input/display */
+	unsigned short shortname; /* flags for shortname display/create rule */
+	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned short allow_utime;/* permission for setting the [am]time */
+	unsigned quiet:1,         /* set = fake successful chmods and chowns */
+		 showexec:1,      /* set = only set x bit for com/exe/bat */
+		 sys_immutable:1, /* set = system files are immutable */
+		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
+		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
+		 utf8:1,	  /* Use of UTF-8 character set (Default) */
+		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
+		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
+		 flush:1,	  /* write things quickly */
+		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
+		 usefree:1,	  /* Use free_clusters for FAT32 */
+		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+};
+
+#define FAT_HASH_BITS	8
+#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
+#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
+
+/*
+ * MS-DOS file system in-core superblock data
+ */
+struct msdos_sb_info {
+	unsigned short sec_per_clus; /* sectors/cluster */
+	unsigned short cluster_bits; /* log2(cluster_size) */
+	unsigned int cluster_size;   /* cluster size */
+	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
+	unsigned short fat_start;
+	unsigned long fat_length;    /* FAT start & length (sec.) */
+	unsigned long dir_start;
+	unsigned short dir_entries;  /* root dir start & entries */
+	unsigned long data_start;    /* first data sector */
+	unsigned long max_cluster;   /* maximum cluster number */
+	unsigned long root_cluster;  /* first cluster of the root directory */
+	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
+	struct mutex fat_lock;
+	unsigned int prev_free;      /* previously allocated cluster number */
+	unsigned int free_clusters;  /* -1 if undefined */
+	unsigned int free_clus_valid; /* is free_clusters valid? */
+	struct fat_mount_options options;
+	struct nls_table *nls_disk;  /* Codepage used on disk */
+	struct nls_table *nls_io;    /* Charset used for input and display */
+	const void *dir_ops;		     /* Opaque; default directory operations */
+	int dir_per_block;	     /* dir entries per block */
+	int dir_per_block_bits;	     /* log2(dir_per_block) */
+
+	int fatent_shift;
+	struct fatent_operations *fatent_ops;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
+};
+
+#define FAT_CACHE_VALID	0	/* special case for valid cache */
+
+/*
+ * MS-DOS file system inode data in memory
+ */
+struct msdos_inode_info {
+	spinlock_t cache_lru_lock;
+	struct list_head cache_lru;
+	int nr_caches;
+	/* for avoiding the race between fat_free() and fat_get_cluster() */
+	unsigned int cache_valid_id;
+
+	loff_t mmu_private;
+	int i_start;		/* first cluster or 0 */
+	int i_logstart;		/* logical first cluster */
+	int i_attrs;		/* unused attribute bits */
+	loff_t i_pos;		/* on-disk position of directory entry or 0 */
+	struct hlist_node i_fat_hash;	/* hash by i_location */
+	struct inode vfs_inode;
+};
+
+struct fat_slot_info {
+	loff_t i_pos;		/* on-disk position of directory entry */
+	loff_t slot_off;	/* offset for slot or de start */
+	int nr_slots;		/* number of slots + 1(de) in filename */
+	struct msdos_dir_entry *de;
+	struct buffer_head *bh;
+};
+
+static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
+{
+	return container_of(inode, struct msdos_inode_info, vfs_inode);
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u8 fat_attr(struct inode *inode)
+{
+	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
+		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
+		MSDOS_I(inode)->i_attrs;
+}
+
+static inline unsigned char fat_checksum(const __u8 *name)
+{
+	unsigned char s = name[0];
+	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
+	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
+	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
+	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
+	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
+	return s;
+}
+
+static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
+{
+	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
+		+ sbi->data_start;
+}
+
+static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		*dst++ = src[0] | (src[1] << 8);
+		src += 2;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
+{
+#ifdef __BIG_ENDIAN
+	while (len--) {
+		dst[0] = *src & 0x00FF;
+		dst[1] = (*src & 0xFF00) >> 8;
+		dst += 2;
+		src++;
+	}
+#else
+	memcpy(dst, src, len * 2);
+#endif
+}
+
+/* fat/cache.c */
+extern void fat_cache_inval_inode(struct inode *inode);
+extern int fat_get_cluster(struct inode *inode, int cluster,
+			   int *fclus, int *dclus);
+extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+		    unsigned long *mapped_blocks);
+
+/* fat/dir.c */
+extern const struct file_operations fat_dir_operations;
+extern int fat_search_long(struct inode *inode, const unsigned char *name,
+			   int name_len, struct fat_slot_info *sinfo);
+extern int fat_dir_empty(struct inode *dir);
+extern int fat_subdirs(struct inode *dir);
+extern int fat_scan(struct inode *dir, const unsigned char *name,
+		    struct fat_slot_info *sinfo);
+extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
+				struct msdos_dir_entry **de, loff_t *i_pos);
+extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
+extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
+			   struct fat_slot_info *sinfo);
+extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
+
+/* fat/fatent.c */
+struct fat_entry {
+	int entry;
+	union {
+		u8 *ent12_p[2];
+		__le16 *ent16_p;
+		__le32 *ent32_p;
+	} u;
+	int nr_bhs;
+	struct buffer_head *bhs[2];
+};
+
+static inline void fatent_init(struct fat_entry *fatent)
+{
+	fatent->nr_bhs = 0;
+	fatent->entry = 0;
+	fatent->u.ent32_p = NULL;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
+{
+	fatent->entry = entry;
+	fatent->u.ent32_p = NULL;
+}
+
+static inline void fatent_brelse(struct fat_entry *fatent)
+{
+	int i;
+	fatent->u.ent32_p = NULL;
+	for (i = 0; i < fatent->nr_bhs; i++)
+		brelse(fatent->bhs[i]);
+	fatent->nr_bhs = 0;
+	fatent->bhs[0] = fatent->bhs[1] = NULL;
+}
+
+extern void fat_ent_access_init(struct super_block *sb);
+extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
+			int entry);
+extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
+			 int new, int wait);
+extern int fat_alloc_clusters(struct inode *inode, int *cluster,
+			      int nr_cluster);
+extern int fat_free_clusters(struct inode *inode, int cluster);
+extern int fat_count_free_clusters(struct super_block *sb);
+
+/* fat/file.c */
+extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+			     unsigned int cmd, unsigned long arg);
+extern const struct file_operations fat_file_operations;
+extern const struct inode_operations fat_file_inode_operations;
+extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
+extern void fat_truncate(struct inode *inode);
+extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+
+/* fat/inode.c */
+extern void fat_attach(struct inode *inode, loff_t i_pos);
+extern void fat_detach(struct inode *inode);
+extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
+extern struct inode *fat_build_inode(struct super_block *sb,
+			struct msdos_dir_entry *de, loff_t i_pos);
+extern int fat_sync_inode(struct inode *inode);
+extern int fat_fill_super(struct super_block *sb, void *data, int silent,
+			const struct inode_operations *fs_dir_inode_ops, int isvfat);
+
+extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
+		            struct inode *i2);
+/* fat/misc.c */
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
+extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
+extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
+			      int tz_utc);
+extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
+
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
+#endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index fb98b3d847ed..5b5f49061b7c 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
 #include <linux/blkdev.h>
+#include "fat.h"
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index ddde37025ca6..b21973f266a1 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,13 +10,13 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/time.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include "fat.h"
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b2eec1283bf..3921de2013a4 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/seq_file.h>
-#include <linux/msdos_fs.h>
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
@@ -28,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/log2.h>
 #include <asm/unaligned.h>
+#include "fat.h"
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
 /* if user don't select VFAT, this is undefined. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 79fb98ad36d4..91ad9be18ff9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -8,8 +8,8 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
-#include <linux/msdos_fs.h>
 #include <linux/buffer_head.h>
+#include "fat.h"
 
 /*
  * fat_fs_panic reports a severe file system problem and sets the file system
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e844b9809d27..c0a4d5cd99b2 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -9,8 +9,8 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
-#include <linux/msdos_fs.h>
 #include <linux/smp_lock.h>
+#include "fat.h"
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 155c10b4adbd..facf3bf0211a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,14 +16,13 @@
  */
 
 #include <linux/module.h>
-
 #include <linux/jiffies.h>
-#include <linux/msdos_fs.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/namei.h>
+#include "fat.h"
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index ba63858056c7..0982fb47a90d 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -167,282 +167,10 @@ struct msdos_dir_slot {
 };
 
 #ifdef __KERNEL__
-
-#include <linux/buffer_head.h>
-#include <linux/string.h>
-#include <linux/nls.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-
-/*
- * vfat shortname flags
- */
-#define VFAT_SFN_DISPLAY_LOWER	0x0001 /* convert to lowercase for display */
-#define VFAT_SFN_DISPLAY_WIN95	0x0002 /* emulate win95 rule for display */
-#define VFAT_SFN_DISPLAY_WINNT	0x0004 /* emulate winnt rule for display */
-#define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
-#define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
-
-struct fat_mount_options {
-	uid_t fs_uid;
-	gid_t fs_gid;
-	unsigned short fs_fmask;
-	unsigned short fs_dmask;
-	unsigned short codepage;  /* Codepage for shortname conversions */
-	char *iocharset;          /* Charset used for filename input/display */
-	unsigned short shortname; /* flags for shortname display/create rule */
-	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
-	unsigned short allow_utime;/* permission for setting the [am]time */
-	unsigned quiet:1,         /* set = fake successful chmods and chowns */
-		 showexec:1,      /* set = only set x bit for com/exe/bat */
-		 sys_immutable:1, /* set = system files are immutable */
-		 dotsOK:1,        /* set = hidden and system files are named '.filename' */
-		 isvfat:1,        /* 0=no vfat long filename support, 1=vfat support */
-		 utf8:1,	  /* Use of UTF-8 character set (Default) */
-		 unicode_xlate:1, /* create escape sequences for unhandled Unicode */
-		 numtail:1,       /* Does first alias have a numeric '~1' type tail? */
-		 flush:1,	  /* write things quickly */
-		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
-		 usefree:1,	  /* Use free_clusters for FAT32 */
-		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
-};
-
-#define FAT_HASH_BITS	8
-#define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
-#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
-
-/*
- * MS-DOS file system in-core superblock data
- */
-struct msdos_sb_info {
-	unsigned short sec_per_clus; /* sectors/cluster */
-	unsigned short cluster_bits; /* log2(cluster_size) */
-	unsigned int cluster_size;   /* cluster size */
-	unsigned char fats,fat_bits; /* number of FATs, FAT bits (12 or 16) */
-	unsigned short fat_start;
-	unsigned long fat_length;    /* FAT start & length (sec.) */
-	unsigned long dir_start;
-	unsigned short dir_entries;  /* root dir start & entries */
-	unsigned long data_start;    /* first data sector */
-	unsigned long max_cluster;   /* maximum cluster number */
-	unsigned long root_cluster;  /* first cluster of the root directory */
-	unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */
-	struct mutex fat_lock;
-	unsigned int prev_free;      /* previously allocated cluster number */
-	unsigned int free_clusters;  /* -1 if undefined */
-	unsigned int free_clus_valid; /* is free_clusters valid? */
-	struct fat_mount_options options;
-	struct nls_table *nls_disk;  /* Codepage used on disk */
-	struct nls_table *nls_io;    /* Charset used for input and display */
-	const void *dir_ops;		     /* Opaque; default directory operations */
-	int dir_per_block;	     /* dir entries per block */
-	int dir_per_block_bits;	     /* log2(dir_per_block) */
-
-	int fatent_shift;
-	struct fatent_operations *fatent_ops;
-
-	spinlock_t inode_hash_lock;
-	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
-};
-
-#define FAT_CACHE_VALID	0	/* special case for valid cache */
-
-/*
- * MS-DOS file system inode data in memory
- */
-struct msdos_inode_info {
-	spinlock_t cache_lru_lock;
-	struct list_head cache_lru;
-	int nr_caches;
-	/* for avoiding the race between fat_free() and fat_get_cluster() */
-	unsigned int cache_valid_id;
-
-	loff_t mmu_private;
-	int i_start;		/* first cluster or 0 */
-	int i_logstart;		/* logical first cluster */
-	int i_attrs;		/* unused attribute bits */
-	loff_t i_pos;		/* on-disk position of directory entry or 0 */
-	struct hlist_node i_fat_hash;	/* hash by i_location */
-	struct inode vfs_inode;
-};
-
-struct fat_slot_info {
-	loff_t i_pos;		/* on-disk position of directory entry */
-	loff_t slot_off;	/* offset for slot or de start */
-	int nr_slots;		/* number of slots + 1(de) in filename */
-	struct msdos_dir_entry *de;
-	struct buffer_head *bh;
-};
-
-static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
-{
-	return container_of(inode, struct msdos_inode_info, vfs_inode);
-}
-
-/* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
-{
-	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
-		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
-		MSDOS_I(inode)->i_attrs;
-}
-
-static inline unsigned char fat_checksum(const __u8 *name)
-{
-	unsigned char s = name[0];
-	s = (s<<7) + (s>>1) + name[1];	s = (s<<7) + (s>>1) + name[2];
-	s = (s<<7) + (s>>1) + name[3];	s = (s<<7) + (s>>1) + name[4];
-	s = (s<<7) + (s>>1) + name[5];	s = (s<<7) + (s>>1) + name[6];
-	s = (s<<7) + (s>>1) + name[7];	s = (s<<7) + (s>>1) + name[8];
-	s = (s<<7) + (s>>1) + name[9];	s = (s<<7) + (s>>1) + name[10];
-	return s;
-}
-
-static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus)
-{
-	return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus
-		+ sbi->data_start;
-}
-
-static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len)
-{
-#ifdef __BIG_ENDIAN
-	while (len--) {
-		*dst++ = src[0] | (src[1] << 8);
-		src += 2;
-	}
-#else
-	memcpy(dst, src, len * 2);
-#endif
-}
-
-static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
-{
-#ifdef __BIG_ENDIAN
-	while (len--) {
-		dst[0] = *src & 0x00FF;
-		dst[1] = (*src & 0xFF00) >> 8;
-		dst += 2;
-		src++;
-	}
-#else
-	memcpy(dst, src, len * 2);
-#endif
-}
-
 /* media of boot sector */
 static inline int fat_valid_media(u8 media)
 {
 	return 0xf8 <= media || media == 0xf0;
 }
-
-/* fat/cache.c */
-extern void fat_cache_inval_inode(struct inode *inode);
-extern int fat_get_cluster(struct inode *inode, int cluster,
-			   int *fclus, int *dclus);
-extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks);
-
-/* fat/dir.c */
-extern const struct file_operations fat_dir_operations;
-extern int fat_search_long(struct inode *inode, const unsigned char *name,
-			   int name_len, struct fat_slot_info *sinfo);
-extern int fat_dir_empty(struct inode *dir);
-extern int fat_subdirs(struct inode *dir);
-extern int fat_scan(struct inode *dir, const unsigned char *name,
-		    struct fat_slot_info *sinfo);
-extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh,
-				struct msdos_dir_entry **de, loff_t *i_pos);
-extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts);
-extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
-			   struct fat_slot_info *sinfo);
-extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo);
-
-/* fat/fatent.c */
-struct fat_entry {
-	int entry;
-	union {
-		u8 *ent12_p[2];
-		__le16 *ent16_p;
-		__le32 *ent32_p;
-	} u;
-	int nr_bhs;
-	struct buffer_head *bhs[2];
-};
-
-static inline void fatent_init(struct fat_entry *fatent)
-{
-	fatent->nr_bhs = 0;
-	fatent->entry = 0;
-	fatent->u.ent32_p = NULL;
-	fatent->bhs[0] = fatent->bhs[1] = NULL;
-}
-
-static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
-{
-	fatent->entry = entry;
-	fatent->u.ent32_p = NULL;
-}
-
-static inline void fatent_brelse(struct fat_entry *fatent)
-{
-	int i;
-	fatent->u.ent32_p = NULL;
-	for (i = 0; i < fatent->nr_bhs; i++)
-		brelse(fatent->bhs[i]);
-	fatent->nr_bhs = 0;
-	fatent->bhs[0] = fatent->bhs[1] = NULL;
-}
-
-extern void fat_ent_access_init(struct super_block *sb);
-extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent,
-			int entry);
-extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
-			 int new, int wait);
-extern int fat_alloc_clusters(struct inode *inode, int *cluster,
-			      int nr_cluster);
-extern int fat_free_clusters(struct inode *inode, int cluster);
-extern int fat_count_free_clusters(struct super_block *sb);
-
-/* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
-			     unsigned int cmd, unsigned long arg);
-extern const struct file_operations fat_file_operations;
-extern const struct inode_operations fat_file_inode_operations;
-extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
-extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		       struct kstat *stat);
-
-/* fat/inode.c */
-extern void fat_attach(struct inode *inode, loff_t i_pos);
-extern void fat_detach(struct inode *inode);
-extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos);
-extern struct inode *fat_build_inode(struct super_block *sb,
-			struct msdos_dir_entry *de, loff_t i_pos);
-extern int fat_sync_inode(struct inode *inode);
-extern int fat_fill_super(struct super_block *sb, void *data, int silent,
-			const struct inode_operations *fs_dir_inode_ops, int isvfat);
-
-extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
-		            struct inode *i2);
-/* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
-extern void fat_clusters_flush(struct super_block *sb);
-extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
-			      int tz_utc);
-extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
-
-int fat_cache_init(void);
-void fat_cache_destroy(void);
-
-#endif /* __KERNEL__ */
-
-#endif
+#endif /* !__KERNEL__ */
+#endif /* !_LINUX_MSDOS_FS_H */
-- 
cgit v1.2.3


From 7decd1cb0305b97243f283fa7f4baf5fe613edeb Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: Fix and cleanup timestamp conversion

This cleans date_dos2unix()/fat_date_unix2dos() up. New code should be
much more readable.

And this fixes those old functions. Those doesn't handle 2100
correctly. 2100 isn't leap year, but old one handles it as leap year.
Also, with this, centi sec is handled and is fixed.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c         |   6 ++-
 fs/fat/fat.h         |   7 +--
 fs/fat/inode.c       |  34 ++++--------
 fs/fat/misc.c        | 148 +++++++++++++++++++++++++++++++++++++--------------
 fs/fat/namei_msdos.c |   2 +-
 fs/fat/namei_vfat.c  |   5 +-
 6 files changed, 130 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 08b23ad25f1c..a601c6d45bc0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1089,6 +1089,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	struct msdos_dir_entry *de;
 	sector_t blknr;
 	__le16 date, time;
+	u8 time_cs;
 	int err, cluster;
 
 	err = fat_alloc_clusters(dir, &cluster, 1);
@@ -1102,7 +1103,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 		goto error_free;
 	}
 
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
 	/* filling the new directory slots ("." and ".." entries) */
@@ -1112,13 +1113,14 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].lcase = de[1].lcase = 0;
 	de[0].time = de[1].time = time;
 	de[0].date = de[1].date = date;
-	de[0].ctime_cs = de[1].ctime_cs = 0;
 	if (sbi->options.isvfat) {
 		/* extra timestamps */
 		de[0].ctime = de[1].ctime = time;
+		de[0].ctime_cs = de[1].ctime_cs = time_cs;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date;
 	} else {
 		de[0].ctime = de[1].ctime = 0;
+		de[0].ctime_cs = de[1].ctime_cs = 0;
 		de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0;
 	}
 	de[0].start = cpu_to_le16(cluster);
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 51f1c42ca5e3..a2a570f81719 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -263,9 +263,10 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
-extern int date_dos2unix(unsigned short time, unsigned short date, int tz_utc);
-extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date,
-			      int tz_utc);
+extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 __time, __le16 __date, u8 time_cs);
+extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+			      __le16 *time, __le16 *date, u8 *time_cs);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
 int fat_cache_init(void);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3921de2013a4..079d9d5e0d36 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -381,22 +381,12 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
-	inode->i_mtime.tv_sec =
-		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
-			      sbi->options.tz_utc);
-	inode->i_mtime.tv_nsec = 0;
+
+	fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
 	if (sbi->options.isvfat) {
-		int secs = de->ctime_cs / 100;
-		int csecs = de->ctime_cs % 100;
-		inode->i_ctime.tv_sec  =
-			date_dos2unix(le16_to_cpu(de->ctime),
-				      le16_to_cpu(de->cdate),
-				      sbi->options.tz_utc) + secs;
-		inode->i_ctime.tv_nsec = csecs * 10000000;
-		inode->i_atime.tv_sec =
-			date_dos2unix(0, le16_to_cpu(de->adate),
-				      sbi->options.tz_utc);
-		inode->i_atime.tv_nsec = 0;
+		fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime,
+				  de->cdate, de->ctime_cs);
+		fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
@@ -591,16 +581,14 @@ retry:
 	raw_entry->attr = fat_attr(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
-			  &raw_entry->date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
+			  &raw_entry->date, NULL);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
-				  &raw_entry->cdate, sbi->options.tz_utc);
-		fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
-				  &raw_entry->adate, sbi->options.tz_utc);
-		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
-			inode->i_ctime.tv_nsec / 10000000;
+		fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime,
+				  &raw_entry->cdate, &raw_entry->ctime_cs);
+		fat_time_unix2fat(sbi, &inode->i_atime, &atime,
+				  &raw_entry->adate, NULL);
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 91ad9be18ff9..a191e79e66a9 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -135,65 +135,131 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 
 extern struct timezone sys_tz;
 
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits :     value
+ * date:  0 -  4: day	(1 -  31)
+ * date:  5 -  8: month	(1 -  12)
+ * date:  9 - 15: year	(0 - 127) from 1980
+ * time:  0 -  4: sec	(0 -  29) 2sec counts
+ * time:  5 - 10: min	(0 -  59)
+ * time: 11 - 15: hour	(0 -  23)
+ */
+#define SECS_PER_MIN	60
+#define SECS_PER_HOUR	(60 * 60)
+#define SECS_PER_DAY	(SECS_PER_HOUR * 24)
+#define UNIX_SECS_1980	315532800L
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108	4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define YEAR_2100	120
+#define IS_LEAP_YEAR(y)	(!((y) & 3) && (y) != YEAR_2100)
+
 /* Linear day numbers of the respective 1sts in non-leap years. */
-static int day_n[] = {
-   /* Jan  Feb  Mar  Apr   May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
-	0,  31,  59,  90,  120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0
+static time_t days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
 };
 
-/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 __time, __le16 __date, u8 time_cs)
 {
-	int month, year, secs;
+	u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date);
+	time_t second, day, leap_day, month, year;
 
-	/*
-	 * first subtract and mask after that... Otherwise, if
-	 * date == 0, bad things happen
-	 */
-	month = ((date >> 5) - 1) & 15;
-	year = date >> 9;
-	secs = (time & 31)*2+60*((time >> 5) & 63)+(time >> 11)*3600+86400*
-	    ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
-	    month < 2 ? 1 : 0)+3653);
-			/* days since 1.1.70 plus 80's leap day */
-	if (!tz_utc)
-		secs += sys_tz.tz_minuteswest*60;
-	return secs;
+	year  = date >> 9;
+	month = max(1, (date >> 5) & 0xf);
+	day   = max(1, date & 0x1f) - 1;
+
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (IS_LEAP_YEAR(year) && month > 2)
+		leap_day++;
+
+	second =  (time & 0x1f) << 1;
+	second += ((time >> 5) & 0x3f) * SECS_PER_MIN;
+	second += (time >> 11) * SECS_PER_HOUR;
+	second += (year * 365 + leap_day
+		   + days_in_year[month] + day
+		   + DAYS_DELTA) * SECS_PER_DAY;
+
+	if (!sbi->options.tz_utc)
+		second += sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	if (time_cs) {
+		ts->tv_sec = second + (time_cs / 100);
+		ts->tv_nsec = (time_cs % 100) * 10000000;
+	} else {
+		ts->tv_sec = second;
+		ts->tv_nsec = 0;
+	}
 }
 
-/* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
+/* Convert linear UNIX date to a FAT time/date pair. */
+void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
+		       __le16 *time, __le16 *date, u8 *time_cs)
 {
-	int day, year, nl_day, month;
+	time_t second = ts->tv_sec;
+	time_t day, leap_day, month, year;
 
-	if (!tz_utc)
-		unix_date -= sys_tz.tz_minuteswest*60;
+	if (!sbi->options.tz_utc)
+		second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
 
 	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-	if (unix_date < 315532800)
-		unix_date = 315532800;
-
-	*time = cpu_to_le16((unix_date % 60)/2+(((unix_date/60) % 60) << 5)+
-	    (((unix_date/3600) % 24) << 11));
-	day = unix_date/86400-3652;
-	year = day/365;
-	if ((year+3)/4+365*year > day)
+	if (second < UNIX_SECS_1980) {
+		*time = 0;
+		*date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
+		if (time_cs)
+			*time_cs = 0;
+		return;
+	}
+#if BITS_PER_LONG == 64
+	if (second >= UNIX_SECS_2108) {
+		*time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
+		*date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
+		if (time_cs)
+			*time_cs = 199;
+		return;
+	}
+#endif
+
+	day = second / SECS_PER_DAY - DAYS_DELTA;
+	year = day / 365;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	if (year * 365 + leap_day > day)
 		year--;
-	day -= (year+3)/4+365*year;
-	if (day == 59 && !(year & 3)) {
-		nl_day = day;
+	leap_day = (year + 3) / 4;
+	if (year > YEAR_2100)		/* 2100 isn't leap year */
+		leap_day--;
+	day -= year * 365 + leap_day;
+
+	if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
 		month = 2;
 	} else {
-		nl_day = (year & 3) || day <= 59 ? day : day-1;
-		for (month = 0; month < 12; month++) {
-			if (day_n[month] > nl_day)
+		if (IS_LEAP_YEAR(year) && day > days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (days_in_year[month + 1] > day)
 				break;
 		}
 	}
-	*date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9));
-}
+	day -= days_in_year[month];
 
-EXPORT_SYMBOL_GPL(fat_date_unix2dos);
+	*time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+			    | ((second / SECS_PER_MIN) % 60) << 5
+			    | (second % SECS_PER_MIN) >> 1);
+	*date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
+	if (time_cs)
+		*time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
+}
+EXPORT_SYMBOL_GPL(fat_time_unix2fat);
 
 int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index c0a4d5cd99b2..e92e8158ebaf 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -247,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (is_hid)
 		de.attr |= ATTR_HIDDEN;
 	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, NULL);
 	de.cdate = de.adate = 0;
 	de.ctime = 0;
 	de.ctime_cs = 0;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index facf3bf0211a..1536bc3ca0f0 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -568,6 +568,7 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name,
 	unsigned char msdos_name[MSDOS_NAME];
 	wchar_t *uname;
 	__le16 time, date;
+	u8 time_cs;
 	int err, ulen, usize, i;
 	loff_t offset;
 
@@ -620,10 +621,10 @@ shortname:
 	memcpy(de->name, msdos_name, MSDOS_NAME);
 	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
 	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
+	fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
 	de->time = de->ctime = time;
 	de->date = de->cdate = de->adate = date;
-	de->ctime_cs = 0;
+	de->ctime_cs = time_cs;
 	de->start = cpu_to_le16(cluster);
 	de->starthi = cpu_to_le16(cluster >> 16);
 	de->size = 0;
-- 
cgit v1.2.3


From 53472bc8f810d2fb507593ea03703670506a668d Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:47 -0800
Subject: fat: use generic_file_llseek() for directory

Since fat_dir_ioctl() was already fixed (i.e. called under ->i_mutex),
and __fat_readdir() doesn't take BKL anymore. So, BKL for ->llseek()
is pointless, and we have to use generic_file_llseek().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index a601c6d45bc0..931dd28b5289 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -832,6 +832,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
 #endif /* CONFIG_COMPAT */
 
 const struct file_operations fat_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
-- 
cgit v1.2.3


From 52e9d9f4b32a3bec91feb76c84e37b7dcffe5040 Mon Sep 17 00:00:00 2001
From: Darren Jenkins <darrenrjenkins@gmail.com>
Date: Thu, 6 Nov 2008 12:53:48 -0800
Subject: fat: cleanup fat_parse_long() error handling

Coverity CID 2332 & 2333 RESOURCE_LEAK

In fat_search_long() if fat_parse_long() returns a -ve value we return
without first freeing unicode.  This patch free's them on this error path.

The above was false positive on current tree, but this change is more
clean, so apply as cleanup.

[hirofumi@mail.parknet.co.jp: fix coding style]
Signed-off-by: Darren Jenkins <darrenrjenkins@gmail.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 931dd28b5289..140fc39e2307 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -373,9 +373,10 @@ parse_record:
 		if (de->attr == ATTR_EXT) {
 			int status = fat_parse_long(inode, &cpos, &bh, &de,
 						    &unicode, &nr_slots);
-			if (status < 0)
-				return status;
-			else if (status == PARSE_INVALID)
+			if (status < 0) {
+				err = status;
+				goto end_of_dir;
+			} else if (status == PARSE_INVALID)
 				continue;
 			else if (status == PARSE_NOT_LONGNAME)
 				goto parse_record;
-- 
cgit v1.2.3


From d3dfa8228f87ab9960ab8b4718013d68e3c25a43 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: improve fat_hash()

fat_hash() is using the algorithm known as bad. Instead of it, this
uses hash_32(). The following is the summary of test.

old hash:
	hash func (1000 times): 33489 cycles
	total inodes in hash table: 70926
	largest bucket contains: 696
	smallest bucket contains: 54

new hash:
	hash func (1000 times): 33129 cycles
	total inodes in hash table: 70926
	largest bucket contains: 315
	smallest bucket contains: 236

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h   |  1 -
 fs/fat/inode.c | 18 +++++++-----------
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a2a570f81719..2b8e94c3eef4 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -43,7 +43,6 @@ struct fat_mount_options {
 
 #define FAT_HASH_BITS	8
 #define FAT_HASH_SIZE	(1UL << FAT_HASH_BITS)
-#define FAT_HASH_MASK	(FAT_HASH_SIZE-1)
 
 /*
  * MS-DOS file system in-core superblock data
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 079d9d5e0d36..f58cd48d98b8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
 #include <linux/uio.h>
 #include <linux/writeback.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
@@ -247,25 +248,21 @@ static void fat_hash_init(struct super_block *sb)
 		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
 }
 
-static inline unsigned long fat_hash(struct super_block *sb, loff_t i_pos)
+static inline unsigned long fat_hash(loff_t i_pos)
 {
-	unsigned long tmp = (unsigned long)i_pos | (unsigned long) sb;
-	tmp = tmp + (tmp >> FAT_HASH_BITS) + (tmp >> FAT_HASH_BITS * 2);
-	return tmp & FAT_HASH_MASK;
+	return hash_32(i_pos, FAT_HASH_BITS);
 }
 
 void fat_attach(struct inode *inode, loff_t i_pos)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 
 	spin_lock(&sbi->inode_hash_lock);
 	MSDOS_I(inode)->i_pos = i_pos;
-	hlist_add_head(&MSDOS_I(inode)->i_fat_hash,
-			sbi->inode_hashtable + fat_hash(sb, i_pos));
+	hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_attach);
 
 void fat_detach(struct inode *inode)
@@ -276,13 +273,12 @@ void fat_detach(struct inode *inode)
 	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
 	spin_unlock(&sbi->inode_hash_lock);
 }
-
 EXPORT_SYMBOL_GPL(fat_detach);
 
 struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct hlist_head *head = sbi->inode_hashtable + fat_hash(sb, i_pos);
+	struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos);
 	struct hlist_node *_p;
 	struct msdos_inode_info *i;
 	struct inode *inode = NULL;
-- 
cgit v1.2.3


From 5e35dd4651002207948f10c576fc7d9bad448815 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:49 -0800
Subject: fat: Fix fat_ent_update_ptr() for FAT12

This fixes the missing update for bhs/nr_bhs in case the caller
accessed from block boundary to first block of boundary.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fatent.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 5b5f49061b7c..13513992da3c 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -317,10 +317,20 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
 	/* Is this fatent's blocks including this entry? */
 	if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr)
 		return 0;
-	/* Does this entry need the next block? */
-	if (sbi->fat_bits == 12 && (offset + 1) >= sb->s_blocksize) {
-		if (fatent->nr_bhs != 2 || bhs[1]->b_blocknr != (blocknr + 1))
-			return 0;
+	if (sbi->fat_bits == 12) {
+		if ((offset + 1) < sb->s_blocksize) {
+			/* This entry is on bhs[0]. */
+			if (fatent->nr_bhs == 2) {
+				brelse(bhs[1]);
+				fatent->nr_bhs = 1;
+			}
+		} else {
+			/* This entry needs the next block. */
+			if (fatent->nr_bhs != 2)
+				return 0;
+			if (bhs[1]->b_blocknr != (blocknr + 1))
+				return 0;
+		}
 	}
 	ops->ent_set_ptr(fatent, offset);
 	return 1;
-- 
cgit v1.2.3


From a993b542bb4cd3e5a64863b7ef892bbebec2239b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:50 -0800
Subject: fat: use fat_detach() in fat_clear_inode()

Use fat_detach() instead of opencoding it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index f58cd48d98b8..8e1b75c63c7f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -429,13 +429,8 @@ static void fat_delete_inode(struct inode *inode)
 
 static void fat_clear_inode(struct inode *inode)
 {
-	struct super_block *sb = inode->i_sb;
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-
-	spin_lock(&sbi->inode_hash_lock);
 	fat_cache_inval_inode(inode);
-	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
-	spin_unlock(&sbi->inode_hash_lock);
+	fat_detach(inode);
 }
 
 static void fat_write_super(struct super_block *sb)
-- 
cgit v1.2.3


From 068f5ae05c51d2cee6b31cb3da06775dd83bd348 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: vfat: Fix vfat_find() error path in vfat_lookup()

Current vfat_lookup() creates negetive dentry blindly if vfat_find()
returned a error. It's wrong. If the error isn't -ENOENT, just return
error.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 1536bc3ca0f0..419deabfb9be 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -683,7 +683,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
+	struct inode *inode;
 	struct dentry *alias;
 	int err, table;
 
@@ -693,14 +693,18 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
-		table++;
+		if (err == -ENOENT) {
+			table++;
+			inode = NULL;
+			goto out;
+		}
 		goto error;
 	}
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		unlock_super(sb);
-		return ERR_CAST(inode);
+		err = PTR_ERR(inode);
+		goto error;
 	}
 	alias = d_find_alias(inode);
 	if (alias) {
@@ -713,7 +717,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		}
 
 	}
-error:
+out:
 	unlock_super(sb);
 	dentry->d_op = &vfat_dentry_ops[table];
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
@@ -723,6 +727,10 @@ error:
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
+
+error:
+	unlock_super(sb);
+	return ERR_PTR(err);
 }
 
 static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
-- 
cgit v1.2.3


From 1b52467243c7167b3a267ddbcbb14d550f28eb4a Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:51 -0800
Subject: fat: Fix/Cleanup dcache handling for vfat

- Add comments for handling dcache of vfat.

- Separate case-sensitive case and case-insensitive to
  vfat_revalidate() and vfat_ci_revalidate().

  vfat_revalidate() doesn't need to drop case-insensitive negative
  dentry on creation path.

- Current code is missing to set ->d_revalidate to the negative dentry
  created by unlink/etc..

  This sets ->d_revalidate always, and returns 1 for positive
  dentry. Now, we don't need to change ->d_op dynamically anymore,
  so this just uses sb->s_root->d_op to set ->d_op.

- d_find_alias() may return DCACHE_DISCONNECTED dentry. It's not
  the interesting dentry there. This checks it.

- Add missing LOOKUP_PARENT check. We don't need to drop the valid
  negative dentry for (LOOKUP_CREATE | LOOKUP_PARENT) lookup.

- For consistent filename on creation path, this drops negative dentry
  if we can't see intent.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 124 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 419deabfb9be..d585398f9f6b 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -24,27 +24,67 @@
 #include <linux/namei.h>
 #include "fat.h"
 
-static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+/*
+ * If new entry was created in the parent, it could create the 8.3
+ * alias (the shortname of logname).  So, the parent may have the
+ * negative-dentry which matches the created 8.3 alias.
+ *
+ * If it happened, the negative dentry isn't actually negative
+ * anymore.  So, drop it.
+ */
+static int vfat_revalidate_shortname(struct dentry *dentry)
 {
 	int ret = 1;
-
-	if (!dentry->d_inode &&
-	    nd && !(nd->flags & LOOKUP_CONTINUE) && (nd->flags & LOOKUP_CREATE))
-		/*
-		 * negative dentry is dropped, in order to make sure
-		 * to use the name which a user desires if this is
-		 * create path.
-		 */
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_time != dentry->d_parent->d_inode->i_version)
 		ret = 0;
-	else {
-		spin_lock(&dentry->d_lock);
-		if (dentry->d_time != dentry->d_parent->d_inode->i_version)
-			ret = 0;
-		spin_unlock(&dentry->d_lock);
-	}
+	spin_unlock(&dentry->d_lock);
 	return ret;
 }
 
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	/* This is not negative dentry. Always valid. */
+	if (dentry->d_inode)
+		return 1;
+	return vfat_revalidate_shortname(dentry);
+}
+
+static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+{
+	/*
+	 * This is not negative dentry. Always valid.
+	 *
+	 * Note, rename() to existing directory entry will have ->d_inode,
+	 * and will use existing name which isn't specified name by user.
+	 *
+	 * We may be able to drop this positive dentry here. But dropping
+	 * positive dentry isn't good idea. So it's unsupported like
+	 * rename("filename", "FILENAME") for now.
+	 */
+	if (dentry->d_inode)
+		return 1;
+
+	/*
+	 * This may be nfsd (or something), anyway, we can't see the
+	 * intent of this. So, since this can be for creation, drop it.
+	 */
+	if (!nd)
+		return 0;
+
+	/*
+	 * Drop the negative dentry, in order to make sure to use the
+	 * case sensitive name which is specified by user if this is
+	 * for creation.
+	 */
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & LOOKUP_CREATE)
+			return 0;
+	}
+
+	return vfat_revalidate_shortname(dentry);
+}
+
 /* returns the length of a struct qstr, ignoring trailing dots */
 static unsigned int vfat_striptail_len(struct qstr *qstr)
 {
@@ -126,25 +166,16 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	return 1;
 }
 
-static struct dentry_operations vfat_dentry_ops[4] = {
-	{
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hashi,
-		.d_compare	= vfat_cmpi,
-	},
-	{
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	},
-	{
-		.d_revalidate	= vfat_revalidate,
-		.d_hash		= vfat_hash,
-		.d_compare	= vfat_cmp,
-	}
+static struct dentry_operations vfat_ci_dentry_ops = {
+	.d_revalidate	= vfat_revalidate_ci,
+	.d_hash		= vfat_hashi,
+	.d_compare	= vfat_cmpi,
+};
+
+static struct dentry_operations vfat_dentry_ops = {
+	.d_revalidate	= vfat_revalidate,
+	.d_hash		= vfat_hash,
+	.d_compare	= vfat_cmp,
 };
 
 /* Characters that are undesirable in an MS-DOS file name */
@@ -685,29 +716,35 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	struct fat_slot_info sinfo;
 	struct inode *inode;
 	struct dentry *alias;
-	int err, table;
+	int err;
 
 	lock_super(sb);
-	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
-	dentry->d_op = &vfat_dentry_ops[table];
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err) {
 		if (err == -ENOENT) {
-			table++;
 			inode = NULL;
 			goto out;
 		}
 		goto error;
 	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto error;
 	}
+
 	alias = d_find_alias(inode);
-	if (alias) {
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
+		/*
+		 * This inode has non DCACHE_DISCONNECTED dentry. This
+		 * means, the user did ->lookup() by an another name
+		 * (longname vs 8.3 alias of it) in past.
+		 *
+		 * Switch to new one for reason of locality if possible.
+		 */
 		if (d_invalidate(alias) == 0)
 			dput(alias);
 		else {
@@ -715,15 +752,14 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 			unlock_super(sb);
 			return alias;
 		}
-
 	}
 out:
 	unlock_super(sb);
-	dentry->d_op = &vfat_dentry_ops[table];
+	dentry->d_op = sb->s_root->d_op;
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry) {
-		dentry->d_op = &vfat_dentry_ops[table];
+		dentry->d_op = sb->s_root->d_op;
 		dentry->d_time = dentry->d_parent->d_inode->i_version;
 	}
 	return dentry;
@@ -1022,9 +1058,9 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 		return res;
 
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_root->d_op = &vfat_dentry_ops[0];
+		sb->s_root->d_op = &vfat_ci_dentry_ops;
 	else
-		sb->s_root->d_op = &vfat_dentry_ops[2];
+		sb->s_root->d_op = &vfat_dentry_ops;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1c13a243a461dd5b089d29e5d57f260c990e462c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:52 -0800
Subject: fat: Kill d_invalidate() in vfat_lookup()

d_invalidate() for positive dentry doesn't work in some cases
(vfsmount, nfsd, and maybe others). shrink_dcache_parent() by
d_invalidate() is pointless for vfat usage at all.

So, this kills it, and intead of it uses d_move().

To save old behavior, this returns alias simply for directory (don't
change pwd, etc..). the directory lookup shouldn't be important for
performance.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index d585398f9f6b..bf326d4356a3 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -745,13 +745,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 		 *
 		 * Switch to new one for reason of locality if possible.
 		 */
-		if (d_invalidate(alias) == 0)
-			dput(alias);
-		else {
-			iput(inode);
-			unlock_super(sb);
-			return alias;
-		}
+		BUG_ON(d_unhashed(alias));
+		if (!S_ISDIR(inode->i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		unlock_super(sb);
+		return alias;
 	}
 out:
 	unlock_super(sb);
-- 
cgit v1.2.3


From 45cfbe354785a5bc9a38354754d6f7322f598001 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:53 -0800
Subject: fat: Cleanup msdos_lookup()

Use same style with vfat_lookup().

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_msdos.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index e92e8158ebaf..7ba03a4acbe0 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -203,33 +203,37 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
-	struct inode *inode = NULL;
-	int res;
-
-	dentry->d_op = &msdos_dentry_operations;
+	struct inode *inode;
+	int err;
 
 	lock_super(sb);
-	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
-	if (res == -ENOENT)
-		goto add;
-	if (res < 0)
-		goto out;
+
+	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto out;
+		err = PTR_ERR(inode);
+		goto error;
 	}
-add:
-	res = 0;
+out:
+	unlock_super(sb);
+	dentry->d_op = &msdos_dentry_operations;
 	dentry = d_splice_alias(inode, dentry);
 	if (dentry)
 		dentry->d_op = &msdos_dentry_operations;
-out:
+	return dentry;
+
+error:
 	unlock_super(sb);
-	if (!res)
-		return dentry;
-	return ERR_PTR(res);
+	return ERR_PTR(err);
 }
 
 /***** Creates a directory entry (name is already formatted). */
-- 
cgit v1.2.3


From 9c0aa1b87bf541affef519eb4879ce7c5a5941ae Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Cleanup FAT attribute stuff

This adds three helpers:

fat_make_attrs() - makes FAT attributes from inode.
fat_make_mode()  - makes mode_t from FAT attributes.
fat_save_attrs() - saves FAT attributes to inode.

Then this replaces: MSDOS_MKMODE() by fat_make_mode(), fat_attr() by
fat_make_attrs(), ->i_attrs = attr & ATTR_UNUSED by fat_save_attrs().
And for root inode, those is used with ATTR_DIR instead of bogus
ATTR_NONE.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h             | 20 +++++++++++++++++++-
 fs/fat/file.c            | 32 ++++++++++++--------------------
 fs/fat/inode.c           | 19 +++++++++----------
 include/linux/msdos_fs.h |  5 -----
 4 files changed, 40 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 2b8e94c3eef4..3b4753a024e3 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,14 +117,32 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
+				   u8 attrs, mode_t mode)
+{
+	if (attrs & ATTR_RO)
+		mode &= ~S_IWUGO;
+
+	if (attrs & ATTR_DIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
 /* Return the FAT attribute byte for this inode */
-static inline u8 fat_attr(struct inode *inode)
+static inline u8 fat_make_attrs(struct inode *inode)
 {
 	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
 		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
 		MSDOS_I(inode)->i_attrs;
 }
 
+static inline void fat_save_attrs(struct inode *inode, u8 attrs)
+{
+	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+}
+
 static inline unsigned char fat_checksum(const __u8 *name)
 {
 	unsigned char s = name[0];
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b21973f266a1..f5a7e907a8fa 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,13 +27,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr;
-
-		if (inode->i_ino == MSDOS_ROOT_INO)
-			attr = ATTR_DIR;
-		else
-			attr = fat_attr(inode);
-
+		u32 attr = fat_make_attrs(inode);
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
@@ -62,20 +56,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		/* Merge in ATTR_VOLUME and ATTR_DIR */
 		attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
 			(is_dir ? ATTR_DIR : 0);
-		oldattr = fat_attr(inode);
+		oldattr = fat_make_attrs(inode);
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
 		ia.ia_ctime = current_fs_time(inode->i_sb);
-		if (is_dir) {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				S_IRWXUGO & ~sbi->options.fs_dmask)
-				| S_IFDIR;
-		} else {
-			ia.ia_mode = MSDOS_MKMODE(attr,
-				(S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO))
-				& ~sbi->options.fs_fmask)
-				| S_IFREG;
+		if (is_dir)
+			ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+		else {
+			ia.ia_mode = fat_make_mode(sbi, attr,
+				S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 		}
 
 		/* The root directory has no attributes */
@@ -115,7 +105,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 				inode->i_flags &= S_IMMUTABLE;
 		}
 
-		MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
+		fat_save_attrs(inode, attr);
 		mark_inode_dirty(inode);
 up:
 		mnt_drop_write(filp->f_path.mnt);
@@ -274,7 +264,7 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 
 	/*
 	 * Note, the basic check is already done by a caller of
-	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 * (attr->ia_mode & ~FAT_VALID_MODE)
 	 */
 
 	if (S_ISREG(inode->i_mode))
@@ -314,6 +304,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 }
 
 #define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+/* valid file mode bits */
+#define FAT_VALID_MODE	(S_IFREG | S_IFDIR | S_IRWXUGO)
 
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -356,7 +348,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
+	     (attr->ia_mode & ~FAT_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8e1b75c63c7f..7aaa21cf019a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -337,8 +337,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 
 	if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) {
 		inode->i_generation &= ~1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-			S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+		inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO);
 		inode->i_op = sbi->dir_ops;
 		inode->i_fop = &fat_dir_operations;
 
@@ -355,10 +354,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		inode->i_nlink = fat_subdirs(inode);
 	} else { /* not a directory */
 		inode->i_generation |= 1;
-		inode->i_mode = MSDOS_MKMODE(de->attr,
-		    ((sbi->options.showexec && !is_exec(de->name + 8))
-			? S_IRUGO|S_IWUGO : S_IRWXUGO)
-		    & ~sbi->options.fs_fmask) | S_IFREG;
+		inode->i_mode = fat_make_mode(sbi, de->attr,
+			((sbi->options.showexec && !is_exec(de->name + 8))
+			 ? S_IRUGO|S_IWUGO : S_IRWXUGO));
 		MSDOS_I(inode)->i_start = le16_to_cpu(de->start);
 		if (sbi->fat_bits == 32)
 			MSDOS_I(inode)->i_start |= (le16_to_cpu(de->starthi) << 16);
@@ -374,7 +372,8 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		if (sbi->options.sys_immutable)
 			inode->i_flags |= S_IMMUTABLE;
 	}
-	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
+	fat_save_attrs(inode, de->attr);
+
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 
@@ -569,7 +568,7 @@ retry:
 		raw_entry->size = 0;
 	else
 		raw_entry->size = cpu_to_le32(inode->i_size);
-	raw_entry->attr = fat_attr(inode);
+	raw_entry->attr = fat_make_attrs(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
 	fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time,
@@ -1105,7 +1104,7 @@ static int fat_read_root(struct inode *inode)
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
 	inode->i_generation = 0;
-	inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_dmask) | S_IFDIR;
+	inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO);
 	inode->i_op = sbi->dir_ops;
 	inode->i_fop = &fat_dir_operations;
 	if (sbi->fat_bits == 32) {
@@ -1122,7 +1121,7 @@ static int fat_read_root(struct inode *inode)
 	MSDOS_I(inode)->i_logstart = 0;
 	MSDOS_I(inode)->mmu_private = inode->i_size;
 
-	MSDOS_I(inode)->i_attrs = ATTR_NONE;
+	fat_save_attrs(inode, ATTR_DIR);
 	inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
 	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
 	inode->i_nlink = fat_subdirs(inode)+2;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 0982fb47a90d..e0a9b207920d 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -46,11 +46,6 @@
 #define DELETED_FLAG	0xe5	/* marks file as deleted when in name[0] */
 #define IS_FREE(n)	(!*(n) || *(n) == DELETED_FLAG)
 
-/* valid file mode bits */
-#define MSDOS_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO)
-/* Convert attribute bits and a mask to the UNIX mode. */
-#define MSDOS_MKMODE(a, m) (m & (a & ATTR_RO ? S_IRUGO|S_IXUGO : S_IRWXUGO))
-
 #define MSDOS_NAME	11	/* maximum name length */
 #define MSDOS_LONGNAME	256	/* maximum name length */
 #define MSDOS_SLOTS	21	/* max # of slots for short and long names */
-- 
cgit v1.2.3


From 9183482f5d4a2de00f66641b974e7f351d41b675 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:54 -0800
Subject: fat: Fix ATTR_RO in the case of (~umask & S_WUGO) == 0

If inode->i_mode doesn't have S_WUGO, current code assumes it means
ATTR_RO.  However, if (~[ufd]mask & S_WUGO) == 0, inode->i_mode can't
hold S_WUGO. Therefore the updated directory entry will always have
ATTR_RO.

This adds fat_mode_can_hold_ro() to check it. And if inode->i_mode
can't hold, uses -i_attrs to hold ATTR_RO instead.

With this, we don't set ATTR_RO unless users change it via ioctl() if
(~[ufd]mask & S_WUGO) == 0.

And on FAT_IOCTL_GET_ATTRIBUTES path, this adds ->i_mutex to it for
not returning the partially updated attributes by FAT_IOCTL_SET_ATTRIBUTES
to userland.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h  | 33 +++++++++++++++++++++++++++++----
 fs/fat/file.c |  7 ++++++-
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 3b4753a024e3..313b645b8126 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -117,6 +117,25 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 	return container_of(inode, struct msdos_inode_info, vfs_inode);
 }
 
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ */
+static inline int fat_mode_can_hold_ro(struct inode *inode)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	mode_t mask;
+
+	if (S_ISDIR(inode->i_mode))
+		mask = ~sbi->options.fs_dmask;
+	else
+		mask = ~sbi->options.fs_fmask;
+
+	if (!(mask & S_IWUGO))
+		return 0;
+	return 1;
+}
+
 /* Convert attribute bits and a mask to the UNIX mode. */
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
@@ -133,14 +152,20 @@ static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 /* Return the FAT attribute byte for this inode */
 static inline u8 fat_make_attrs(struct inode *inode)
 {
-	return ((inode->i_mode & S_IWUGO) ? ATTR_NONE : ATTR_RO) |
-		(S_ISDIR(inode->i_mode) ? ATTR_DIR : ATTR_NONE) |
-		MSDOS_I(inode)->i_attrs;
+	u8 attrs = MSDOS_I(inode)->i_attrs;
+	if (S_ISDIR(inode->i_mode))
+		attrs |= ATTR_DIR;
+	if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		attrs |= ATTR_RO;
+	return attrs;
 }
 
 static inline void fat_save_attrs(struct inode *inode, u8 attrs)
 {
-	MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	if (fat_mode_can_hold_ro(inode))
+		MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED;
+	else
+		MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO);
 }
 
 static inline unsigned char fat_checksum(const __u8 *name)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f5a7e907a8fa..81b15c623803 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -27,7 +27,12 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case FAT_IOCTL_GET_ATTRIBUTES:
 	{
-		u32 attr = fat_make_attrs(inode);
+		u32 attr;
+
+		mutex_lock(&inode->i_mutex);
+		attr = fat_make_attrs(inode);
+		mutex_unlock(&inode->i_mutex);
+
 		return put_user(attr, user_attr);
 	}
 	case FAT_IOCTL_SET_ATTRIBUTES:
-- 
cgit v1.2.3


From dfc209c0064efef5590f608056a48b61a5cac09c Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:55 -0800
Subject: fat: Fix ATTR_RO for directory

FAT has the ATTR_RO (read-only) attribute. But on Windows, the ATTR_RO
of the directory will be just ignored actually, and is used by only
applications as flag. E.g. it's setted for the customized folder by
Explorer.

http://msdn2.microsoft.com/en-us/library/aa969337.aspx

This adds "rodir" option. If user specified it, ATTR_RO is used as
read-only flag even if it's the directory. Otherwise, inode->i_mode
is not used to hold ATTR_RO (i.e. fat_mode_can_save_ro() returns 0).

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfat.txt |  8 ++++++++
 fs/fat/fat.h                       | 14 ++++++++++----
 fs/fat/file.c                      | 16 ++++++++++++----
 fs/fat/inode.c                     | 17 +++++++++++++----
 4 files changed, 43 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index dc9dc73d7d38..3a5ddc96901a 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -124,6 +124,14 @@ sys_immutable -- If set, ATTR_SYS attribute on FAT is handled as
 flush         -- If set, the filesystem will try to flush to disk more
 		 early than normal. Not set by default.
 
+rodir	      -- FAT has the ATTR_RO (read-only) attribute. But on Windows,
+		 the ATTR_RO of the directory will be just ignored actually,
+		 and is used by only applications as flag. E.g. it's setted
+		 for the customized folder.
+
+		 If you want to use ATTR_RO as read-only flag even for
+		 the directory, set this option.
+
 <bool>: 0,1,yes,no,true,false
 
 TODO
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 313b645b8126..e9dce5d8e7a7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -38,7 +38,8 @@ struct fat_mount_options {
 		 flush:1,	  /* write things quickly */
 		 nocase:1,	  /* Does this need case conversion? 0=need case conversion*/
 		 usefree:1,	  /* Use free_clusters for FAT32 */
-		 tz_utc:1;	  /* Filesystem timestamps are in UTC */
+		 tz_utc:1,	  /* Filesystem timestamps are in UTC */
+		 rodir:1;	  /* allow ATTR_RO for directory */
 };
 
 #define FAT_HASH_BITS	8
@@ -120,15 +121,20 @@ static inline struct msdos_inode_info *MSDOS_I(struct inode *inode)
 /*
  * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
  * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
  */
 static inline int fat_mode_can_hold_ro(struct inode *inode)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	mode_t mask;
 
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode)) {
+		if (!sbi->options.rodir)
+			return 0;
 		mask = ~sbi->options.fs_dmask;
-	else
+	} else
 		mask = ~sbi->options.fs_fmask;
 
 	if (!(mask & S_IWUGO))
@@ -140,7 +146,7 @@ static inline int fat_mode_can_hold_ro(struct inode *inode)
 static inline mode_t fat_make_mode(struct msdos_sb_info *sbi,
 				   u8 attrs, mode_t mode)
 {
-	if (attrs & ATTR_RO)
+	if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir))
 		mode &= ~S_IWUGO;
 
 	if (attrs & ATTR_DIR)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 81b15c623803..f06a4e525ece 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -282,11 +282,18 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
+	 *
+	 * If fat_mode_can_hold_ro(inode) is false, can't change w bits.
 	 */
 	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
-	if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
-		return -EPERM;
+	if (fat_mode_can_hold_ro(inode)) {
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
 
 	*mode_ptr &= S_IFMT | perm;
 
@@ -316,8 +323,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
-	int error = 0;
 	unsigned int ia_valid;
+	int error;
 
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
@@ -371,7 +378,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			attr->ia_valid &= ~ATTR_MODE;
 	}
 
-	error = inode_setattr(inode, attr);
+	if (attr->ia_valid)
+		error = inode_setattr(inode, attr);
 out:
 	return error;
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7aaa21cf019a..0da04e6d1e34 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -797,8 +797,10 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 			seq_puts(m, ",uni_xlate");
 		if (!opts->numtail)
 			seq_puts(m, ",nonumtail");
+		if (opts->rodir)
+			seq_puts(m, ",rodir");
 	}
-	if (sbi->options.flush)
+	if (opts->flush)
 		seq_puts(m, ",flush");
 	if (opts->tz_utc)
 		seq_puts(m, ",tz=UTC");
@@ -814,7 +816,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -886,6 +888,7 @@ static const match_table_t vfat_tokens = {
 	{Opt_nonumtail_yes, "nonumtail=yes"},
 	{Opt_nonumtail_yes, "nonumtail=true"},
 	{Opt_nonumtail_yes, "nonumtail"},
+	{Opt_rodir, "rodir"},
 	{Opt_err, NULL}
 };
 
@@ -905,10 +908,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
-	if (is_vfat)
+	if (is_vfat) {
 		opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
-	else
+		opts->rodir = 0;
+	} else {
 		opts->shortname = 0;
+		opts->rodir = 1;
+	}
 	opts->name_check = 'n';
 	opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK =  0;
 	opts->utf8 = opts->unicode_xlate = 0;
@@ -1059,6 +1065,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_nonumtail_yes:		/* empty or 1 or yes or true */
 			opts->numtail = 0;	/* negated option */
 			break;
+		case Opt_rodir:
+			opts->rodir = 1;
+			break;
 
 		/* obsolete mount options */
 		case Opt_obsolate:
-- 
cgit v1.2.3


From fa93ca18a8b0da4e26bd9491ad144cd14d22f8ec Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Fix _fat_bmap() race

fat_get_cluster() assumes the requested blocknr isn't truncated during
read. _fat_bmap() doesn't follow this rule.

This protects it by ->i_mutex.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0da04e6d1e34..be88208b83a6 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -199,7 +199,14 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
-	return generic_block_bmap(mapping, block, fat_get_block);
+	sector_t blocknr;
+
+	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
+	mutex_lock(&mapping->host->i_mutex);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	mutex_unlock(&mapping->host->i_mutex);
+
+	return blocknr;
 }
 
 static const struct address_space_operations fat_aops = {
-- 
cgit v1.2.3


From 0e75f5da06c05425f4b375eb981c4489fb2d9787 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:56 -0800
Subject: fat: Add printf attribute to fat_fs_panic()

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e9dce5d8e7a7..a69f7f9757c0 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -308,7 +308,8 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...);
+extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
-- 
cgit v1.2.3


From 2bdf67eb1631f30e2f3f5d49e4007c76e88877a8 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: mmu_private race fix

mmu_private is 64bits value, hence it's not atomic to update.

So, the access rule for mmu_private is we must hold ->i_mutex.  But,
fat_get_block() path doesn't follow the rule on non-allocation path.

This fixes by using i_size instead if non-allocation path.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/cache.c | 23 ++++++++++++++++++-----
 fs/fat/dir.c   |  2 +-
 fs/fat/fat.h   |  6 ++++--
 fs/fat/inode.c |  4 ++--
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 589edde9053c..b42602298087 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -293,10 +293,12 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 }
 
 int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-	     unsigned long *mapped_blocks)
+	     unsigned long *mapped_blocks, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
 	sector_t last_block;
 	int cluster, offset;
 
@@ -309,10 +311,21 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		}
 		return 0;
 	}
-	last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1))
-		>> sb->s_blocksize_bits;
-	if (sector >= last_block)
-		return 0;
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (!create)
+			return 0;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= last_block)
+			return 0;
+	}
 
 	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
 	offset  = sector & (sbi->sec_per_clus - 1);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 140fc39e2307..2ecaa17acdb5 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -77,7 +77,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index a69f7f9757c0..4efc5038ed29 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -91,7 +91,9 @@ struct msdos_inode_info {
 	/* for avoiding the race between fat_free() and fat_get_cluster() */
 	unsigned int cache_valid_id;
 
-	loff_t mmu_private;
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;	/* physically allocated size */
+
 	int i_start;		/* first cluster or 0 */
 	int i_logstart;		/* logical first cluster */
 	int i_attrs;		/* unused attribute bits */
@@ -222,7 +224,7 @@ extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
 extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks);
+		    unsigned long *mapped_blocks, int create);
 
 /* fat/dir.c */
 extern const struct file_operations fat_dir_operations;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index be88208b83a6..9e37ad93c730 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -64,7 +64,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	sector_t phys;
 	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 	if (phys) {
@@ -94,7 +94,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	*max_blocks = min(mapped_blocks, *max_blocks);
 	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 9ca59f4c3d28df14a1545a1e2832f34a0a50e3ed Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:57 -0800
Subject: fat: ->i_pos race fix

i_pos is 64bits value, hence it's not atomic to update.

Important place is fat_write_inode() only, other places without lock
are just for printk().

This adds lock for "BITS_PER_LONG == 32" kernel.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/inode.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9e37ad93c730..bdd8fb7be2ca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -542,6 +542,20 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
+				    struct inode *inode)
+{
+	loff_t i_pos;
+#if BITS_PER_LONG == 32
+	spin_lock(&sbi->inode_hash_lock);
+#endif
+	i_pos = MSDOS_I(inode)->i_pos;
+#if BITS_PER_LONG == 32
+	spin_unlock(&sbi->inode_hash_lock);
+#endif
+	return i_pos;
+}
+
 static int fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
@@ -551,9 +565,12 @@ static int fat_write_inode(struct inode *inode, int wait)
 	loff_t i_pos;
 	int err;
 
+	if (inode->i_ino == MSDOS_ROOT_INO)
+		return 0;
+
 retry:
-	i_pos = MSDOS_I(inode)->i_pos;
-	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
+	i_pos = fat_i_pos_read(sbi, inode);
+	if (!i_pos)
 		return 0;
 
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
-- 
cgit v1.2.3


From c3302931db090d87e9015c3a7ce5c97a7dd90f78 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 6 Nov 2008 12:53:58 -0800
Subject: fat: i_blocks warning fix

blkcnt_t type depends on CONFIG_LSF. Use unsigned long long always for
printk().  But lazy to type it, so add "llu" and use it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/dir.c    | 2 +-
 fs/fat/fat.h    | 3 +++
 fs/fat/fatent.c | 5 ++---
 fs/fat/misc.c   | 5 +++--
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 2ecaa17acdb5..67e058357098 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -86,7 +86,7 @@ next:
 	*bh = sb_bread(sb, phys);
 	if (*bh == NULL) {
 		printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
-		       (unsigned long long)phys);
+		       (llu)phys);
 		/* skip this block */
 		*pos = (iblock + 1) << sb->s_blocksize_bits;
 		goto next;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4efc5038ed29..ea440d65819c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,4 +323,7 @@ extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 int fat_cache_init(void);
 void fat_cache_destroy(void);
 
+/* helper for printk */
+typedef unsigned long long	llu;
+
 #endif /* !_FAT_H */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 13513992da3c..da6eea47872f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -93,8 +93,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 err_brelse:
 	brelse(bhs[0]);
 err:
-	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-	       (unsigned long long)blocknr);
+	printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
 	return -EIO;
 }
 
@@ -107,7 +106,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
-		       (unsigned long long)blocknr);
+		       (llu)blocknr);
 		return -EIO;
 	}
 	fatent->nr_bhs = 1;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a191e79e66a9..ac39ebcc1496 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -124,8 +124,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 			mark_inode_dirty(inode);
 	}
 	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-		fat_fs_panic(sb, "clusters badly computed (%d != %lu)",
-			new_fclus, inode->i_blocks >> (sbi->cluster_bits - 9));
+		fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+			     new_fclus,
+			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
 		fat_cache_inval_inode(inode);
 	}
 	inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
-- 
cgit v1.2.3


From 7e2d9bfa4eabee3e1919a40f20d2ef8b569bd07e Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 5 Nov 2008 16:09:04 +0200
Subject: UBIFS: allow for gaps when dirtying the LPT

The LPT may have gaps in it because initially empty LEBs
are not added by mkfs.ubifs - because it does not know how
many there are.  Then UBIFS allocates empty LEBs in the
reverse order that they are discovered i.e. they are
added to, and removed from, the front of a list.  That
creates a gap in the middle of the LPT.

The function dirtying the LPT tree (for the purpose of
small model garbage collection) assumed that a gap could
only occur at the very end of the LPT and stopped dirtying
prematurely, which in turn resulted in the LPT running
out of space - something that is designed to be impossible.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
---
 fs/ubifs/lpt_commit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index eed5a0025d63..a41434b42785 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -571,8 +571,6 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 		/* We assume here that LEB zero is never an LPT LEB */
 		if (nnode->nbranch[iip].lnum)
 			return ubifs_get_pnode(c, nnode, iip);
-		else
-			return NULL;
 	}
 
 	/* Go up while can't go right */
-- 
cgit v1.2.3


From ed9b3e3379731e9f9d2f73f3d7fd9e7d2ce3df4a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 7 Nov 2008 09:06:45 -0500
Subject: ext4: Mark the buffer_heads as dirty and uptodate after prepare_write

We need to make sure we mark the buffer_heads as dirty and uptodate
so that block_write_full_page write them correctly.

This fixes mmap corruptions that can occur in low memory situations.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a130b56f1cf..be21a5ae33cb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2329,6 +2329,8 @@ static int ext4_da_writepage(struct page *page,
 			unlock_page(page);
 			return 0;
 		}
+		/* now mark the buffer_heads as dirty and uptodate */
+		block_commit_write(page, 0, PAGE_CACHE_SIZE);
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-- 
cgit v1.2.3


From 23712a9c28b9f80a8cf70c8490358d5f562d2465 Mon Sep 17 00:00:00 2001
From: Frederic Bohe <frederic.bohe@bull.net>
Date: Fri, 7 Nov 2008 09:21:01 -0500
Subject: ext4: add checksum calculation when clearing UNINIT flag in
 ext4_new_inode

When initializing an uninitialized block group in ext4_new_inode(),
its block group checksum must be re-calculated.  This fixes a race
when several threads try to allocate a new inode in an UNINIT'd group.

There is some question whether we need to be initializing the block
bitmap in ext4_new_inode() at all, but for now, if we are going to
init the block group, let's eliminate the race.

Signed-off-by: Frederic Bohe <frederic.bohe@bull.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fe34d74cfb19..2a117e286e54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -718,6 +718,8 @@ got:
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
 			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
+								gdp);
 		}
 		spin_unlock(sb_bgl_lock(sbi, group));
 
-- 
cgit v1.2.3


From b726e923ea4d216027e466aa602d914e4b4a63af Mon Sep 17 00:00:00 2001
From: Doug Nazar <nazard@dragoninc.ca>
Date: Wed, 5 Nov 2008 06:16:28 -0500
Subject: Fix nfsd truncation of readdir results

Commit 8d7c4203 "nfsd: fix failure to set eof in readdir in some
situations" introduced a bug: on a directory in an exported ext3
filesystem with dir_index unset, a READDIR will only return about 250
entries, even if the directory was larger.

Bisected it back to this commit; reverting it fixes the problem.

It turns out that in this case ext3 reads a block at a time, then
returns from readdir, which means we can end up with buf.full==0 but
with more entries in the directory still to be read.  Before 8d7c4203
(but after c002a6c797 "Optimise NFS readdir hack slightly"), this would
cause us to return the READDIR result immediately, but with the eof bit
unset.  That could cause a performance regression (because the client
would need more roundtrips to the server to read the whole directory),
but no loss in correctness, since the cleared eof bit caused the client
to send another readdir.  After 8d7c4203, the setting of the eof bit
made this a correctness problem.

So, move nfserr_eof into the loop and remove the buf.full check so that
we loop until buf.used==0.  The following seems to do the right thing
and reduces the network traffic since we don't return a READDIR result
until the buffer is full.

Tested on an empty directory & large directory; eof is properly sent and
there are no more short buffers.

Signed-off-by: Doug Nazar <nazard@dragoninc.ca>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 848a03e83a42..4433c8f00163 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1875,11 +1875,11 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 		return -ENOMEM;
 
 	offset = *offsetp;
-	cdp->err = nfserr_eof; /* will be cleared on successful read */
 
 	while (1) {
 		unsigned int reclen;
 
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
 		buf.used = 0;
 		buf.full = 0;
 
@@ -1912,9 +1912,6 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func,
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
 		offset = vfs_llseek(file, 0, SEEK_CUR);
-		cdp->err = nfserr_eof;
-		if (!buf.full)
-			break;
 	}
 
  done:
-- 
cgit v1.2.3


From 9ccbece546cf836f67f6d9bb4bf2f70f7476cb2c Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 30 Oct 2008 16:53:25 +1100
Subject: [XFS] Fix use-after-free with log and quotas

Destroying the quota stuff on unmount can access the log - ie
XFS_QM_DONE() ends up in xfs_dqunlock() which calls
xfs_trans_unlocked_item() and then xfs_log_move_tail(). By this time the
log has already been destroyed. Just move the cleanup of the quota code
earlier in xfs_unmountfs() before the call to xfs_log_unmount(). Moving
XFS_QM_DONE() up near XFS_QM_DQPURGEALL() seems like a good spot.

SGI-PV: 987086

SGI-Modid: xfs-linux-melb:xfs-kern:32148a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Peter Leckie <pleckie@sgi.com>
---
 fs/xfs/xfs_mount.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a4503f5e9497..15f5dd22fbb2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1245,6 +1245,9 @@ xfs_unmountfs(
 
 	XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
+	if (mp->m_quotainfo)
+		XFS_QM_DONE(mp);
+
 	/*
 	 * Flush out the log synchronously so that we know for sure
 	 * that nothing is pinned.  This is important because bflush()
@@ -1297,8 +1300,6 @@ xfs_unmountfs(
 	xfs_errortag_clearall(mp, 0);
 #endif
 	xfs_free_perag(mp);
-	if (mp->m_quotainfo)
-		XFS_QM_DONE(mp);
 }
 
 STATIC void
-- 
cgit v1.2.3


From 2cf7f0da3ae225848a2ee10d4e216448a770fd00 Mon Sep 17 00:00:00 2001
From: Lachlan McIlroy <lachlan@sgi.com>
Date: Thu, 30 Oct 2008 16:59:06 +1100
Subject: [XFS] Wait for all I/O on truncate to zero file size

It's possible to have outstanding xfs_ioend_t's queued when the file size
is zero. This can happen in the direct I/O path when a direct I/O write
fails due to ENOSPC. In this case the xfs_ioend_t will still be queued (ie
xfs_end_io_direct() does not know that the I/O failed so can't force the
xfs_ioend_t to be flushed synchronously).

When we truncate a file on unlink we don't know to wait for these
xfs_ioend_ts and we can have a use-after-free situation if the inode is
reclaimed before the xfs_ioend_t is finally processed.

As was suggested by Dave Chinner lets wait for all I/Os to complete when
truncating the file size to zero.

SGI-PV: 981668

SGI-Modid: xfs-linux-melb:xfs-kern:32216a

Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/xfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dbd9cef852ec..a391b955df01 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1414,7 +1414,7 @@ xfs_itruncate_start(
 	mp = ip->i_mount;
 
 	/* wait for the completion of any pending DIOs */
-	if (new_size < ip->i_size)
+	if (new_size == 0 || new_size < ip->i_size)
 		vn_iowait(ip);
 
 	/*
-- 
cgit v1.2.3


From 6f9f51adb6ac0a49fce49e01c47dcfc2810c6e9d Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:38:12 +1100
Subject: [XFS] Account for allocated blocks when expanding directories

When we create a directory, we reserve a number of blocks for the maximum
possible expansion of of the directory due to various btree splits,
freespace allocation, etc. Unfortunately, each allocation is not reflected
in the total number of blocks still available to the transaction, so the
maximal reservation is used over and over again.

This leads to problems where an allocation group has only enough blocks
for *some* of the allocations required for the directory modification.
After the first N allocations, the remaining blocks in the allocation
group drops below the total reservation, and subsequent allocations fail
because the allocator will not allow the allocation to proceed if the AG
does not have the enough blocks available for the entire allocation total.

This results in an ENOSPC occurring after an allocation has already
occurred. This results in aborting the directory operation (leaving the
directory in an inconsistent state) and cancelling a dirty transaction,
which results in a filesystem shutdown.

Avoid the problem by reflecting the number of blocks allocated in any
directory expansion in the total number of blocks available to the
modification in progress. This prevents a directory modification from
being aborted part way through with an ENOSPC.

SGI-PV: 988144

SGI-Modid: xfs-linux-melb:xfs-kern:32340a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_da_btree.c | 5 +++++
 fs/xfs/xfs_dir2.c     | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9e561a9cefca..a11a8390bf6c 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	int nmap, error, w, count, c, got, i, mapi;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
+	xfs_drfsbno_t	nblks;
 
 	dp = args->dp;
 	mp = dp->i_mount;
 	w = args->whichfork;
 	tp = args->trans;
+	nblks = dp->i_d.di_nblocks;
+
 	/*
 	 * For new directories adjust the file offset and block count.
 	 */
@@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	}
 	if (mapp != &map)
 		kmem_free(mapp);
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*new_blkno = (xfs_dablk_t)bno;
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 80e0dc51361c..1afb12278b8d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -525,11 +525,13 @@ xfs_dir2_grow_inode(
 	xfs_mount_t	*mp;
 	int		nmap;		/* number of bmap entries */
 	xfs_trans_t	*tp;
+	xfs_drfsbno_t	nblks;
 
 	xfs_dir2_trace_args_s("grow_inode", args, space);
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
+	nblks = dp->i_d.di_nblocks;
 	/*
 	 * Set lowest possible block in the space requested.
 	 */
@@ -622,7 +624,11 @@ xfs_dir2_grow_inode(
 	 */
 	if (mapp != &map)
 		kmem_free(mapp);
+
+	/* account for newly allocated blocks in reserved blocks total */
+	args->total -= dp->i_d.di_nblocks - nblks;
 	*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
 	/*
 	 * Update file's size if this is the data space and it grew.
 	 */
-- 
cgit v1.2.3


From 8f330f5149ef41ff943b04d914406cc417f62784 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 10 Nov 2008 16:50:24 +1100
Subject: [XFS] handle memory allocation failures during log initialisation

When there is no memory left in the system, xfs_buf_get_noaddr()
can fail. If this happens at mount time during xlog_alloc_log()
we fail to catch the error and oops.

Catch the error from xfs_buf_get_noaddr(), and allow other memory
allocations to fail and catch those errors too. Report the error
to the console and fail the mount with ENOMEM.

Tested by manually injecting errors into xfs_buf_get_noaddr() and
xlog_alloc_log().

Version 2:
o remove unnecessary casts of the returned pointer from kmem_zalloc()

SGI-PV: 987246

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0b02c6443551..3608a0f0a5f6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -563,6 +563,11 @@ xfs_log_mount(
 	}
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+	if (!mp->m_log) {
+		cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
+		error = ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * Initialize the AIL now we have a log.
@@ -601,6 +606,7 @@ xfs_log_mount(
 	return 0;
 error:
 	xfs_log_unmount_dealloc(mp);
+out:
 	return error;
 }	/* xfs_log_mount */
 
@@ -1217,7 +1223,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	int			i;
 	int			iclogsize;
 
-	log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
+	if (!log)
+		return NULL;
 
 	log->l_mp	   = mp;
 	log->l_targ	   = log_target;
@@ -1249,6 +1257,8 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xlog_get_iclog_buffer_size(mp, log);
 
 	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+	if (!bp)
+		goto out_free_log;
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
 	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
@@ -1275,13 +1285,17 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	iclogsize = log->l_iclog_size;
 	ASSERT(log->l_iclog_size >= 4096);
 	for (i=0; i < log->l_iclog_bufs; i++) {
-		*iclogp = (xlog_in_core_t *)
-			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+		*iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
+		if (!*iclogp)
+			goto out_free_iclog;
+
 		iclog = *iclogp;
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
 
 		bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp);
+		if (!bp)
+			goto out_free_iclog;
 		if (!XFS_BUF_CPSEMA(bp))
 			ASSERT(0);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
@@ -1323,6 +1337,25 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
 
 	return log;
+
+out_free_iclog:
+	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
+		prev_iclog = iclog->ic_next;
+		if (iclog->ic_bp) {
+			sv_destroy(&iclog->ic_force_wait);
+			sv_destroy(&iclog->ic_write_wait);
+			xfs_buf_free(iclog->ic_bp);
+			xlog_trace_iclog_dealloc(iclog);
+		}
+		kmem_free(iclog);
+	}
+	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
+	xlog_trace_loggrant_dealloc(log);
+	xfs_buf_free(log->l_xbuf);
+out_free_log:
+	kmem_free(log);
+	return NULL;
 }	/* xlog_alloc_log */
 
 
-- 
cgit v1.2.3


From 220ca310a53200b4bfbc7c4c6e365eea284ec44f Mon Sep 17 00:00:00 2001
From: David Chinner <david@fromorbit.com>
Date: Thu, 30 Oct 2008 17:40:09 +1100
Subject: [XFS] XFS: Check for valid transaction headers in recovery

When we are about to add a new item to a transaction in recovery, we need
to check that it is valid first. Currently we just assert that header
magic number matches, but in production systems that is not present and we
add a corrupted transaction to the list to be processed. This results in a
kernel oops later when processing the corrupted transaction.

Instead, if we detect a corrupted transaction, abort recovery and leave
the user to clean up the mess that has occurred.

SGI-PV: 988145

SGI-Modid: xfs-linux-melb:xfs-kern:32356a

Signed-off-by: David Chinner <david@fromorbit.com>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82d46ce69d5f..70e3ba32e6be 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1419,7 +1419,13 @@ xlog_recover_add_to_trans(
 		return 0;
 	item = trans->r_itemq;
 	if (item == NULL) {
-		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+		/* we need to catch log corruptions here */
+		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+			xlog_warn("XFS: xlog_recover_add_to_trans: "
+				  "bad header magic number");
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
-- 
cgit v1.2.3


From c3cb6827353102fee62f3b9401a03ee29b297e5b Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:33:03 +0800
Subject: ocfs2: fix license in xattr

This patch fixes the license in xattr.c and xattr.h.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 13 ++++---------
 fs/ocfs2/xattr.h | 12 ++----------
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 802c41492214..2f8952e4e4c1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3,25 +3,20 @@
  *
  * xattr.c
  *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * CREDITS:
- * Lots of code in this file is taken from ext3.
+ * Lots of code in this file is copy from linux/fs/ext3/xattr.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #include <linux/capability.h>
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index c25c7c62a059..e4e45c81a261 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -3,24 +3,16 @@
  *
  * xattr.h
  *
- * Function prototypes
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * License version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
 
 #ifndef OCFS2_XATTR_H
-- 
cgit v1.2.3


From 0030e001505d2d1503c083c917a747c033eaf8cd Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:33:33 +0800
Subject: ocfs2: fix function declaration and definition in xattr

Because we merged the xattr sources into one file, some functions
no longer belong in the header file.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 28 +++++++++++++++++++++++-----
 fs/ocfs2/xattr.h | 26 ++++----------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2f8952e4e4c1..420d8e30b184 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -132,6 +132,24 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
 
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -832,11 +850,11 @@ cleanup:
  * Copy an extended attribute into the buffer provided.
  * Buffer is NULL to compute the size of buffer required.
  */
-int ocfs2_xattr_get(struct inode *inode,
-		    int name_index,
-		    const char *name,
-		    void *buffer,
-		    size_t buffer_size)
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
 {
 	int ret;
 	struct ocfs2_dinode *di = NULL;
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index e4e45c81a261..1d8314c7656d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,29 +32,11 @@ enum ocfs2_xattr_type {
 
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
-
-extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
-			   size_t, int);
-extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
-static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
-{
-	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
-}
-
-static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
-{
-	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
-}
-
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-	u16 len = sb->s_blocksize -
-		 offsetof(struct ocfs2_xattr_header, xh_entries);
+ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+		    size_t, int);
+int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 
-	return len / sizeof(struct ocfs2_xattr_entry);
-}
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From ceb1eba3dc2ad94b25764785ff7d2082c6094115 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:34:13 +0800
Subject: ocfs2: remove duplicate definition in xattr

Include/linux/xattr.h already has the definition about xattr prefix,
so remove the duplicate definitions in xattr.c.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 420d8e30b184..a9da45bbb9ed 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4740,14 +4740,11 @@ out:
 /*
  * 'trusted' attributes support
  */
-
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
 				       size_t list_size, const char *name,
 				       size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (list && total_len <= list_size) {
@@ -4784,18 +4781,14 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.set	= ocfs2_xattr_trusted_set,
 };
 
-
 /*
  * 'user' attributes support
  */
-
-#define XATTR_USER_PREFIX "user."
-
 static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
 				    size_t list_size, const char *name,
 				    size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-- 
cgit v1.2.3


From c988fd045f1195e62c0970384903ab9da26a9359 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Thu, 23 Oct 2008 16:34:44 +0800
Subject: ocfs2: add handler_map array bounds checking

Make the handler_map array as large as the possible value range to avoid
a fencepost error.

[ Utilize alternate method -- Joel ]

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a9da45bbb9ed..e19980a71a3c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -78,7 +78,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 	NULL
 };
 
-static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
 };
-- 
cgit v1.2.3


From f6087fb799e097e7c9d912daa75701de9d62dc53 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:20:43 -0700
Subject: ocfs2: Check xattr block signatures properly.

The xattr.c code is currently memcmp()ing naking buffer pointers.
Create the OCFS2_IS_VALID_XATTR_BLOCK() macro to match its peers and use
that.

In addition, failed signature checks were returning -EFAULT, which is
completely wrong.  Return -EIO.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h |  3 +++
 fs/ocfs2/xattr.c | 38 ++++++++++++++++----------------------
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a21a465490c4..fef7ece32376 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -473,6 +473,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 		(____gd)->bg_signature);				\
 } while (0)
 
+#define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
+	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e19980a71a3c..151ba6257fbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -555,14 +555,12 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
-		goto cleanup;
-	}
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
+	}
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
@@ -779,15 +777,14 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
 		goto cleanup;
 	}
 
 	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
@@ -1527,10 +1524,9 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-		ret = -EFAULT;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
 		goto out;
 	}
 
@@ -1540,7 +1536,6 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		goto out;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
 	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1784,15 +1779,14 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	/*Verify the signature of xattr block*/
-	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
-		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
-			ret = -EFAULT;
-			goto cleanup;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ret = -EIO;
+		goto cleanup;
 	}
 
 	xs->xattr_bh = blk_bh;
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
-- 
cgit v1.2.3


From b37c4d84e9d16fd5b6f31197f02ea0a112fc9e99 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:24:03 -0700
Subject: ocfs2: Don't return -EFAULT from a corrupt xattr entry.

If the xattr disk structures are corrupt, return -EIO, not -EFAULT.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 151ba6257fbb..41a6ca004ae3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1239,7 +1239,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
 	if (free < 0)
-		return -EFAULT;
+		return -EIO;
 
 	if (!xs->not_found) {
 		size_t size = 0;
-- 
cgit v1.2.3


From bd60bd37ade4321ecce4ed4442f68c88febd76d5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:25:56 -0700
Subject: ocfs2: Check errors from ocfs2_xattr_update_xattr_search()

The ocfs2_xattr_update_xattr_search() function can return an error when
trying to read blocks off of disk.  The caller needs to check this error
before using those (possibly invalid) blocks.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 41a6ca004ae3..92df88a41e5d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2825,7 +2825,11 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	if (data_bh)
 		ocfs2_journal_dirty(handle, data_bh);
 
-	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
 
 	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
 	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-- 
cgit v1.2.3


From eb6ff2397d1fdfc6a7629c99896338e5b5c508e5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:32:48 -0700
Subject: ocfs2: Specify appropriate journal access for new xattr buckets.

There are a couple places that get an xattr bucket that may be reading
an existing one or may be allocating a new one.  They should specify the
correct journal access mode depending.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 92df88a41e5d..fb450200bc88 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3231,7 +3231,9 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+					   new_bucket_head ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3393,6 +3395,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   t_is_new ?
+					   OCFS2_JOURNAL_ACCESS_CREATE :
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret)
 			goto out;
-- 
cgit v1.2.3


From 54f443f4e7265a1333886dbace31cb6eb1991c72 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 20 Oct 2008 18:43:07 -0700
Subject: ocfs2: Don't repeat ocfs2_xattr_block_find()

ocfs2_xattr_block_get() looks up the xattr in a startlingly familiar
way; it's identical to the function ocfs2_xattr_block_find().  Let's just
use the later in the former.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fb450200bc88..74d1faba23bb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -111,6 +111,10 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
 					     int *block_off,
 					     int *new_offset);
 
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs);
 static int ocfs2_xattr_index_block_find(struct inode *inode,
 					struct buffer_head *root_bh,
 					int name_index,
@@ -760,46 +764,20 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 				 size_t buffer_size,
 				 struct ocfs2_xattr_search *xs)
 {
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	struct buffer_head *blk_bh = NULL;
 	struct ocfs2_xattr_block *xb;
 	struct ocfs2_xattr_value_root *xv;
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
-	if (!di->i_xattr_loc)
-		return ret;
-
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
-	if (ret < 0) {
+	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
+	if (ret) {
 		mlog_errno(ret);
-		return ret;
-	}
-
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
 		goto cleanup;
 	}
 
-	xs->xattr_bh = blk_bh;
-
-	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
-		xs->header = &xb->xb_attrs.xb_header;
-		xs->base = (void *)xs->header;
-		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
-		xs->here = xs->header->xh_entries;
-
-		ret = ocfs2_xattr_find_entry(name_index, name, xs);
-	} else
-		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
-						   name_index,
-						   name, xs);
-
-	if (ret)
-		goto cleanup;
+	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 	size = le64_to_cpu(xs->here->xe_value_size);
 	if (buffer) {
 		ret = -ERANGE;
@@ -838,7 +816,8 @@ cleanup:
 		brelse(xs->bucket.bhs[i]);
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
-	brelse(blk_bh);
+	brelse(xs->xattr_bh);
+	xs->xattr_bh = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 63fd77573723841d5d44a79471258f1b261f4482 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 17 Oct 2008 12:44:36 +0800
Subject: ocfs2: Remove unused ocfs2_restore_xattr_block().

Since now ocfs2 supports empty xattr buckets, we will never remove
the xattr index tree even if all the xattrs are removed, so this
function will never be called. So remove it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 48 ------------------------------------------------
 1 file changed, 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d1faba23bb..789fb70462c9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1791,52 +1791,6 @@ cleanup:
 	return ret;
 }
 
-/*
- * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
- * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
- * re-initialized.
- */
-static int ocfs2_restore_xattr_block(struct inode *inode,
-				     struct ocfs2_xattr_search *xs)
-{
-	int ret;
-	handle_t *handle;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_block *xb =
-		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
-	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
-	u16 xb_flags = le16_to_cpu(xb->xb_flags);
-
-	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
-		le16_to_cpu(el->l_next_free_rec) != 0);
-
-	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
-	       offsetof(struct ocfs2_xattr_block, xb_attrs));
-
-	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
-
-	ocfs2_journal_dirty(handle, xs->xattr_bh);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	return ret;
-}
-
 /*
  * ocfs2_xattr_block_set()
  *
@@ -1947,8 +1901,6 @@ out:
 	}
 
 	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
-	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
-		ret = ocfs2_restore_xattr_block(inode, xs);
 
 end:
 
-- 
cgit v1.2.3


From 8573f79d30077875e2b6e83849b5245bfbb08685 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 24 Oct 2008 22:24:17 +0800
Subject: ocfs2: Fix some typos in xattr annotations.

Fix some typos in the xattr annotations.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Reported-by: Coly Li <coyli@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f24ce3d3f956..5f180cf7abbd 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -742,12 +742,12 @@ struct ocfs2_group_desc
  */
 struct ocfs2_xattr_entry {
 	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
-	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+	__le16	xe_name_offset;  /* byte offset from the 1st entry in the
 				    local xattr storage(inode, xattr block or
 				    xattr bucket). */
 	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
-	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
-				  * type and the highest 1 bits indicate whether
+	__u8	xe_type;         /* the low 7 bits indicate the name prefix
+				  * type and the highest bit indicates whether
 				  * the EA is stored in the local storage. */
 	__le64	xe_value_size;	 /* real xattr value length. */
 };
@@ -766,9 +766,10 @@ struct ocfs2_xattr_header {
 						   xattr. */
 	__le16	xh_name_value_len;              /* total length of name/value
 						   length in this bucket. */
-	__le16	xh_num_buckets;                 /* bucket nums in one extent
-						   record, only valid in the
-						   first bucket. */
+	__le16	xh_num_buckets;                 /* Number of xattr buckets
+						   in this extent record,
+						   only valid in the first
+						   bucket. */
 	__le64  xh_csum;
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -776,8 +777,8 @@ struct ocfs2_xattr_header {
 /*
  * On disk structure for xattr value root.
  *
- * It is used when one extended attribute's size is larger, and we will save it
- * in an outside cluster. It will stored in a b-tree like file content.
+ * When an xattr's value is large enough, it is stored in an external
+ * b-tree like file data.  The xattr value root points to this structure.
  */
 struct ocfs2_xattr_value_root {
 /*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
-- 
cgit v1.2.3


From fa38e92cb34e27e60d0faf1035934eb9b44aa1d4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:51 +0200
Subject: ocfs2: Fix check of return value of ocfs2_start_trans()

On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM).
Thus checks for !handle are wrong. Fix them to use IS_ERR().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7efe937a415f..3138a385fdbb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -247,8 +247,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	mlog_entry_void();
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -312,8 +312,8 @@ static int ocfs2_simple_size_update(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1055,8 +1055,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1259,8 +1259,8 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1352,8 +1352,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 		goto out;
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (handle == NULL) {
-		ret = -ENOMEM;
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		goto out;
 	}
-- 
cgit v1.2.3


From 87cfa004321c62aec681713ea48e0b846336d9f4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:53 +0200
Subject: ocfs2: Fix checking of return value of new_inode()

new_inode() does not return ERR_PTR() but NULL in case of failure. Correct
checking of the return value.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 485a6aa0ad39..f594f300d4cd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -378,8 +378,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 
 	inode = new_inode(dir->i_sb);
-	if (IS_ERR(inode)) {
-		status = PTR_ERR(inode);
+	if (!inode) {
+		status = -ENOMEM;
 		mlog(ML_ERROR, "new_inode failed!\n");
 		goto leave;
 	}
-- 
cgit v1.2.3


From b99835c1684918b9975851d71455c5c007d1715b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 19:23:54 +0200
Subject: ocfs2: Let inode be really deleted when ocfs2_mknod_locked() fails

We forgot to set i_nlink to 0 when returning due to error from ocfs2_mknod_locked()
and thus inode was not properly released via ocfs2_delete_inode() (e.g. claimed
space was not released). Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f594f300d4cd..f4967e634ffd 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -491,8 +491,10 @@ leave:
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
-		if (inode)
+		if (inode) {
+			clear_nlink(inode);
 			iput(inode);
+		}
 	}
 
 	mlog_exit(status);
-- 
cgit v1.2.3


From d32647993c211901fc4819ef3327f62d1859241b Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 24 Oct 2008 07:57:28 +0800
Subject: ocfs2: Fix check of return value of ocfs2_start_trans() in xattr.c.

On failure, ocfs2_start_trans() returns values like ERR_PTR(-ENOMEM),
so we should check whether handle is NULL. Fix them to use IS_ERR().
Jan has made the patch for other part in ocfs2(thank Jan for it), so
this is just the fix for fs/ocfs2/xattr.c.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 789fb70462c9..a371c01942b1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4092,7 +4092,7 @@ static int ocfs2_xattr_value_update_size(struct inode *inode,
 	handle_t *handle = NULL;
 
 	handle = ocfs2_start_trans(osb, 1);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -4259,7 +4259,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 	}
 
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (handle == NULL) {
+	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.3


From ae0dff683076b2798763288c7ac2f09a18c4a998 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 22 Oct 2008 13:24:29 -0700
Subject: ocfs2: Set journal descriptor to NULL after journal shutdown

Patch sets journal descriptor to NULL after the journal is shutdown.
This ensures that jbd2_journal_release_jbd_inode(), which removes the
jbd2 inode from txn lists, can be called safely from ocfs2_clear_inode()
even after the journal has been shutdown.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/inode.c   | 6 ++++++
 fs/ocfs2/journal.c | 1 +
 2 files changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4903688f72a9..7aa00d511874 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1106,6 +1106,12 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+
+	/*
+	 * ip_jinode is used to track txns against this inode. We ensure that
+	 * the journal is flushed before journal shutdown. Thus it is safe to
+	 * have inodes get cleaned up after journal shutdown.
+	 */
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 81e40677eecb..99fe9d584f3c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -690,6 +690,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 	/* Shutdown the kernel journal system */
 	jbd2_journal_destroy(journal->j_journal);
+	journal->j_journal = NULL;
 
 	OCFS2_I(inode)->ip_open_count--;
 
-- 
cgit v1.2.3


From 4c1bbf1ba631d7db61ce3462349a3f5d14ae3009 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 6 Oct 2008 16:59:55 +0800
Subject: ocfs2: return 0 in page_mkwrite to let VFS retry.

In ocfs2_page_mkwrite, we return -EINVAL when we found the page mapping
isn't updated, and it will cause the user space program get SIGBUS and
exit. The reason is that during race writeable mmap, we will do
unmap_mapping_range in ocfs2_data_downconvert_worker. The good thing is
that if we reuturn 0 in page_mkwrite, VFS will retry fault and then
call page_mkwrite again, so it is safe to return 0 here.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3dc18d67557c..eea1d24713ea 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -113,7 +113,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
 	 * ocfs2_write_begin_nolock().
 	 */
 	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
-		ret = -EINVAL;
+		/*
+		 * the page has been umapped in ocfs2_data_downconvert_worker.
+		 * So return 0 here and let VFS retry.
+		 */
+		ret = 0;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 80bcaf3469b8aefd316d4ceb27d9af7cfbb0b913 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 27 Oct 2008 06:06:24 +0800
Subject: ocfs2/xattr: Proper hash collision handle in bucket division

In ocfs2/xattr, we must make sure the xattrs which have the same hash value
exist in the same bucket so that the search schema can work. But in the old
implementation, when we want to extend a bucket, we just move half number of
xattrs to the new bucket. This works in most cases, but if we are lucky
enough we will move 2 xattrs into 2 different buckets. This means that an
xattr from the previous bucket cannot be found anymore. This patch fix this
problem by finding the right position during extending the bucket and extend
an empty bucket if needed.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Cc: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 144 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 115 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a371c01942b1..f3ea7efb48c6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3110,25 +3110,73 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 }
 
 /*
- * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * Find the suitable pos when we divide a bucket into 2.
+ * We have to make sure the xattrs with the same hash value exist
+ * in the same bucket.
+ *
+ * If this ocfs2_xattr_header covers more than one hash value, find a
+ * place where the hash value changes.  Try to find the most even split.
+ * The most common case is that all entries have different hash values,
+ * and the first check we make will find a place to split.
+ */
+static int ocfs2_xattr_find_divide_pos(struct ocfs2_xattr_header *xh)
+{
+	struct ocfs2_xattr_entry *entries = xh->xh_entries;
+	int count = le16_to_cpu(xh->xh_count);
+	int delta, middle = count / 2;
+
+	/*
+	 * We start at the middle.  Each step gets farther away in both
+	 * directions.  We therefore hit the change in hash value
+	 * nearest to the middle.  Note that this loop does not execute for
+	 * count < 2.
+	 */
+	for (delta = 0; delta < middle; delta++) {
+		/* Let's check delta earlier than middle */
+		if (cmp_xe(&entries[middle - delta - 1],
+			   &entries[middle - delta]))
+			return middle - delta;
+
+		/* For even counts, don't walk off the end */
+		if ((middle + delta + 1) == count)
+			continue;
+
+		/* Now try delta past middle */
+		if (cmp_xe(&entries[middle + delta],
+			   &entries[middle + delta + 1]))
+			return middle + delta + 1;
+	}
+
+	/* Every entry had the same hash */
+	return count;
+}
+
+/*
+ * Move some xattrs in old bucket(blk) to new bucket(new_blk).
  * first_hash will record the 1st hash of the new bucket.
+ *
+ * Normally half of the xattrs will be moved.  But we have to make
+ * sure that the xattrs with the same hash value are stored in the
+ * same bucket. If all the xattrs in this bucket have the same hash
+ * value, the new bucket will be initialized as an empty one and the
+ * first_hash will be initialized as (hash_value+1).
  */
-static int ocfs2_half_xattr_bucket(struct inode *inode,
-				   handle_t *handle,
-				   u64 blk,
-				   u64 new_blk,
-				   u32 *first_hash,
-				   int new_bucket_head)
+static int ocfs2_divide_xattr_bucket(struct inode *inode,
+				    handle_t *handle,
+				    u64 blk,
+				    u64 new_blk,
+				    u32 *first_hash,
+				    int new_bucket_head)
 {
 	int ret, i;
-	u16 count, start, len, name_value_len, xe_len, name_offset;
+	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct buffer_head **s_bhs, **t_bhs = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
 
-	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
 	     blk, new_blk);
 
 	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
@@ -3171,14 +3219,35 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 		}
 	}
 
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = ocfs2_xattr_find_divide_pos(xh);
+
+	if (start == count) {
+		xe = &xh->xh_entries[start-1];
+
+		/*
+		 * initialized a new empty bucket here.
+		 * The hash value is set as one larger than
+		 * that of the last entry in the previous bucket.
+		 */
+		for (i = 0; i < blk_per_bucket; i++)
+			memset(t_bhs[i]->b_data, 0, blocksize);
+
+		xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+		xh->xh_free_start = cpu_to_le16(blocksize);
+		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
+		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
+
+		goto set_num_buckets;
+	}
+
 	/* copy the whole bucket to the new first. */
 	for (i = 0; i < blk_per_bucket; i++)
 		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
 
 	/* update the new bucket. */
 	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
-	count = le16_to_cpu(xh->xh_count);
-	start = count / 2;
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3235,6 +3304,7 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 			xh->xh_free_start = xe->xe_name_offset;
 	}
 
+set_num_buckets:
 	/* set xh->xh_num_buckets for the new xh. */
 	if (new_bucket_head)
 		xh->xh_num_buckets = cpu_to_le16(1);
@@ -3252,9 +3322,13 @@ static int ocfs2_half_xattr_bucket(struct inode *inode,
 		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
 
 	/*
-	 * Now only update the 1st block of the old bucket.
-	 * Please note that the entry has been sorted already above.
+	 * Now only update the 1st block of the old bucket.  If we
+	 * just added a new empty bucket, there is no need to modify
+	 * it.
 	 */
+	if (start == count)
+		goto out;
+
 	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
@@ -3439,15 +3513,15 @@ out:
 }
 
 /*
- * Move half of the xattrs in this cluster to the new cluster.
+ * Move some xattrs in this cluster to the new cluster.
  * This function should only be called when bucket size == cluster size.
  * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
  */
-static int ocfs2_half_xattr_cluster(struct inode *inode,
-				    handle_t *handle,
-				    u64 prev_blk,
-				    u64 new_blk,
-				    u32 *first_hash)
+static int ocfs2_divide_xattr_cluster(struct inode *inode,
+				      handle_t *handle,
+				      u64 prev_blk,
+				      u64 new_blk,
+				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int ret, credits = 2 * blk_per_bucket;
@@ -3461,8 +3535,8 @@ static int ocfs2_half_xattr_cluster(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
-					new_blk, first_hash, 1);
+	return  ocfs2_divide_xattr_bucket(inode, handle, prev_blk,
+					  new_blk, first_hash, 1);
 }
 
 /*
@@ -3524,9 +3598,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 						     last_blk, new_blk,
 						     v_start);
 		else {
-			ret = ocfs2_half_xattr_cluster(inode, handle,
-						       last_blk, new_blk,
-						       v_start);
+			ret = ocfs2_divide_xattr_cluster(inode, handle,
+							 last_blk, new_blk,
+							 v_start);
 
 			if ((*header_bh)->b_blocknr == last_blk && extend)
 				*extend = 0;
@@ -3743,8 +3817,8 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	}
 
 	/* Move half of the xattr in start_blk to the next bucket. */
-	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
-				      start_blk + blk_per_bucket, NULL, 0);
+	ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+					start_blk + blk_per_bucket, NULL, 0);
 
 	le16_add_cpu(&first_xh->xh_num_buckets, 1);
 	ocfs2_journal_dirty(handle, first_bh);
@@ -4435,11 +4509,21 @@ out:
 	return ret;
 }
 
-/* check whether the xattr bucket is filled up with the same hash value. */
+/*
+ * check whether the xattr bucket is filled up with the same hash value.
+ * If we want to insert the xattr with the same hash, return -ENOSPC.
+ * If we want to insert a xattr with different hash value, go ahead
+ * and ocfs2_divide_xattr_bucket will handle this.
+ */
 static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
-					      struct ocfs2_xattr_bucket *bucket)
+					      struct ocfs2_xattr_bucket *bucket,
+					      const char *name)
 {
 	struct ocfs2_xattr_header *xh = bucket->xh;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
+		return 0;
 
 	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
 	    xh->xh_entries[0].xe_name_hash) {
@@ -4562,7 +4646,9 @@ try_again:
 		 * one bucket's worth, so check it here whether we need to
 		 * add a new bucket for the insert.
 		 */
-		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		ret = ocfs2_check_xattr_bucket_collision(inode,
+							 &xs->bucket,
+							 xi->name);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
cgit v1.2.3


From c435400140d24fbcb3da6b1e006be831f9056cb6 Mon Sep 17 00:00:00 2001
From: Dmitri Monakhov <dmonakhov@openvz.org>
Date: Mon, 27 Oct 2008 13:01:49 -0700
Subject: ocfs2: truncate outstanding block after direct io failure

Signed-off-by: Dmitri Monakhov <dmonakhov@openvz.org>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3138a385fdbb..e2570a3bc2b2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1866,6 +1866,13 @@ relock:
 		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
 						    ppos, count, ocount);
 		if (written < 0) {
+			/*
+			 * direct write may have instantiated a few
+			 * blocks outside i_size. Trim these off again.
+			 * Don't need i_size_read because we hold i_mutex.
+			 */
+			if (*ppos + count > inode->i_size)
+				vmtruncate(inode, inode->i_size);
 			ret = written;
 			goto out_dio;
 		}
-- 
cgit v1.2.3


From de29c08528bae45e3fa1171d190f1340e37e0f70 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 29 Oct 2008 14:45:30 -0700
Subject: ocfs2: fix printk related build warnings in xattr.c

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f3ea7efb48c6..70baffeb1812 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2336,7 +2336,8 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
 
 	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+	     "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
+	     first_hash);
 
 	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
 				      p_blkno, first_hash, num_clusters, xs);
@@ -2360,7 +2361,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 	memset(&bucket, 0, sizeof(bucket));
 
 	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-	     clusters, blkno);
+	     clusters, (unsigned long long)blkno);
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
 		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
@@ -2378,7 +2379,8 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
+		     (unsigned long long)blkno,
 		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
@@ -2714,7 +2716,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
+	     (unsigned long long)blkno);
 
 	xh_bh = sb_getblk(inode->i_sb, blkno);
 	if (!xh_bh) {
@@ -2883,8 +2886,8 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 
 	mlog(0, "adjust xattr bucket in %llu, count = %u, "
 	     "xh_free_start = %u, xh_name_value_len = %u.\n",
-	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
-	     le16_to_cpu(xh->xh_name_value_len));
+	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
+	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
 
 	/*
 	 * sort all the entries by their offset.
@@ -3000,7 +3003,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     prev_blkno, new_blkno);
+	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
 
 	/*
 	 * We need to update the 1st half of the new cluster and
@@ -3177,7 +3180,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	int blocksize = inode->i_sb->s_blocksize;
 
 	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
-	     blk, new_blk);
+	     (unsigned long long)blk, (unsigned long long)new_blk);
 
 	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
 	if (!s_bhs)
@@ -3376,7 +3379,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	BUG_ON(s_blkno == t_blkno);
 
 	mlog(0, "cp bucket %llu to %llu, target is %d\n",
-	     s_blkno, t_blkno, t_is_new);
+	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
+	     t_is_new);
 
 	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
 			GFP_NOFS);
@@ -3448,7 +3452,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	struct ocfs2_xattr_header *xh;
 	u64 to_blk_start = to_blk;
 
-	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+	mlog(0, "cp xattrs from cluster %llu to %llu\n",
+	     (unsigned long long)src_blk, (unsigned long long)to_blk);
 
 	/*
 	 * We need to update the new cluster and 1 more for the update of
@@ -3579,7 +3584,8 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     prev_blk, prev_clusters, new_blk);
+	     (unsigned long long)prev_blk, prev_clusters,
+	     (unsigned long long)new_blk);
 
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -3649,7 +3655,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, prev_blkno);
+	     prev_cpos, (unsigned long long)prev_blkno);
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
@@ -3736,7 +3742,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-	     num_bits, block, v_start);
+	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
 				  num_bits, 0, meta_ac);
 	if (ret < 0) {
@@ -3781,7 +3787,7 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
 
 	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", start_blk,
+	     "from %llu, len = %u\n", (unsigned long long)start_blk,
 	     (unsigned long long)first_bh->b_blocknr, num_clusters);
 
 	BUG_ON(bucket >= num_buckets);
-- 
cgit v1.2.3


From 6c1e183e12dbd78a897a859f13220406296fee31 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Sun, 2 Nov 2008 19:04:21 +0800
Subject: ocfs2: Check search result in ocfs2_xattr_block_get()

ocfs2_xattr_block_get() calls ocfs2_xattr_search() to find an external
xattr, but doesn't check the search result that is passed back via struct
ocfs2_xattr_search. Add a check for search result, and pass back -ENODATA if
the xattr search failed. This avoids a later NULL pointer error.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 70baffeb1812..054e2efb0b7e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -777,6 +777,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 		goto cleanup;
 	}
 
+	if (xs->not_found) {
+		ret = -ENODATA;
+		goto cleanup;
+	}
+
 	xb = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 	size = le64_to_cpu(xs->here->xe_value_size);
 	if (buffer) {
@@ -860,7 +865,7 @@ static int ocfs2_xattr_get(struct inode *inode,
 	down_read(&oi->ip_xattr_sem);
 	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
 				    buffer_size, &xis);
-	if (ret == -ENODATA)
+	if (ret == -ENODATA && di->i_xattr_loc)
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
 	up_read(&oi->ip_xattr_sem);
-- 
cgit v1.2.3


From afef80b3d87cae574b8c6b763505f25b74d254ef Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Nov 2008 13:26:54 -0800
Subject: vfs: fix shrink_submounts

In the last refactoring of shrink_submounts a variable was not completely
renamed.  So finish the renaming of mnt to m now.

Without this if you attempt to mount an nfs mount that has both automatic
nfs sub mounts on it, and has normal mounts on it.  The unmount will
succeed when it should not.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index cce46702d33c..65b3dc844c87 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1815,8 +1815,8 @@ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 		while (!list_empty(&graveyard)) {
 			m = list_first_entry(&graveyard, struct vfsmount,
 						mnt_expire);
-			touch_mnt_namespace(mnt->mnt_ns);
-			umount_tree(mnt, 1, umounts);
+			touch_mnt_namespace(m->mnt_ns);
+			umount_tree(m, 1, umounts);
 		}
 	}
 }
-- 
cgit v1.2.3


From 6cdfcc275e40b89fb020da1088ead86a61d33115 Mon Sep 17 00:00:00 2001
From: Theodore Tso <tytso@mit.edu>
Date: Wed, 12 Nov 2008 13:27:01 -0800
Subject: ext3: Clean up outdated and incorrect comment for ext3_write_super()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5dec6d1356c4..f6c94f232ec1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2375,12 +2375,9 @@ int ext3_force_commit(struct super_block *sb)
 /*
  * Ext3 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
+ * point.  (We can probably nuke this function altogether, and remove
+ * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
  */
-
 static void ext3_write_super (struct super_block * sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
-- 
cgit v1.2.3


From 278afcbf4fe964230eba67f8fb8235e8b7e63ffb Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 13 Nov 2008 13:22:34 -0600
Subject: dlm: fix shutdown cleanup

Fixes a regression from commit 0f8e0d9a317406612700426fad3efab0b7bbc467,
"dlm: allow multiple lockspace creates".

An extraneous 'else' slipped into a code fragment being moved from
release_lockspace() to dlm_release_lockspace().  The result of the
unwanted 'else' is that dlm threads and structures are not stopped
and cleaned up when the final dlm lockspace is removed.  Trying to
create a new lockspace again afterward will fail with
"kmem_cache_create: duplicate cache dlm_conn" because the cache
was not previously destroyed.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d910501de6d2..8d86b7960f0d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -812,7 +812,7 @@ int dlm_release_lockspace(void *lockspace, int force)
 	error = release_lockspace(ls, force);
 	if (!error)
 		ls_count--;
-	else if (!ls_count)
+	if (!ls_count)
 		threads_stop();
 	mutex_unlock(&ls_lock);
 
-- 
cgit v1.2.3


From 3b7952109361c684caf0c50474da8662ecc81019 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 13 Nov 2008 19:45:32 +0000
Subject: [CIFS] Fix cifs reconnection flags

In preparation for Jeff's big umount/mount fixes to remove the possibility of
various races in cifs mount and linked list handling of sessions, sockets and
tree connections, this patch cleans up some repetitive code in cifs_mount,
and addresses a problem with ses->status and tcon->tidStatus in which we
were overloading the "need_reconnect" state with other status in that
field.  So the "need_reconnect" flag has been broken out from those
two state fields (need reconnect was not mutually exclusive from some of the
other possible tid and ses states).  In addition, a few exit cases in
cifs_mount were cleaned up, and a problem with a tcon flag (for lease support)
was not being set consistently for the 2nd mount of the same share

CC: Jeff Layton <jlayton@redhat.com>
CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |   2 +-
 fs/cifs/cifsglob.h |   5 +
 fs/cifs/cifssmb.c  |  40 ++++----
 fs/cifs/connect.c  | 262 ++++++++++++++++++++++++++---------------------------
 fs/cifs/file.c     |   2 +-
 5 files changed, 160 insertions(+), 151 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ac5915d61dca..903bbd6449d1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1013,7 +1013,7 @@ static int cifs_oplock_thread(void *dummyarg)
 				not bother sending an oplock release if session
 				to server still is disconnected since oplock
 				already released by the server in that case */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
 				rc = CIFSSMBLock(0, pTcon, netfid,
 						0 /* len */ , 0 /* offset */, 0,
 						0, LOCKING_ANDX_OPLOCK_RELEASE,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1cb1189f24e0..dc0aa140f1bf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -122,6 +122,8 @@ struct cifs_cred {
  */
 
 struct TCP_Server_Info {
+	struct list_head tcp_ses_list;
+	struct list_head smb_ses_list;
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -195,6 +197,7 @@ struct cifsUidInfo {
  */
 struct cifsSesInfo {
 	struct list_head cifsSessionList;
+	struct list_head tcon_list;
 	struct semaphore sesSem;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
@@ -216,6 +219,7 @@ struct cifsSesInfo {
 	char userName[MAX_USERNAME_SIZE + 1];
 	char *domainName;
 	char *password;
+	bool need_reconnect:1; /* connection reset, uid now invalid */
 };
 /* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
@@ -288,6 +292,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d5eac48fc415..7f0651b69573 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -190,10 +190,10 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -295,7 +295,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	   check for tcp and smb session status done differently
 	   for those three - in the calling routine */
 	if (tcon) {
-		if (tcon->tidStatus == CifsExiting) {
+		if (tcon->need_reconnect) {
 			/* only tree disconnect, open, and write,
 			  (and ulogoff which does not have tcon)
 			  are allowed as we start force umount */
@@ -337,10 +337,10 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 		/* need to prevent multiple threads trying to
 		simultaneously reconnect the same SMB session */
 			down(&tcon->ses->sesSem);
-			if (tcon->ses->status == CifsNeedReconnect)
+			if (tcon->ses->need_reconnect)
 				rc = cifs_setup_session(0, tcon->ses,
 							nls_codepage);
-			if (!rc && (tcon->tidStatus == CifsNeedReconnect)) {
+			if (!rc && (tcon->need_reconnect)) {
 				mark_open_files_invalid(tcon);
 				rc = CIFSTCon(0, tcon->ses, tcon->treeName,
 					      tcon, nls_codepage);
@@ -759,7 +759,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 
 	/* No need to return error on this operation if tid invalidated and
 	closed on server already e.g. due to tcp session crashing */
-	if (tcon->tidStatus == CifsNeedReconnect) {
+	if (tcon->need_reconnect) {
 		up(&tcon->tconSem);
 		return 0;
 	}
@@ -806,32 +806,36 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 		up(&ses->sesSem);
 		return -EBUSY;
 	}
+
+	if (ses->server == NULL)
+		return -EIO;
+
+	if (ses->need_reconnect)
+		goto session_already_dead; /* no need to send SMBlogoff if uid
+					      already closed due to reconnect */
 	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
 	if (rc) {
 		up(&ses->sesSem);
 		return rc;
 	}
 
-	if (ses->server) {
-		pSMB->hdr.Mid = GetNextMid(ses->server);
+	pSMB->hdr.Mid = GetNextMid(ses->server);
 
-		if (ses->server->secMode &
+	if (ses->server->secMode &
 		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
 			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-	}
 
 	pSMB->hdr.Uid = ses->Suid;
 
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
-	if (ses->server) {
-		atomic_dec(&ses->server->socketUseCount);
-		if (atomic_read(&ses->server->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			ses->server->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			rc = -ESHUTDOWN;
-		}
+session_already_dead:
+	atomic_dec(&ses->server->socketUseCount);
+	if (atomic_read(&ses->server->socketUseCount) == 0) {
+		spin_lock(&GlobalMid_Lock);
+		ses->server->tcpStatus = CifsExiting;
+		spin_unlock(&GlobalMid_Lock);
+		rc = -ESHUTDOWN;
 	}
 	up(&ses->sesSem);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c682be8f2984..c1cd1217c990 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -149,7 +149,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
 		if (ses->server) {
 			if (ses->server == server) {
-				ses->status = CifsNeedReconnect;
+				ses->need_reconnect = true;
 				ses->ipc_tid = 0;
 			}
 		}
@@ -158,7 +158,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		if ((tcon->ses) && (tcon->ses->server == server))
-			tcon->tidStatus = CifsNeedReconnect;
+			tcon->need_reconnect = true;
 	}
 	read_unlock(&GlobalSMBSeslock);
 	/* do not want to be sending data on a socket we are freeing */
@@ -1891,6 +1891,92 @@ kill_cifsd(struct TCP_Server_Info *server)
 		force_sig(SIGKILL, task);
 }
 
+static void setup_cifs_sb(struct smb_vol *pvolume_info,
+			  struct cifs_sb_info *cifs_sb)
+{
+	if (pvolume_info->rsize > CIFSMaxBufSize) {
+		cERROR(1, ("rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize));
+		cifs_sb->rsize = CIFSMaxBufSize;
+	} else if ((pvolume_info->rsize) &&
+			(pvolume_info->rsize <= CIFSMaxBufSize))
+		cifs_sb->rsize = pvolume_info->rsize;
+	else /* default */
+		cifs_sb->rsize = CIFSMaxBufSize;
+
+	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
+		cERROR(1, ("wsize %d too large, using 4096 instead",
+			  pvolume_info->wsize));
+		cifs_sb->wsize = 4096;
+	} else if (pvolume_info->wsize)
+		cifs_sb->wsize = pvolume_info->wsize;
+	else
+		cifs_sb->wsize = min_t(const int,
+					PAGEVEC_SIZE * PAGE_CACHE_SIZE,
+					127*1024);
+		/* old default of CIFSMaxBufSize was too small now
+		   that SMB Write2 can send multiple pages in kvec.
+		   RFC1001 does not describe what happens when frame
+		   bigger than 128K is sent so use that as max in
+		   conjunction with 52K kvec constraint on arch with 4K
+		   page size  */
+
+	if (cifs_sb->rsize < 2048) {
+		cifs_sb->rsize = 2048;
+		/* Windows ME may prefer this */
+		cFYI(1, ("readsize set to minimum: 2048"));
+	}
+	/* calculate prepath */
+	cifs_sb->prepath = pvolume_info->prepath;
+	if (cifs_sb->prepath) {
+		cifs_sb->prepathlen = strlen(cifs_sb->prepath);
+		/* we can not convert the / to \ in the path
+		separators in the prefixpath yet because we do not
+		know (until reset_cifs_unix_caps is called later)
+		whether POSIX PATH CAP is available. We normalize
+		the / to \ after reset_cifs_unix_caps is called */
+		pvolume_info->prepath = NULL;
+	} else
+		cifs_sb->prepathlen = 0;
+	cifs_sb->mnt_uid = pvolume_info->linux_uid;
+	cifs_sb->mnt_gid = pvolume_info->linux_gid;
+	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
+	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
+	cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+
+	if (pvolume_info->noperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
+	if (pvolume_info->setuids)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
+	if (pvolume_info->server_ino)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
+	if (pvolume_info->remap)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
+	if (pvolume_info->no_xattr)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
+	if (pvolume_info->sfu_emul)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
+	if (pvolume_info->nobrl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->cifs_acl)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
+	if (pvolume_info->override_uid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
+	if (pvolume_info->override_gid)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+	if (pvolume_info->dynperm)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
+	if (pvolume_info->direct_io) {
+		cFYI(1, ("mounting share using direct i/o"));
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
+	}
+
+	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
+		cERROR(1, ("mount option dynperm ignored if cifsacl "
+			   "mount option supported"));
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -1996,9 +2082,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		goto out;
 	}
 
-	if (srvTcp) {
-		cFYI(1, ("Existing tcp session with server found"));
-	} else {	/* create socket */
+	if (!srvTcp) {	/* create socket */
 		if (volume_info.port)
 			sin_server.sin_port = htons(volume_info.port);
 		else
@@ -2074,7 +2158,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
 		down(&pSesInfo->sesSem);
-		if (pSesInfo->status == CifsNeedReconnect) {
+		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
@@ -2124,146 +2208,59 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	/* search for existing tcon to this server share */
 	if (!rc) {
-		if (volume_info.rsize > CIFSMaxBufSize) {
-			cERROR(1, ("rsize %d too large, using MaxBufSize",
-				volume_info.rsize));
-			cifs_sb->rsize = CIFSMaxBufSize;
-		} else if ((volume_info.rsize) &&
-				(volume_info.rsize <= CIFSMaxBufSize))
-			cifs_sb->rsize = volume_info.rsize;
-		else /* default */
-			cifs_sb->rsize = CIFSMaxBufSize;
-
-		if (volume_info.wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-			cERROR(1, ("wsize %d too large, using 4096 instead",
-				  volume_info.wsize));
-			cifs_sb->wsize = 4096;
-		} else if (volume_info.wsize)
-			cifs_sb->wsize = volume_info.wsize;
-		else
-			cifs_sb->wsize =
-				min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE,
-					127*1024);
-			/* old default of CIFSMaxBufSize was too small now
-			   that SMB Write2 can send multiple pages in kvec.
-			   RFC1001 does not describe what happens when frame
-			   bigger than 128K is sent so use that as max in
-			   conjunction with 52K kvec constraint on arch with 4K
-			   page size  */
-
-		if (cifs_sb->rsize < 2048) {
-			cifs_sb->rsize = 2048;
-			/* Windows ME may prefer this */
-			cFYI(1, ("readsize set to minimum: 2048"));
-		}
-		/* calculate prepath */
-		cifs_sb->prepath = volume_info.prepath;
-		if (cifs_sb->prepath) {
-			cifs_sb->prepathlen = strlen(cifs_sb->prepath);
-			/* we can not convert the / to \ in the path
-			separators in the prefixpath yet because we do not
-			know (until reset_cifs_unix_caps is called later)
-			whether POSIX PATH CAP is available. We normalize
-			the / to \ after reset_cifs_unix_caps is called */
-			volume_info.prepath = NULL;
-		} else
-			cifs_sb->prepathlen = 0;
-		cifs_sb->mnt_uid = volume_info.linux_uid;
-		cifs_sb->mnt_gid = volume_info.linux_gid;
-		cifs_sb->mnt_file_mode = volume_info.file_mode;
-		cifs_sb->mnt_dir_mode = volume_info.dir_mode;
-		cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-			cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
-
-		if (volume_info.noperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
-		if (volume_info.setuids)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID;
-		if (volume_info.server_ino)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
-		if (volume_info.remap)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
-		if (volume_info.no_xattr)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
-		if (volume_info.sfu_emul)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
-		if (volume_info.nobrl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
-		if (volume_info.cifs_acl)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-		if (volume_info.override_uid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
-		if (volume_info.override_gid)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
-		if (volume_info.dynperm)
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
-		if (volume_info.direct_io) {
-			cFYI(1, ("mounting share using direct i/o"));
-			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-		}
-
-		if ((volume_info.cifs_acl) && (volume_info.dynperm))
-			cERROR(1, ("mount option dynperm ignored if cifsacl "
-				   "mount option supported"));
+		setup_cifs_sb(&volume_info, cifs_sb);
 
 		tcon =
 		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-			/* we can have only one retry value for a connection
-			   to a share so for resources mounted more than once
-			   to the same server share the last value passed in
-			   for the retry flag is used */
-			tcon->retry = volume_info.retry;
-			tcon->nocase = volume_info.nocase;
-			tcon->local_lease = volume_info.local_lease;
 			if (tcon->seal != volume_info.seal)
 				cERROR(1, ("transport encryption setting "
 					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
-			if (tcon == NULL)
+			if (tcon == NULL) {
 				rc = -ENOMEM;
-			else {
-				/* check for null share name ie connecting to
-				 * dfs root */
-
-				/* BB check if this works for exactly length
-				 * three strings */
-				if ((strchr(volume_info.UNC + 3, '\\') == NULL)
-				    && (strchr(volume_info.UNC + 3, '/') ==
-					NULL)) {
-/*					rc = connect_to_dfs_path(xid, pSesInfo,
-						"", cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
-					cFYI(1, ("DFS root not supported"));
-					rc = -ENODEV;
-					goto out;
-				} else {
-					/* BB Do we need to wrap sesSem around
-					 * this TCon call and Unix SetFS as
-					 * we do on SessSetup and reconnect? */
-					rc = CIFSTCon(xid, pSesInfo,
-						volume_info.UNC,
-						tcon, cifs_sb->local_nls);
-					cFYI(1, ("CIFS Tcon rc = %d", rc));
-					if (volume_info.nodfs) {
-						tcon->Flags &=
-							~SMB_SHARE_IS_IN_DFS;
-						cFYI(1, ("DFS disabled (%d)",
-							tcon->Flags));
-					}
-				}
-				if (!rc) {
-					atomic_inc(&pSesInfo->inUse);
-					tcon->retry = volume_info.retry;
-					tcon->nocase = volume_info.nocase;
-					tcon->seal = volume_info.seal;
+				goto mount_fail_check;
+			}
+
+			/* check for null share name ie connect to dfs root */
+
+			/* BB check if works for exactly length 3 strings */
+			if ((strchr(volume_info.UNC + 3, '\\') == NULL)
+			    && (strchr(volume_info.UNC + 3, '/') == NULL)) {
+				/* rc = connect_to_dfs_path(...) */
+				cFYI(1, ("DFS root not supported"));
+				rc = -ENODEV;
+				goto mount_fail_check;
+			} else {
+				/* BB Do we need to wrap sesSem around
+				 * this TCon call and Unix SetFS as
+				 * we do on SessSetup and reconnect? */
+				rc = CIFSTCon(xid, pSesInfo, volume_info.UNC,
+					      tcon, cifs_sb->local_nls);
+				cFYI(1, ("CIFS Tcon rc = %d", rc));
+				if (volume_info.nodfs) {
+					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+					cFYI(1, ("DFS disabled (%d)",
+						tcon->Flags));
 				}
 			}
+			if (!rc) {
+				atomic_inc(&pSesInfo->inUse);
+				tcon->seal = volume_info.seal;
+			} else
+				goto mount_fail_check;
 		}
+
+		/* we can have only one retry value for a connection
+		   to a share so for resources mounted more than once
+		   to the same server share the last value passed in
+		   for the retry flag is used */
+		tcon->retry = volume_info.retry;
+		tcon->nocase = volume_info.nocase;
+		tcon->local_lease = volume_info.local_lease;
 	}
 	if (pSesInfo) {
 		if (pSesInfo->capabilities & CAP_LARGE_FILES) {
@@ -2276,6 +2273,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	sb->s_time_gran = 100;
 
 /* on error free sesinfo and tcon struct if needed */
+mount_fail_check:
 	if (rc) {
 		/* if session setup failed, use count is zero but
 		we still need to free cifsd thread */
@@ -3518,6 +3516,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	/* above now done in SendReceive */
 	if ((rc == 0) && (tcon != NULL)) {
 		tcon->tidStatus = CifsGood;
+		tcon->need_reconnect = false;
 		tcon->tid = smb_buffer_response->Tid;
 		bcc_ptr = pByteArea(smb_buffer_response);
 		length = strnlen(bcc_ptr, BCC(smb_buffer_response) - 2);
@@ -3746,6 +3745,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		cFYI(1, ("CIFS Session Established successfully"));
 			spin_lock(&GlobalMid_Lock);
 			pSesInfo->status = CifsGood;
+			pSesInfo->need_reconnect = false;
 			spin_unlock(&GlobalMid_Lock);
 	}
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ead1a3bb0256..1540adaa593d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -493,7 +493,7 @@ int cifs_close(struct inode *inode, struct file *file)
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
-			if (pTcon->tidStatus != CifsNeedReconnect) {
+			if (!pTcon->need_reconnect) {
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->wrtPending) != 0)
 					&& (timeout <= 2048)) {
-- 
cgit v1.2.3


From fb396016647ae9de5b3bd8c4ee4f7b9cc7148bd5 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 13 Nov 2008 20:04:07 +0000
Subject: [CIFS] remove unused list, add new cifs sock list to prepare for
 mount/umount fix

Also adds two lines missing from the previous patch (for the need reconnect flag in the
/proc/fs/cifs/DebugData handling)

The new global_cifs_sock_list is added, and initialized in init_cifs but not used yet.
Jeff Layton will be adding code in to use that and to remove the GlobalTcon and GlobalSMBSession
lists.

CC: Jeff Layton <jlayton@redhat.com>
CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |  4 ++--
 fs/cifs/cifsfs.c     |  6 +++---
 fs/cifs/cifsglob.h   | 23 ++++++++---------------
 3 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 69a12aae91d3..ba8723d95996 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -204,7 +204,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 		else
 			seq_printf(m, " type: %d ", dev_type);
 
-		if (tcon->tidStatus == CifsNeedReconnect)
+		if (tcon->need_reconnect)
 			seq_puts(m, "\tDISCONNECTED ");
 	}
 	read_unlock(&GlobalSMBSeslock);
@@ -311,7 +311,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		seq_printf(m, "\n%d) %s", i, tcon->treeName);
-		if (tcon->tidStatus == CifsNeedReconnect)
+		if (tcon->need_reconnect)
 			seq_puts(m, "\tDISCONNECTED ");
 		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
 			atomic_read(&tcon->num_smbs_sent),
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 903bbd6449d1..af16a2406b1c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1059,9 +1059,9 @@ init_cifs(void)
 {
 	int rc = 0;
 	cifs_proc_init();
-/*	INIT_LIST_HEAD(&GlobalServerList);*/	/* BB not implemented yet */
-	INIT_LIST_HEAD(&GlobalSMBSessionList);
-	INIT_LIST_HEAD(&GlobalTreeConnectionList);
+	INIT_LIST_HEAD(&global_cifs_sock_list);
+	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
+	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index dc0aa140f1bf..d6357dc1be72 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -592,22 +592,15 @@ require use of the stronger protocol */
 #define GLOBAL_EXTERN extern
 #endif
 
-/*
- * The list of servers that did not respond with NT LM 0.12.
- * This list helps improve performance and eliminate the messages indicating
- * that we had a communications error talking to the server in this list.
- */
-/* Feature not supported */
-/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
-
-/*
- * The following is a hash table of all the users we know about.
- */
-GLOBAL_EXTERN struct smbUidInfo *GlobalUidList[UID_HASH];
 
-/* GLOBAL_EXTERN struct list_head GlobalServerList; BB not implemented yet */
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList;
+/* the list of TCP_Server_Info structures, ie each of the sockets
+ * connecting our client to a distinct server (ip address), is
+ * chained together by global_cifs_sock_list. The list of all our SMB
+ * sessions (and from that the tree connections) can be found
+ * by iterating over global_cifs_sock_list */
+GLOBAL_EXTERN struct list_head global_cifs_sock_list;
+GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
+GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
-- 
cgit v1.2.3


From 3ec332ef7a38c2327e18d087d4120a8e3bd3dc6e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 14 Nov 2008 03:35:10 +0000
Subject: [CIFS] clean up server protocol handling

We're currently declaring both a sockaddr_in and sockaddr6_in on the
stack, but we really only need storage for one of them. Declare a
sockaddr struct and cast it to the proper type. Also, eliminate the
protocolType field in the TCP_Server_Info struct. It's redundant since
we have a sa_family field in the sockaddr anyway.

We may need to revisit this if SCTP is ever implemented, but for now
this will simplify the code.

CIFS over IPv6 also has a number of problems currently. This fixes all
of them that I found. Eventually, it would be nice to move more of the
code to be protocol independent, but this is a start.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c |  4 ++--
 fs/cifs/cifsglob.h    |  3 +--
 fs/cifs/connect.c     | 57 +++++++++++++++++++++++++++------------------------
 3 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index fcee9298b620..0ab2fb5afef1 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -73,8 +73,8 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
-#define MAX_IPV6_ADDR_LEN	42
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
+#define MAX_IPV6_ADDR_LEN	43
 
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d6357dc1be72..13dc48414a78 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -85,8 +85,7 @@ enum securityEnum {
 };
 
 enum protocolEnum {
-	IPV4 = 0,
-	IPV6,
+	TCP = 0,
 	SCTP
 	/* Netbios frames protocol not supported at this time */
 };
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c1cd1217c990..30ab8dc68e17 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -193,7 +193,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	while ((server->tcpStatus != CifsExiting) &&
 	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
-		if (server->protocolType == IPV6) {
+		if (server->addr.sockAddr6.sin6_family == AF_INET6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
 					  &server->ssocket, server->noautotune);
 		} else {
@@ -1983,10 +1983,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 {
 	int rc = 0;
 	int xid;
-	int address_type = AF_INET;
 	struct socket *csocket = NULL;
-	struct sockaddr_in sin_server;
-	struct sockaddr_in6 sin_server6;
+	struct sockaddr addr;
+	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	struct smb_vol volume_info;
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsSesInfo *existingCifsSes = NULL;
@@ -1997,6 +1997,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 /* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */
 
+	memset(&addr, 0, sizeof(struct sockaddr));
 	memset(&volume_info, 0, sizeof(struct smb_vol));
 	if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
 		rc = -EINVAL;
@@ -2019,16 +2020,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 	if (volume_info.UNCip && volume_info.UNC) {
 		rc = cifs_inet_pton(AF_INET, volume_info.UNCip,
-				    &sin_server.sin_addr.s_addr);
+				    &sin_server->sin_addr.s_addr);
 
 		if (rc <= 0) {
 			/* not ipv4 address, try ipv6 */
 			rc = cifs_inet_pton(AF_INET6, volume_info.UNCip,
-					    &sin_server6.sin6_addr.in6_u);
+					    &sin_server6->sin6_addr.in6_u);
 			if (rc > 0)
-				address_type = AF_INET6;
+				addr.sa_family = AF_INET6;
 		} else {
-			address_type = AF_INET;
+			addr.sa_family = AF_INET;
 		}
 
 		if (rc <= 0) {
@@ -2068,39 +2069,38 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (address_type == AF_INET)
-		existingCifsSes = cifs_find_tcp_session(&sin_server.sin_addr,
+	if (addr.sa_family == AF_INET)
+		existingCifsSes = cifs_find_tcp_session(&sin_server->sin_addr,
 			NULL /* no ipv6 addr */,
 			volume_info.username, &srvTcp);
-	else if (address_type == AF_INET6) {
+	else if (addr.sa_family == AF_INET6) {
 		cFYI(1, ("looking for ipv6 address"));
 		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-			&sin_server6.sin6_addr,
+			&sin_server6->sin6_addr,
 			volume_info.username, &srvTcp);
 	} else {
 		rc = -EINVAL;
 		goto out;
 	}
 
-	if (!srvTcp) {	/* create socket */
-		if (volume_info.port)
-			sin_server.sin_port = htons(volume_info.port);
-		else
-			sin_server.sin_port = 0;
-		if (address_type == AF_INET6) {
+	if (!srvTcp) {
+		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
 			/* other OS never observed in Wild doing 139 with v6 */
-			rc = ipv6_connect(&sin_server6, &csocket,
+			sin_server6->sin6_port = htons(volume_info.port);
+			rc = ipv6_connect(sin_server6, &csocket,
 					volume_info.noblocksnd);
-		} else
-			rc = ipv4_connect(&sin_server, &csocket,
+		} else {
+			sin_server->sin_port = htons(volume_info.port);
+			rc = ipv4_connect(sin_server, &csocket,
 				  volume_info.source_rfc1001_name,
 				  volume_info.target_rfc1001_name,
 				  volume_info.noblocksnd,
 				  volume_info.noautotune);
+		}
 		if (rc < 0) {
-			cERROR(1, ("Error connecting to IPv4 socket. "
+			cERROR(1, ("Error connecting to socket. "
 				   "Aborting operation"));
 			if (csocket != NULL)
 				sock_release(csocket);
@@ -2115,12 +2115,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		} else {
 			srvTcp->noblocksnd = volume_info.noblocksnd;
 			srvTcp->noautotune = volume_info.noautotune;
-			memcpy(&srvTcp->addr.sockAddr, &sin_server,
-				sizeof(struct sockaddr_in));
+			if (addr.sa_family == AF_INET6)
+				memcpy(&srvTcp->addr.sockAddr6, sin_server6,
+					sizeof(struct sockaddr_in6));
+			else
+				memcpy(&srvTcp->addr.sockAddr, sin_server,
+					sizeof(struct sockaddr_in));
 			atomic_set(&srvTcp->inFlight, 0);
 			/* BB Add code for ipv6 case too */
 			srvTcp->ssocket = csocket;
-			srvTcp->protocolType = IPV4;
 			srvTcp->hostname = extract_hostname(volume_info.UNC);
 			if (IS_ERR(srvTcp->hostname)) {
 				rc = PTR_ERR(srvTcp->hostname);
@@ -2172,7 +2175,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		else {
 			pSesInfo->server = srvTcp;
 			sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-				NIPQUAD(sin_server.sin_addr.s_addr));
+				NIPQUAD(sin_server->sin_addr.s_addr));
 		}
 
 		if (!rc) {
@@ -2211,7 +2214,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		setup_cifs_sb(&volume_info, cifs_sb);
 
 		tcon =
-		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
+		    find_unc(sin_server->sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-- 
cgit v1.2.3


From e7ddee9037e7dd43de1ad08b51727e552aedd836 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Nov 2008 13:44:38 -0500
Subject: cifs: disable sharing session and tcon and add new TCP sharing code

The code that allows these structs to be shared is extremely racy.
Disable the sharing of SMB and tcon structs for now until we can
come up with a way to do this that's race free.

We want to continue to share TCP sessions, however since they are
required for multiuser mounts. For that, implement a new (hopefully
race-free) scheme. Add a new global list of TCP sessions, and take
care to get a reference to it whenever we're dealing with one.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |   2 +-
 fs/cifs/cifsfs.c     |   3 +-
 fs/cifs/cifsglob.h   |  17 +++--
 fs/cifs/cifsproto.h  |   1 +
 fs/cifs/cifssmb.c    |  18 ++---
 fs/cifs/connect.c    | 205 ++++++++++++++++++---------------------------------
 6 files changed, 96 insertions(+), 150 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ba8723d95996..40b5108fb4f9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -144,7 +144,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
 				    "Server: %d SecMode: 0x%x Req On Wire: %d",
 				ses->server->tcpStatus,
-				atomic_read(&ses->server->socketUseCount),
+				ses->server->srv_count,
 				ses->server->secMode,
 				atomic_read(&ses->server->inFlight));
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index af16a2406b1c..2946dab0718f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1059,7 +1059,7 @@ init_cifs(void)
 {
 	int rc = 0;
 	cifs_proc_init();
-	INIT_LIST_HEAD(&global_cifs_sock_list);
+	INIT_LIST_HEAD(&cifs_tcp_ses_list);
 	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -1089,6 +1089,7 @@ init_cifs(void)
 	GlobalMaxActiveXid = 0;
 	memset(Local_System_Name, 0, 15);
 	rwlock_init(&GlobalSMBSeslock);
+	rwlock_init(&cifs_tcp_ses_lock);
 	spin_lock_init(&GlobalMid_Lock);
 
 	if (cifs_max_pending < 2) {
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 13dc48414a78..313f7bfedec7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -123,6 +123,7 @@ struct cifs_cred {
 struct TCP_Server_Info {
 	struct list_head tcp_ses_list;
 	struct list_head smb_ses_list;
+	int srv_count; /* reference counter */
 	/* 15 character server name + 0x20 16th byte indicating type = srv */
 	char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
@@ -144,7 +145,6 @@ struct TCP_Server_Info {
 	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
-	atomic_t socketUseCount; /* number of open cifs sessions on socket */
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t inSend; /* requests trying to send */
@@ -591,13 +591,18 @@ require use of the stronger protocol */
 #define GLOBAL_EXTERN extern
 #endif
 
-
-/* the list of TCP_Server_Info structures, ie each of the sockets
+/*
+ * the list of TCP_Server_Info structures, ie each of the sockets
  * connecting our client to a distinct server (ip address), is
- * chained together by global_cifs_sock_list. The list of all our SMB
+ * chained together by cifs_tcp_ses_list. The list of all our SMB
  * sessions (and from that the tree connections) can be found
- * by iterating over global_cifs_sock_list */
-GLOBAL_EXTERN struct list_head global_cifs_sock_list;
+ * by iterating over cifs_tcp_ses_list
+ */
+GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
+
+/* protects cifs_tcp_ses_list and srv_count for each tcp session */
+GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
+
 GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
 GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f21ecb85ce5..0250a994c6e6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,6 +102,7 @@ extern void acl_to_uid_mode(struct inode *inode, const char *path,
 			    const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
+extern void cifs_put_tcp_session(struct TCP_Server_Info *server);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7f0651b69573..cd9e9a145e4d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -664,8 +664,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			rc = -EIO;
 			goto neg_err_exit;
 		}
-
-		if (server->socketUseCount.counter > 1) {
+		read_lock(&cifs_tcp_ses_lock);
+		if (server->srv_count > 1) {
+			read_unlock(&cifs_tcp_ses_lock);
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
@@ -674,9 +675,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 					pSMBr->u.extended_response.GUID,
 					16);
 			}
-		} else
+		} else {
+			read_unlock(&cifs_tcp_ses_lock);
 			memcpy(server->server_GUID,
 			       pSMBr->u.extended_response.GUID, 16);
+		}
 
 		if (count == 16) {
 			server->secType = RawNTLMSSP;
@@ -830,12 +833,9 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	atomic_dec(&ses->server->socketUseCount);
-	if (atomic_read(&ses->server->socketUseCount) == 0) {
-		spin_lock(&GlobalMid_Lock);
-		ses->server->tcpStatus = CifsExiting;
-		spin_unlock(&GlobalMid_Lock);
-		rc = -ESHUTDOWN;
+	if (ses->server) {
+		cifs_put_tcp_session(ses->server);
+		rc = 0;
 	}
 	up(&ses->sesSem);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 30ab8dc68e17..a0314259f94d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -659,6 +659,11 @@ multi_t2_fnd:
 		}
 	} /* end while !EXITING */
 
+	/* take it off the list, if it's not already */
+	write_lock(&cifs_tcp_ses_lock);
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -1357,92 +1362,66 @@ cifs_parse_mount_options(char *options, const char *devname,
 	return 0;
 }
 
-static struct cifsSesInfo *
-cifs_find_tcp_session(struct in_addr *target_ip_addr,
-		      struct in6_addr *target_ip6_addr,
-		      char *userName, struct TCP_Server_Info **psrvTcp)
+static struct TCP_Server_Info *
+cifs_find_tcp_session(struct sockaddr *addr)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
-
-	*psrvTcp = NULL;
-
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (!ses->server)
+	struct TCP_Server_Info *server;
+	struct sockaddr_in *addr4 = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) addr;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info,
+				    tcp_ses_list);
+
+		/*
+		 * the demux thread can exit on its own while still in CifsNew
+		 * so don't accept any sockets in that state. Since the
+		 * tcpStatus never changes back to CifsNew it's safe to check
+		 * for this without a lock.
+		 */
+		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (target_ip_addr &&
-		    ses->server->addr.sockAddr.sin_addr.s_addr != target_ip_addr->s_addr)
-				continue;
-		else if (target_ip6_addr &&
-			 memcmp(&ses->server->addr.sockAddr6.sin6_addr,
-				target_ip6_addr, sizeof(*target_ip6_addr)))
-				continue;
-		/* BB lock server and tcp session; increment use count here?? */
-
-		/* found a match on the TCP session */
-		*psrvTcp = ses->server;
+		if (addr->sa_family == AF_INET &&
+		    (addr4->sin_addr.s_addr !=
+		     server->addr.sockAddr.sin_addr.s_addr))
+			continue;
+		else if (addr->sa_family == AF_INET6 &&
+			 memcmp(&server->addr.sockAddr6.sin6_addr,
+				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			continue;
 
-		/* BB check if reconnection needed */
-		if (strncmp(ses->userName, userName, MAX_USERNAME_SIZE) == 0) {
-			read_unlock(&GlobalSMBSeslock);
-			/* Found exact match on both TCP and
-			   SMB sessions */
-			return ses;
-		}
-		/* else tcp and smb sessions need reconnection */
+		++server->srv_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return server;
 	}
-	read_unlock(&GlobalSMBSeslock);
-
+	write_unlock(&cifs_tcp_ses_lock);
 	return NULL;
 }
 
-static struct cifsTconInfo *
-find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
+void
+cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
-	struct list_head *tmp;
-	struct cifsTconInfo *tcon;
-	__be32 old_ip;
-
-	read_lock(&GlobalSMBSeslock);
-
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		cFYI(1, ("Next tcon"));
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if (!tcon->ses || !tcon->ses->server)
-			continue;
-
-		old_ip = tcon->ses->server->addr.sockAddr.sin_addr.s_addr;
-		cFYI(1, ("old ip addr: %x == new ip %x ?",
-			old_ip, new_target_ip_addr));
-
-		if (old_ip != new_target_ip_addr)
-			continue;
-
-		/* BB lock tcon, server, tcp session and increment use count? */
-		/* found a match on the TCP session */
-		/* BB check if reconnection needed */
-		cFYI(1, ("IP match, old UNC: %s new: %s",
-			tcon->treeName, uncName));
+	struct task_struct *task;
 
-		if (strncmp(tcon->treeName, uncName, MAX_TREE_SIZE))
-			continue;
+	write_lock(&cifs_tcp_ses_lock);
+	if (--server->srv_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
 
-		cFYI(1, ("and old usr: %s new: %s",
-			tcon->treeName, uncName));
+	list_del_init(&server->tcp_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
 
-		if (strncmp(tcon->ses->userName, userName, MAX_USERNAME_SIZE))
-			continue;
-
-		/* matched smb session (user name) */
-		read_unlock(&GlobalSMBSeslock);
-		return tcon;
-	}
+	spin_lock(&GlobalMid_Lock);
+	server->tcpStatus = CifsExiting;
+	spin_unlock(&GlobalMid_Lock);
 
-	read_unlock(&GlobalSMBSeslock);
-	return NULL;
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
 }
 
 int
@@ -1881,16 +1860,6 @@ convert_delimiter(char *path, char delim)
 	}
 }
 
-static void
-kill_cifsd(struct TCP_Server_Info *server)
-{
-	struct task_struct *task;
-
-	task = xchg(&server->tsk, NULL);
-	if (task)
-		force_sig(SIGKILL, task);
-}
-
 static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
@@ -2069,21 +2038,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (addr.sa_family == AF_INET)
-		existingCifsSes = cifs_find_tcp_session(&sin_server->sin_addr,
-			NULL /* no ipv6 addr */,
-			volume_info.username, &srvTcp);
-	else if (addr.sa_family == AF_INET6) {
-		cFYI(1, ("looking for ipv6 address"));
-		existingCifsSes = cifs_find_tcp_session(NULL /* no ipv4 addr */,
-			&sin_server6->sin6_addr,
-			volume_info.username, &srvTcp);
-	} else {
-		rc = -EINVAL;
-		goto out;
-	}
-
-	if (!srvTcp) {
+	srvTcp = cifs_find_tcp_session(&addr);
+	if (srvTcp) {
+		cFYI(1, ("Existing tcp session with server found"));
+	} else {	/* create socket */
 		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
@@ -2153,6 +2111,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			memcpy(srvTcp->server_RFC1001_name,
 				volume_info.target_rfc1001_name, 16);
 			srvTcp->sequence_number = 0;
+			INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+			++srvTcp->srv_count;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&srvTcp->tcp_ses_list,
+				 &cifs_tcp_ses_list);
+			write_unlock(&cifs_tcp_ses_lock);
 		}
 	}
 
@@ -2204,8 +2168,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
 			up(&pSesInfo->sesSem);
-			if (!rc)
-				atomic_inc(&srvTcp->socketUseCount);
 		}
 	}
 
@@ -2213,9 +2175,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (!rc) {
 		setup_cifs_sb(&volume_info, cifs_sb);
 
-		tcon =
-		    find_unc(sin_server->sin_addr.s_addr, volume_info.UNC,
-			     volume_info.username);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
 			if (tcon->seal != volume_info.seal)
@@ -2278,35 +2237,21 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 /* on error free sesinfo and tcon struct if needed */
 mount_fail_check:
 	if (rc) {
-		/* if session setup failed, use count is zero but
-		we still need to free cifsd thread */
-		if (atomic_read(&srvTcp->socketUseCount) == 0) {
-			spin_lock(&GlobalMid_Lock);
-			srvTcp->tcpStatus = CifsExiting;
-			spin_unlock(&GlobalMid_Lock);
-			kill_cifsd(srvTcp);
-		}
-		 /* If find_unc succeeded then rc == 0 so we can not end */
-		if (tcon)  /* up accidently freeing someone elses tcon struct */
+		/* If find_unc succeeded then rc == 0 so we can not end */
+		/* up accidently freeing someone elses tcon struct */
+		if (tcon)
 			tconInfoFree(tcon);
+
 		if (existingCifsSes == NULL) {
 			if (pSesInfo) {
 				if ((pSesInfo->server) &&
-				    (pSesInfo->status == CifsGood)) {
-					int temp_rc;
-					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
-					/* if the socketUseCount is now zero */
-					if ((temp_rc == -ESHUTDOWN) &&
-					    (pSesInfo->server))
-						kill_cifsd(pSesInfo->server);
-				} else {
+				    (pSesInfo->status == CifsGood))
+					CIFSSMBLogoff(xid, pSesInfo);
+				else {
 					cFYI(1, ("No session or bad tcon"));
-					if (pSesInfo->server) {
-						spin_lock(&GlobalMid_Lock);
-						srvTcp->tcpStatus = CifsExiting;
-						spin_unlock(&GlobalMid_Lock);
-						kill_cifsd(pSesInfo->server);
-					}
+					if (pSesInfo->server)
+						cifs_put_tcp_session(
+							pSesInfo->server);
 				}
 				sesInfoFree(pSesInfo);
 				/* pSesInfo = NULL; */
@@ -3613,13 +3558,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 			if (rc == -EBUSY) {
 				FreeXid(xid);
 				return 0;
-			} else if (rc == -ESHUTDOWN) {
-				cFYI(1, ("Waking up socket by sending signal"));
-				if (ses->server)
-					kill_cifsd(ses->server);
-				rc = 0;
-			} /* else - we have an smb session
-				left on this socket do not kill cifsd */
+			}
 		} else
 			cFYI(1, ("No session or bad tcon"));
 	}
-- 
cgit v1.2.3


From 14fbf50d695207754daeb96270b3027a3821121f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Nov 2008 13:53:46 -0500
Subject: cifs: reinstate sharing of SMB sessions sans races

We do this by abandoning the global list of SMB sessions and instead
moving to a per-server list. This entails adding a new list head to the
TCP_Server_Info struct. The refcounting for the cifsSesInfo is moved to
a non-atomic variable. We have to protect it by a lock anyway, so there's
no benefit to making it an atomic. The list and refcount are protected
by the global cifs_tcp_ses_lock.

The patch also adds a new routines to find and put SMB sessions and
that properly take and put references under the lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c |  53 ++++++------
 fs/cifs/cifsfs.c     |  17 ++--
 fs/cifs/cifsglob.h   |   6 +-
 fs/cifs/cifsproto.h  |   1 -
 fs/cifs/cifssmb.c    |  22 ++---
 fs/cifs/connect.c    | 226 ++++++++++++++++++++++++++++-----------------------
 fs/cifs/misc.c       |  16 ++--
 7 files changed, 175 insertions(+), 166 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 40b5108fb4f9..59841a68b0b6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,9 +107,9 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
+	struct TCP_Server_Info *server;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	int i;
@@ -122,43 +122,45 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "Servers:");
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info,
+				    tcp_ses_list);
 		i++;
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
-		   (ses->serverNOS == NULL)) {
-			seq_printf(m, "\nentry for %s not fully "
-					"displayed\n\t", ses->serverName);
-		} else {
-			seq_printf(m,
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			if ((ses->serverDomain == NULL) ||
+				(ses->serverOS == NULL) ||
+				(ses->serverNOS == NULL)) {
+				seq_printf(m, "\nentry for %s not fully "
+					   "displayed\n\t", ses->serverName);
+			} else {
+				seq_printf(m,
 				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
 				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
 				i, ses->serverName, ses->serverDomain,
-				atomic_read(&ses->inUse),
-				ses->serverOS, ses->serverNOS,
+				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
-		}
-		if (ses->server) {
+			}
 			seq_printf(m, "TCP status: %d\n\tLocal Users To "
-				    "Server: %d SecMode: 0x%x Req On Wire: %d",
-				ses->server->tcpStatus,
-				ses->server->srv_count,
-				ses->server->secMode,
-				atomic_read(&ses->server->inFlight));
+				   "Server: %d SecMode: 0x%x Req On Wire: %d",
+				   server->tcpStatus, server->srv_count,
+				   server->secMode,
+				   atomic_read(&server->inFlight));
 
 #ifdef CONFIG_CIFS_STATS2
 			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-				atomic_read(&ses->server->inSend),
-				atomic_read(&ses->server->num_waiters));
+				atomic_read(&server->inSend),
+				atomic_read(&server->num_waiters));
 #endif
 
 			seq_puts(m, "\nMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
-			list_for_each(tmp1, &ses->server->pending_mid_q) {
-				mid_entry = list_entry(tmp1, struct
+			list_for_each(tmp3, &server->pending_mid_q) {
+				mid_entry = list_entry(tmp3, struct
 					mid_q_entry,
 					qhead);
 				seq_printf(m, "State: %d com: %d pid:"
@@ -171,9 +173,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
-
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
 	seq_puts(m, "Shares:");
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 2946dab0718f..a1e96620b097 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1031,24 +1031,24 @@ static int cifs_oplock_thread(void *dummyarg)
 static int cifs_dnotify_thread(void *dummyarg)
 {
 	struct list_head *tmp;
-	struct cifsSesInfo *ses;
+	struct TCP_Server_Info *server;
 
 	do {
 		if (try_to_freeze())
 			continue;
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(15*HZ);
-		read_lock(&GlobalSMBSeslock);
 		/* check if any stuck requests that need
 		   to be woken up and wakeq so the
 		   thread can wake up and error out */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-			if (ses->server && atomic_read(&ses->server->inFlight))
-				wake_up_all(&ses->server->response_q);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp, &cifs_tcp_ses_list) {
+			server = list_entry(tmp, struct TCP_Server_Info,
+					 tcp_ses_list);
+			if (atomic_read(&server->inFlight))
+				wake_up_all(&server->response_q);
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} while (!kthread_should_stop());
 
 	return 0;
@@ -1060,7 +1060,6 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalSMBSessionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 313f7bfedec7..631a99f72f22 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -195,14 +195,14 @@ struct cifsUidInfo {
  * Session structure.  One of these for each uid session with a particular host
  */
 struct cifsSesInfo {
-	struct list_head cifsSessionList;
+	struct list_head smb_ses_list;
 	struct list_head tcon_list;
 	struct semaphore sesSem;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
 #endif
 	struct TCP_Server_Info *server;	/* pointer to server info */
-	atomic_t inUse; /* # of mounts (tree connections) on this ses */
+	int ses_count;		/* reference counter */
 	enum statusEnum status;
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
@@ -602,8 +602,6 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 
 /* protects cifs_tcp_ses_list and srv_count for each tcp session */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-
-GLOBAL_EXTERN struct list_head GlobalSMBSessionList; /* BB to be removed by jl*/
 GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 0250a994c6e6..6f21ecb85ce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,7 +102,6 @@ extern void acl_to_uid_mode(struct inode *inode, const char *path,
 			    const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
-extern void cifs_put_tcp_session(struct TCP_Server_Info *server);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
 			const char *);
 extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cd9e9a145e4d..9c95617baa4d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -799,20 +799,16 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	int rc = 0;
 
 	cFYI(1, ("In SMBLogoff for session disconnect"));
-	if (ses)
-		down(&ses->sesSem);
-	else
-		return -EIO;
-
-	atomic_dec(&ses->inUse);
-	if (atomic_read(&ses->inUse) > 0) {
-		up(&ses->sesSem);
-		return -EBUSY;
-	}
 
-	if (ses->server == NULL)
+	/*
+	 * BB: do we need to check validity of ses and server? They should
+	 * always be valid since we have an active reference. If not, that
+	 * should probably be a BUG()
+	 */
+	if (!ses || !ses->server)
 		return -EIO;
 
+	down(&ses->sesSem);
 	if (ses->need_reconnect)
 		goto session_already_dead; /* no need to send SMBlogoff if uid
 					      already closed due to reconnect */
@@ -833,10 +829,6 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	if (ses->server) {
-		cifs_put_tcp_session(ses->server);
-		rc = 0;
-	}
 	up(&ses->sesSem);
 
 	/* if session dead then we do not need to do ulogoff,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a0314259f94d..44130e052e0b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -144,23 +144,18 @@ cifs_reconnect(struct TCP_Server_Info *server)
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
-		if (ses->server) {
-			if (ses->server == server) {
-				ses->need_reconnect = true;
-				ses->ipc_tid = 0;
-			}
-		}
-		/* else tcp and smb sessions need reconnection */
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->need_reconnect = true;
+		ses->ipc_tid = 0;
 	}
+	read_unlock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		if ((tcon->ses) && (tcon->ses->server == server))
 			tcon->need_reconnect = true;
 	}
-	read_unlock(&GlobalSMBSeslock);
 	/* do not want to be sending data on a socket we are freeing */
 	down(&server->tcpSem);
 	if (server->ssocket) {
@@ -696,29 +691,29 @@ multi_t2_fnd:
 	if (smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(smallbuf);
 
-	read_lock(&GlobalSMBSeslock);
+	/*
+	 * BB: we shouldn't have to do any of this. It shouldn't be
+	 * possible to exit from the thread with active SMB sessions
+	 */
+	read_lock(&cifs_tcp_ses_lock);
 	if (list_empty(&server->pending_mid_q)) {
 		/* loop through server session structures attached to this and
 		    mark them dead */
-		list_for_each(tmp, &GlobalSMBSessionList) {
-			ses =
-			    list_entry(tmp, struct cifsSesInfo,
-				       cifsSessionList);
-			if (ses->server == server) {
-				ses->status = CifsExiting;
-				ses->server = NULL;
-			}
+		list_for_each(tmp, &server->smb_ses_list) {
+			ses = list_entry(tmp, struct cifsSesInfo,
+					 smb_ses_list);
+			ses->status = CifsExiting;
+			ses->server = NULL;
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	} else {
 		/* although we can not zero the server struct pointer yet,
 		since there are active requests which may depnd on them,
 		mark the corresponding SMB sessions as exiting too */
-		list_for_each(tmp, &GlobalSMBSessionList) {
+		list_for_each(tmp, &server->smb_ses_list) {
 			ses = list_entry(tmp, struct cifsSesInfo,
-					 cifsSessionList);
-			if (ses->server == server)
-				ses->status = CifsExiting;
+					 smb_ses_list);
+			ses->status = CifsExiting;
 		}
 
 		spin_lock(&GlobalMid_Lock);
@@ -733,7 +728,7 @@ multi_t2_fnd:
 			}
 		}
 		spin_unlock(&GlobalMid_Lock);
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 		/* 1/8th of sec is more than enough time for them to exit */
 		msleep(125);
 	}
@@ -755,14 +750,13 @@ multi_t2_fnd:
 	if there are any pointing to this (e.g
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
-	write_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalSMBSessionList) {
-		ses = list_entry(tmp, struct cifsSesInfo,
-				cifsSessionList);
-		if (ses->server == server)
-			ses->server = NULL;
+	/* BB: This shouldn't be necessary, see above */
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		ses->server = NULL;
 	}
-	write_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	kfree(server->hostname);
 	task_to_wake = xchg(&server->tsk, NULL);
@@ -1401,7 +1395,7 @@ cifs_find_tcp_session(struct sockaddr *addr)
 	return NULL;
 }
 
-void
+static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
 	struct task_struct *task;
@@ -1424,6 +1418,50 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 		force_sig(SIGKILL, task);
 }
 
+static struct cifsSesInfo *
+cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *ses;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &server->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		if (strncmp(ses->userName, username, MAX_USERNAME_SIZE))
+			continue;
+
+		++ses->ses_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_smb_ses(struct cifsSesInfo *ses)
+{
+	int xid;
+	struct TCP_Server_Info *server = ses->server;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if (--ses->ses_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&ses->smb_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	if (ses->status == CifsGood) {
+		xid = GetXid();
+		CIFSSMBLogoff(xid, ses);
+		_FreeXid(xid);
+	}
+	sesInfoFree(ses);
+	cifs_put_tcp_session(server);
+}
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -1958,7 +1996,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	struct smb_vol volume_info;
 	struct cifsSesInfo *pSesInfo = NULL;
-	struct cifsSesInfo *existingCifsSes = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
 
@@ -2112,6 +2149,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				volume_info.target_rfc1001_name, 16);
 			srvTcp->sequence_number = 0;
 			INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
+			INIT_LIST_HEAD(&srvTcp->smb_ses_list);
 			++srvTcp->srv_count;
 			write_lock(&cifs_tcp_ses_lock);
 			list_add(&srvTcp->tcp_ses_list,
@@ -2120,10 +2158,16 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 		}
 	}
 
-	if (existingCifsSes) {
-		pSesInfo = existingCifsSes;
+	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info.username);
+	if (pSesInfo) {
 		cFYI(1, ("Existing smb sess found (status=%d)",
 			pSesInfo->status));
+		/*
+		 * The existing SMB session already has a reference to srvTcp,
+		 * so we can put back the extra one we got before
+		 */
+		cifs_put_tcp_session(srvTcp);
+
 		down(&pSesInfo->sesSem);
 		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
@@ -2134,41 +2178,44 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
-		if (pSesInfo == NULL)
+		if (pSesInfo == NULL) {
 			rc = -ENOMEM;
-		else {
-			pSesInfo->server = srvTcp;
-			sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
-				NIPQUAD(sin_server->sin_addr.s_addr));
+			goto mount_fail_check;
 		}
 
-		if (!rc) {
-			/* volume_info.password freed at unmount */
-			if (volume_info.password) {
-				pSesInfo->password = volume_info.password;
-				/* set to NULL to prevent freeing on exit */
-				volume_info.password = NULL;
-			}
-			if (volume_info.username)
-				strncpy(pSesInfo->userName,
-					volume_info.username,
-					MAX_USERNAME_SIZE);
-			if (volume_info.domainname) {
-				int len = strlen(volume_info.domainname);
-				pSesInfo->domainName =
-					kmalloc(len + 1, GFP_KERNEL);
-				if (pSesInfo->domainName)
-					strcpy(pSesInfo->domainName,
-						volume_info.domainname);
-			}
-			pSesInfo->linux_uid = volume_info.linux_uid;
-			pSesInfo->overrideSecFlg = volume_info.secFlg;
-			down(&pSesInfo->sesSem);
-			/* BB FIXME need to pass vol->secFlgs BB */
-			rc = cifs_setup_session(xid, pSesInfo,
-						cifs_sb->local_nls);
-			up(&pSesInfo->sesSem);
-		}
+		/* new SMB session uses our srvTcp ref */
+		pSesInfo->server = srvTcp;
+		sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
+			NIPQUAD(sin_server->sin_addr.s_addr));
+
+		write_lock(&cifs_tcp_ses_lock);
+		list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
+		write_unlock(&cifs_tcp_ses_lock);
+
+		/* volume_info.password freed at unmount */
+		if (volume_info.password) {
+			pSesInfo->password = volume_info.password;
+			/* set to NULL to prevent freeing on exit */
+			volume_info.password = NULL;
+		}
+		if (volume_info.username)
+			strncpy(pSesInfo->userName, volume_info.username,
+				MAX_USERNAME_SIZE);
+		if (volume_info.domainname) {
+			int len = strlen(volume_info.domainname);
+			pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
+			if (pSesInfo->domainName)
+				strcpy(pSesInfo->domainName,
+					volume_info.domainname);
+		}
+		pSesInfo->linux_uid = volume_info.linux_uid;
+		pSesInfo->overrideSecFlg = volume_info.secFlg;
+		down(&pSesInfo->sesSem);
+
+		/* BB FIXME need to pass vol->secFlgs BB */
+		rc = cifs_setup_session(xid, pSesInfo,
+					cifs_sb->local_nls);
+		up(&pSesInfo->sesSem);
 	}
 
 	/* search for existing tcon to this server share */
@@ -2209,11 +2256,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						tcon->Flags));
 				}
 			}
-			if (!rc) {
-				atomic_inc(&pSesInfo->inUse);
-				tcon->seal = volume_info.seal;
-			} else
+			if (rc)
 				goto mount_fail_check;
+			tcon->seal = volume_info.seal;
 		}
 
 		/* we can have only one retry value for a connection
@@ -2234,29 +2279,19 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-/* on error free sesinfo and tcon struct if needed */
 mount_fail_check:
+	/* on error free sesinfo and tcon struct if needed */
 	if (rc) {
 		/* If find_unc succeeded then rc == 0 so we can not end */
 		/* up accidently freeing someone elses tcon struct */
 		if (tcon)
 			tconInfoFree(tcon);
 
-		if (existingCifsSes == NULL) {
-			if (pSesInfo) {
-				if ((pSesInfo->server) &&
-				    (pSesInfo->status == CifsGood))
-					CIFSSMBLogoff(xid, pSesInfo);
-				else {
-					cFYI(1, ("No session or bad tcon"));
-					if (pSesInfo->server)
-						cifs_put_tcp_session(
-							pSesInfo->server);
-				}
-				sesInfoFree(pSesInfo);
-				/* pSesInfo = NULL; */
-			}
-		}
+		/* should also end up putting our tcp session ref if needed */
+		if (pSesInfo)
+			cifs_put_smb_ses(pSesInfo);
+		else
+			cifs_put_tcp_session(srvTcp);
 	} else {
 		atomic_inc(&tcon->useCount);
 		cifs_sb->tcon = tcon;
@@ -3551,16 +3586,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 		}
 		DeleteTconOplockQEntries(cifs_sb->tcon);
 		tconInfoFree(cifs_sb->tcon);
-		if ((ses) && (ses->server)) {
-			/* save off task so we do not refer to ses later */
-			cFYI(1, ("About to do SMBLogoff "));
-			rc = CIFSSMBLogoff(xid, ses);
-			if (rc == -EBUSY) {
-				FreeXid(xid);
-				return 0;
-			}
-		} else
-			cFYI(1, ("No session or bad tcon"));
+		cifs_put_smb_ses(ses);
 	}
 
 	cifs_sb->tcon = NULL;
@@ -3568,8 +3594,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	cifs_sb->prepathlen = 0;
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
-	if (ses)
-		sesInfoFree(ses);
 
 	FreeXid(xid);
 	return rc;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 88786ba02d27..46c8c7baccba 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,12 +75,11 @@ sesInfoAlloc(void)
 
 	ret_buf = kzalloc(sizeof(struct cifsSesInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&sesInfoAllocCount);
 		ret_buf->status = CifsNew;
-		list_add(&ret_buf->cifsSessionList, &GlobalSMBSessionList);
+		++ret_buf->ses_count;
+		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
 		init_MUTEX(&ret_buf->sesSem);
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -93,10 +92,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
 		return;
 	}
 
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&sesInfoAllocCount);
-	list_del(&buf_to_free->cifsSessionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->serverOS);
 	kfree(buf_to_free->serverDomain);
 	kfree(buf_to_free->serverNOS);
@@ -350,9 +346,9 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 				if (current->fsuid != treeCon->ses->linux_uid) {
 					cFYI(1, ("Multiuser mode and UID "
 						 "did not match tcon uid"));
-					read_lock(&GlobalSMBSeslock);
-					list_for_each(temp_item, &GlobalSMBSessionList) {
-						ses = list_entry(temp_item, struct cifsSesInfo, cifsSessionList);
+					read_lock(&cifs_tcp_ses_lock);
+					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
+						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current->fsuid) {
 							if (ses->server == treeCon->ses->server) {
 								cFYI(1, ("found matching uid substitute right smb_uid"));
@@ -364,7 +360,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 							}
 						}
 					}
-					read_unlock(&GlobalSMBSeslock);
+					read_unlock(&cifs_tcp_ses_lock);
 				}
 			}
 		}
-- 
cgit v1.2.3


From d82c2df54e2f7e447476350848d8eccc8d2fe46a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 15 Nov 2008 00:07:26 +0000
Subject: [CIFS] minor cleanup to cifs_mount

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 74 +++++++++++++++++++++++++------------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 44130e052e0b..a3dc0d7cafc3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1368,7 +1368,6 @@ cifs_find_tcp_session(struct sockaddr *addr)
 	list_for_each(tmp, &cifs_tcp_ses_list) {
 		server = list_entry(tmp, struct TCP_Server_Info,
 				    tcp_ses_list);
-
 		/*
 		 * the demux thread can exit on its own while still in CifsNew
 		 * so don't accept any sockets in that state. Since the
@@ -1389,6 +1388,7 @@ cifs_find_tcp_session(struct sockaddr *addr)
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
+		cFYI(1, ("Existing tcp session with server found"));
 		return server;
 	}
 	write_unlock(&cifs_tcp_ses_lock);
@@ -2076,9 +2076,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	}
 
 	srvTcp = cifs_find_tcp_session(&addr);
-	if (srvTcp) {
-		cFYI(1, ("Existing tcp session with server found"));
-	} else {	/* create socket */
+	if (!srvTcp) { /* create socket */
 		if (addr.sa_family == AF_INET6) {
 			cFYI(1, ("attempting ipv6 connect"));
 			/* BB should we allow ipv6 on port 139? */
@@ -2292,45 +2290,41 @@ mount_fail_check:
 			cifs_put_smb_ses(pSesInfo);
 		else
 			cifs_put_tcp_session(srvTcp);
-	} else {
-		atomic_inc(&tcon->useCount);
-		cifs_sb->tcon = tcon;
-		tcon->ses = pSesInfo;
-
-		/* do not care if following two calls succeed - informational */
-		if (!tcon->ipc) {
-			CIFSSMBQFSDeviceInfo(xid, tcon);
-			CIFSSMBQFSAttributeInfo(xid, tcon);
-		}
-
-		/* tell server which Unix caps we support */
-		if (tcon->ses->capabilities & CAP_UNIX)
-			/* reset of caps checks mount to see if unix extensions
-			   disabled for just this mount */
-			reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
-		else
-			tcon->unix_ext = 0; /* server does not support them */
-
-		/* convert forward to back slashes in prepath here if needed */
-		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
-			convert_delimiter(cifs_sb->prepath,
-					  CIFS_DIR_SEP(cifs_sb));
+		goto out;
+	}
+	atomic_inc(&tcon->useCount);
+	cifs_sb->tcon = tcon;
+	tcon->ses = pSesInfo;
 
-		if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-			cifs_sb->rsize = 1024 * 127;
-			cFYI(DBG2,
-				("no very large read support, rsize now 127K"));
-		}
-		if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
-			cifs_sb->wsize = min(cifs_sb->wsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
-		if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-			cifs_sb->rsize = min(cifs_sb->rsize,
-					     (tcon->ses->server->maxBuf -
-					      MAX_CIFS_HDR_SIZE));
+	/* do not care if following two calls succeed - informational */
+	if (!tcon->ipc) {
+		CIFSSMBQFSDeviceInfo(xid, tcon);
+		CIFSSMBQFSAttributeInfo(xid, tcon);
 	}
 
+	/* tell server which Unix caps we support */
+	if (tcon->ses->capabilities & CAP_UNIX)
+		/* reset of caps checks mount to see if unix extensions
+		   disabled for just this mount */
+		reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
+	else
+		tcon->unix_ext = 0; /* server does not support them */
+
+	/* convert forward to back slashes in prepath here if needed */
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
+		convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb));
+
+	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
+		cifs_sb->rsize = 1024 * 127;
+		cFYI(DBG2, ("no very large read support, rsize now 127K"));
+	}
+	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
+		cifs_sb->wsize = min(cifs_sb->wsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+	if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
+		cifs_sb->rsize = min(cifs_sb->rsize,
+			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
+
 	/* volume_info.password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
-- 
cgit v1.2.3


From 8f7b0ba1c853919b85b54774775f567f30006107 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 15 Nov 2008 01:15:43 +0000
Subject: Fix inotify watch removal/umount races

Inotify watch removals suck violently.

To kick the watch out we need (in this order) inode->inotify_mutex and
ih->mutex.  That's fine if we have a hold on inode; however, for all
other cases we need to make damn sure we don't race with umount.  We can
*NOT* just grab a reference to a watch - inotify_unmount_inodes() will
happily sail past it and we'll end with reference to inode potentially
outliving its superblock.

Ideally we just want to grab an active reference to superblock if we
can; that will make sure we won't go into inotify_umount_inodes() until
we are done.  Cleanup is just deactivate_super().

However, that leaves a messy case - what if we *are* racing with
umount() and active references to superblock can't be acquired anymore?
We can bump ->s_count, grab ->s_umount, which will almost certainly wait
until the superblock is shut down and the watch in question is pining
for fjords.  That's fine, but there is a problem - we might have hit the
window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e.
the moment when superblock is past the point of no return and is heading
for shutdown) and the moment when deactivate_super() acquires
->s_umount.

We could just do drop_super() yield() and retry, but that's rather
antisocial and this stuff is luser-triggerable.  OTOH, having grabbed
->s_umount and having found that we'd got there first (i.e.  that
->s_root is non-NULL) we know that we won't race with
inotify_umount_inodes().

So we could grab a reference to watch and do the rest as above, just
with drop_super() instead of deactivate_super(), right? Wrong.  We had
to drop ih->mutex before we could grab ->s_umount.  So the watch
could've been gone already.

That still can be dealt with - we need to save watch->wd, do idr_find()
and compare its result with our pointer.  If they match, we either have
the damn thing still alive or we'd lost not one but two races at once,
the watch had been killed and a new one got created with the same ->wd
at the same address.  That couldn't have happened in inotify_destroy(),
but inotify_rm_wd() could run into that.  Still, "new one got created"
is not a problem - we have every right to kill it or leave it alone,
whatever's more convenient.

So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
"grab it and kill it" check.  If it's been our original watch, we are
fine, if it's a newcomer - nevermind, just pretend that we'd won the
race and kill the fscker anyway; we are safe since we know that its
superblock won't be going away.

And yes, this is far beyond mere "not very pretty"; so's the entire
concept of inotify to start with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Greg KH <greg@kroah.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inotify.c            | 150 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/inotify.h |  11 ++++
 kernel/audit_tree.c     |  91 +++++++++++++++++------------
 kernel/auditfilter.c    |  14 +++--
 4 files changed, 218 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 690e72595e6e..7bbed1b89825 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(get_inotify_watch);
 
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		atomic_inc(&watch->count);
+		return 1;
+	}
+	spin_unlock(&sb_lock);
+	return 0;
+}
+
 /**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
  * watch references if the count reaches zero.  inotify_watch is freed by
@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(put_inotify_watch);
 
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	deactivate_super(sb);
+}
+
 /*
  * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch)
 }
 EXPORT_SYMBOL_GPL(inotify_init_watch);
 
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	s32 wd = watch->wd;
+
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		get_inotify_watch(watch);
+		mutex_unlock(&ih->mutex);
+		return 1;	/* the best outcome */
+	}
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+	down_read(&sb->s_umount);
+	if (likely(!sb->s_root)) {
+		/* fs is already shut down; the watch is dead */
+		drop_super(sb);
+		return 0;
+	}
+	/* raced with the final deactivate_super() */
+	mutex_lock(&ih->mutex);
+	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+		/* the watch is dead */
+		mutex_unlock(&ih->mutex);
+		drop_super(sb);
+		return 0;
+	}
+	/* still alive or freed and reused with the same sb and wd; kill */
+	get_inotify_watch(watch);
+	mutex_unlock(&ih->mutex);
+	return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	switch (how) {
+	case 1:
+		deactivate_super(sb);
+		break;
+	case 2:
+		drop_super(sb);
+	}
+}
+
 /**
  * inotify_destroy - clean up and destroy an inotify instance
  * @ih: inotify handle
@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih)
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
 	 * hold inode->inotify_mutex before ih->mutex.  The following works.
+	 *
+	 * AV: it had to become even uglier to start working ;-/
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
+		struct super_block *sb;
 		struct inode *inode;
+		int how;
 
 		mutex_lock(&ih->mutex);
 		watches = &ih->watches;
@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih)
 			break;
 		}
 		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
+		sb = watch->inode->i_sb;
+		how = pin_to_kill(ih, watch);
+		if (!how)
+			continue;
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih)
 
 		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
-		put_inotify_watch(watch);
+		unpin_and_kill(watch, how);
 	}
 
 	/* free this handle: the put matching the get in inotify_init() */
@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
 int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
 	struct inotify_watch *watch;
+	struct super_block *sb;
 	struct inode *inode;
+	int how;
 
 	mutex_lock(&ih->mutex);
 	watch = idr_find(&ih->idr, wd);
@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 		mutex_unlock(&ih->mutex);
 		return -EINVAL;
 	}
-	get_inotify_watch(watch);
+	sb = watch->inode->i_sb;
+	how = pin_to_kill(ih, watch);
+	if (!how)
+		return 0;
+
 	inode = watch->inode;
-	mutex_unlock(&ih->mutex);
 
 	mutex_lock(&inode->inotify_mutex);
 	mutex_lock(&ih->mutex);
@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
+	unpin_and_kill(watch, how);
 
 	return 0;
 }
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index bd578578a8b9..37ea2894b3c0 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -134,6 +134,8 @@ extern void inotify_remove_watch_locked(struct inotify_handle *,
 					struct inotify_watch *);
 extern void get_inotify_watch(struct inotify_watch *);
 extern void put_inotify_watch(struct inotify_watch *);
+extern int pin_inotify_watch(struct inotify_watch *);
+extern void unpin_inotify_watch(struct inotify_watch *);
 
 #else
 
@@ -228,6 +230,15 @@ static inline void put_inotify_watch(struct inotify_watch *watch)
 {
 }
 
+extern inline int pin_inotify_watch(struct inotify_watch *watch)
+{
+	return 0;
+}
+
+extern inline void unpin_inotify_watch(struct inotify_watch *watch)
+{
+}
+
 #endif	/* CONFIG_INOTIFY */
 
 #endif	/* __KERNEL __ */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ba0e0d934f2..8b509441f49a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
 	struct list_head trees;		/* with root here */
 	int dead;
 	int count;
+	atomic_long_t refs;
 	struct rcu_head head;
 	struct node {
 		struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
  * tree is refcounted; one reference for "some rules on rules_list refer to
  * it", one for each chunk with pointer to it.
  *
- * chunk is refcounted by embedded inotify_watch.
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
  *
  * node.index allows to get from node.list to containing chunk.
  * MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
 	INIT_LIST_HEAD(&chunk->hash);
 	INIT_LIST_HEAD(&chunk->trees);
 	chunk->count = count;
+	atomic_long_set(&chunk->refs, 1);
 	for (i = 0; i < count; i++) {
 		INIT_LIST_HEAD(&chunk->owners[i].list);
 		chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
 	return chunk;
 }
 
-static void __free_chunk(struct rcu_head *rcu)
+static void free_chunk(struct audit_chunk *chunk)
 {
-	struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
 	int i;
 
 	for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
 	kfree(chunk);
 }
 
-static inline void free_chunk(struct audit_chunk *chunk)
+void audit_put_chunk(struct audit_chunk *chunk)
 {
-	call_rcu(&chunk->head, __free_chunk);
+	if (atomic_long_dec_and_test(&chunk->refs))
+		free_chunk(chunk);
 }
 
-void audit_put_chunk(struct audit_chunk *chunk)
+static void __put_chunk(struct rcu_head *rcu)
 {
-	put_inotify_watch(&chunk->watch);
+	struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+	audit_put_chunk(chunk);
 }
 
 enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
 
 	list_for_each_entry_rcu(p, list, hash) {
 		if (p->watch.inode == inode) {
-			get_inotify_watch(&p->watch);
+			atomic_long_inc(&p->refs);
 			return p;
 		}
 	}
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
 
 /* tagging and untagging inodes with trees */
 
-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+static struct audit_chunk *find_chunk(struct node *p)
+{
+	int index = p->index & ~(1U<<31);
+	p -= index;
+	return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
 {
+	struct audit_chunk *chunk = find_chunk(p);
 	struct audit_chunk *new;
 	struct audit_tree *owner;
 	int size = chunk->count - 1;
 	int i, j;
 
+	if (!pin_inotify_watch(&chunk->watch)) {
+		/*
+		 * Filesystem is shutting down; all watches are getting
+		 * evicted, just take it off the node list for this
+		 * tree and let the eviction logics take care of the
+		 * rest.
+		 */
+		owner = p->owner;
+		if (owner->root == chunk) {
+			list_del_init(&owner->same_root);
+			owner->root = NULL;
+		}
+		list_del_init(&p->list);
+		p->owner = NULL;
+		put_tree(owner);
+		return;
+	}
+
+	spin_unlock(&hash_lock);
+
+	/*
+	 * pin_inotify_watch() succeeded, so the watch won't go away
+	 * from under us.
+	 */
 	mutex_lock(&chunk->watch.inode->inotify_mutex);
 	if (chunk->dead) {
 		mutex_unlock(&chunk->watch.inode->inotify_mutex);
-		return;
+		goto out;
 	}
 
 	owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
 		inotify_evict_watch(&chunk->watch);
 		mutex_unlock(&chunk->watch.inode->inotify_mutex);
 		put_inotify_watch(&chunk->watch);
-		return;
+		goto out;
 	}
 
 	new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
 	inotify_evict_watch(&chunk->watch);
 	mutex_unlock(&chunk->watch.inode->inotify_mutex);
 	put_inotify_watch(&chunk->watch);
-	return;
+	goto out;
 
 Fallback:
 	// do the best we can
@@ -277,6 +313,9 @@ Fallback:
 	put_tree(owner);
 	spin_unlock(&hash_lock);
 	mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+	unpin_inotify_watch(&chunk->watch);
+	spin_lock(&hash_lock);
 }
 
 static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	return 0;
 }
 
-static struct audit_chunk *find_chunk(struct node *p)
-{
-	int index = p->index & ~(1U<<31);
-	p -= index;
-	return container_of(p, struct audit_chunk, owners[0]);
-}
-
 static void kill_rules(struct audit_tree *tree)
 {
 	struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
 	spin_lock(&hash_lock);
 	while (!list_empty(&victim->chunks)) {
 		struct node *p;
-		struct audit_chunk *chunk;
 
 		p = list_entry(victim->chunks.next, struct node, list);
-		chunk = find_chunk(p);
-		get_inotify_watch(&chunk->watch);
-		spin_unlock(&hash_lock);
-
-		untag_chunk(chunk, p);
 
-		put_inotify_watch(&chunk->watch);
-		spin_lock(&hash_lock);
+		untag_chunk(p);
 	}
 	spin_unlock(&hash_lock);
 	put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
 
 	while (!list_empty(&tree->chunks)) {
 		struct node *node;
-		struct audit_chunk *chunk;
 
 		node = list_entry(tree->chunks.next, struct node, list);
 
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
 		if (!(node->index & (1U<<31)))
 			break;
 
-		chunk = find_chunk(node);
-		get_inotify_watch(&chunk->watch);
-		spin_unlock(&hash_lock);
-
-		untag_chunk(chunk, node);
-
-		put_inotify_watch(&chunk->watch);
-		spin_lock(&hash_lock);
+		untag_chunk(node);
 	}
 	if (!tree->root && !tree->goner) {
 		tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
 static void destroy_watch(struct inotify_watch *watch)
 {
 	struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
-	free_chunk(chunk);
+	call_rcu(&chunk->head, __put_chunk);
 }
 
 static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0ef..9fd85a4640a0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
 	list_for_each_entry_safe(p, n, in_list, ilist) {
 		list_del(&p->ilist);
 		inotify_rm_watch(audit_ih, &p->wdata);
-		/* the put matching the get in audit_do_del_rule() */
-		put_inotify_watch(&p->wdata);
+		/* the unpin matching the pin in audit_do_del_rule() */
+		unpin_inotify_watch(&p->wdata);
 	}
 }
 
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
 				/* Put parent on the inotify un-registration
 				 * list.  Grab a reference before releasing
 				 * audit_filter_mutex, to be released in
-				 * audit_inotify_unregister(). */
-				list_add(&parent->ilist, &inotify_list);
-				get_inotify_watch(&parent->wdata);
+				 * audit_inotify_unregister().
+				 * If filesystem is going away, just leave
+				 * the sucker alone, eviction will take
+				 * care of it.
+				 */
+				if (pin_inotify_watch(&parent->wdata))
+					list_add(&parent->ilist, &inotify_list);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 5c06fe772da43db63b053addcd2c267f76d0be91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 16 Nov 2008 22:19:10 +0000
Subject: Fix broken ownership of /proc/sys/ files

D'oh...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-and-tested-by: Peter Palfrader <peter@palfrader.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff6863a..06ed10b7da9e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,6 +31,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
+	inode->i_uid = inode->i_gid = 0;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
-- 
cgit v1.2.3


From f1987b44f642e96176adc88b7ce23a1d74806f89 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 15 Nov 2008 11:12:47 -0500
Subject: cifs: reinstate sharing of tree connections

Use a similar approach to the SMB session sharing. Add a list of tcons
attached to each SMB session. Move the refcount to non-atomic. Protect
all of the above with the cifs_tcp_ses_lock. Add functions to
properly find and put references to the tcons.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 236 ++++++++++++++++++++++++++++-----------------------
 fs/cifs/cifsfs.c     |   8 +-
 fs/cifs/cifsglob.h   |  13 +--
 fs/cifs/cifssmb.c    |  43 +++-------
 fs/cifs/connect.c    |  93 +++++++++++++-------
 fs/cifs/misc.c       |  74 ++++++++--------
 6 files changed, 249 insertions(+), 218 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 59841a68b0b6..1d6dfa8923ca 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -107,12 +107,13 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #ifdef CONFIG_PROC_FS
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
-	struct list_head *tmp, *tmp2, *tmp3;
+	struct list_head *tmp1, *tmp2, *tmp3;
 	struct mid_q_entry *mid_entry;
 	struct TCP_Server_Info *server;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
-	int i;
+	int i, j;
+	__u32 dev_type;
 
 	seq_puts(m,
 		    "Display Internal CIFS Data Structures for Debugging\n"
@@ -123,8 +124,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 
 	i = 0;
 	read_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &cifs_tcp_ses_list) {
-		server = list_entry(tmp, struct TCP_Server_Info,
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
 				    tcp_ses_list);
 		i++;
 		list_for_each(tmp2, &server->smb_ses_list) {
@@ -133,12 +134,12 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			if ((ses->serverDomain == NULL) ||
 				(ses->serverOS == NULL) ||
 				(ses->serverNOS == NULL)) {
-				seq_printf(m, "\nentry for %s not fully "
-					   "displayed\n\t", ses->serverName);
+				seq_printf(m, "\n%d) entry for %s not fully "
+					   "displayed\n\t", i, ses->serverName);
 			} else {
 				seq_printf(m,
-				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
-				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
+				    "\n%d) Name: %s  Domain: %s Uses: %d OS:"
+				    " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
 				i, ses->serverName, ses->serverDomain,
 				ses->ses_count, ses->serverOS, ses->serverNOS,
@@ -156,14 +157,44 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 				atomic_read(&server->num_waiters));
 #endif
 
-			seq_puts(m, "\nMIDs:\n");
+			seq_puts(m, "\n\tShares:");
+			j = 0;
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3, struct cifsTconInfo,
+						  tcon_list);
+				++j;
+				dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+				seq_printf(m, "\n\t%d) %s Mounts: %d ", j,
+					   tcon->treeName, tcon->tc_count);
+				if (tcon->nativeFileSystem) {
+					seq_printf(m, "Type: %s ",
+						   tcon->nativeFileSystem);
+				}
+				seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
+					"\nPathComponentMax: %d Status: 0x%d",
+					le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
+					le32_to_cpu(tcon->fsAttrInfo.Attributes),
+					le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
+					tcon->tidStatus);
+				if (dev_type == FILE_DEVICE_DISK)
+					seq_puts(m, " type: DISK ");
+				else if (dev_type == FILE_DEVICE_CD_ROM)
+					seq_puts(m, " type: CDROM ");
+				else
+					seq_printf(m, " type: %d ", dev_type);
+
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_putc(m, '\n');
+			}
+
+			seq_puts(m, "\n\tMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
 			list_for_each(tmp3, &server->pending_mid_q) {
-				mid_entry = list_entry(tmp3, struct
-					mid_q_entry,
+				mid_entry = list_entry(tmp3, struct mid_q_entry,
 					qhead);
-				seq_printf(m, "State: %d com: %d pid:"
+				seq_printf(m, "\tState: %d com: %d pid:"
 						" %d tsk: %p mid %d\n",
 						mid_entry->midState,
 						(int)mid_entry->command,
@@ -177,41 +208,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 	read_unlock(&cifs_tcp_ses_lock);
 	seq_putc(m, '\n');
 
-	seq_puts(m, "Shares:");
-
-	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		__u32 dev_type;
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-		seq_printf(m, "\n%d) %s Uses: %d ", i,
-				 tcon->treeName, atomic_read(&tcon->useCount));
-		if (tcon->nativeFileSystem) {
-			seq_printf(m, "Type: %s ",
-					 tcon->nativeFileSystem);
-		}
-		seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-				 "\nPathComponentMax: %d Status: %d",
-			    le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
-			    le32_to_cpu(tcon->fsAttrInfo.Attributes),
-			    le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
-			    tcon->tidStatus);
-		if (dev_type == FILE_DEVICE_DISK)
-			seq_puts(m, " type: DISK ");
-		else if (dev_type == FILE_DEVICE_CD_ROM)
-			seq_puts(m, " type: CDROM ");
-		else
-			seq_printf(m, " type: %d ", dev_type);
-
-		if (tcon->need_reconnect)
-			seq_puts(m, "\tDISCONNECTED ");
-	}
-	read_unlock(&GlobalSMBSeslock);
-
-	seq_putc(m, '\n');
-
 	/* BB add code to dump additional info such as TCP session info now */
 	return 0;
 }
@@ -235,7 +231,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 {
 	char c;
 	int rc;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	rc = get_user(c, buffer);
@@ -243,33 +241,42 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		return rc;
 
 	if (c == '1' || c == 'y' || c == 'Y' || c == '0') {
-		read_lock(&GlobalSMBSeslock);
 #ifdef CONFIG_CIFS_STATS2
 		atomic_set(&totBufAllocCount, 0);
 		atomic_set(&totSmBufAllocCount, 0);
 #endif /* CONFIG_CIFS_STATS2 */
-		list_for_each(tmp, &GlobalTreeConnectionList) {
-			tcon = list_entry(tmp, struct cifsTconInfo,
-					cifsConnectionList);
-			atomic_set(&tcon->num_smbs_sent, 0);
-			atomic_set(&tcon->num_writes, 0);
-			atomic_set(&tcon->num_reads, 0);
-			atomic_set(&tcon->num_oplock_brks, 0);
-			atomic_set(&tcon->num_opens, 0);
-			atomic_set(&tcon->num_closes, 0);
-			atomic_set(&tcon->num_deletes, 0);
-			atomic_set(&tcon->num_mkdirs, 0);
-			atomic_set(&tcon->num_rmdirs, 0);
-			atomic_set(&tcon->num_renames, 0);
-			atomic_set(&tcon->num_t2renames, 0);
-			atomic_set(&tcon->num_ffirst, 0);
-			atomic_set(&tcon->num_fnext, 0);
-			atomic_set(&tcon->num_fclose, 0);
-			atomic_set(&tcon->num_hardlinks, 0);
-			atomic_set(&tcon->num_symlinks, 0);
-			atomic_set(&tcon->num_locks, 0);
+		read_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp1, &cifs_tcp_ses_list) {
+			server = list_entry(tmp1, struct TCP_Server_Info,
+					    tcp_ses_list);
+			list_for_each(tmp2, &server->smb_session_list) {
+				ses = list_entry(tmp2, struct cifsSesInfo,
+						 smb_session_list);
+				list_for_each(tmp3, &ses->tcon_list) {
+					tcon = list_entry(tmp3,
+							  struct cifsTconInfo,
+							  tcon_list);
+					atomic_set(&tcon->num_smbs_sent, 0);
+					atomic_set(&tcon->num_writes, 0);
+					atomic_set(&tcon->num_reads, 0);
+					atomic_set(&tcon->num_oplock_brks, 0);
+					atomic_set(&tcon->num_opens, 0);
+					atomic_set(&tcon->num_closes, 0);
+					atomic_set(&tcon->num_deletes, 0);
+					atomic_set(&tcon->num_mkdirs, 0);
+					atomic_set(&tcon->num_rmdirs, 0);
+					atomic_set(&tcon->num_renames, 0);
+					atomic_set(&tcon->num_t2renames, 0);
+					atomic_set(&tcon->num_ffirst, 0);
+					atomic_set(&tcon->num_fnext, 0);
+					atomic_set(&tcon->num_fclose, 0);
+					atomic_set(&tcon->num_hardlinks, 0);
+					atomic_set(&tcon->num_symlinks, 0);
+					atomic_set(&tcon->num_locks, 0);
+				}
+			}
 		}
-		read_unlock(&GlobalSMBSeslock);
+		read_unlock(&cifs_tcp_ses_lock);
 	}
 
 	return count;
@@ -278,7 +285,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
 	int i;
-	struct list_head *tmp;
+	struct list_head *tmp1, *tmp2, *tmp3;
+	struct TCP_Server_Info *server;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 
 	seq_printf(m,
@@ -307,44 +316,55 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 		GlobalCurrentXid, GlobalMaxActiveXid);
 
 	i = 0;
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		i++;
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		seq_printf(m, "\n%d) %s", i, tcon->treeName);
-		if (tcon->need_reconnect)
-			seq_puts(m, "\tDISCONNECTED ");
-		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
-			atomic_read(&tcon->num_smbs_sent),
-			atomic_read(&tcon->num_oplock_brks));
-		seq_printf(m, "\nReads:  %d Bytes: %lld",
-			atomic_read(&tcon->num_reads),
-			(long long)(tcon->bytes_read));
-		seq_printf(m, "\nWrites: %d Bytes: %lld",
-			atomic_read(&tcon->num_writes),
-			(long long)(tcon->bytes_written));
-		seq_printf(m,
-			"\nLocks: %d HardLinks: %d Symlinks: %d",
-			atomic_read(&tcon->num_locks),
-			atomic_read(&tcon->num_hardlinks),
-			atomic_read(&tcon->num_symlinks));
-
-		seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
-			atomic_read(&tcon->num_opens),
-			atomic_read(&tcon->num_closes),
-			atomic_read(&tcon->num_deletes));
-		seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
-			atomic_read(&tcon->num_mkdirs),
-			atomic_read(&tcon->num_rmdirs));
-		seq_printf(m, "\nRenames: %d T2 Renames %d",
-			atomic_read(&tcon->num_renames),
-			atomic_read(&tcon->num_t2renames));
-		seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
-			atomic_read(&tcon->num_ffirst),
-			atomic_read(&tcon->num_fnext),
-			atomic_read(&tcon->num_fclose));
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp1, &cifs_tcp_ses_list) {
+		server = list_entry(tmp1, struct TCP_Server_Info,
+				    tcp_ses_list);
+		list_for_each(tmp2, &server->smb_ses_list) {
+			ses = list_entry(tmp2, struct cifsSesInfo,
+					 smb_ses_list);
+			list_for_each(tmp3, &ses->tcon_list) {
+				tcon = list_entry(tmp3,
+						  struct cifsTconInfo,
+						  tcon_list);
+				i++;
+				seq_printf(m, "\n%d) %s", i, tcon->treeName);
+				if (tcon->need_reconnect)
+					seq_puts(m, "\tDISCONNECTED ");
+				seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
+					atomic_read(&tcon->num_smbs_sent),
+					atomic_read(&tcon->num_oplock_brks));
+				seq_printf(m, "\nReads:  %d Bytes: %lld",
+					atomic_read(&tcon->num_reads),
+					(long long)(tcon->bytes_read));
+				seq_printf(m, "\nWrites: %d Bytes: %lld",
+					atomic_read(&tcon->num_writes),
+					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nLocks: %d HardLinks: %d "
+					      "Symlinks: %d",
+					atomic_read(&tcon->num_locks),
+					atomic_read(&tcon->num_hardlinks),
+					atomic_read(&tcon->num_symlinks));
+				seq_printf(m, "\nOpens: %d Closes: %d"
+					      "Deletes: %d",
+					atomic_read(&tcon->num_opens),
+					atomic_read(&tcon->num_closes),
+					atomic_read(&tcon->num_deletes));
+				seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
+					atomic_read(&tcon->num_mkdirs),
+					atomic_read(&tcon->num_rmdirs));
+				seq_printf(m, "\nRenames: %d T2 Renames %d",
+					atomic_read(&tcon->num_renames),
+					atomic_read(&tcon->num_t2renames));
+				seq_printf(m, "\nFindFirst: %d FNext %d "
+					      "FClose %d",
+					atomic_read(&tcon->num_ffirst),
+					atomic_read(&tcon->num_fnext),
+					atomic_read(&tcon->num_fclose));
+			}
+		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	seq_putc(m, '\n');
 	return 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a1e96620b097..d9cf467309e8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -514,10 +514,11 @@ static void cifs_umount_begin(struct super_block *sb)
 	tcon = cifs_sb->tcon;
 	if (tcon == NULL)
 		return;
-	down(&tcon->tconSem);
-	if (atomic_read(&tcon->useCount) == 1)
+
+	read_lock(&cifs_tcp_ses_lock);
+	if (tcon->tc_count == 1)
 		tcon->tidStatus = CifsExiting;
-	up(&tcon->tconSem);
+	read_unlock(&cifs_tcp_ses_lock);
 
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
@@ -1060,7 +1061,6 @@ init_cifs(void)
 	int rc = 0;
 	cifs_proc_init();
 	INIT_LIST_HEAD(&cifs_tcp_ses_list);
-	INIT_LIST_HEAD(&GlobalTreeConnectionList); /* BB to be removed by jl */
 	INIT_LIST_HEAD(&GlobalOplock_Q);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	INIT_LIST_HEAD(&GlobalDnotifyReqList);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 631a99f72f22..f1ae1f57c30d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -233,16 +233,15 @@ struct cifsSesInfo {
  * session
  */
 struct cifsTconInfo {
-	struct list_head cifsConnectionList;
+	struct list_head tcon_list;
+	int tc_count;
 	struct list_head openFileList;
-	struct semaphore tconSem;
 	struct cifsSesInfo *ses;	/* pointer to session associated with */
 	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
 	char *nativeFileSystem;
 	__u16 tid;		/* The 2 byte tree id */
 	__u16 Flags;		/* optional support bits */
 	enum statusEnum tidStatus;
-	atomic_t useCount;	/* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
@@ -600,9 +599,13 @@ require use of the stronger protocol */
  */
 GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
 
-/* protects cifs_tcp_ses_list and srv_count for each tcp session */
+/*
+ * This lock protects the cifs_tcp_ses_list, the list of smb sessions per
+ * tcp session, and the list of tcon's per smb session. It also protects
+ * the reference counters for the server, smb session, and tcon. Finally,
+ * changes to the tcon->tidStatus should be done while holding this lock.
+ */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-GLOBAL_EXTERN struct list_head GlobalTreeConnectionList; /* BB to be removed */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9c95617baa4d..e6bb2d9d5b09 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -742,50 +742,31 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	int rc = 0;
 
 	cFYI(1, ("In tree disconnect"));
-	/*
-	 *  If last user of the connection and
-	 *  connection alive - disconnect it
-	 *  If this is the last connection on the server session disconnect it
-	 *  (and inside session disconnect we should check if tcp socket needs
-	 *  to be freed and kernel thread woken up).
-	 */
-	if (tcon)
-		down(&tcon->tconSem);
-	else
-		return -EIO;
 
-	atomic_dec(&tcon->useCount);
-	if (atomic_read(&tcon->useCount) > 0) {
-		up(&tcon->tconSem);
-		return -EBUSY;
-	}
+	/* BB: do we need to check this? These should never be NULL. */
+	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
+		return -EIO;
 
-	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
-	if (tcon->need_reconnect) {
-		up(&tcon->tconSem);
+	/*
+	 * No need to return error on this operation if tid invalidated and
+	 * closed on server already e.g. due to tcp session crashing. Also,
+	 * the tcon is no longer on the list, so no need to take lock before
+	 * checking this.
+	 */
+	if (tcon->need_reconnect)
 		return 0;
-	}
 
-	if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) {
-		up(&tcon->tconSem);
-		return -EIO;
-	}
 	rc = small_smb_init(SMB_COM_TREE_DISCONNECT, 0, tcon,
 			    (void **)&smb_buffer);
-	if (rc) {
-		up(&tcon->tconSem);
+	if (rc)
 		return rc;
-	}
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
 		cFYI(1, ("Tree disconnect failed %d", rc));
 
-	up(&tcon->tconSem);
-
 	/* No need to return error on this operation if tid invalidated and
-	closed on server already e.g. due to tcp session crashing */
+	   closed on server already e.g. due to tcp session crashing */
 	if (rc == -EAGAIN)
 		rc = 0;
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a3dc0d7cafc3..2f2be8faabb3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -124,7 +124,7 @@ static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	struct list_head *tmp;
+	struct list_head *tmp, *tmp2;
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	struct mid_q_entry *mid_entry;
@@ -149,13 +149,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
 		ses->need_reconnect = true;
 		ses->ipc_tid = 0;
-	}
-	read_unlock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->ses) && (tcon->ses->server == server))
+		list_for_each(tmp2, &ses->tcon_list) {
+			tcon = list_entry(tmp2, struct cifsTconInfo, tcon_list);
 			tcon->need_reconnect = true;
+		}
 	}
+	read_unlock(&cifs_tcp_ses_lock);
 	/* do not want to be sending data on a socket we are freeing */
 	down(&server->tcpSem);
 	if (server->ssocket) {
@@ -1462,6 +1461,52 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	cifs_put_tcp_session(server);
 }
 
+static struct cifsTconInfo *
+cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
+{
+	struct list_head *tmp;
+	struct cifsTconInfo *tcon;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &ses->tcon_list) {
+		tcon = list_entry(tmp, struct cifsTconInfo, tcon_list);
+		if (tcon->tidStatus == CifsExiting)
+			continue;
+		if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+			continue;
+
+		++tcon->tc_count;
+		write_unlock(&cifs_tcp_ses_lock);
+		return tcon;
+	}
+	write_unlock(&cifs_tcp_ses_lock);
+	return NULL;
+}
+
+static void
+cifs_put_tcon(struct cifsTconInfo *tcon)
+{
+	int xid;
+	struct cifsSesInfo *ses = tcon->ses;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if (--tcon->tc_count > 0) {
+		write_unlock(&cifs_tcp_ses_lock);
+		return;
+	}
+
+	list_del_init(&tcon->tcon_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	xid = GetXid();
+	CIFSSMBTDis(xid, tcon);
+	_FreeXid(xid);
+
+	DeleteTconOplockQEntries(tcon);
+	tconInfoFree(tcon);
+	cifs_put_smb_ses(ses);
+}
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -2220,11 +2265,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	if (!rc) {
 		setup_cifs_sb(&volume_info, cifs_sb);
 
+		tcon = cifs_find_tcon(pSesInfo, volume_info.UNC);
 		if (tcon) {
 			cFYI(1, ("Found match on UNC path"));
-			if (tcon->seal != volume_info.seal)
-				cERROR(1, ("transport encryption setting "
-					   "conflicts with existing tid"));
+			/* existing tcon already has a reference */
+			cifs_put_smb_ses(pSesInfo);
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL) {
@@ -2257,6 +2302,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			if (rc)
 				goto mount_fail_check;
 			tcon->seal = volume_info.seal;
+			tcon->ses = pSesInfo;
+			write_lock(&cifs_tcp_ses_lock);
+			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
+			write_unlock(&cifs_tcp_ses_lock);
 		}
 
 		/* we can have only one retry value for a connection
@@ -2283,18 +2332,14 @@ mount_fail_check:
 		/* If find_unc succeeded then rc == 0 so we can not end */
 		/* up accidently freeing someone elses tcon struct */
 		if (tcon)
-			tconInfoFree(tcon);
-
-		/* should also end up putting our tcp session ref if needed */
-		if (pSesInfo)
+			cifs_put_tcon(tcon);
+		else if (pSesInfo)
 			cifs_put_smb_ses(pSesInfo);
 		else
 			cifs_put_tcp_session(srvTcp);
 		goto out;
 	}
-	atomic_inc(&tcon->useCount);
 	cifs_sb->tcon = tcon;
-	tcon->ses = pSesInfo;
 
 	/* do not care if following two calls succeed - informational */
 	if (!tcon->ipc) {
@@ -3565,23 +3610,10 @@ int
 cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 {
 	int rc = 0;
-	int xid;
-	struct cifsSesInfo *ses = NULL;
 	char *tmp;
 
-	xid = GetXid();
-
-	if (cifs_sb->tcon) {
-		ses = cifs_sb->tcon->ses; /* save ptr to ses before delete tcon!*/
-		rc = CIFSSMBTDis(xid, cifs_sb->tcon);
-		if (rc == -EBUSY) {
-			FreeXid(xid);
-			return 0;
-		}
-		DeleteTconOplockQEntries(cifs_sb->tcon);
-		tconInfoFree(cifs_sb->tcon);
-		cifs_put_smb_ses(ses);
-	}
+	if (cifs_sb->tcon)
+		cifs_put_tcon(cifs_sb->tcon);
 
 	cifs_sb->tcon = NULL;
 	tmp = cifs_sb->prepath;
@@ -3589,7 +3621,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	cifs_sb->prepath = NULL;
 	kfree(tmp);
 
-	FreeXid(xid);
 	return rc;
 }
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 46c8c7baccba..addd1dcc2d79 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,6 +79,7 @@ sesInfoAlloc(void)
 		ret_buf->status = CifsNew;
 		++ret_buf->ses_count;
 		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 		init_MUTEX(&ret_buf->sesSem);
 	}
 	return ret_buf;
@@ -107,17 +108,14 @@ tconInfoAlloc(void)
 	struct cifsTconInfo *ret_buf;
 	ret_buf = kzalloc(sizeof(struct cifsTconInfo), GFP_KERNEL);
 	if (ret_buf) {
-		write_lock(&GlobalSMBSeslock);
 		atomic_inc(&tconInfoAllocCount);
-		list_add(&ret_buf->cifsConnectionList,
-			 &GlobalTreeConnectionList);
 		ret_buf->tidStatus = CifsNew;
+		++ret_buf->tc_count;
 		INIT_LIST_HEAD(&ret_buf->openFileList);
-		init_MUTEX(&ret_buf->tconSem);
+		INIT_LIST_HEAD(&ret_buf->tcon_list);
 #ifdef CONFIG_CIFS_STATS
 		spin_lock_init(&ret_buf->stat_lock);
 #endif
-		write_unlock(&GlobalSMBSeslock);
 	}
 	return ret_buf;
 }
@@ -129,10 +127,7 @@ tconInfoFree(struct cifsTconInfo *buf_to_free)
 		cFYI(1, ("Null buffer passed to tconInfoFree"));
 		return;
 	}
-	write_lock(&GlobalSMBSeslock);
 	atomic_dec(&tconInfoAllocCount);
-	list_del(&buf_to_free->cifsConnectionList);
-	write_unlock(&GlobalSMBSeslock);
 	kfree(buf_to_free->nativeFileSystem);
 	kfree(buf_to_free);
 }
@@ -493,9 +488,10 @@ bool
 is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 {
 	struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)buf;
-	struct list_head *tmp;
-	struct list_head *tmp1;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
+	struct cifsInodeInfo *pCifsInode;
 	struct cifsFileInfo *netfile;
 
 	cFYI(1, ("Checking for oplock break or dnotify response"));
@@ -550,42 +546,42 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		return false;
 
 	/* look up tcon based on tid & uid */
-	read_lock(&GlobalSMBSeslock);
-	list_for_each(tmp, &GlobalTreeConnectionList) {
-		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) {
+	read_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &srv->smb_ses_list) {
+		ses = list_entry(tmp, struct cifsSesInfo, smb_ses_list);
+		list_for_each(tmp1, &ses->tcon_list) {
+			tcon = list_entry(tmp1, struct cifsTconInfo, tcon_list);
+			if (tcon->tid != buf->Tid)
+				continue;
+
 			cifs_stats_inc(&tcon->num_oplock_brks);
-			list_for_each(tmp1, &tcon->openFileList) {
-				netfile = list_entry(tmp1, struct cifsFileInfo,
+			list_for_each(tmp2, &tcon->openFileList) {
+				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
-				if (pSMB->Fid == netfile->netfid) {
-					struct cifsInodeInfo *pCifsInode;
-					read_unlock(&GlobalSMBSeslock);
-					cFYI(1,
-					    ("file id match, oplock break"));
-					pCifsInode =
-						CIFS_I(netfile->pInode);
-					pCifsInode->clientCanCacheAll = false;
-					if (pSMB->OplockLevel == 0)
-						pCifsInode->clientCanCacheRead
-							= false;
-					pCifsInode->oplockPending = true;
-					AllocOplockQEntry(netfile->pInode,
-							  netfile->netfid,
-							  tcon);
-					cFYI(1,
-					    ("about to wake up oplock thread"));
-					if (oplockThread)
-					    wake_up_process(oplockThread);
-					return true;
-				}
+				if (pSMB->Fid != netfile->netfid)
+					continue;
+
+				read_unlock(&cifs_tcp_ses_lock);
+				cFYI(1, ("file id match, oplock break"));
+				pCifsInode = CIFS_I(netfile->pInode);
+				pCifsInode->clientCanCacheAll = false;
+				if (pSMB->OplockLevel == 0)
+					pCifsInode->clientCanCacheRead = false;
+				pCifsInode->oplockPending = true;
+				AllocOplockQEntry(netfile->pInode,
+						  netfile->netfid, tcon);
+				cFYI(1, ("about to wake up oplock thread"));
+				if (oplockThread)
+					wake_up_process(oplockThread);
+
+				return true;
 			}
-			read_unlock(&GlobalSMBSeslock);
+			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, ("No matching file for oplock break"));
 			return true;
 		}
 	}
-	read_unlock(&GlobalSMBSeslock);
+	read_unlock(&cifs_tcp_ses_lock);
 	cFYI(1, ("Can not process oplock break for non-existent connection"));
 	return true;
 }
-- 
cgit v1.2.3


From c2b3382cd4d6c6adef1347e81f20e16c93a39feb Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 17 Nov 2008 03:57:13 +0000
Subject: [CIFS] Fix build break

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 1d6dfa8923ca..490e34bbf27a 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -249,9 +249,9 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 		list_for_each(tmp1, &cifs_tcp_ses_list) {
 			server = list_entry(tmp1, struct TCP_Server_Info,
 					    tcp_ses_list);
-			list_for_each(tmp2, &server->smb_session_list) {
+			list_for_each(tmp2, &server->smb_ses_list) {
 				ses = list_entry(tmp2, struct cifsSesInfo,
-						 smb_session_list);
+						 smb_ses_list);
 				list_for_each(tmp3, &ses->tcon_list) {
 					tcon = list_entry(tmp3,
 							  struct cifsTconInfo,
-- 
cgit v1.2.3


From ab3f992983062440b4f37c666dac66d987902d91 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 17 Nov 2008 16:03:00 +0000
Subject: [CIFS] Fix check for tcon seal setting and fix oops on failed mount
 from earlier patch

set tcon->ses earlier

If the inital tree connect fails, we'll end up calling cifs_put_smb_ses
with a NULL pointer. Fix it by setting the tcon->ses earlier.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2f2be8faabb3..c7d341714586 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2270,16 +2270,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cFYI(1, ("Found match on UNC path"));
 			/* existing tcon already has a reference */
 			cifs_put_smb_ses(pSesInfo);
+			if (tcon->seal != volume_info.seal)
+				cERROR(1, ("transport encryption setting "
+					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL) {
 				rc = -ENOMEM;
 				goto mount_fail_check;
 			}
+			tcon->ses = pSesInfo;
 
 			/* check for null share name ie connect to dfs root */
-
-			/* BB check if works for exactly length 3 strings */
 			if ((strchr(volume_info.UNC + 3, '\\') == NULL)
 			    && (strchr(volume_info.UNC + 3, '/') == NULL)) {
 				/* rc = connect_to_dfs_path(...) */
@@ -2302,7 +2304,6 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			if (rc)
 				goto mount_fail_check;
 			tcon->seal = volume_info.seal;
-			tcon->ses = pSesInfo;
 			write_lock(&cifs_tcp_ses_lock);
 			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
 			write_unlock(&cifs_tcp_ses_lock);
-- 
cgit v1.2.3


From 2c55608f28444c3f33b10312881384c470ceed56 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Thu, 23 Oct 2008 13:58:42 +0400
Subject: Fixed parsing of mount options when doing DFS submount

Since these hit the same routines, and are relatively small, it is easier to review
them as one patch.

Fixed incorrect handling of the last option in some cases
Fixed prefixpath handling convert path_consumed into host depended string length (in bytes)
Use non default separator if it is provided in the original mount options

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Igor Mammedov <niallain@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 71 +++++++++++++++++++++++++++++++++-----------------
 fs/cifs/cifssmb.c      | 39 ++++++++++++++++++++++++---
 2 files changed, 83 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d2c8eef84f3c..e1c18362ba46 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -106,7 +106,8 @@ static char *cifs_get_share_name(const char *node_name)
 /**
  * compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @ref_unc:		refferral server UNC
+ * @dentry:		point where we are going to mount
+ * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
  * creates mount options for submount based on template options sb_mountdata
@@ -116,7 +117,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Caller is responcible for freeing retunrned value if it is not error.
  */
 static char *compose_mount_options(const char *sb_mountdata,
-				   const char *ref_unc,
+				   struct dentry *dentry,
+				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
 	int rc;
@@ -126,11 +128,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
+	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
 
-	*devname = cifs_get_share_name(ref_unc);
+	*devname = cifs_get_share_name(ref->node_name);
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
 		cERROR(1, ("%s: Failed to resolve server part of %s to IP",
@@ -138,7 +141,12 @@ static char *compose_mount_options(const char *sb_mountdata,
 		mountdata = ERR_PTR(rc);
 		goto compose_mount_options_out;
 	}
-	md_len = strlen(sb_mountdata) + strlen(srvIP) + strlen(ref_unc) + 3;
+	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
+	 * assuming that we have 'unc=' and 'ip=' in
+	 * the original sb_mountdata
+	 */
+	md_len = strlen(sb_mountdata) + strlen(srvIP) +
+		strlen(ref->node_name) + 12;
 	mountdata = kzalloc(md_len+1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		mountdata = ERR_PTR(-ENOMEM);
@@ -152,41 +160,56 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strncpy(mountdata, sb_mountdata, 5);
 			off += 5;
 	}
-	while ((tkn_e = strchr(sb_mountdata+off, sep))) {
-		noff = (tkn_e - (sb_mountdata+off)) + 1;
-		if (strnicmp(sb_mountdata+off, "unc=", 4) == 0) {
+
+	do {
+		tkn_e = strchr(sb_mountdata + off, sep);
+		if (tkn_e == NULL)
+			noff = strlen(sb_mountdata + off);
+		else
+			noff = tkn_e - (sb_mountdata + off) + 1;
+
+		if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "ip=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) {
 			off += noff;
 			continue;
 		}
-		if (strnicmp(sb_mountdata+off, "prefixpath=", 3) == 0) {
+		if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
 			off += noff;
 			continue;
 		}
-		strncat(mountdata, sb_mountdata+off, noff);
+		strncat(mountdata, sb_mountdata + off, noff);
 		off += noff;
-	}
-	strcat(mountdata, sb_mountdata+off);
+	} while (tkn_e);
+	strcat(mountdata, sb_mountdata + off);
 	mountdata[md_len] = '\0';
 
 	/* copy new IP and ref share name */
-	strcat(mountdata, ",ip=");
+	if (mountdata[strlen(mountdata) - 1] != sep)
+		strncat(mountdata, &sep, 1);
+	strcat(mountdata, "ip=");
 	strcat(mountdata, srvIP);
-	strcat(mountdata, ",unc=");
+	strncat(mountdata, &sep, 1);
+	strcat(mountdata, "unc=");
 	strcat(mountdata, *devname);
 
 	/* find & copy prefixpath */
-	tkn_e = strchr(ref_unc+2, '\\');
-	if (tkn_e) {
-		tkn_e = strchr(tkn_e+1, '\\');
-		if (tkn_e) {
-			strcat(mountdata, ",prefixpath=");
-			strcat(mountdata, tkn_e+1);
-		}
+	tkn_e = strchr(ref->node_name + 2, '\\');
+	if (tkn_e == NULL) /* invalid unc, missing share name*/
+		goto compose_mount_options_out;
+
+	fullpath = build_path_from_dentry(dentry);
+	tkn_e = strchr(tkn_e + 1, '\\');
+	if (tkn_e || strlen(fullpath) - (ref->path_consumed)) {
+		strncat(mountdata, &sep, 1);
+		strcat(mountdata, "prefixpath=");
+		if (tkn_e)
+			strcat(mountdata, tkn_e + 1);
+		strcat(mountdata, fullpath + (ref->path_consumed));
 	}
+	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -198,7 +221,7 @@ compose_mount_options_out:
 
 
 static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
-		struct dentry *dentry, char *ref_unc)
+		struct dentry *dentry, const struct dfs_info3_param *ref)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct vfsmount *mnt;
@@ -207,7 +230,7 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
 	mountdata = compose_mount_options(cifs_sb->mountdata,
-						ref_unc, &devname);
+						dentry, ref, &devname);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
@@ -310,7 +333,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			}
 			mnt = cifs_dfs_do_refmount(nd->path.mnt,
 						nd->path.dentry,
-						referrals[i].node_name);
+						referrals + i);
 			cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p",
 					 __func__,
 					referrals[i].node_name, mnt));
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e6bb2d9d5b09..bdda46dd435a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3899,6 +3899,27 @@ GetInodeNumOut:
 	return rc;
 }
 
+/* computes length of UCS string converted to host codepage
+ * @src:	UCS string
+ * @maxlen:	length of the input string in UCS characters
+ * 		(not in bytes)
+ *
+ * return:	size of input string in host codepage
+ */
+static int hostlen_fromUCS(const __le16 *src, const int maxlen,
+		const struct nls_table *nls_codepage) {
+	int i;
+	int hostlen = 0;
+	char to[4];
+	int charlen;
+	for (i = 0; (i < maxlen) && src[i]; ++i) {
+		charlen = nls_codepage->uni2char(le16_to_cpu(src[i]),
+				to, NLS_MAX_CHARSET_SIZE);
+		hostlen += charlen > 0 ? charlen : 1;
+	}
+	return hostlen;
+}
+
 /* parses DFS refferal V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
@@ -3909,7 +3930,8 @@ static int
 parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		unsigned int *num_of_nodes,
 		struct dfs_info3_param **target_nodes,
-		const struct nls_table *nls_codepage)
+		const struct nls_table *nls_codepage, int remap,
+		const char *searchName)
 {
 	int i, rc = 0;
 	char *data_end;
@@ -3960,7 +3982,17 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 		struct dfs_info3_param *node = (*target_nodes)+i;
 
 		node->flags = le16_to_cpu(pSMBr->DFSFlags);
-		node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+		if (is_unicode) {
+			__le16 *tmp = kmalloc(strlen(searchName)*2, GFP_KERNEL);
+			cifsConvertToUCS((__le16 *) tmp, searchName,
+					PATH_MAX, nls_codepage, remap);
+			node->path_consumed = hostlen_fromUCS(tmp,
+					le16_to_cpu(pSMBr->PathConsumed)/2,
+					nls_codepage);
+			kfree(tmp);
+		} else
+			node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+
 		node->server_type = le16_to_cpu(ref->ServerType);
 		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
 
@@ -4093,7 +4125,8 @@ getDFSRetry:
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
-				 target_nodes, nls_codepage);
+				 target_nodes, nls_codepage, remap,
+				 searchName);
 
 GetDFSRefExit:
 	cifs_buf_release(pSMB);
-- 
cgit v1.2.3


From b066a48c9532243894f93a06ca5a0ee2cc21a8dc Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Tue, 18 Nov 2008 03:49:05 +0000
Subject: prevent cifs_writepages() from skipping unwritten pages

Fixes a data corruption under heavy stress in which pages could be left
dirty after all open instances of a inode have been closed.

In order to write contiguous pages whenever possible, cifs_writepages()
asks pagevec_lookup_tag() for more pages than it may write at one time.
Normally, it then resets index just past the last page written before calling
pagevec_lookup_tag() again.

If cifs_writepages() can't write the first page returned, it wasn't resetting
index, and the next call to pagevec_lookup_tag() resulted in skipping all of
the pages it previously returned, even though cifs_writepages() did nothing
with them.  This can result in data loss when the file descriptor is about
to be closed.

This patch ensures that index gets set back to the next returned page so
that none get skipped.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Cc: Shirish S Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1540adaa593d..6449e1aae621 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1404,7 +1404,10 @@ retry:
 			if ((wbc->nr_to_write -= n_iov) <= 0)
 				done = 1;
 			index = next;
-		}
+		} else
+			/* Need to re-find the pages we skipped */
+			index = pvec.pages[0]->index + 1;
+
 		pagevec_release(&pvec);
 	}
 	if (!scanned && !done) {
-- 
cgit v1.2.3


From eb60fa1066622ddb2278732cf61e0c4544e82c6f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:28:59 +0900
Subject: block: fix add_partition() error path

Partition stats structure was not freed on devt allocation failure
path.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633f7a0ebb2c..90bcf136a9de 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -395,7 +395,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_free;
+		goto out_free_stats;
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
@@ -426,6 +426,8 @@ int add_partition(struct gendisk *disk, int partno,
 
 	return 0;
 
+out_free_stats:
+	free_part_stats(p);
 out_free:
 	kfree(p);
 	return err;
-- 
cgit v1.2.3


From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c         |  7 +++----
 fs/partitions/check.c | 25 +++++++++++++------------
 include/linux/genhd.h |  4 +++-
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/block/ioctl.c b/block/ioctl.c
index c832d639b6e2..d03985b04d67 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,10 +60,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			disk_part_iter_exit(&piter);
 
 			/* all seems OK */
-			err = add_partition(disk, partno, start, length,
-					    ADDPART_FLAG_NONE);
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return err;
+			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 90bcf136a9de..633025340239 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -424,20 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
 out_free_stats:
 	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -568,10 +568,11 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e439e6aed832..3df7742ce246 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -522,7 +522,9 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
-extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
+extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
+						     int partno, sector_t start,
+						     sector_t len, int flags);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3


From 55e8e30c382d25c34f8aafcc78efec948571a941 Mon Sep 17 00:00:00 2001
From: Tejun Heo <teheo@suse.de>
Date: Mon, 10 Nov 2008 15:30:47 +0900
Subject: block/md: fix md autodetection

Block ext devt conversion missed md_autodetect_dev() call in
rescan_partitions() leaving md autodetect unable to see partitions.
Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 633025340239..6d5b213b8a9b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -577,7 +577,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		}
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
-			md_autodetect_dev(bdev->bd_dev+p);
+			md_autodetect_dev(part_to_dev(part)->devt);
 #endif
 	}
 	kfree(state);
-- 
cgit v1.2.3


From bfb59820ee46616a7bdb4af6b8f7e109646de6ec Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 18 Nov 2008 16:33:48 +0000
Subject: [CIFS] fix check for dead tcon in smb_init

This was recently changed to check for need_reconnect, but should
actually be a check for a tidStatus of CifsExiting.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 6 +++++-
 fs/cifs/cifssmb.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8855331b2fba..e078b7aea143 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -8,7 +8,11 @@ handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
 sends, and also let tcp autotune the socket send and receive buffers.
 This reduces the number of EAGAIN errors returned by TCP/IP in
 high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests).
+when sending large SMBWriteX requests).  Fix case in which a portion of
+data can in some cases not get written to the file on the server before the
+file is closed.  Fix DFS parsing to properly handle path consumed field,
+and to handle certain codepage conversions better.  Fix mount and
+umount race that can cause oops in mount or umount or reconnect.
 
 Version 1.54
 ------------
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index bdda46dd435a..2af8626ced43 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -295,7 +295,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	   check for tcp and smb session status done differently
 	   for those three - in the calling routine */
 	if (tcon) {
-		if (tcon->need_reconnect) {
+		if (tcon->tidStatus == CifsExiting) {
 			/* only tree disconnect, open, and write,
 			  (and ulogoff which does not have tcon)
 			  are allowed as we start force umount */
-- 
cgit v1.2.3


From ac97b9f9a2d0b83488e0bbcb8517b229d5c9b142 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Wed, 19 Nov 2008 15:36:28 -0800
Subject: eCryptfs: Allocate up to two scatterlists for crypto ops on keys

I have received some reports of out-of-memory errors on some older AMD
architectures.  These errors are what I would expect to see if
crypt_stat->key were split between two separate pages.  eCryptfs should
not assume that any of the memory sent through virt_to_scatterlist() is
all contained in a single page, and so this patch allocates two
scatterlist structs instead of one when processing keys.  I have received
confirmation from one person affected by this bug that this patch resolves
the issue for him, and so I am submitting it for inclusion in a future
stable release.

Note that virt_to_scatterlist() runs sg_init_table() on the scatterlist
structs passed to it, so the calls to sg_init_table() in
decrypt_passphrase_encrypted_session_key() are redundant.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Reported-by: Paulo J. S. Silva <pjssilva@ime.usp.br>
Cc: "Leon Woestenberg" <leon.woestenberg@gmail.com>
Cc: Tim Gardner <tim.gardner@canonical.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e22bc3961345..0d713b691941 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1037,17 +1037,14 @@ static int
 decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 					 struct ecryptfs_crypt_stat *crypt_stat)
 {
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex;
 	struct blkcipher_desc desc = {
 		.flags = CRYPTO_TFM_REQ_MAY_SLEEP
 	};
 	int rc = 0;
 
-	sg_init_table(&dst_sg, 1);
-	sg_init_table(&src_sg, 1);
-
 	if (unlikely(ecryptfs_verbosity > 0)) {
 		ecryptfs_printk(
 			KERN_DEBUG, "Session key encryption key (size [%d]):\n",
@@ -1066,8 +1063,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	}
 	rc = virt_to_scatterlist(auth_tok->session_key.encrypted_key,
 				 auth_tok->session_key.encrypted_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.encrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]. "
@@ -1079,8 +1076,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		auth_tok->session_key.encrypted_key_size;
 	rc = virt_to_scatterlist(auth_tok->session_key.decrypted_key,
 				 auth_tok->session_key.decrypted_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		printk(KERN_ERR "Internal error whilst attempting to convert "
 			"auth_tok->session_key.decrypted_key to scatterlist; "
 			"expected rc = 1; got rc = [%d]\n", rc);
@@ -1096,7 +1093,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		rc = -EINVAL;
 		goto out;
 	}
-	rc = crypto_blkcipher_decrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
 				      auth_tok->session_key.encrypted_key_size);
 	mutex_unlock(tfm_mutex);
 	if (unlikely(rc)) {
@@ -1539,8 +1536,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	size_t i;
 	size_t encrypted_session_key_valid = 0;
 	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
-	struct scatterlist dst_sg;
-	struct scatterlist src_sg;
+	struct scatterlist dst_sg[2];
+	struct scatterlist src_sg[2];
 	struct mutex *tfm_mutex = NULL;
 	u8 cipher_code;
 	size_t packet_size_length;
@@ -1619,8 +1616,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		ecryptfs_dump_hex(session_key_encryption_key, 16);
 	}
 	rc = virt_to_scatterlist(crypt_stat->key, key_rec->enc_key_size,
-				 &src_sg, 1);
-	if (rc != 1) {
+				 src_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat session key; expected rc = 1; "
 				"got rc = [%d]. key_rec->enc_key_size = [%d]\n",
@@ -1629,8 +1626,8 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 		goto out;
 	}
 	rc = virt_to_scatterlist(key_rec->enc_key, key_rec->enc_key_size,
-				 &dst_sg, 1);
-	if (rc != 1) {
+				 dst_sg, 2);
+	if (rc < 1 || rc > 2) {
 		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
 				"for crypt_stat encrypted session key; "
 				"expected rc = 1; got rc = [%d]. "
@@ -1651,7 +1648,7 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
 	rc = 0;
 	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
 			crypt_stat->key_size);
-	rc = crypto_blkcipher_encrypt(&desc, &dst_sg, &src_sg,
+	rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
 				      (*key_rec).enc_key_size);
 	mutex_unlock(tfm_mutex);
 	if (rc) {
-- 
cgit v1.2.3


From f9454548e17cd56bad081bd7d55a09b001950cbb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 19 Nov 2008 15:36:38 -0800
Subject: don't unlink an active swapfile

Peter Cordes is sorry that he rm'ed his swapfiles while they were in use,
he then had no pathname to swapoff.  It's a curious little oversight, but
not one worth a lot of hackery.  Kudos to Willy Tarreau for turning this
around from a discussion of synthetic pathnames to how to prevent unlink.
Mimic immutable: prohibit unlinking an active swapfile in may_delete()
(and don't worry my little head over the tiny race window).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Willy Tarreau <w@1wt.eu>
Acked-by: Christoph Hellwig <hch@infradead.org>
Cc: Peter Cordes <peter@cordes.ca>
Cc: Bodo Eggert <7eggert@gmx.de>
Cc: David Newall <davidn@davidnewall.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 09ce58e49e72..d34e0f9681c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1378,7 +1378,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	if (IS_APPEND(dir))
 		return -EPERM;
 	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-	    IS_IMMUTABLE(victim->d_inode))
+	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
 		return -EPERM;
 	if (isdir) {
 		if (!S_ISDIR(victim->d_inode->i_mode))
-- 
cgit v1.2.3


From ea7e743e49b94749fc739baaf160809ed279aeda Mon Sep 17 00:00:00 2001
From: WANG Cong <wangcong@zeuux.org>
Date: Wed, 19 Nov 2008 15:36:46 -0800
Subject: hostfs: fix a duplicated global function name

fs/hostfs/hostfs_user.c defines do_readlink() as non-static, and so does
fs/xfs/linux-2.6/xfs_ioctl.c when CONFIG_XFS_DEBUG=y.  So rename
do_readlink() in hostfs to hostfs_do_readlink().

I think it's better if XFS guys will also rename their do_readlink(),
it's not necessary to use such a general name.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hostfs/hostfs.h      | 2 +-
 fs/hostfs/hostfs_kern.c | 4 ++--
 fs/hostfs/hostfs_user.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 6ae9011b95eb..2f34f8f2134b 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -81,7 +81,7 @@ extern int do_rmdir(const char *file);
 extern int do_mknod(const char *file, int mode, unsigned int major,
 		    unsigned int minor);
 extern int link_file(const char *from, const char *to);
-extern int do_readlink(char *file, char *buf, int size);
+extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
 		     long long *bfree_out, long long *bavail_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7f34f4385de0..3a31451ac170 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -168,7 +168,7 @@ static char *follow_link(char *link)
 		if (name == NULL)
 			goto out;
 
-		n = do_readlink(link, name, len);
+		n = hostfs_do_readlink(link, name, len);
 		if (n < len)
 			break;
 		len *= 2;
@@ -943,7 +943,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	name = inode_name(page->mapping->host, 0);
 	if (name == NULL)
 		return -ENOMEM;
-	err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
+	err = hostfs_do_readlink(name, buffer, PAGE_CACHE_SIZE);
 	kfree(name);
 	if (err == PAGE_CACHE_SIZE)
 		err = -E2BIG;
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 53fd0a67c11a..b79424f93282 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -377,7 +377,7 @@ int link_file(const char *to, const char *from)
 	return 0;
 }
 
-int do_readlink(char *file, char *buf, int size)
+int hostfs_do_readlink(char *file, char *buf, int size)
 {
 	int n;
 
-- 
cgit v1.2.3


From ddb4cbfc53aa0913ee8da059fcbf628d14f40f63 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 20 Nov 2008 20:00:44 +0000
Subject: [CIFS] Do not attempt to close invalidated file handles

If a connection with open file handles has gone down
and come back up and reconnected without reopening
the file handle yet, do not attempt to send an SMB close
request for this handle in cifs_close.  We were
checking for the connection being invalid in cifs_close
but since the connection may have been reconnected
we also need to check whether the file handle
was marked invalid (otherwise we could close the
wrong file handle by accident).

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 10 +++++++++-
 fs/cifs/file.c     | 21 ++++++++++++++-------
 fs/cifs/misc.c     |  3 +++
 fs/cifs/readdir.c  |  5 ++++-
 4 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f1ae1f57c30d..c57c0565547f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -606,7 +606,15 @@ GLOBAL_EXTERN struct list_head		cifs_tcp_ses_list;
  * changes to the tcon->tidStatus should be done while holding this lock.
  */
 GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
-GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
+
+/*
+ * This lock protects the cifs_file->llist and cifs_file->flist
+ * list operations, and updates to some flags (cifs_file->invalidHandle)
+ * It will be moved to either use the tcon->stat_lock or equivalent later.
+ * If cifs_tcp_ses_lock and the lock below are both needed to be held, then
+ * the cifs_tcp_ses_lock must be grabbed first and released last.
+ */
+GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
 
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6449e1aae621..b691b893a848 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -488,12 +488,13 @@ int cifs_close(struct inode *inode, struct file *file)
 	pTcon = cifs_sb->tcon;
 	if (pSMBFile) {
 		struct cifsLockInfo *li, *tmp;
-
+		write_lock(&GlobalSMBSeslock);
 		pSMBFile->closePend = true;
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
 			if (!pTcon->need_reconnect) {
+				write_unlock(&GlobalSMBSeslock);
 				timeout = 2;
 				while ((atomic_read(&pSMBFile->wrtPending) != 0)
 					&& (timeout <= 2048)) {
@@ -510,12 +511,15 @@ int cifs_close(struct inode *inode, struct file *file)
 					timeout *= 4;
 				}
 				if (atomic_read(&pSMBFile->wrtPending))
-					cERROR(1,
-						("close with pending writes"));
-				rc = CIFSSMBClose(xid, pTcon,
+					cERROR(1, ("close with pending write"));
+				if (!pTcon->need_reconnect &&
+				    !pSMBFile->invalidHandle)
+					rc = CIFSSMBClose(xid, pTcon,
 						  pSMBFile->netfid);
-			}
-		}
+			} else
+				write_unlock(&GlobalSMBSeslock);
+		} else
+			write_unlock(&GlobalSMBSeslock);
 
 		/* Delete any outstanding lock records.
 		   We'll lose them when the file is closed anyway. */
@@ -587,15 +591,18 @@ int cifs_closedir(struct inode *inode, struct file *file)
 		pTcon = cifs_sb->tcon;
 
 		cFYI(1, ("Freeing private data in close dir"));
+		write_lock(&GlobalSMBSeslock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
 			cFYI(1, ("Closing uncompleted readdir with rc %d",
 				 rc));
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
 			cFYI(1, ("closedir free smb buf in srch struct"));
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index addd1dcc2d79..9ee3f689c2b0 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -555,12 +555,14 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 				continue;
 
 			cifs_stats_inc(&tcon->num_oplock_brks);
+			write_lock(&GlobalSMBSeslock);
 			list_for_each(tmp2, &tcon->openFileList) {
 				netfile = list_entry(tmp2, struct cifsFileInfo,
 						     tlist);
 				if (pSMB->Fid != netfile->netfid)
 					continue;
 
+				write_unlock(&GlobalSMBSeslock);
 				read_unlock(&cifs_tcp_ses_lock);
 				cFYI(1, ("file id match, oplock break"));
 				pCifsInode = CIFS_I(netfile->pInode);
@@ -576,6 +578,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 
 				return true;
 			}
+			write_unlock(&GlobalSMBSeslock);
 			read_unlock(&cifs_tcp_ses_lock);
 			cFYI(1, ("No matching file for oplock break"));
 			return true;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 58d57299f2a0..9f51f9bf0292 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -741,11 +741,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
+		write_lock(&GlobalSMBSeslock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
 			cifsFile->invalidHandle = true;
+			write_unlock(&GlobalSMBSeslock);
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		}
+		} else
+			write_unlock(&GlobalSMBSeslock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
 			if (cifsFile->srch_inf.smallBuf)
-- 
cgit v1.2.3


From 39ce81ce7168aa7226fb9f182c3a2b57060d0905 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 18 Nov 2008 18:09:49 +0200
Subject: UBIFS: do not print scary memory allocation warnings

Bulk-read allocates a lot of memory with 'kmalloc()', and when it
is/gets fragmented 'kmalloc()' fails with a scarry warning. But
because bulk-read is just an optimization, UBIFS keeps working fine.
Supress the warning by passing __GFP_NOWARN option to 'kmalloc()'.

This patch also introduces a macro for the magic 128KiB constant.
This is just neater.

Note, this is not really fixes the problem we had, but just hides
the warnings. The further patches fix the problem.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/file.c  |  4 ++--
 fs/ubifs/super.c | 17 ++++++++++++-----
 fs/ubifs/ubifs.h |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 9124eee73aea..8be827cc7078 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -705,12 +705,12 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
 	int err, page_idx, page_cnt, ret = 0, n = 0;
 	loff_t isize;
 
-	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS);
+	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
 	if (!bu)
 		return 0;
 
 	bu->buf_len = c->bulk_read_buf_size;
-	bu->buf = kmalloc(bu->buf_len, GFP_NOFS);
+	bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
 	if (!bu->buf)
 		goto out_free;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 8780efbf40ac..ea493e6f2652 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,12 @@
 #include <linux/mount.h>
 #include "ubifs.h"
 
+/*
+ * Maximum amount of memory we may 'kmalloc()' without worrying that we are
+ * allocating too much.
+ */
+#define UBIFS_KMALLOC_OK (128*1024)
+
 /* Slab cache for UBIFS inodes */
 struct kmem_cache *ubifs_inode_slab;
 
@@ -561,17 +567,18 @@ static int init_constants_early(struct ubifs_info *c)
 	 * calculations when reporting free space.
 	 */
 	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+
 	/* Buffer size for bulk-reads */
 	c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
 	if (c->bulk_read_buf_size > c->leb_size)
 		c->bulk_read_buf_size = c->leb_size;
-	if (c->bulk_read_buf_size > 128 * 1024) {
-		/* Check if we can kmalloc more than 128KiB */
-		void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL);
-
+	if (c->bulk_read_buf_size > UBIFS_KMALLOC_OK) {
+		/* Check if we can kmalloc that much */
+		void *try = kmalloc(c->bulk_read_buf_size,
+				    GFP_KERNEL | __GFP_NOWARN);
 		kfree(try);
 		if (!try)
-			c->bulk_read_buf_size = 128 * 1024;
+			c->bulk_read_buf_size = UBIFS_KMALLOC_OK;
 	}
 	return 0;
 }
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a7bd32fa15b9..06ba51efd65d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -753,7 +753,7 @@ struct ubifs_znode {
 };
 
 /**
- * struct bu_info - bulk-read information
+ * struct bu_info - bulk-read information.
  * @key: first data node key
  * @zbranch: zbranches of data nodes to bulk read
  * @buf: buffer to read into
-- 
cgit v1.2.3


From 6c0c42cdfd73fb161417403d8d077cb136e10bbf Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 18 Nov 2008 20:20:05 +0200
Subject: UBIFS: do not allocate too much

Bulk-read allocates 128KiB or more using kmalloc. The allocation
starts failing often when the memory gets fragmented. UBIFS still
works fine in this case, because it falls-back to standard
(non-optimized) read method, though. This patch teaches bulk-read
to allocate exactly the amount of memory it needs, instead of
allocating 128KiB every time.

This patch is also a preparation to the further fix where we'll
have a pre-allocated bulk-read buffer as well. For example, now
the @bu object is prepared in 'ubifs_bulk_read()', so we could
path either pre-allocated or allocated information to
'ubifs_do_bulk_read()' later. Or teaching 'ubifs_do_bulk_read()'
not to allocate 'bu->buf' if it is already there.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/file.c  | 70 +++++++++++++++++++++++++++++++++++++-------------------
 fs/ubifs/super.c | 12 +++++-----
 fs/ubifs/tnc.c   |  7 +++++-
 fs/ubifs/ubifs.h |  4 ++--
 4 files changed, 60 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 8be827cc7078..0c5c27d63f6e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -691,32 +691,22 @@ out_err:
 /**
  * ubifs_do_bulk_read - do bulk-read.
  * @c: UBIFS file-system description object
- * @page1: first page
+ * @bu: bulk-read information
+ * @page1: first page to read
  *
  * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
  */
-static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
+			      struct page *page1)
 {
 	pgoff_t offset = page1->index, end_index;
 	struct address_space *mapping = page1->mapping;
 	struct inode *inode = mapping->host;
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	struct bu_info *bu;
 	int err, page_idx, page_cnt, ret = 0, n = 0;
+	int allocate = bu->buf ? 0 : 1;
 	loff_t isize;
 
-	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
-	if (!bu)
-		return 0;
-
-	bu->buf_len = c->bulk_read_buf_size;
-	bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
-	if (!bu->buf)
-		goto out_free;
-
-	data_key_init(c, &bu->key, inode->i_ino,
-		      offset << UBIFS_BLOCKS_PER_PAGE_SHIFT);
-
 	err = ubifs_tnc_get_bu_keys(c, bu);
 	if (err)
 		goto out_warn;
@@ -735,12 +725,25 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
 		 * together. If all the pages were like this, bulk-read would
 		 * reduce performance, so we turn it off for a while.
 		 */
-		ui->read_in_a_row = 0;
-		ui->bulk_read = 0;
-		goto out_free;
+		goto out_bu_off;
 	}
 
 	if (bu->cnt) {
+		if (allocate) {
+			/*
+			 * Allocate bulk-read buffer depending on how many data
+			 * nodes we are going to read.
+			 */
+			bu->buf_len = bu->zbranch[bu->cnt - 1].offs +
+				      bu->zbranch[bu->cnt - 1].len -
+				      bu->zbranch[0].offs;
+			ubifs_assert(bu->buf_len > 0);
+			ubifs_assert(bu->buf_len <= c->leb_size);
+			bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
+			if (!bu->buf)
+				goto out_bu_off;
+		}
+
 		err = ubifs_tnc_bulk_read(c, bu);
 		if (err)
 			goto out_warn;
@@ -779,13 +782,17 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
 	ui->last_page_read = offset + page_idx - 1;
 
 out_free:
-	kfree(bu->buf);
-	kfree(bu);
+	if (allocate)
+		kfree(bu->buf);
 	return ret;
 
 out_warn:
 	ubifs_warn("ignoring error %d and skipping bulk-read", err);
 	goto out_free;
+
+out_bu_off:
+	ui->read_in_a_row = ui->bulk_read = 0;
+	goto out_free;
 }
 
 /**
@@ -803,18 +810,20 @@ static int ubifs_bulk_read(struct page *page)
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = page->index, last_page_read = ui->last_page_read;
-	int ret = 0;
+	struct bu_info *bu;
+	int err = 0;
 
 	ui->last_page_read = index;
-
 	if (!c->bulk_read)
 		return 0;
+
 	/*
 	 * Bulk-read is protected by ui_mutex, but it is an optimization, so
 	 * don't bother if we cannot lock the mutex.
 	 */
 	if (!mutex_trylock(&ui->ui_mutex))
 		return 0;
+
 	if (index != last_page_read + 1) {
 		/* Turn off bulk-read if we stop reading sequentially */
 		ui->read_in_a_row = 1;
@@ -822,6 +831,7 @@ static int ubifs_bulk_read(struct page *page)
 			ui->bulk_read = 0;
 		goto out_unlock;
 	}
+
 	if (!ui->bulk_read) {
 		ui->read_in_a_row += 1;
 		if (ui->read_in_a_row < 3)
@@ -829,10 +839,22 @@ static int ubifs_bulk_read(struct page *page)
 		/* Three reads in a row, so switch on bulk-read */
 		ui->bulk_read = 1;
 	}
-	ret = ubifs_do_bulk_read(c, page);
+
+	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+	if (!bu)
+		return 0;
+
+	bu->buf = NULL;
+	bu->buf_len = c->max_bu_buf_len;
+	data_key_init(c, &bu->key, inode->i_ino,
+		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+
+	err = ubifs_do_bulk_read(c, bu, page);
+	kfree(bu);
+
 out_unlock:
 	mutex_unlock(&ui->ui_mutex);
-	return ret;
+	return err;
 }
 
 static int ubifs_readpage(struct file *file, struct page *page)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ea493e6f2652..1d511569c035 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -569,16 +569,16 @@ static int init_constants_early(struct ubifs_info *c)
 	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
 
 	/* Buffer size for bulk-reads */
-	c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
-	if (c->bulk_read_buf_size > c->leb_size)
-		c->bulk_read_buf_size = c->leb_size;
-	if (c->bulk_read_buf_size > UBIFS_KMALLOC_OK) {
+	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->max_bu_buf_len > c->leb_size)
+		c->max_bu_buf_len = c->leb_size;
+	if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
 		/* Check if we can kmalloc that much */
-		void *try = kmalloc(c->bulk_read_buf_size,
+		void *try = kmalloc(c->max_bu_buf_len,
 				    GFP_KERNEL | __GFP_NOWARN);
 		kfree(try);
 		if (!try)
-			c->bulk_read_buf_size = UBIFS_KMALLOC_OK;
+			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
 	}
 	return 0;
 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 99e9a744cfd0..6eef5344a145 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1501,7 +1501,12 @@ out:
  * @bu: bulk-read parameters and results
  *
  * Lookup consecutive data node keys for the same inode that reside
- * consecutively in the same LEB.
+ * consecutively in the same LEB. This function returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
+ * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
+ * maxumum possible amount of nodes for bulk-read.
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 06ba51efd65d..870b5c479e95 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -969,7 +969,7 @@ struct ubifs_mount_opts {
  * @mst_node: master node
  * @mst_offs: offset of valid master node
  * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
- * @bulk_read_buf_size: buffer size for bulk-reads
+ * @max_bu_buf_len: maximum bulk-read buffer length
  *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
@@ -1217,7 +1217,7 @@ struct ubifs_info {
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
 	struct mutex mst_mutex;
-	int bulk_read_buf_size;
+	int max_bu_buf_len;
 
 	int log_lebs;
 	long long log_bytes;
-- 
cgit v1.2.3


From 3477d204658733aa3a87d3ae03b0327c1e599517 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 19 Nov 2008 11:53:15 +0200
Subject: UBIFS: pre-allocate bulk-read buffer

To avoid memory allocation failure during bulk-read, pre-allocate
a bulk-read buffer, so that if there is only one bulk-reader at
a time, it would just use the pre-allocated buffer and would not
do any memory allocation. However, if there are more than 1 bulk-
reader, then only one reader would use the pre-allocated buffer,
while the other reader would allocate the buffer for itself.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/file.c  | 31 +++++++++++++++++++++---------
 fs/ubifs/super.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/ubifs/ubifs.h |  6 ++++++
 3 files changed, 76 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0c5c27d63f6e..2624411d9758 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -811,15 +811,15 @@ static int ubifs_bulk_read(struct page *page)
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = page->index, last_page_read = ui->last_page_read;
 	struct bu_info *bu;
-	int err = 0;
+	int err = 0, allocated = 0;
 
 	ui->last_page_read = index;
 	if (!c->bulk_read)
 		return 0;
 
 	/*
-	 * Bulk-read is protected by ui_mutex, but it is an optimization, so
-	 * don't bother if we cannot lock the mutex.
+	 * Bulk-read is protected by @ui->ui_mutex, but it is an optimization,
+	 * so don't bother if we cannot lock the mutex.
 	 */
 	if (!mutex_trylock(&ui->ui_mutex))
 		return 0;
@@ -840,17 +840,30 @@ static int ubifs_bulk_read(struct page *page)
 		ui->bulk_read = 1;
 	}
 
-	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
-	if (!bu)
-		return 0;
+	/*
+	 * If possible, try to use pre-allocated bulk-read information, which
+	 * is protected by @c->bu_mutex.
+	 */
+	if (mutex_trylock(&c->bu_mutex))
+		bu = &c->bu;
+	else {
+		bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+		if (!bu)
+			goto out_unlock;
+
+		bu->buf = NULL;
+		allocated = 1;
+	}
 
-	bu->buf = NULL;
 	bu->buf_len = c->max_bu_buf_len;
 	data_key_init(c, &bu->key, inode->i_ino,
 		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
-
 	err = ubifs_do_bulk_read(c, bu, page);
-	kfree(bu);
+
+	if (!allocated)
+		mutex_unlock(&c->bu_mutex);
+	else
+		kfree(bu);
 
 out_unlock:
 	mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1d511569c035..d80b2aef42b6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -572,14 +572,6 @@ static int init_constants_early(struct ubifs_info *c)
 	c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
 	if (c->max_bu_buf_len > c->leb_size)
 		c->max_bu_buf_len = c->leb_size;
-	if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
-		/* Check if we can kmalloc that much */
-		void *try = kmalloc(c->max_bu_buf_len,
-				    GFP_KERNEL | __GFP_NOWARN);
-		kfree(try);
-		if (!try)
-			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
-	}
 	return 0;
 }
 
@@ -998,6 +990,34 @@ static void destroy_journal(struct ubifs_info *c)
 	free_buds(c);
 }
 
+/**
+ * bu_init - initialize bulk-read information.
+ * @c: UBIFS file-system description object
+ */
+static void bu_init(struct ubifs_info *c)
+{
+	ubifs_assert(c->bulk_read == 1);
+
+	if (c->bu.buf)
+		return; /* Already initialized */
+
+again:
+	c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN);
+	if (!c->bu.buf) {
+		if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) {
+			c->max_bu_buf_len = UBIFS_KMALLOC_OK;
+			goto again;
+		}
+
+		/* Just disable bulk-read */
+		ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, "
+			   "disabling it", c->max_bu_buf_len);
+		c->mount_opts.bulk_read = 1;
+		c->bulk_read = 0;
+		return;
+	}
+}
+
 /**
  * mount_ubifs - mount UBIFS file-system.
  * @c: UBIFS file-system description object
@@ -1066,6 +1086,13 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_free;
 	}
 
+	if (c->bulk_read == 1)
+		bu_init(c);
+
+	/*
+	 * We have to check all CRCs, even for data nodes, when we mount the FS
+	 * (specifically, when we are replaying).
+	 */
 	c->always_chk_crc = 1;
 
 	err = ubifs_read_superblock(c);
@@ -1296,6 +1323,7 @@ out_cbuf:
 out_dereg:
 	dbg_failure_mode_deregistration(c);
 out_free:
+	kfree(c->bu.buf);
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
@@ -1332,10 +1360,11 @@ static void ubifs_umount(struct ubifs_info *c)
 	kfree(c->cbuf);
 	kfree(c->rcvrd_mst_node);
 	kfree(c->mst_node);
+	kfree(c->bu.buf);
+	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
 	UBIFS_DBG(vfree(c->dbg_buf));
-	vfree(c->ileb_buf);
 	dbg_failure_mode_deregistration(c);
 }
 
@@ -1633,6 +1662,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		ubifs_err("invalid or unknown remount parameter");
 		return err;
 	}
+
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		err = ubifs_remount_rw(c);
 		if (err)
@@ -1640,6 +1670,14 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
 		ubifs_remount_ro(c);
 
+	if (c->bulk_read == 1)
+		bu_init(c);
+	else {
+		dbg_gen("disable bulk-read");
+		kfree(c->bu.buf);
+		c->bu.buf = NULL;
+	}
+
 	return 0;
 }
 
@@ -1730,6 +1768,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&c->log_mutex);
 	mutex_init(&c->mst_mutex);
 	mutex_init(&c->umount_mutex);
+	mutex_init(&c->bu_mutex);
 	init_waitqueue_head(&c->cmt_wq);
 	c->buds = RB_ROOT;
 	c->old_idx = RB_ROOT;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 870b5c479e95..46b172560a06 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -969,7 +969,10 @@ struct ubifs_mount_opts {
  * @mst_node: master node
  * @mst_offs: offset of valid master node
  * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
  * @max_bu_buf_len: maximum bulk-read buffer length
+ * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
+ * @bu: pre-allocated bulk-read information
  *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
@@ -1217,7 +1220,10 @@ struct ubifs_info {
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
 	struct mutex mst_mutex;
+
 	int max_bu_buf_len;
+	struct mutex bu_mutex;
+	struct bu_info bu;
 
 	int log_lebs;
 	long long log_bytes;
-- 
cgit v1.2.3


From 2c5e76158fcea6e3b9536a74efa7b5e2e846d374 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 20 Nov 2008 14:36:17 -0600
Subject: nfsd: clean up grace period on early exit

If nfsd was shut down before the grace period ended, we could end up
with a freed object still on grace_list.  Thanks to Jeff Moyer for
reporting the resulting list corruption warnings.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
---
 fs/lockd/svc.c      | 1 +
 fs/nfsd/nfs4state.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index c631a83931ce..56b076736b56 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -181,6 +181,7 @@ lockd(void *vrqstp)
 	}
 	flush_signals(current);
 	cancel_delayed_work_sync(&grace_period_end);
+	locks_end_grace(&lockd_manager);
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b0bebc552a11..1a052ac2bde9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3261,6 +3261,7 @@ nfs4_state_shutdown(void)
 {
 	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
 	destroy_workqueue(laundry_wq);
+	locks_end_grace(&nfsd4_manager);
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
-- 
cgit v1.2.3


From e4625eb826de4f6774ee602c442ba23b686bdcc7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 24 Nov 2008 10:32:46 -0600
Subject: nfsd: use of unitialized list head on error exit in nfs4recover.c

Thanks to Matthew Dodd for this bug report:

A file label issue while running SELinux in MLS mode provoked the
following bug, which is a result of use before init on a 'struct list_head'.

In nfsd4_list_rec_dir() if the call to dentry_open() fails the 'goto
out' skips INIT_LIST_HEAD() which results in the normally improbable
case where list_entry() returns NULL.

Trace follows.

NFSD: Using /var/lib/nfs/v4recovery as the NFSv4 state recovery directory
SELinux:  Context unconfined_t:object_r:var_lib_nfs_t:s0 is not valid
(left unmapped).
type=1400 audit(1227298063.609:282): avc:  denied  { read } for
pid=1890 comm="rpc.nfsd" name="v4recovery" dev=dm-0 ino=148726
scontext=system_u:system_r:nfsd_t:s0-s15:c0.c1023
tcontext=system_u:object_r:unlabeled_t:s15:c0.c1023 tclass=dir
BUG: unable to handle kernel NULL pointer dereference at 00000004
IP: [<c050894e>] list_del+0x6/0x60
*pde = 0d9ce067 *pte = 00000000
Oops: 0000 [#1] SMP
Modules linked in: nfsd lockd nfs_acl auth_rpcgss exportfs autofs4
sunrpc ipv6 dm_multipath scsi_dh ppdev parport_pc sg parport floppy
ata_piix pata_acpi ata_generic libata pcnet32 i2c_piix4 mii pcspkr
i2c_core dm_snapshot dm_zero dm_mirror dm_log dm_mod BusLogic sd_mod
scsi_mod crc_t10dif ext3 jbd mbcache uhci_hcd ohci_hcd ehci_hcd [last
unloaded: microcode]

Pid: 1890, comm: rpc.nfsd Not tainted (2.6.27.5-37.fc9.i686 #1)
EIP: 0060:[<c050894e>] EFLAGS: 00010217 CPU: 0
EIP is at list_del+0x6/0x60
EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: cd99e480
ESI: cf9caed8 EDI: 00000000 EBP: cf9caebc ESP: cf9caeb8
  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
Process rpc.nfsd (pid: 1890, ti=cf9ca000 task=cf4de580 task.ti=cf9ca000)
Stack: 00000000 cf9caef0 d0a9f139 c0496d04 d0a9f217 fffffff3 00000000
00000000
        00000000 00000000 cf32b220 00000000 00000008 00000801 cf9caefc
d0a9f193
        00000000 cf9caf08 d0a9b6ea 00000000 cf9caf1c d0a874f2 cf9c3004
00000008
Call Trace:
  [<d0a9f139>] ? nfsd4_list_rec_dir+0xf3/0x13a [nfsd]
  [<c0496d04>] ? do_path_lookup+0x12d/0x175
  [<d0a9f217>] ? load_recdir+0x0/0x26 [nfsd]
  [<d0a9f193>] ? nfsd4_recdir_load+0x13/0x34 [nfsd]
  [<d0a9b6ea>] ? nfs4_state_start+0x2a/0xc5 [nfsd]
  [<d0a874f2>] ? nfsd_svc+0x51/0xff [nfsd]
  [<d0a87f2d>] ? write_svc+0x0/0x1e [nfsd]
  [<d0a87f48>] ? write_svc+0x1b/0x1e [nfsd]
  [<d0a87854>] ? nfsctl_transaction_write+0x3a/0x61 [nfsd]
  [<c04b6a4e>] ? sys_nfsservctl+0x116/0x154
  [<c04975c1>] ? putname+0x24/0x2f
  [<c04975c1>] ? putname+0x24/0x2f
  [<c048d49f>] ? do_sys_open+0xad/0xb7
  [<c048d337>] ? filp_close+0x50/0x5a
  [<c048d4eb>] ? sys_open+0x1e/0x26
  [<c0403cca>] ? syscall_call+0x7/0xb
  [<c064007b>] ? init_cyrix+0x185/0x490
  =======================
Code: 75 e1 8b 53 08 8d 4b 04 8d 46 04 e8 75 00 00 00 8b 53 10 8d 4b 0c
8d 46 0c e8 67 00 00 00 5b 5e 5f 5d c3 90 90 55 89 e5 53 89 c3 <8b> 40
04 8b 00 39 d8 74 16 50 53 68 3e d6 6f c0 6a 30 68 78 d6
EIP: [<c050894e>] list_del+0x6/0x60 SS:ESP 0068:cf9caeb8
---[ end trace a89c4ad091c4ad53 ]---

Cc: Matthew N. Dodd <Matthew.Dodd@spart.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bb93946ace22..b79ec930d9f1 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -225,12 +225,12 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 		return 0;
 
 	nfs4_save_user(&uid, &gid);
+	INIT_LIST_HEAD(dentries);
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
-	INIT_LIST_HEAD(dentries);
 	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
 	fput(filp);
 	while (!list_empty(dentries)) {
-- 
cgit v1.2.3


From a8d82d9b950213b66b22c9e7c63a058841de2394 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Nov 2008 12:51:55 -0500
Subject: NLM: client-side nlm_lookup_host() should avoid matching on srcaddr

Since commit c98451bd, the loop in nlm_lookup_host() unconditionally
compares the host's h_srcaddr field to the incoming source address.
For client-side nlm_host entries, both are always AF_UNSPEC, so this
check is unnecessary.

Since commit 781b61a6, which added support for AF_INET6 addresses to
nlm_cmp_addr(), nlm_cmp_addr() now returns FALSE for AF_UNSPEC
addresses, which causes nlm_lookup_host() to create a fresh nlm_host
entry every time it is called on the client.

These extra entries will eventually expire once the server is
unmounted, so the impact of this regression, introduced with lockd
IPv6 support in 2.6.28, should be minor.

We could fix this by adding an arm in nlm_cmp_addr() for AF_UNSPEC
addresses, but really, nlm_lookup_host() shouldn't be matching on the
srcaddr field for client-side nlm_host lookups.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 9fd8889097b7..70fc63a1727b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -167,7 +167,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 			continue;
 		if (host->h_server != ni->server)
 			continue;
-		if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
+		if (ni->server &&
+		    !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
 			continue;
 
 		/* Move to head of hash chain. */
-- 
cgit v1.2.3


From a98ee8c1c707fe3210b00ef9f806ba8e2bf35504 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 26 Nov 2008 19:32:33 +0000
Subject: [CIFS] fix regression in cifs_write_begin/cifs_write_end

The conversion to write_begin/write_end interfaces had a bug where we
were passing a bad parameter to cifs_readpage_worker. Rather than
passing the page offset of the start of the write, we needed to pass the
offset of the beginning of the page. This was reliably showing up as
data corruption in the fsx-linux test from LTP.

It also became evident that this code was occasionally doing unnecessary
read calls. Optimize those away by using the PG_checked flag to indicate
that the unwritten part of the page has been initialized.

CC: Nick Piggin <npiggin@suse.de>
Acked-by: Dave Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 77 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b691b893a848..f0a81e631ae6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1475,7 +1475,11 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
 		 page, pos, copied));
 
-	if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+	if (PageChecked(page)) {
+		if (copied == len)
+			SetPageUptodate(page);
+		ClearPageChecked(page);
+	} else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
 		SetPageUptodate(page);
 
 	if (!PageUptodate(page)) {
@@ -2062,39 +2066,70 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t page_start = pos & PAGE_MASK;
+	loff_t i_size;
+	struct page *page;
+	int rc = 0;
 
 	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
 
-	*pagep = __grab_cache_page(mapping, index);
-	if (!*pagep)
-		return -ENOMEM;
-
-	if (PageUptodate(*pagep))
-		return 0;
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
-	/* If we are writing a full page it will be up to date,
-	   no need to read from the server */
-	if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
-		return 0;
+	if (PageUptodate(page))
+		goto out;
 
-	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
-		int rc;
+	/*
+	 * If we write a full page it will be up to date, no need to read from
+	 * the server. If the write is short, we'll end up doing a sync write
+	 * instead.
+	 */
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
 
-		/* might as well read a page, it is fast enough */
-		rc = cifs_readpage_worker(file, *pagep, &offset);
+	/*
+	 * optimize away the read when we have an oplock, and we're not
+	 * expecting to use any of the data we'd be reading in. That
+	 * is, when the page lies beyond the EOF, or straddles the EOF
+	 * and the write will cover all of the existing data.
+	 */
+	if (CIFS_I(mapping->host)->clientCanCacheRead) {
+		i_size = i_size_read(mapping->host);
+		if (page_start >= i_size ||
+		    (offset == 0 && (pos + len) >= i_size)) {
+			zero_user_segments(page, 0, offset,
+					   offset + len,
+					   PAGE_CACHE_SIZE);
+			/*
+			 * PageChecked means that the parts of the page
+			 * to which we're not writing are considered up
+			 * to date. Once the data is copied to the
+			 * page, it can be set uptodate.
+			 */
+			SetPageChecked(page);
+			goto out;
+		}
+	}
 
-		/* we do not need to pass errors back
-		   e.g. if we do not have read access to the file
-		   because cifs_write_end will attempt synchronous writes
-		   -- shaggy */
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		/*
+		 * might as well read a page, it is fast enough. If we get
+		 * an error, we don't need to return it. cifs_write_end will
+		 * do a sync write instead since PG_uptodate isn't set.
+		 */
+		cifs_readpage_worker(file, page, &page_start);
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
 		   racing with this read? In any case
 		   this will be written out by write_end so is fine */
 	}
-
-	return 0;
+out:
+	*pagep = page;
+	return rc;
 }
 
 const struct address_space_operations cifs_addr_ops = {
-- 
cgit v1.2.3


From 52b19ac993f1aeadbce15b55302be9a35346e235 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 23 Sep 2008 18:24:08 +0200
Subject: udf: Fix BUG_ON() in destroy_inode()

udf_clear_inode() can leave behind buffers on mapping's i_private list (when
we truncated preallocation). Call invalidate_inode_buffers() so that the list
is properly cleaned-up before we return from udf_clear_inode(). This is ugly
and suggest that we should cleanup preallocation earlier than in clear_inode()
but currently there's no such call available since drop_inode() is called under
inode lock and thus is unusable for disk operations.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c    | 1 +
 fs/udf/inode.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6569fda5cfed..10179cfa1152 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -878,6 +878,7 @@ void invalidate_inode_buffers(struct inode *inode)
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
+EXPORT_SYMBOL(invalidate_inode_buffers);
 
 /*
  * Remove any clean buffers from the inode's buffer list.  This is called
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 6e74b117aaf0..30ebde490f7f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -106,6 +106,7 @@ void udf_clear_inode(struct inode *inode)
 		udf_truncate_tail_extent(inode);
 		unlock_kernel();
 		write_inode_now(inode, 0);
+		invalidate_inode_buffers(inode);
 	}
 	iinfo = UDF_I(inode);
 	kfree(iinfo->i_ext.i_data);
-- 
cgit v1.2.3


From 3b5da0189c93160e44b878d2c72e9552d642497c Mon Sep 17 00:00:00 2001
From: Coly Li <coyli@suse.de>
Date: Wed, 5 Nov 2008 15:16:24 +0800
Subject: ocfs2: comments typo fix

This patch fixes two typos in comments of ocfs2.

Signed-off-by: Coly Li <coyli@suse.de>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/userdlm.h | 2 +-
 fs/ocfs2/ocfs2.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
index 39ec27738499..0c3cc03c61fa 100644
--- a/fs/ocfs2/dlm/userdlm.h
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -33,7 +33,7 @@
 #include <linux/workqueue.h>
 
 /* user_lock_res->l_flags flags. */
-#define USER_LOCK_ATTACHED      (0x00000001) /* have we initialized
+#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
 					       * the lvb */
 #define USER_LOCK_BUSY          (0x00000002) /* we are currently in
 					       * dlm_lock */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fef7ece32376..3fed9e3d8992 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -85,7 +85,7 @@ enum ocfs2_unlock_action {
 };
 
 /* ocfs2_lock_res->l_flags flags. */
-#define OCFS2_LOCK_ATTACHED      (0x00000001) /* have we initialized
+#define OCFS2_LOCK_ATTACHED      (0x00000001) /* we have initialized
 					       * the lvb */
 #define OCFS2_LOCK_BUSY          (0x00000002) /* we are currently in
 					       * dlm_lock */
-- 
cgit v1.2.3


From 66f502a416f18cd36179290746aa53736c6b2828 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 10 Nov 2008 16:24:57 -0600
Subject: ocfs2: initialize stack_user lvbptr

The locking_state dump, ocfs2_dlm_seq_show, reads the lvb on locks where it
has not yet been initialized by a lock call.

Signed-off-by: David Teigland <teigland@redhat.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/stack_user.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index faec2d879357..9b76d41a8ac6 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -740,6 +740,9 @@ static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
 
 static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 {
+	if (!lksb->lksb_fsdlm.sb_lvbptr)
+		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
+					     sizeof(struct dlm_lksb);
 	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
 }
 
-- 
cgit v1.2.3


From 07f9eebcdfaeefc8f807fa1bcce1d7c3ae6661b1 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 17 Nov 2008 12:28:48 -0600
Subject: ocfs2: fix wake_up in unlock_ast

In ocfs2_unlock_ast(), call wake_up() on lockres before releasing
the spin lock on it.  As soon as the spin lock is released, the
lockres can be freed.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index ec684426034b..6e6cc0a2e5f7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2841,9 +2841,8 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
 	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	mlog_exit_void();
 }
-- 
cgit v1.2.3


From 07d9a3954a68764aefe16855bcd0f86deeb5c825 Mon Sep 17 00:00:00 2001
From: Coly Li <coyli@suse.de>
Date: Mon, 17 Nov 2008 12:38:22 +0800
Subject: ocfs2: fix return value set in init_dlmfs_fs()

In init_dlmfs_fs(), if calling kmem_cache_create() failed, the code will use return value from
calling bdi_init(). The correct behavior should be set status as -ENOMEM before going to "bail:".

Signed-off-by: Coly Li <coyli@suse.de>
Acked-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 533a789c3ef8..ba962d71b34d 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -608,8 +608,10 @@ static int __init init_dlmfs_fs(void)
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 					SLAB_MEM_SPREAD),
 				dlmfs_init_once);
-	if (!dlmfs_inode_cache)
+	if (!dlmfs_inode_cache) {
+		status = -ENOMEM;
 		goto bail;
+	}
 	cleanup_inode = 1;
 
 	user_dlm_worker = create_singlethread_workqueue("user_dlm");
-- 
cgit v1.2.3


From d6b58f89f7257c8099c2260e2bea042a917d6cdf Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Fri, 21 Nov 2008 14:06:55 -0800
Subject: ocfs2: fix regression in ocfs2_read_blocks_sync()

We're panicing in ocfs2_read_blocks_sync() if a jbd-managed buffer is seen.
At first glance, this seems ok but in reality it can happen. My test case
was to just run 'exorcist'. A struct inode is being pushed out of memory but
is then re-read at a later time, before the buffer has been checkpointed by
jbd. This causes a BUG to be hit in ocfs2_read_blocks_sync().

Reviewed-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/buffer_head_io.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 7e947c672469..3a178ec48d7c 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -112,7 +112,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 		bh = bhs[i];
 
 		if (buffer_jbd(bh)) {
-			mlog(ML_ERROR,
+			mlog(ML_BH_IO,
 			     "trying to sync read a jbd "
 			     "managed bh (blocknr = %llu), skipping\n",
 			     (unsigned long long)bh->b_blocknr);
@@ -147,15 +147,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 	for (i = nr; i > 0; i--) {
 		bh = bhs[i - 1];
 
-		if (buffer_jbd(bh)) {
-			mlog(ML_ERROR,
-			     "the journal got the buffer while it was "
-			     "locked for io! (blocknr = %llu)\n",
-			     (unsigned long long)bh->b_blocknr);
-			BUG();
-		}
+		/* No need to wait on the buffer if it's managed by JBD. */
+		if (!buffer_jbd(bh))
+			wait_on_buffer(bh);
 
-		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* Status won't be cleared from here on out,
 			 * so we can safely record this and loop back
@@ -251,8 +246,6 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 			ignore_cache = 1;
 		}
 
-		/* XXX: Can we ever get this and *not* have the cached
-		 * flag set? */
 		if (buffer_jbd(bh)) {
 			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
-- 
cgit v1.2.3


From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Mon, 1 Dec 2008 13:13:55 -0800
Subject: epoll: introduce resource usage limits

It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface.  Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds.  To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced.  A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:

  max_user_instances = Maximum number of devices - per user

  max_user_watches   = Maximum number of "watched" fds - per user

The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM.  As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users.  The
default value for "max_user_instances" is set to 128, that should be
enough too.

This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC).  The EMFILE from epoll_create() was already
listed, so that should be ok.

[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 27 ++++++++++++
 fs/eventpoll.c                     | 85 ++++++++++++++++++++++++++++++++++----
 include/linux/sched.h              |  4 ++
 kernel/sysctl.c                    | 10 +++++
 4 files changed, 118 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index bcceb99b81dd..bb1b0dd3bfcb 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -44,6 +44,7 @@ Table of Contents
   2.14	/proc/<pid>/io - Display the IO accounting fields
   2.15	/proc/<pid>/coredump_filter - Core dump filtering settings
   2.16	/proc/<pid>/mountinfo - Information about mounts
+  2.17	/proc/sys/fs/epoll - Configuration options for the epoll interface
 
 ------------------------------------------------------------------------------
 Preface
@@ -2483,4 +2484,30 @@ For more information on mount propagation see:
 
   Documentation/filesystems/sharedsubtree.txt
 
+2.17	/proc/sys/fs/epoll - Configuration options for the epoll interface
+--------------------------------------------------------
+
+This directory contains configuration options for the epoll(7) interface.
+
+max_user_instances
+------------------
+
+This is the maximum number of epoll file descriptors that a single user can
+have open at a given time. The default value is 128, and should be enough
+for normal users.
+
+max_user_watches
+----------------
+
+Every epoll file descriptor can store a number of files to be monitored
+for event readiness. Each one of these monitored files constitutes a "watch".
+This configuration option sets the maximum number of "watches" that are
+allowed for each user.
+Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
+on a 64bit one.
+The current default value for  max_user_watches  is the 1/32 of the available
+low memory, divided for the "watch" cost in bytes.
+
+
 ------------------------------------------------------------------------------
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aec5c13f6341..96355d505347 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -102,6 +102,8 @@
 
 #define EP_UNACTIVE_PTR ((void *) -1L)
 
+#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
+
 struct epoll_filefd {
 	struct file *file;
 	int fd;
@@ -200,6 +202,9 @@ struct eventpoll {
 	 * holding ->lock.
 	 */
 	struct epitem *ovflist;
+
+	/* The user that created the eventpoll descriptor */
+	struct user_struct *user;
 };
 
 /* Wait structure used by the poll hooks */
@@ -226,10 +231,18 @@ struct ep_pqueue {
 	struct epitem *epi;
 };
 
+/*
+ * Configuration options available inside /proc/sys/fs/epoll/
+ */
+/* Maximum number of epoll devices, per user */
+static int max_user_instances __read_mostly;
+/* Maximum number of epoll watched descriptors, per user */
+static int max_user_watches __read_mostly;
+
 /*
  * This mutex is used to serialize ep_free() and eventpoll_release_file().
  */
-static struct mutex epmutex;
+static DEFINE_MUTEX(epmutex);
 
 /* Safe wake up implementation */
 static struct poll_safewake psw;
@@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
 
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table epoll_table[] = {
+	{
+		.procname	= "max_user_instances",
+		.data		= &max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "max_user_watches",
+		.data		= &max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
 
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	/* At this point it is safe to free the eventpoll item */
 	kmem_cache_free(epi_cache, epi);
 
+	atomic_dec(&ep->user->epoll_watches);
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
 		     current, ep, file));
 
@@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
+	atomic_dec(&ep->user->epoll_devs);
+	free_uid(ep->user);
 	kfree(ep);
 }
 
@@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file)
 
 static int ep_alloc(struct eventpoll **pep)
 {
-	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	int error;
+	struct user_struct *user;
+	struct eventpoll *ep;
 
-	if (!ep)
-		return -ENOMEM;
+	user = get_current_user();
+	error = -EMFILE;
+	if (unlikely(atomic_read(&user->epoll_devs) >=
+			max_user_instances))
+		goto free_uid;
+	error = -ENOMEM;
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+	if (unlikely(!ep))
+		goto free_uid;
 
 	spin_lock_init(&ep->lock);
 	mutex_init(&ep->mtx);
@@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep)
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT;
 	ep->ovflist = EP_UNACTIVE_PTR;
+	ep->user = user;
 
 	*pep = ep;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
 		     current, ep));
 	return 0;
+
+free_uid:
+	free_uid(user);
+	return error;
 }
 
 /*
@@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	struct epitem *epi;
 	struct ep_pqueue epq;
 
-	error = -ENOMEM;
+	if (unlikely(atomic_read(&ep->user->epoll_watches) >=
+		     max_user_watches))
+		return -ENOSPC;
 	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
-		goto error_return;
+		return -ENOMEM;
 
 	/* Item initialization follow here ... */
 	INIT_LIST_HEAD(&epi->rdllink);
@@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	 * install process. Namely an allocation for a wait queue failed due
 	 * high memory pressure.
 	 */
+	error = -ENOMEM;
 	if (epi->nwait < 0)
 		goto error_unregister;
 
@@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
+	atomic_inc(&ep->user->epoll_watches);
+
 	/* We have to call this outside the lock */
 	if (pwake)
 		ep_poll_safewake(&psw, &ep->poll_wait);
@@ -789,7 +852,7 @@ error_unregister:
 	spin_unlock_irqrestore(&ep->lock, flags);
 
 	kmem_cache_free(epi_cache, epi);
-error_return:
+
 	return error;
 }
 
@@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
+	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 
 static int __init eventpoll_init(void)
 {
-	mutex_init(&epmutex);
+	struct sysinfo si;
+
+	si_meminfo(&si);
+	max_user_instances = 128;
+	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_poll_safewake_init(&psw);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 644ffbda17ca..55e30d114477 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -630,6 +630,10 @@ struct user_struct {
 	atomic_t inotify_watches; /* How many inotify watches does this user have? */
 	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
 #endif
+#ifdef CONFIG_EPOLL
+	atomic_t epoll_devs;	/* The number of epoll descriptors currently open */
+	atomic_t epoll_watches;	/* The number of file descriptors currently watched */
+#endif
 #ifdef CONFIG_POSIX_MQUEUE
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d902..3d56fe7570da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
 #ifdef CONFIG_INOTIFY_USER
 extern struct ctl_table inotify_table[];
 #endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
 		.child		= inotify_table,
 	},
 #endif	
+#ifdef CONFIG_EPOLL
+	{
+		.procname	= "epoll",
+		.mode		= 0555,
+		.child		= epoll_table,
+	},
+#endif
 #endif
 	{
 		.ctl_name	= KERN_SETUID_DUMPABLE,
-- 
cgit v1.2.3


From 03801553630c4bec6682108800c9b2de64bdbd37 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 1 Dec 2008 13:14:04 -0800
Subject: ntfs: don't fool kernel-doc

kernel-doc handles macros now (it has for quite some time), so change the
ntfs_debug() macro's kernel-doc to be just before the macro instead of
before a phony function prototype.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/debug.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 5e6724c1afd1..2142b1c68b61 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -30,7 +30,8 @@
 
 extern int debug_msgs;
 
-#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+extern void __ntfs_debug(const char *file, int line, const char *function,
+	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 /**
  * ntfs_debug - write a debug level message to syslog
  * @f:		a printf format string containing the message
@@ -39,11 +40,6 @@ extern int debug_msgs;
  * ntfs_debug() writes a DEBUG level message to the syslog but only if the
  * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
  */
-static void ntfs_debug(const char *f, ...);
-#endif
-
-extern void __ntfs_debug (const char *file, int line, const char *function,
-	const char *format, ...) __attribute__ ((format (printf, 4, 5)));
 #define ntfs_debug(f, a...)						\
 	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
 
-- 
cgit v1.2.3


From ebbefc011e56bd85b4745d01e5b8d7d05d95ed5d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 14:54:41 +0100
Subject: [PATCH] clean up blkdev_get a little bit

The way the bd_claim for the FMODE_EXCL case is implemented is rather
confusing.  Clean it up to the most logical style.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index db831efbdbbd..7c727523bc54 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1135,12 +1135,15 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (res)
 		return res;
 
-	if (!(filp->f_mode & FMODE_EXCL))
-		return 0;
+	if (filp->f_mode & FMODE_EXCL) {
+		res = bd_claim(bdev, filp);
+		if (res)
+			goto out_blkdev_put;
+	}
 
-	if (!(res = bd_claim(bdev, filp)))
-		return 0;
+	return 0;
 
+ out_blkdev_put:
 	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
-- 
cgit v1.2.3


From fd4ce1acd0f8558033b1a6968001552bd7671e6d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 14:58:42 +0100
Subject: [PATCH 1/2] kill FMODE_NDELAY_NOW

Update FMODE_NDELAY before each ioctl call so that we can kill the
magic FMODE_NDELAY_NOW.  It would be even better to do this directly
in setfl(), but for that we'd need to have FMODE_NDELAY for all files,
not just block special files.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/compat_ioctl.c |  8 +++++++-
 drivers/scsi/sd.c    |  2 +-
 drivers/scsi/sr.c    |  2 +-
 fs/block_dev.c       | 10 +++++++++-
 include/linux/fs.h   |  1 -
 5 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index d43e6087badc..67eb93cff699 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -722,8 +722,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	struct backing_dev_info *bdi;
 	loff_t size;
 
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
 	if (file->f_flags & O_NDELAY)
-		mode |= FMODE_NDELAY_NOW;
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
 
 	switch (cmd) {
 	case HDIO_GETGEO:
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c9e1242eaf25..5081b3981d3c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -757,7 +757,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode,
 	 * access to the device is prohibited.
 	 */
 	error = scsi_nonblockable_ioctl(sdp, cmd, p,
-					(mode & FMODE_NDELAY_NOW) != 0);
+					(mode & FMODE_NDELAY) != 0);
 	if (!scsi_block_when_processing_errors(sdp) || !error)
 		return error;
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 62b6633e3a97..45b66b98a516 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -521,7 +521,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	 * if it doesn't recognise the ioctl
 	 */
 	ret = scsi_nonblockable_ioctl(sdev, cmd, argp,
-					(mode & FMODE_NDELAY_NOW) != 0);
+					(mode & FMODE_NDELAY) != 0);
 	if (ret != -ENODEV)
 		return ret;
 	return scsi_ioctl(sdev, cmd, argp);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7c727523bc54..99e0ae1a4c78 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1206,8 +1206,16 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	fmode_t mode = file->f_mode;
+
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
 	if (file->f_flags & O_NDELAY)
-		mode |= FMODE_NDELAY_NOW;
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
+
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0dcdd9458f4b..b3345a90e11a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -79,7 +79,6 @@ extern int dir_notify_enable;
 #define FMODE_NDELAY	((__force fmode_t)32)
 #define FMODE_EXCL	((__force fmode_t)64)
 #define FMODE_WRITE_IOCTL	((__force fmode_t)128)
-#define FMODE_NDELAY_NOW	((__force fmode_t)256)
 
 #define RW_MASK		1
 #define RWA_MASK	2
-- 
cgit v1.2.3


From 576a488a27f267af203f3ea69c700a1612335e9f Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 4 Dec 2008 09:09:34 +1100
Subject: [XFS] Fix hang after disallowed rename across directory quota domains

When project quota is active and is being used for directory tree
quota control, we disallow rename outside the current directory
tree. This requires a check to be made after all the inodes
involved in the rename are locked. We fail to unlock the inodes
correctly if we disallow the rename when the target is outside the
current directory tree. This results in a hang on the next access
to the inodes involved in failed rename.

Reported-by: Arkadiusz Miskiewicz <arekm@maven.pl>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Tested-by: Arkadiusz Miskiewicz <arekm@maven.pl>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_rename.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d700dacdb10e..c903130be7fd 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -212,7 +212,7 @@ xfs_rename(
 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
 		     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
 		error = XFS_ERROR(EXDEV);
-		xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+		xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
 		xfs_trans_cancel(tp, cancel_flags);
 		goto std_return;
 	}
-- 
cgit v1.2.3


From 218d11a8b071b23b76c484fd5f72a4fe3306801e Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Fri, 5 Dec 2008 16:12:48 -0700
Subject: Fix a race condition in FASYNC handling

Changeset a238b790d5f99c7832f9b73ac8847025815b85f7 (Call fasync()
functions without the BKL) introduced a race which could leave
file->f_flags in a state inconsistent with what the underlying
driver/filesystem believes.  Revert that change, and also fix the same
races in ioctl_fioasync() and ioctl_fionbio().

This is a minimal, short-term fix; the real fix will not involve the
BKL.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fcntl.c |  7 +++++++
 fs/ioctl.c | 12 ++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index ac4f7db9f134..549daf8005fb 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -19,6 +19,7 @@
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
+#include <linux/smp_lock.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -175,6 +176,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (error)
 		return error;
 
+	/*
+	 * We still need a lock here for now to keep multiple FASYNC calls
+	 * from racing with each other.
+	 */
+	lock_kernel();
 	if ((arg ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync) {
 			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -185,6 +191,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
  out:
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index d152856c371b..43e8b2c0664b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -400,11 +400,9 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync) {
-			lock_kernel();
+		if (filp->f_op && filp->f_op->fasync)
 			error = filp->f_op->fasync(fd, filp, on);
-			unlock_kernel();
-		} else
+		else
 			error = -ENOTTY;
 	}
 	if (error)
@@ -440,11 +438,17 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
+		/* BKL needed to avoid races tweaking f_flags */
+		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
+		unlock_kernel();
 		break;
 
 	case FIOASYNC:
+		/* BKL needed to avoid races tweaking f_flags */
+		lock_kernel();
 		error = ioctl_fioasync(fd, filp, argp);
+		unlock_kernel();
 		break;
 
 	case FIOQSIZE:
-- 
cgit v1.2.3


From a4f4d6df537368297a84e6b9444f403f99bf59f6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 8 Dec 2008 18:24:18 -0500
Subject: EXPORTFS: handle NULL returns from fh_to_dentry()/fh_to_parent()

While 440037287c5 "[PATCH] switch all filesystems over to
d_obtain_alias" removed some cases where fh_to_dentry() and
fh_to_parent() could return NULL, there are still a few NULL returns
left in individual filesystems.  Thus it was a mistake for that commit
to remove the handling of NULL returns in the callers.

Revert those parts of 440037287c5 which removed the NULL handling.

(We could, alternatively, modify all implementations to return -ESTALE
instead of NULL, but that proves to require fixing a number of
filesystems, and in some cases it's arguably more natural to return
NULL.)

Thanks to David for original patch and Linus, Christoph, and Hugh for
review.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: David Howells <dhowells@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exportfs/expfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 80246bad1b7f..890e01828817 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -367,6 +367,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
+	if (!result)
+		result = ERR_PTR(-ESTALE);
 	if (IS_ERR(result))
 		return result;
 
@@ -420,6 +422,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
 		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
 				fh_len, fileid_type);
+		if (!target_dir)
+			goto err_result;
 		err = PTR_ERR(target_dir);
 		if (IS_ERR(target_dir))
 			goto err_result;
-- 
cgit v1.2.3


From 85f334666a771680472722eee43ae0fc8730a619 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 9 Dec 2008 19:36:38 -0800
Subject: tracehook: exec double-reporting fix

The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting.  Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.

To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued.  However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).

The test program included below demonstrates the problem.

This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).

The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler().  But it keeps the use of
of the depth count to its most obvious expected meaning.  Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.

    /* Test program using PTRACE_O_TRACEEXEC.
       This forks and exec's the first argument with the rest of the arguments,
       while ptrace'ing.  It expects to see one PTRACE_EVENT_EXEC stop and
       then a successful exit, with no other signals or events in between.

       Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:

       $ gcc -g traceexec.c -o traceexec
       $ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
       $ echo 'foobar test' > ./foobar
       $ chmod +x ./foobar
       $ ./traceexec ./foobar; echo $?
       ==> good <==
       foobar test
       0
       $
       ==> bad <==
       foobar test
       unexpected status 0x4057f != 0
       3
       $

    */

    #include <stdio.h>
    #include <sys/types.h>
    #include <sys/wait.h>
    #include <sys/ptrace.h>
    #include <unistd.h>
    #include <signal.h>
    #include <stdlib.h>

    static void
    wait_for (pid_t child, int expect)
    {
      int status;
      pid_t p = wait (&status);
      if (p != child)
	{
	  perror ("wait");
	  exit (2);
	}
      if (status != expect)
	{
	  fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
	  exit (3);
	}
    }

    int
    main (int argc, char **argv)
    {
      pid_t child = fork ();

      if (child < 0)
	{
	  perror ("fork");
	  return 127;
	}
      else if (child == 0)
	{
	  ptrace (PTRACE_TRACEME);
	  raise (SIGUSR1);
	  execv (argv[1], &argv[1]);
	  perror ("execve");
	  _exit (127);
	}

      wait_for (child, W_STOPCODE (SIGUSR1));

      if (ptrace (PTRACE_SETOPTIONS, child,
		  0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
	{
	  perror ("PTRACE_SETOPTIONS");
	  return 4;
	}

      if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
	{
	  perror ("PTRACE_CONT");
	  return 5;
	}

      wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));

      if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
	{
	  perror ("PTRACE_CONT");
	  return 6;
	}

      wait_for (child, W_EXITCODE (0, 0));

      return 0;
    }

Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
---
 fs/exec.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 4e834f16d9da..ec5df9a38313 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1159,6 +1159,7 @@ EXPORT_SYMBOL(remove_arg_zero);
  */
 int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
+	unsigned int depth = bprm->recursion_depth;
 	int try,retval;
 	struct linux_binfmt *fmt;
 #ifdef __alpha__
@@ -1219,8 +1220,15 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 				continue;
 			read_unlock(&binfmt_lock);
 			retval = fn(bprm, regs);
+			/*
+			 * Restore the depth counter to its starting value
+			 * in this call, so we don't have to rely on every
+			 * load_binary function to restore it on return.
+			 */
+			bprm->recursion_depth = depth;
 			if (retval >= 0) {
-				tracehook_report_exec(fmt, bprm, regs);
+				if (depth == 0)
+					tracehook_report_exec(fmt, bprm, regs);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
-- 
cgit v1.2.3


From 71c5576fbd809f2015f4eddf72e501e298720cf3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Dec 2008 13:14:13 -0800
Subject: revert "percpu counter: clean up percpu_counter_sum_and_set()"

Revert

    commit 1f7c14c62ce63805f9574664a6c6de3633d4a354
    Author: Mingming Cao <cmm@us.ibm.com>
    Date:   Thu Oct 9 12:50:59 2008 -0400

        percpu counter: clean up percpu_counter_sum_and_set()

Before this patch we had the following:

percpu_counter_sum(): return the percpu_counter's value

percpu_counter_sum_and_set(): return the percpu_counter's value, copying
that value into the central value and zeroing the per-cpu counters before
returning.

After this patch, percpu_counter_sum_and_set() has gone, and
percpu_counter_sum() gets the old percpu_counter_sum_and_set()
functionality.

Problem is, as Eric points out, the old percpu_counter_sum_and_set()
functionality was racy and wrong.  It zeroes out counters on "other" cpus,
without holding any locks which will prevent races agaist updates from
those other CPUS.

This patch reverts 1f7c14c62ce63805f9574664a6c6de3633d4a354.  This means
that percpu_counter_sum_and_set() still has the race, but
percpu_counter_sum() does not.

Note that this is not a simple revert - ext4 has since started using
percpu_counter_sum() for its dirty_blocks counter as well.

Note that this revert patch changes percpu_counter_sum() semantics.

Before the patch, a call to percpu_counter_sum() will bring the counter's
central counter mostly up-to-date, so a following percpu_counter_read()
will return a close value.

After this patch, a call to percpu_counter_sum() will leave the counter's
central accumulator unaltered, so a subsequent call to
percpu_counter_read() can now return a significantly inaccurate result.

If there is any code in the tree which was introduced after
e8ced39d5e8911c662d4d69a342b9d053eaaac4e was merged, and which depends
upon the new percpu_counter_sum() semantics, that code will break.

Reported-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/balloc.c               |  4 ++--
 include/linux/percpu_counter.h | 12 +++++++++---
 lib/percpu_counter.c           |  8 +++++---
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2003cdc36aa..c17f69bcd7dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum(fbc);
-		dirty_blocks = percpu_counter_sum(dbc);
+		free_blocks  = percpu_counter_sum_and_set(fbc);
+		dirty_blocks = percpu_counter_sum_and_set(dbc);
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..208388835357 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 71b265c330ce..dba1530a5b29 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,9 +62,11 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		*pcount = 0;
+		if (set)
+			*pcount = 0;
 	}
-	fbc->count = ret;
+	if (set)
+		fbc->count = ret;
 
 	spin_unlock(&fbc->lock);
 	return ret;
-- 
cgit v1.2.3


From 02d211688727ad02bb4555b1aa8ae2de16b21b39 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 9 Dec 2008 13:14:14 -0800
Subject: revert "percpu_counter: new function percpu_counter_sum_and_set"

Revert

    commit e8ced39d5e8911c662d4d69a342b9d053eaaac4e
    Author: Mingming Cao <cmm@us.ibm.com>
    Date:   Fri Jul 11 19:27:31 2008 -0400

        percpu_counter: new function percpu_counter_sum_and_set

As described in

	revert "percpu counter: clean up percpu_counter_sum_and_set()"

the new percpu_counter_sum_and_set() is racy against updates to the
cpu-local accumulators on other CPUs.  Revert that change.

This means that ext4 will be slow again.  But correct.

Reported-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mingming Cao <cmm@us.ibm.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: <stable@kernel.org>		[2.6.27.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/balloc.c               |  4 ++--
 include/linux/percpu_counter.h | 12 +++---------
 lib/percpu_counter.c           |  7 +------
 3 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c17f69bcd7dd..db35cfdb3c8b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 
 	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
 						EXT4_FREEBLOCKS_WATERMARK) {
-		free_blocks  = percpu_counter_sum_and_set(fbc);
-		dirty_blocks = percpu_counter_sum_and_set(dbc);
+		free_blocks  = percpu_counter_sum_positive(fbc);
+		dirty_blocks = percpu_counter_sum_positive(dbc);
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 208388835357..9007ccdfc112 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
+s64 __percpu_counter_sum(struct percpu_counter *fbc);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,19 +44,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc, 0);
+	s64 ret = __percpu_counter_sum(fbc);
 	return ret < 0 ? 0 : ret;
 }
 
-static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
-{
-	return __percpu_counter_sum(fbc, 1);
-}
-
-
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc, 0);
+	return __percpu_counter_sum(fbc);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index dba1530a5b29..b255b939bc1b 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
+s64 __percpu_counter_sum(struct percpu_counter *fbc)
 {
 	s64 ret;
 	int cpu;
@@ -62,12 +62,7 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
-		if (set)
-			*pcount = 0;
 	}
-	if (set)
-		fbc->count = ret;
-
 	spin_unlock(&fbc->lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 49c50342c728344b79c8f9e8293637fe80ef5ad5 Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Tue, 9 Dec 2008 13:14:21 -0800
Subject: pagemap: fix 32-bit pagemap regression

The large pages fix from bcf8039ed45 broke 32-bit pagemap by pulling the
pagemap entry code out into a function with the wrong return type.
Pagemap entries are 64 bits on all systems and unsigned long is only 32
bits on 32-bit systems.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Reported-by: Doug Graham <dgraham@nortel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: <stable@kernel.org>		[2.6.26.x, 2.6.27.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b770c095e45c..3a8bdd7f5756 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -557,9 +557,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
 	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
-static unsigned long pte_to_pagemap_entry(pte_t pte)
+static u64 pte_to_pagemap_entry(pte_t pte)
 {
-	unsigned long pme = 0;
+	u64 pme = 0;
 	if (is_swap_pte(pte))
 		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
 			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
-- 
cgit v1.2.3


From 6ee5a399d6a92a52646836a6e10faf255c16393e Mon Sep 17 00:00:00 2001
From: Dmitri Monakhov <dmonakhov@openvz.org>
Date: Tue, 9 Dec 2008 13:14:26 -0800
Subject: inotify: fix IN_ONESHOT unmount event watcher

On umount two event will be dispatched to watcher:

1: inotify_dev_queue_event(.., IN_UNMOUNT,..)
2: remove_watch(watch, dev)
    ->inotify_dev_queue_event(.., IN_IGNORED, ..)

But if watcher has IN_ONESHOT bit set then the watcher will be released
inside first event.  Which result in accessing invalid object later.  IMHO
it is not pure regression.  This bug wasn't triggered while initial
inotify interface testing phase because of another bug in IN_ONESHOT
handling logic :)

  commit ac74c00e499ed276a965e5b5600667d5dc04a84a
  Author: Ulisses Furquim <ulissesf@gmail.com>
  Date:   Fri Feb 8 04:18:16 2008 -0800
    inotify: fix check for one-shot watches before destroying them
    As the IN_ONESHOT bit is never set when an event is sent we must check it
    in the watch's mask and not in the event's mask.

TESTCASE:
mkdir mnt
mount -ttmpfs none mnt
mkdir mnt/d
./inotify mnt/d&
umount mnt ## << lockup or crash here

TESTSOURCE:
/* gcc -oinotify inotify.c */
#include <stdio.h>
#include <stdlib.h>
#include <sys/inotify.h>

int main(int argc, char **argv)
{
        char buf[1024];
        struct inotify_event *ie;
        char *p;
        int i;
        ssize_t l;

        p = argv[1];
        i = inotify_init();
        inotify_add_watch(i, p, ~0);

        l = read(i, buf, sizeof(buf));
        printf("read %d bytes\n", l);
        ie = (struct inotify_event *) buf;
        printf("event mask: %d\n", ie->mask);
	return 0;
}

Signed-off-by: Dmitri Monakhov <dmonakhov@openvz.org>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Robert Love <rlove@google.com>
Cc: Ulisses Furquim <ulissesf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inotify.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 7bbed1b89825..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -428,11 +428,13 @@ void inotify_unmount_inodes(struct list_head *list)
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
 			struct inotify_handle *ih= watch->ih;
+			get_inotify_watch(watch);
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
 						 NULL, NULL);
 			inotify_remove_watch_locked(ih, watch);
 			mutex_unlock(&ih->mutex);
+			put_inotify_watch(watch);
 		}
 		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
-- 
cgit v1.2.3


From 9c24624727f6d6c460e45762a408ca5f5b9b8ef2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 9 Dec 2008 13:14:27 -0800
Subject: KSYM_SYMBOL_LEN fixes

Miles Lane tailing /sys files hit a BUG which Pekka Enberg has tracked
to my 966c8c12dc9e77f931e2281ba25d2f0244b06949 sprint_symbol(): use
less stack exposing a bug in slub's list_locations() -
kallsyms_lookup() writes a 0 to namebuf[KSYM_NAME_LEN-1], but that was
beyond the end of page provided.

The 100 slop which list_locations() allows at end of page looks roughly
enough for all the other stuff it might print after the symbol before
it checks again: break out KSYM_SYMBOL_LEN earlier than before.

Latencytop and ftrace and are using KSYM_NAME_LEN buffers where they
need KSYM_SYMBOL_LEN buffers, and vmallocinfo a 2*KSYM_NAME_LEN buffer
where it wants a KSYM_SYMBOL_LEN buffer: fix those before anyone copies
them.

[akpm@linux-foundation.org: ftrace.h needs module.h]
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc Miles Lane <miles.lane@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c         | 2 +-
 include/linux/ftrace.h | 3 ++-
 kernel/latencytop.c    | 2 +-
 mm/slub.c              | 2 +-
 mm/vmalloc.c           | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe7139..d4677603c889 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -371,7 +371,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 				task->latency_record[i].time,
 				task->latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!task->latency_record[i].backtrace[q])
 					break;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2b..9c5bc6be2b09 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -6,6 +6,7 @@
 #include <linux/ktime.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -231,7 +232,7 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { }
 
 struct boot_trace {
 	pid_t			caller;
-	char			func[KSYM_NAME_LEN];
+	char			func[KSYM_SYMBOL_LEN];
 	int			result;
 	unsigned long long	duration;		/* usecs */
 	ktime_t			calltime;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c56923..449db466bdbc 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
 				latency_record[i].time,
 				latency_record[i].max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_NAME_LEN];
+				char sym[KSYM_SYMBOL_LEN];
 				char *c;
 				if (!latency_record[i].backtrace[q])
 					break;
diff --git a/mm/slub.c b/mm/slub.c
index 749588a50a5a..a2cd47d89e0a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3597,7 +3597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	for (i = 0; i < t.count; i++) {
 		struct location *l = &t.loc[i];
 
-		if (len > PAGE_SIZE - 100)
+		if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
 			break;
 		len += sprintf(buf + len, "%7ld ", l->count);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f3f6e0758562..1ddb77ba3995 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1717,7 +1717,7 @@ static int s_show(struct seq_file *m, void *p)
 		v->addr, v->addr + v->size, v->size);
 
 	if (v->caller) {
-		char buff[2 * KSYM_NAME_LEN];
+		char buff[KSYM_SYMBOL_LEN];
 
 		seq_putc(m, ' ');
 		sprint_symbol(buff, (unsigned long)v->caller);
-- 
cgit v1.2.3