From a1de02dccf906faba2ee2d99cac56799bda3b96a Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 4 Feb 2010 23:58:38 -0500
Subject: ext4: fix async i/o writes beyond 4GB to a sparse file

The "offset" member in ext4_io_end holds bytes, not blocks, so
ext4_lblk_t is wrong - and too small (u32).

This caused the async i/o writes to sparse files beyond 4GB to fail
when they wrapped around to 0.

Also fix up the type of arguments to ext4_convert_unwritten_extents(),
it gets ssize_t from ext4_end_aio_dio_nolock() and
ext4_ext_direct_IO().

Reported-by: Giel de Nijs <giel@vectorwise.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/ext4/ext4.h    | 6 +++---
 fs/ext4/extents.c | 2 +-
 fs/ext4/inode.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..602d5ad6f5e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -139,8 +139,8 @@ typedef struct ext4_io_end {
 	struct inode		*inode;		/* file being written to */
 	unsigned int		flag;		/* unwritten or not */
 	int			error;		/* I/O error code */
-	ext4_lblk_t		offset;		/* offset in the file */
-	size_t			size;		/* size of the extent */
+	loff_t			offset;		/* offset in the file */
+	ssize_t			size;		/* size of the extent */
 	struct work_struct	work;		/* data work queue */
 } ext4_io_end_t;
 
@@ -1744,7 +1744,7 @@ extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-			  loff_t len);
+			  ssize_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 765a4826b118..c56877972b0e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3603,7 +3603,7 @@ retry:
  * Returns 0 on success.
  */
 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
-				    loff_t len)
+				    ssize_t len)
 {
 	handle_t *handle;
 	ext4_lblk_t block;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..2059c34ac4c8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3551,7 +3551,7 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
 {
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
-	size_t size = io->size;
+	ssize_t size = io->size;
 	int ret = 0;
 
 	ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
-- 
cgit v1.2.3


From 15121c18a22ae483279f76dc9e554334b800d0f7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 15 Feb 2010 20:17:55 -0500
Subject: ext4: Fix optional-arg mount options

We have 2 mount options, "barrier" and "auto_da_alloc" which may or
may not take a 1/0 argument.  This causes the ext4 superblock mount
code to subtract uninitialized pointers and pass the result to
kmalloc, which results in very noisy failures.

Per Ted's suggestion, initialize the args struct so that
we know whether match_token() found an argument for the
option, and skip match_int() if not.

Also, return error (0) from parse_options if we thought
we found an argument, but match_int() Fails.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d5fd56..68a55dffb360 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1229,6 +1229,11 @@ static int parse_options(char *options, struct super_block *sb,
 		if (!*p)
 			continue;
 
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
@@ -1518,10 +1523,11 @@ set_qf_format:
 			clear_opt(sbi->s_mount_opt, BARRIER);
 			break;
 		case Opt_barrier:
-			if (match_int(&args[0], &option)) {
-				set_opt(sbi->s_mount_opt, BARRIER);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				set_opt(sbi->s_mount_opt, BARRIER);
 			else
@@ -1594,10 +1600,11 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		case Opt_auto_da_alloc:
-			if (match_int(&args[0], &option)) {
-				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
-				break;
-			}
+			if (args[0].from) {
+				if (match_int(&args[0], &option))
+					return 0;
+			} else
+				option = 1;	/* No argument, default to 1 */
 			if (option)
 				clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
 			else
-- 
cgit v1.2.3


From 1f2acb6017d8528135ec3b01ab7cd2be6ea0630b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 22 Jan 2010 17:40:42 -0500
Subject: ext4: Add block validity check when truncating indirect block mapped
 inodes

Add checks to ext4_free_branches() to make sure a block number found
in an indirect block are valid before trying to free it.  If a bad
block number is found, stop freeing the indirect block immediately,
since the file system is corrupt and we will need to run fsck anyway.
This also avoids spamming the logs, and specifically avoids
driver-level "attempt to access beyond end of device" errors obscure
what is really going on.

If you get *really*, *really*, *really* unlucky, without this patch, a
supposed indirect block containing garbage might contain a reference
to a primary block group descriptor, in which case
ext4_free_branches() could end up zero'ing out a block group
descriptor block, and if then one of the block bitmaps for a block
group described by that bg descriptor block is not in memory, and is
read in by ext4_read_block_bitmap().  This function calls
ext4_valid_block_bitmap(), which assumes that bg_inode_table() was
validated at mount time and hasn't been modified since.  Since this
assumption is no longer valid, it's possible for the value
(ext4_inode_table(sb, desc) - group_first_block) to go negative, which
will cause ext4_find_next_zero_bit() to trigger a kernel GPF.

Addresses-Google-Bug: #2220436

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/inode.c   | 39 ++++++++++++++++++++++++++++++---------
 fs/ext4/mballoc.c |  7 ++++---
 3 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 602d5ad6f5e7..307ecd13a762 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -377,6 +377,7 @@ struct ext4_new_group_data {
  */
 #define EXT4_FREE_BLOCKS_METADATA	0x0001
 #define EXT4_FREE_BLOCKS_FORGET		0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED	0x0004
 
 /*
  * ioctl commands
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2059c34ac4c8..3e8afd969236 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4130,18 +4130,27 @@ no_top:
  * We release `count' blocks on disk, but (last - first) may be greater
  * than `count' because there can be holes in there.
  */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-			      struct buffer_head *bh,
-			      ext4_fsblk_t block_to_free,
-			      unsigned long count, __le32 *first,
-			      __le32 *last)
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh,
+			     ext4_fsblk_t block_to_free,
+			     unsigned long count, __le32 *first,
+			     __le32 *last)
 {
 	__le32 *p;
-	int	flags = EXT4_FREE_BLOCKS_FORGET;
+	int	flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
 
 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 		flags |= EXT4_FREE_BLOCKS_METADATA;
 
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+				   count)) {
+		ext4_error(inode->i_sb, __func__, "inode #%lu: "
+			   "attempt to clear blocks %llu len %lu, invalid",
+			   inode->i_ino, (unsigned long long) block_to_free,
+			   count);
+		return 1;
+	}
+
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
 			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4160,6 +4169,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 		*p = 0;
 
 	ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+	return 0;
 }
 
 /**
@@ -4215,9 +4225,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
-				ext4_clear_blocks(handle, inode, this_bh,
-						  block_to_free,
-						  count, block_to_free_p, p);
+				if (ext4_clear_blocks(handle, inode, this_bh,
+						      block_to_free, count,
+						      block_to_free_p, p))
+					break;
 				block_to_free = nr;
 				block_to_free_p = p;
 				count = 1;
@@ -4281,6 +4292,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			if (!nr)
 				continue;		/* A hole */
 
+			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						   nr, 1)) {
+				ext4_error(inode->i_sb, __func__,
+					   "indirect mapped block in inode "
+					   "#%lu invalid (level %d, blk #%lu)",
+					   inode->i_ino, depth,
+					   (unsigned long) nr);
+				break;
+			}
+
 			/* Go read the buffer for the next level down */
 			bh = sb_bread(inode->i_sb, nr);
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad3e137..d129c1039f1d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4476,10 +4476,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
-	if (!ext4_data_block_valid(sbi, block, count)) {
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_data_block_valid(sbi, block, count)) {
 		ext4_error(sb, __func__,
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
+			   "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-- 
cgit v1.2.3


From 71f2be213a0009098819e5c04f75ff19f84f2122 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 23 Dec 2009 07:45:44 -0500
Subject: ext4: Add new tracepoint for jbd2_cleanup_journal_tail

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c        |  1 +
 include/trace/events/jbd2.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 886849370950..30beb11ef928 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -507,6 +507,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	if (blocknr < journal->j_tail)
 		freed = freed + journal->j_last - journal->j_first;
 
+	trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
 	jbd_debug(1,
 		  "Cleaning journal tail from %d to %d (offset %lu), "
 		  "freeing %lu\n",
diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 96b370a050de..bf16545cc977 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h
@@ -199,6 +199,34 @@ TRACE_EVENT(jbd2_checkpoint_stats,
 		  __entry->forced_to_close, __entry->written, __entry->dropped)
 );
 
+TRACE_EVENT(jbd2_cleanup_journal_tail,
+
+	TP_PROTO(journal_t *journal, tid_t first_tid,
+		 unsigned long block_nr, unsigned long freed),
+
+	TP_ARGS(journal, first_tid, block_nr, freed),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	tid_t,	tail_sequence		)
+		__field(	tid_t,	first_tid		)
+		__field(unsigned long,	block_nr		)
+		__field(unsigned long,	freed			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->tail_sequence	= journal->j_tail_sequence;
+		__entry->first_tid	= first_tid;
+		__entry->block_nr	= block_nr;
+		__entry->freed		= freed;
+	),
+
+	TP_printk("dev %s from %u to %u offset %lu freed %lu",
+		  jbd2_dev_to_name(__entry->dev), __entry->tail_sequence,
+		  __entry->first_tid, __entry->block_nr, __entry->freed)
+);
+
 #endif /* _TRACE_JBD2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From f8ec9d6837241865cf99bed97bb99f4399fd5a03 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 Jan 2010 01:00:21 -0500
Subject: ext4: Add new tracepoints to debug delayed allocation space functions

Add tracepoints for ext4_da_reserve_space(),
ext4_da_update_reserve_space(), and ext4_da_release_space().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c             |   2 +
 include/trace/events/ext4.h | 101 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e8afd969236..1a3d7b232cd7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1061,6 +1061,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
 	int mdb_free = 0, allocated_meta_blocks = 0;
 
 	spin_lock(&ei->i_block_reservation_lock);
+	trace_ext4_da_update_reserve_space(inode, used);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
 			 "with only %d reserved data blocks\n",
@@ -1846,6 +1847,7 @@ repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	md_reserved = ei->i_reserved_meta_blocks;
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
+	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
 	/*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d0b6cd3afb2f..2aa6aa3e8f61 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -874,6 +874,107 @@ TRACE_EVENT(ext4_forget,
 		  __entry->mode, __entry->is_metadata, __entry->block)
 );
 
+TRACE_EVENT(ext4_da_update_reserve_space,
+	TP_PROTO(struct inode *inode, int used_blocks),
+
+	TP_ARGS(inode, used_blocks),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	__u64,	i_blocks		)
+		__field(	int,	used_blocks		)
+		__field(	int,	reserved_data_blocks	)
+		__field(	int,	reserved_meta_blocks	)
+		__field(	int,	allocated_meta_blocks	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->i_blocks = inode->i_blocks;
+		__entry->used_blocks = used_blocks;
+		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
+		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+	),
+
+	TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  __entry->mode,  (unsigned long long) __entry->i_blocks,
+		  __entry->used_blocks, __entry->reserved_data_blocks,
+		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
+);
+
+TRACE_EVENT(ext4_da_reserve_space,
+	TP_PROTO(struct inode *inode, int md_needed),
+
+	TP_ARGS(inode, md_needed),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	__u64,	i_blocks		)
+		__field(	int,	md_needed		)
+		__field(	int,	reserved_data_blocks	)
+		__field(	int,	reserved_meta_blocks	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->i_blocks = inode->i_blocks;
+		__entry->md_needed = md_needed;
+		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
+	),
+
+	TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  __entry->mode, (unsigned long long) __entry->i_blocks,
+		  __entry->md_needed, __entry->reserved_data_blocks,
+		  __entry->reserved_meta_blocks)
+);
+
+TRACE_EVENT(ext4_da_release_space,
+	TP_PROTO(struct inode *inode, int freed_blocks),
+
+	TP_ARGS(inode, freed_blocks),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	umode_t, mode			)
+		__field(	__u64,	i_blocks		)
+		__field(	int,	freed_blocks		)
+		__field(	int,	reserved_data_blocks	)
+		__field(	int,	reserved_meta_blocks	)
+		__field(	int,	allocated_meta_blocks	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->mode	= inode->i_mode;
+		__entry->i_blocks = inode->i_blocks;
+		__entry->freed_blocks = freed_blocks;
+		__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+		__entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
+		__entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
+	),
+
+	TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+		  __entry->mode, (unsigned long long) __entry->i_blocks,
+		  __entry->freed_blocks, __entry->reserved_data_blocks,
+		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
+);
+
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d2eecb03936878ec574ade5532fa83df7d75dde7 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 7 Dec 2009 10:36:20 -0500
Subject: ext4: Use slab allocator for sub-page sized allocations

Now that the SLUB seems to be fixed so that it respects the requested
alignment, use kmem_cache_alloc() to allocator if the block size of
the buffer heads to be allocated is less than the page size.
Previously, we were using 16k page on a Power system for each buffer,
even when the file system was using 1k or 4k block size.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h |  11 +----
 2 files changed, 134 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ac0d027595d0..c03d4dce4d76 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -39,6 +39,8 @@
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/hash.h>
+#include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -93,6 +95,7 @@ EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -1248,6 +1251,13 @@ int jbd2_journal_load(journal_t *journal)
 		}
 	}
 
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (jbd2_journal_recover(journal))
@@ -1806,6 +1816,127 @@ size_t journal_tag_bytes(journal_t *journal)
 		return JBD2_TAG_SIZE32;
 }
 
+/*
+ * JBD memory management
+ *
+ * These functions are used to allocate block-sized chunks of memory
+ * used for making copies of buffer_head data.  Very often it will be
+ * page-sized chunks of data, but sometimes it will be in
+ * sub-page-size chunks.  (For example, 16k pages on Power systems
+ * with a 4k block file system.)  For blocks smaller than a page, we
+ * use a SLAB allocator.  There are slab caches for each block size,
+ * which are allocated at mount time, if necessary, and we only free
+ * (all of) the slab caches when/if the jbd2 module is unloaded.  For
+ * this reason we don't need to a mutex to protect access to
+ * jbd2_slab[] allocating or releasing memory; only in
+ * jbd2_journal_create_slab().
+ */
+#define JBD2_MAX_SLABS 8
+static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
+static DECLARE_MUTEX(jbd2_slab_create_sem);
+
+static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
+	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
+};
+
+
+static void jbd2_journal_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD2_MAX_SLABS; i++) {
+		if (jbd2_slab[i])
+			kmem_cache_destroy(jbd2_slab[i]);
+		jbd2_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+	size_t slab_size;
+
+	if (size == PAGE_SIZE)
+		return 0;
+
+	if (i >= JBD2_MAX_SLABS)
+		return -EINVAL;
+
+	if (unlikely(i < 0))
+		i = 0;
+	down(&jbd2_slab_create_sem);
+	if (jbd2_slab[i]) {
+		up(&jbd2_slab_create_sem);
+		return 0;	/* Already created */
+	}
+
+	slab_size = 1 << (i+10);
+	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
+					 slab_size, 0, NULL);
+	up(&jbd2_slab_create_sem);
+	if (!jbd2_slab[i]) {
+		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct kmem_cache *get_slab(size_t size)
+{
+	int i = order_base_2(size) - 10;
+
+	BUG_ON(i >= JBD2_MAX_SLABS);
+	if (unlikely(i < 0))
+		i = 0;
+	BUG_ON(jbd2_slab[i] == 0);
+	return jbd2_slab[i];
+}
+
+void *jbd2_alloc(size_t size, gfp_t flags)
+{
+	void *ptr;
+
+	BUG_ON(size & (size-1)); /* Must be a power of 2 */
+
+	flags |= __GFP_REPEAT;
+	if (size == PAGE_SIZE)
+		ptr = (void *)__get_free_pages(flags, 0);
+	else if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			ptr = (void *)__get_free_pages(flags, order);
+		else
+			ptr = vmalloc(size);
+	} else
+		ptr = kmem_cache_alloc(get_slab(size), flags);
+
+	/* Check alignment; SLUB has gotten this wrong in the past,
+	 * and this can lead to user data corruption! */
+	BUG_ON(((unsigned long) ptr) & (size-1));
+
+	return ptr;
+}
+
+void jbd2_free(void *ptr, size_t size)
+{
+	if (size == PAGE_SIZE) {
+		free_pages((unsigned long)ptr, 0);
+		return;
+	}
+	if (size > PAGE_SIZE) {
+		int order = get_order(size);
+
+		if (order < 3)
+			free_pages((unsigned long)ptr, order);
+		else
+			vfree(ptr);
+		return;
+	}
+	kmem_cache_free(get_slab(size), ptr);
+};
+
 /*
  * Journal_head storage management
  */
@@ -2204,6 +2335,7 @@ static void jbd2_journal_destroy_caches(void)
 	jbd2_journal_destroy_revoke_caches();
 	jbd2_journal_destroy_jbd2_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 638ce4554c76..8ada2a129d08 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -69,15 +69,8 @@ extern u8 jbd2_journal_enable_debug;
 #define jbd_debug(f, a...)	/**/
 #endif
 
-static inline void *jbd2_alloc(size_t size, gfp_t flags)
-{
-	return (void *)__get_free_pages(flags, get_order(size));
-}
-
-static inline void jbd2_free(void *ptr, size_t size)
-{
-	free_pages((unsigned long)ptr, get_order(size));
-};
+extern void *jbd2_alloc(size_t size, gfp_t flags);
+extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
 
-- 
cgit v1.2.3


From 6a5fa2362b628ee950080bef8895a6fb62f58ab4 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 1 Jan 2010 01:28:43 +0000
Subject: [CIFS] Add support for TCP_NODELAY

mount option sockopt=TCP_NODELAY helpful for faster networks
boosting performance.  Kernel bugzilla bug number 14032.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    |  4 ++++
 fs/cifs/cifsfs.h   |  2 +-
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/connect.c  | 30 ++++++++++++++++++++++++++----
 4 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7b2600b380d7..49503d2edc7e 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,7 @@
+Version 1.62
+------------
+Add sockopt=TCP_NODELAY mount option.
+
 Version 1.61
 ------------
 Fix append problem to Samba servers (files opened with O_APPEND could
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index ac2b24c192f8..78c1b86d55f6 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.61"
+#define CIFS_VERSION   "1.62"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4b35f7ec0583..ed751bb657db 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -149,6 +149,7 @@ struct TCP_Server_Info {
 	bool svlocal:1;			/* local server or remote */
 	bool noblocksnd;		/* use blocking sendmsg */
 	bool noautotune;		/* do not autotune send buf sizes */
+	bool tcp_nodelay;
 	atomic_t inFlight;  /* number of requests on the wire to server */
 #ifdef CONFIG_CIFS_STATS2
 	atomic_t inSend; /* requests trying to send */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3bbcaa716b3c..2e9e09ca0e30 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -98,7 +98,7 @@ struct smb_vol {
 	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	unsigned int rsize;
 	unsigned int wsize;
-	unsigned int sockopt;
+	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	char *prepath;
 };
@@ -1142,9 +1142,11 @@ cifs_parse_mount_options(char *options, const char *devname,
 					simple_strtoul(value, &value, 0);
 			}
 		} else if (strnicmp(data, "sockopt", 5) == 0) {
-			if (value && *value) {
-				vol->sockopt =
-					simple_strtoul(value, &value, 0);
+			if (!value || !*value) {
+				cERROR(1, ("no socket option specified"));
+				continue;
+			} else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
+				vol->sockopt_tcp_nodelay = 1;
 			}
 		} else if (strnicmp(data, "netbiosname", 4) == 0) {
 			if (!value || !*value || (*value == ' ')) {
@@ -1514,6 +1516,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	tcp_ses->noblocksnd = volume_info->noblocksnd;
 	tcp_ses->noautotune = volume_info->noautotune;
+	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
 	atomic_set(&tcp_ses->inFlight, 0);
 	init_waitqueue_head(&tcp_ses->response_q);
 	init_waitqueue_head(&tcp_ses->request_q);
@@ -1764,6 +1767,7 @@ static int
 ipv4_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
+	int val;
 	bool connected = false;
 	__be16 orig_port = 0;
 	struct socket *socket = server->ssocket;
@@ -1845,6 +1849,14 @@ ipv4_connect(struct TCP_Server_Info *server)
 			socket->sk->sk_rcvbuf = 140 * 1024;
 	}
 
+	if (server->tcp_nodelay) {
+		val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+	}
+
 	 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
@@ -1916,6 +1928,7 @@ static int
 ipv6_connect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
+	int val;
 	bool connected = false;
 	__be16 orig_port = 0;
 	struct socket *socket = server->ssocket;
@@ -1987,6 +2000,15 @@ ipv6_connect(struct TCP_Server_Info *server)
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
 	socket->sk->sk_sndtimeo = 5 * HZ;
+
+	if (server->tcp_nodelay) {
+		val = 1;
+		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
+				(char *)&val, sizeof(val));
+		if (rc)
+			cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+	}
+
 	server->ssocket = socket;
 
 	return rc;
-- 
cgit v1.2.3


From e03a72e13648ac6277bf2bab6b8324d51f89c0fa Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 11 Jan 2010 03:21:51 -0500
Subject: block: Stop using byte offsets

All callers of the stacking functions use 512-byte sector units rather
than byte offsets.  Simplify the code so the stacking functions take
sectors when specifying data offsets.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 26 +++++++++-----------------
 fs/partitions/check.c  |  7 ++++---
 include/linux/blkdev.h | 17 +++++------------
 3 files changed, 18 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5eeb9e0d256e..78549c723783 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -507,7 +507,7 @@ static unsigned int lcm(unsigned int a, unsigned int b)
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
  * @b:  the underlying queue limits (bottom, component device)
- * @offset:  offset to beginning of data within component device
+ * @start:  first data sector within component device
  *
  * Description:
  *    This function is used by stacking drivers like MD and DM to ensure
@@ -525,10 +525,9 @@ static unsigned int lcm(unsigned int a, unsigned int b)
  *    the alignment_offset is undefined.
  */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
-		     sector_t offset)
+		     sector_t start)
 {
-	sector_t alignment;
-	unsigned int top, bottom, ret = 0;
+	unsigned int top, bottom, alignment, ret = 0;
 
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
@@ -548,7 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	t->misaligned |= b->misaligned;
 
-	alignment = queue_limit_alignment_offset(b, offset);
+	alignment = queue_limit_alignment_offset(b, start);
 
 	/* Bottom device has different alignment.  Check that it is
 	 * compatible with the current top alignment.
@@ -611,11 +610,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
-		unsigned int granularity = b->discard_granularity;
-		offset &= granularity - 1;
-
-		alignment = (granularity + b->discard_alignment - offset)
-			& (granularity - 1);
+		alignment = queue_limit_discard_alignment(b, start);
 
 		if (t->discard_granularity != 0 &&
 		    t->discard_alignment != alignment) {
@@ -657,7 +652,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
 
 	start += get_start_sect(bdev);
 
-	return blk_stack_limits(t, &bq->limits, start << 9);
+	return blk_stack_limits(t, &bq->limits, start);
 }
 EXPORT_SYMBOL(bdev_stack_limits);
 
@@ -668,9 +663,8 @@ EXPORT_SYMBOL(bdev_stack_limits);
  * @offset:  offset to beginning of data within component device
  *
  * Description:
- *    Merges the limits for two queues.  Returns 0 if alignment
- *    didn't change.  Returns -1 if adding the bottom device caused
- *    misalignment.
+ *    Merges the limits for a top level gendisk and a bottom level
+ *    block_device.
  */
 void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 		       sector_t offset)
@@ -678,9 +672,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 	struct request_queue *t = disk->queue;
 	struct request_queue *b = bdev_get_queue(bdev);
 
-	offset += get_start_sect(bdev) << 9;
-
-	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+	if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
 		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
 
 		disk_name(disk, 0, top);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 64bc8998ac9a..e8865c11777f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -412,9 +412,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
-	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
-	p->discard_alignment = queue_sector_discard_alignment(disk->queue,
-							      start);
+	p->alignment_offset =
+		queue_limit_alignment_offset(&disk->queue->limits, start);
+	p->discard_alignment =
+		queue_limit_discard_alignment(&disk->queue->limits, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c8018977efa..ffb13ad35716 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1112,18 +1112,13 @@ static inline int queue_alignment_offset(struct request_queue *q)
 	return q->limits.alignment_offset;
 }
 
-static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t offset)
+static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
 {
 	unsigned int granularity = max(lim->physical_block_size, lim->io_min);
+	unsigned int alignment = (sector << 9) & (granularity - 1);
 
-	offset &= granularity - 1;
-	return (granularity + lim->alignment_offset - offset) & (granularity - 1);
-}
-
-static inline int queue_sector_alignment_offset(struct request_queue *q,
-						sector_t sector)
-{
-	return queue_limit_alignment_offset(&q->limits, sector << 9);
+	return (granularity + lim->alignment_offset - alignment)
+		& (granularity - 1);
 }
 
 static inline int bdev_alignment_offset(struct block_device *bdev)
@@ -1147,10 +1142,8 @@ static inline int queue_discard_alignment(struct request_queue *q)
 	return q->limits.discard_alignment;
 }
 
-static inline int queue_sector_discard_alignment(struct request_queue *q,
-						 sector_t sector)
+static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector)
 {
-	struct queue_limits *lim = &q->limits;
 	unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
 
 	return (lim->discard_granularity + lim->discard_alignment - alignment)
-- 
cgit v1.2.3


From 3a85cd96d3ab3c6dcf88b81fc6eaddb84e565a43 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 14 Jan 2010 01:33:55 +0000
Subject: xfs: add tracing to xfs_swap_extents

To be able to diagnose whether the swap extents function is
detecting compatible inode data fork configurations for swapping
extents, add tracing points to the code to allow us to see the
format of the inode forks before and after the swap.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 53 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_dfrag.c           |  5 +++++
 2 files changed, 58 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index c22a608321a3..3353aef50530 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1414,6 +1414,59 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
 		  __entry->count)
 );
 
+#define XFS_SWAPEXT_INODES \
+	{ 0,	"target" }, \
+	{ 1,	"temp" }
+
+#define XFS_INODE_FORMAT_STR \
+	{ 0,	"invalid" }, \
+	{ 1,	"local" }, \
+	{ 2,	"extent" }, \
+	{ 3,	"btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+	TP_PROTO(struct xfs_inode *ip, int which),
+	TP_ARGS(ip, which),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(int, which)
+		__field(xfs_ino_t, ino)
+		__field(int, format)
+		__field(int, nex)
+		__field(int, max_nex)
+		__field(int, broot_size)
+		__field(int, fork_off)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->which = which;
+		__entry->ino = ip->i_ino;
+		__entry->format = ip->i_d.di_format;
+		__entry->nex = ip->i_d.di_nextents;
+		__entry->max_nex = ip->i_df.if_ext_max;
+		__entry->broot_size = ip->i_df.if_broot_bytes;
+		__entry->fork_off = XFS_IFORK_BOFF(ip);
+	),
+	TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+		  "Max in-fork extents %d, broot size %d, fork offset %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+		  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+		  __entry->nex,
+		  __entry->max_nex,
+		  __entry->broot_size,
+		  __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int which), \
+	TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 84ca1cf16a1e..f25e54027d10 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -254,6 +254,9 @@ xfs_swap_extents(
 		goto out_unlock;
 	}
 
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
 	/* check inode formats now that data is flushed */
 	error = xfs_swap_extents_check_format(ip, tip);
 	if (error) {
@@ -421,6 +424,8 @@ xfs_swap_extents(
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 
+	trace_xfs_swap_extent_after(ip, 0);
+	trace_xfs_swap_extent_after(tip, 1);
 out:
 	kmem_free(tempifp);
 	return error;
-- 
cgit v1.2.3


From 6bded0f383fd7971b76ad6c194dda7d5b814b871 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 14 Jan 2010 01:33:56 +0000
Subject: xfs: clean up inconsistent variable naming in xfs_swap_extent

The swap extent ioctl passes in a target inode and a temporary inode
which are clearly named in the ioctl structure. The code then
assigns temp to target and vice versa, making it extremely difficult
to work out which inode is which later in the code.  Make this
consistent throughout the code.

Also make xfs_swap_extent static as there are no external users of
the function.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 38 ++++++++++++++++++++++----------------
 fs/xfs/xfs_dfrag.h |  3 ---
 2 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index f25e54027d10..cd27c9d6c71f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -45,15 +45,21 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
+
+static int xfs_swap_extents(
+	xfs_inode_t	*ip,	/* target inode */
+	xfs_inode_t	*tip,	/* tmp inode */
+	xfs_swapext_t	*sxp);
+
 /*
- * Syssgi interface for swapext
+ * ioctl interface for swapext
  */
 int
 xfs_swapext(
 	xfs_swapext_t	*sxp)
 {
 	xfs_inode_t     *ip, *tip;
-	struct file	*file, *target_file;
+	struct file	*file, *tmp_file;
 	int		error = 0;
 
 	/* Pull information for the target fd */
@@ -68,46 +74,46 @@ xfs_swapext(
 		goto out_put_file;
 	}
 
-	target_file = fget((int)sxp->sx_fdtmp);
-	if (!target_file) {
+	tmp_file = fget((int)sxp->sx_fdtmp);
+	if (!tmp_file) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_file;
 	}
 
-	if (!(target_file->f_mode & FMODE_WRITE) ||
-	    (target_file->f_flags & O_APPEND)) {
+	if (!(tmp_file->f_mode & FMODE_WRITE) ||
+	    (tmp_file->f_flags & O_APPEND)) {
 		error = XFS_ERROR(EBADF);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
-	    IS_SWAPFILE(target_file->f_path.dentry->d_inode)) {
+	    IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	ip = XFS_I(file->f_path.dentry->d_inode);
-	tip = XFS_I(target_file->f_path.dentry->d_inode);
+	tip = XFS_I(tmp_file->f_path.dentry->d_inode);
 
 	if (ip->i_mount != tip->i_mount) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (ip->i_ino == tip->i_ino) {
 		error = XFS_ERROR(EINVAL);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		error = XFS_ERROR(EIO);
-		goto out_put_target_file;
+		goto out_put_tmp_file;
 	}
 
 	error = xfs_swap_extents(ip, tip, sxp);
 
- out_put_target_file:
-	fput(target_file);
+ out_put_tmp_file:
+	fput(tmp_file);
  out_put_file:
 	fput(file);
  out:
@@ -186,7 +192,7 @@ xfs_swap_extents_check_format(
 	return 0;
 }
 
-int
+static int
 xfs_swap_extents(
 	xfs_inode_t	*ip,	/* target inode */
 	xfs_inode_t	*tip,	/* tmp inode */
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index 4f55a6306558..20bdd935c121 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,9 +48,6 @@ typedef struct xfs_swapext
  */
 int	xfs_swapext(struct xfs_swapext *sx);
 
-int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
-		struct xfs_swapext *sxp);
-
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_DFRAG_H__ */
-- 
cgit v1.2.3


From 5d77c0dc0c05c2c65aee16149fae06831a118730 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 19 Nov 2009 15:52:00 +0000
Subject: xfs: make several more functions static

Just minor housekeeping, a lot more functions can be trivially made
static; others could if we reordered things a bit...

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 2 +-
 fs/xfs/linux-2.6/xfs_sync.h | 1 -
 fs/xfs/xfs_attr.c           | 2 +-
 fs/xfs/xfs_attr.h           | 1 -
 fs/xfs/xfs_bmap_btree.c     | 2 +-
 fs/xfs/xfs_bmap_btree.h     | 1 -
 fs/xfs/xfs_dir2_node.c      | 2 +-
 fs/xfs/xfs_dir2_node.h      | 2 --
 fs/xfs/xfs_log_priv.h       | 5 -----
 fs/xfs/xfs_log_recover.c    | 6 +++---
 10 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 1f5e4bb5e970..0f90bfe2815f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -351,7 +351,7 @@ xfs_commit_dummy_trans(
 	return error;
 }
 
-int
+STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp,
 	int			flags)
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index ea932b43335d..d480c346cabb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -37,7 +37,6 @@ void xfs_syncd_stop(struct xfs_mount *mp);
 
 int xfs_sync_attr(struct xfs_mount *mp, int flags);
 int xfs_sync_data(struct xfs_mount *mp, int flags);
-int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e953b6cfb2a8..9d11ebad43b6 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -197,7 +197,7 @@ xfs_attr_get(
 /*
  * Calculate how many blocks we need for the new attribute,
  */
-int
+STATIC int
 xfs_attr_calc_size(
 	struct xfs_inode 	*ip,
 	int			namelen,
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 59b410ce69a1..9c3a24372914 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -139,7 +139,6 @@ typedef struct xfs_attr_list_context {
 /*
  * Overall external interface routines.
  */
-int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
 int xfs_attr_list_int(struct xfs_attr_list_context *);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 38751d5fac6f..416e47e54b83 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -334,7 +334,7 @@ xfs_bmbt_disk_set_allf(
 /*
  * Set all the fields in a bmap extent record from the uncompressed form.
  */
-void
+STATIC void
 xfs_bmbt_disk_set_all(
 	xfs_bmbt_rec_t	*r,
 	xfs_bmbt_irec_t *s)
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cf07ca7c22e7..0e66c4ea0f85 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -223,7 +223,6 @@ extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v);
 extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v);
 extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v);
 
-extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
 			xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
 
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ce6e355199b5..78fc4d9ae756 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -65,7 +65,7 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
 /*
  * Log entries from a freespace block.
  */
-void
+STATIC void
 xfs_dir2_free_log_bests(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	xfs_dabuf_t		*bp,		/* freespace buffer */
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
index dde72db3d695..82dfe7147195 100644
--- a/fs/xfs/xfs_dir2_node.h
+++ b/fs/xfs/xfs_dir2_node.h
@@ -75,8 +75,6 @@ xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
 	return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp));
 }
 
-extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
-				    int first, int last);
 extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
 				 struct xfs_dabuf *lbp);
 extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d55662db7077..fd02a18facd5 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -443,14 +443,9 @@ typedef struct log {
 
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
-extern int	 xlog_find_tail(xlog_t	*log,
-				xfs_daddr_t *head_blk,
-				xfs_daddr_t *tail_blk);
 extern int	 xlog_recover(xlog_t *log);
 extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
-extern void	 xlog_put_bp(struct xfs_buf *);
 
 extern kmem_zone_t	*xfs_log_ticket_zone;
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 69ac2e5ef20c..48a7ab1e6311 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -68,7 +68,7 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
 
-xfs_buf_t *
+STATIC xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
 	int		nbblks)
@@ -88,7 +88,7 @@ xlog_get_bp(
 	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
-void
+STATIC void
 xlog_put_bp(
 	xfs_buf_t	*bp)
 {
@@ -805,7 +805,7 @@ xlog_find_head(
  * We could speed up search by using current head_blk buffer, but it is not
  * available.
  */
-int
+STATIC int
 xlog_find_tail(
 	xlog_t			*log,
 	xfs_daddr_t		*head_blk,
-- 
cgit v1.2.3


From f0a7695380efa31cd281730917f7e907a724d5cb Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:49:57 +0000
Subject: xfs: Use list_heads for log recovery item lists

Remove the roll-your-own linked list operations.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 205 ++++++++++++++++-------------------------------
 fs/xfs/xfs_log_recover.h |  23 +++---
 2 files changed, 81 insertions(+), 147 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 48a7ab1e6311..65f1f137d789 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -50,8 +50,6 @@
 
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
-STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
-					       xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void	xlog_recover_check_summary(xlog_t *);
 #else
@@ -1367,36 +1365,45 @@ xlog_clear_stale_blocks(
 
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
-	xlog_recover_t		*q,
+	struct hlist_head	*head,
 	xlog_tid_t		tid)
 {
-	xlog_recover_t		*p = q;
+	xlog_recover_t		*trans;
+	struct hlist_node	*n;
 
-	while (p != NULL) {
-		if (p->r_log_tid == tid)
-		    break;
-		p = p->r_next;
+	hlist_for_each_entry(trans, n, head, r_list) {
+		if (trans->r_log_tid == tid)
+			return trans;
 	}
-	return p;
+	return NULL;
 }
 
 STATIC void
-xlog_recover_put_hashq(
-	xlog_recover_t		**q,
-	xlog_recover_t		*trans)
+xlog_recover_new_tid(
+	struct hlist_head	*head,
+	xlog_tid_t		tid,
+	xfs_lsn_t		lsn)
 {
-	trans->r_next = *q;
-	*q = trans;
+	xlog_recover_t		*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	INIT_LIST_HEAD(&trans->r_itemq);
+
+	INIT_HLIST_NODE(&trans->r_list);
+	hlist_add_head(&trans->r_list, head);
 }
 
 STATIC void
 xlog_recover_add_item(
-	xlog_recover_item_t	**itemq)
+	struct list_head	*head)
 {
 	xlog_recover_item_t	*item;
 
 	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-	xlog_recover_insert_item_backq(itemq, item);
+	INIT_LIST_HEAD(&item->ri_list);
+	list_add_tail(&item->ri_list, head);
 }
 
 STATIC int
@@ -1409,8 +1416,7 @@ xlog_recover_add_to_cont_trans(
 	xfs_caddr_t		ptr, old_ptr;
 	int			old_len;
 
-	item = trans->r_itemq;
-	if (item == NULL) {
+	if (list_empty(&trans->r_itemq)) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
 		ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1418,7 +1424,8 @@ xlog_recover_add_to_cont_trans(
 		memcpy(ptr, dp, len); /* d, s, l */
 		return 0;
 	}
-	item = item->ri_prev;
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
 
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1455,8 +1462,7 @@ xlog_recover_add_to_trans(
 
 	if (!len)
 		return 0;
-	item = trans->r_itemq;
-	if (item == NULL) {
+	if (list_empty(&trans->r_itemq)) {
 		/* we need to catch log corruptions here */
 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
 			xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1474,12 +1480,15 @@ xlog_recover_add_to_trans(
 	memcpy(ptr, dp, len);
 	in_f = (xfs_inode_log_format_t *)ptr;
 
-	if (item->ri_prev->ri_total != 0 &&
-	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+	/* take the tail entry */
+	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+	if (item->ri_total != 0 &&
+	     item->ri_total == item->ri_cnt) {
+		/* tail item is in use, get a new one */
 		xlog_recover_add_item(&trans->r_itemq);
+		item = list_entry(trans->r_itemq.prev,
+					xlog_recover_item_t, ri_list);
 	}
-	item = trans->r_itemq;
-	item = item->ri_prev;
 
 	if (item->ri_total == 0) {		/* first region to be added */
 		if (in_f->ilf_size == 0 ||
@@ -1504,96 +1513,29 @@ xlog_recover_add_to_trans(
 	return 0;
 }
 
-STATIC void
-xlog_recover_new_tid(
-	xlog_recover_t		**q,
-	xlog_tid_t		tid,
-	xfs_lsn_t		lsn)
-{
-	xlog_recover_t		*trans;
-
-	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-	trans->r_log_tid   = tid;
-	trans->r_lsn	   = lsn;
-	xlog_recover_put_hashq(q, trans);
-}
-
-STATIC int
-xlog_recover_unlink_tid(
-	xlog_recover_t		**q,
-	xlog_recover_t		*trans)
-{
-	xlog_recover_t		*tp;
-	int			found = 0;
-
-	ASSERT(trans != NULL);
-	if (trans == *q) {
-		*q = (*q)->r_next;
-	} else {
-		tp = *q;
-		while (tp) {
-			if (tp->r_next == trans) {
-				found = 1;
-				break;
-			}
-			tp = tp->r_next;
-		}
-		if (!found) {
-			xlog_warn(
-			     "XFS: xlog_recover_unlink_tid: trans not found");
-			ASSERT(0);
-			return XFS_ERROR(EIO);
-		}
-		tp->r_next = tp->r_next->r_next;
-	}
-	return 0;
-}
-
-STATIC void
-xlog_recover_insert_item_backq(
-	xlog_recover_item_t	**q,
-	xlog_recover_item_t	*item)
-{
-	if (*q == NULL) {
-		item->ri_prev = item->ri_next = item;
-		*q = item;
-	} else {
-		item->ri_next		= *q;
-		item->ri_prev		= (*q)->ri_prev;
-		(*q)->ri_prev		= item;
-		item->ri_prev->ri_next	= item;
-	}
-}
-
-STATIC void
-xlog_recover_insert_item_frontq(
-	xlog_recover_item_t	**q,
-	xlog_recover_item_t	*item)
-{
-	xlog_recover_insert_item_backq(q, item);
-	*q = item;
-}
-
+/*
+ * Sort the log items in the transaction. Cancelled buffers need
+ * to be put first so they are processed before any items that might
+ * modify the buffers. If they are cancelled, then the modifications
+ * don't need to be replayed.
+ */
 STATIC int
 xlog_recover_reorder_trans(
 	xlog_recover_t		*trans)
 {
-	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
-	xfs_buf_log_format_t	*buf_f;
-	ushort			flags = 0;
+	xlog_recover_item_t	*item, *n;
+	LIST_HEAD(sort_list);
+
+	list_splice_init(&trans->r_itemq, &sort_list);
+	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+		xfs_buf_log_format_t	*buf_f;
 
-	first_item = itemq = trans->r_itemq;
-	trans->r_itemq = NULL;
-	do {
-		itemq_next = itemq->ri_next;
-		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
+		buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 
-		switch (ITEM_TYPE(itemq)) {
+		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
-			flags = buf_f->blf_flags;
-			if (!(flags & XFS_BLI_CANCEL)) {
-				xlog_recover_insert_item_frontq(&trans->r_itemq,
-								itemq);
+			if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+				list_move(&item->ri_list, &trans->r_itemq);
 				break;
 			}
 		case XFS_LI_INODE:
@@ -1601,7 +1543,7 @@ xlog_recover_reorder_trans(
 		case XFS_LI_QUOTAOFF:
 		case XFS_LI_EFD:
 		case XFS_LI_EFI:
-			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+			list_move_tail(&item->ri_list, &trans->r_itemq);
 			break;
 		default:
 			xlog_warn(
@@ -1609,8 +1551,8 @@ xlog_recover_reorder_trans(
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
-		itemq = itemq_next;
-	} while (first_item != itemq);
+	}
+	ASSERT(list_empty(&sort_list));
 	return 0;
 }
 
@@ -2814,14 +2756,13 @@ xlog_recover_do_trans(
 	int			pass)
 {
 	int			error = 0;
-	xlog_recover_item_t	*item, *first_item;
+	xlog_recover_item_t	*item;
 
 	error = xlog_recover_reorder_trans(trans);
 	if (error)
 		return error;
 
-	first_item = item = trans->r_itemq;
-	do {
+	list_for_each_entry(item, &trans->r_itemq, ri_list) {
 		switch (ITEM_TYPE(item)) {
 		case XFS_LI_BUF:
 			error = xlog_recover_do_buffer_trans(log, item, pass);
@@ -2854,8 +2795,7 @@ xlog_recover_do_trans(
 
 		if (error)
 			return error;
-		item = item->ri_next;
-	} while (first_item != item);
+	}
 
 	return 0;
 }
@@ -2869,21 +2809,18 @@ STATIC void
 xlog_recover_free_trans(
 	xlog_recover_t		*trans)
 {
-	xlog_recover_item_t	*first_item, *item, *free_item;
+	xlog_recover_item_t	*item, *n;
 	int			i;
 
-	item = first_item = trans->r_itemq;
-	do {
-		free_item = item;
-		item = item->ri_next;
-		 /* Free the regions in the item. */
-		for (i = 0; i < free_item->ri_cnt; i++) {
-			kmem_free(free_item->ri_buf[i].i_addr);
-		}
+	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+		/* Free the regions in the item. */
+		list_del(&item->ri_list);
+		for (i = 0; i < item->ri_cnt; i++)
+			kmem_free(item->ri_buf[i].i_addr);
 		/* Free the item itself */
-		kmem_free(free_item->ri_buf);
-		kmem_free(free_item);
-	} while (first_item != item);
+		kmem_free(item->ri_buf);
+		kmem_free(item);
+	}
 	/* Free the transaction recover structure */
 	kmem_free(trans);
 }
@@ -2891,14 +2828,12 @@ xlog_recover_free_trans(
 STATIC int
 xlog_recover_commit_trans(
 	xlog_t			*log,
-	xlog_recover_t		**q,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error;
 
-	if ((error = xlog_recover_unlink_tid(q, trans)))
-		return error;
+	hlist_del(&trans->r_list);
 	if ((error = xlog_recover_do_trans(log, trans, pass)))
 		return error;
 	xlog_recover_free_trans(trans);			/* no error */
@@ -2926,7 +2861,7 @@ xlog_recover_unmount_trans(
 STATIC int
 xlog_recover_process_data(
 	xlog_t			*log,
-	xlog_recover_t		*rhash[],
+	struct hlist_head	rhash[],
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	int			pass)
@@ -2960,7 +2895,7 @@ xlog_recover_process_data(
 		}
 		tid = be32_to_cpu(ohead->oh_tid);
 		hash = XLOG_RHASH(tid);
-		trans = xlog_recover_find_tid(rhash[hash], tid);
+		trans = xlog_recover_find_tid(&rhash[hash], tid);
 		if (trans == NULL) {		   /* not found; add new tid */
 			if (ohead->oh_flags & XLOG_START_TRANS)
 				xlog_recover_new_tid(&rhash[hash], tid,
@@ -2978,7 +2913,7 @@ xlog_recover_process_data(
 			switch (flags) {
 			case XLOG_COMMIT_TRANS:
 				error = xlog_recover_commit_trans(log,
-						&rhash[hash], trans, pass);
+								trans, pass);
 				break;
 			case XLOG_UNMOUNT_TRANS:
 				error = xlog_recover_unmount_trans(trans);
@@ -3517,7 +3452,7 @@ xlog_do_recovery_pass(
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
-	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
+	struct hlist_head	rhash[XLOG_RHASH_SIZE];
 
 	ASSERT(head_blk != tail_blk);
 
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index b22545555301..75d749207258 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -35,22 +35,21 @@
  * item headers are in ri_buf[0].  Additional buffers follow.
  */
 typedef struct xlog_recover_item {
-	struct xlog_recover_item *ri_next;
-	struct xlog_recover_item *ri_prev;
-	int			 ri_type;
-	int			 ri_cnt;	/* count of regions found */
-	int			 ri_total;	/* total regions */
-	xfs_log_iovec_t		 *ri_buf;	/* ptr to regions buffer */
+	struct list_head	ri_list;
+	int			ri_type;
+	int			ri_cnt;	/* count of regions found */
+	int			ri_total;	/* total regions */
+	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
 } xlog_recover_item_t;
 
 struct xlog_tid;
 typedef struct xlog_recover {
-	struct xlog_recover *r_next;
-	xlog_tid_t	    r_log_tid;		/* log's transaction id */
-	xfs_trans_header_t  r_theader;		/* trans header for partial */
-	int		    r_state;		/* not needed */
-	xfs_lsn_t	    r_lsn;		/* xact lsn */
-	xlog_recover_item_t *r_itemq;		/* q for items */
+	struct hlist_node	r_list;
+	xlog_tid_t		r_log_tid;	/* log's transaction id */
+	xfs_trans_header_t	r_theader;	/* trans header for partial */
+	int			r_state;	/* not needed */
+	xfs_lsn_t		r_lsn;		/* xact lsn */
+	struct list_head	r_itemq;	/* q for items */
 } xlog_recover_t;
 
 #define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
-- 
cgit v1.2.3


From 453eac8a9aa417878a38bdfbccafd5f7ce4e8e4e Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:49:58 +0000
Subject: xfs: Don't wake the aild once per second

Now that the AIL push algorithm is traversal safe, we don't need a
watchdog function in the xfsaild to catch pushes that fail to make
progress. Remove the watchdog timeout and make pushes purely driven
by demand. This will remove the once-per-second wakeup that is seen
when the filesystem is idle and make laptop power misers happy.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c |  7 +++----
 fs/xfs/xfs_trans_ail.c       | 19 +++++++++++--------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 77414db10dc2..9f2e398a5616 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -877,12 +877,11 @@ xfsaild(
 {
 	struct xfs_ail	*ailp = data;
 	xfs_lsn_t	last_pushed_lsn = 0;
-	long		tout = 0;
+	long		tout = 0; /* milliseconds */
 
 	while (!kthread_should_stop()) {
-		if (tout)
-			schedule_timeout_interruptible(msecs_to_jiffies(tout));
-		tout = 1000;
+		schedule_timeout_interruptible(tout ?
+				msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
 
 		/* swsusp */
 		try_to_freeze();
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2ffc570679be..063dfbdca94b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -237,14 +237,15 @@ out:
 }
 
 /*
- * Function that does the work of pushing on the AIL
+ * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * zero indicates that the caller should sleep until woken.
  */
 long
 xfsaild_push(
 	struct xfs_ail	*ailp,
 	xfs_lsn_t	*last_lsn)
 {
-	long		tout = 1000; /* milliseconds */
+	long		tout = 0;
 	xfs_lsn_t	last_pushed_lsn = *last_lsn;
 	xfs_lsn_t	target =  ailp->xa_target;
 	xfs_lsn_t	lsn;
@@ -262,7 +263,7 @@ xfsaild_push(
 		 */
 		xfs_trans_ail_cursor_done(ailp, cur);
 		spin_unlock(&ailp->xa_lock);
-		last_pushed_lsn = 0;
+		*last_lsn = 0;
 		return tout;
 	}
 
@@ -279,7 +280,6 @@ xfsaild_push(
 	 * prevents use from spinning when we can't do anything or there is
 	 * lots of contention on the AIL lists.
 	 */
-	tout = 10;
 	lsn = lip->li_lsn;
 	flush_log = stuck = count = 0;
 	while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
@@ -376,14 +376,14 @@ xfsaild_push(
 
 	if (!count) {
 		/* We're past our target or empty, so idle */
-		tout = 1000;
+		last_pushed_lsn = 0;
 	} else if (XFS_LSN_CMP(lsn, target) >= 0) {
 		/*
 		 * We reached the target so wait a bit longer for I/O to
 		 * complete and remove pushed items from the AIL before we
 		 * start the next scan from the start of the AIL.
 		 */
-		tout += 20;
+		tout = 50;
 		last_pushed_lsn = 0;
 	} else if ((stuck * 100) / count > 90) {
 		/*
@@ -395,11 +395,14 @@ xfsaild_push(
 		 * Backoff a bit more to allow some I/O to complete before
 		 * continuing from where we were.
 		 */
-		tout += 10;
+		tout = 20;
+	} else {
+		/* more to do, but wait a short while before continuing */
+		tout = 10;
 	}
 	*last_lsn = last_pushed_lsn;
 	return tout;
-}	/* xfsaild_push */
+}
 
 
 /*
-- 
cgit v1.2.3


From c9c129714e71c890bed1bd5b61697a896c3c2d54 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:49:59 +0000
Subject: xfs: Don't wake xfsbufd when idle

The xfsbufd wakes every xfsbufd_centisecs (once per second by
default) for each filesystem even when the filesystem is idle.  If
the xfsbufd has nothing to do, put it into a long term sleep and
only wake it up when there is work pending (i.e. dirty buffers to
flush soon). This will make laptop power misers happy.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..18ae3ba8f78a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1595,6 +1595,11 @@ xfs_buf_delwri_queue(
 		list_del(&bp->b_list);
 	}
 
+	if (list_empty(dwq)) {
+		/* start xfsbufd as it is about to have something to do */
+		wake_up_process(bp->b_target->bt_task);
+	}
+
 	bp->b_flags |= _XBF_DELWRI_Q;
 	list_add_tail(&bp->b_list, dwq);
 	bp->b_queuetime = jiffies;
@@ -1644,6 +1649,8 @@ xfsbufd_wakeup(
 	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
 		if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
 			continue;
+		if (list_empty(&btp->bt_delwrite_queue))
+			continue;
 		set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
 		wake_up_process(btp->bt_task);
 	}
@@ -1708,6 +1715,9 @@ xfsbufd(
 	set_freezable();
 
 	do {
+		long	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+		long	tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 			refrigerator();
@@ -1715,12 +1725,12 @@ xfsbufd(
 			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 		}
 
-		schedule_timeout_interruptible(
-			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
-
-		xfs_buf_delwri_split(target, &tmp,
-				xfs_buf_age_centisecs * msecs_to_jiffies(10));
+		/* sleep for a long time if there is nothing to do. */
+		if (list_empty(&target->bt_delwrite_queue))
+			tout = MAX_SCHEDULE_TIMEOUT;
+		schedule_timeout_interruptible(tout);
 
+		xfs_buf_delwri_split(target, &tmp, age);
 		count = 0;
 		while (!list_empty(&tmp)) {
 			bp = list_entry(tmp.next, xfs_buf_t, b_list);
-- 
cgit v1.2.3


From 5017e97d52628fb8ae56e434e86ac2e72ddaac2b Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:40 +0000
Subject: xfs: rename xfs_get_perag

xfs_get_perag is really getting the perag that an inode belongs to
based on it's inode number. Convert the use of this function to just
get the perag from a provided ag number.  Use this new function to
obtain the per-ag structure when traversing the per AG inode trees
for sync and reclaim.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 22 +++++++++++++---------
 fs/xfs/xfs_iget.c           | 10 +++++-----
 fs/xfs/xfs_inode.c          |  8 +++++---
 fs/xfs/xfs_mount.h          |  8 ++++----
 4 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 0f90bfe2815f..cc964faf12e9 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -90,14 +90,13 @@ xfs_inode_ag_lookup(
 STATIC int
 xfs_inode_ag_walk(
 	struct xfs_mount	*mp,
-	xfs_agnumber_t		ag,
+	struct xfs_perag	*pag,
 	int			(*execute)(struct xfs_inode *ip,
 					   struct xfs_perag *pag, int flags),
 	int			flags,
 	int			tag,
 	int			exclusive)
 {
-	struct xfs_perag	*pag = &mp->m_perag[ag];
 	uint32_t		first_index;
 	int			last_error = 0;
 	int			skipped;
@@ -141,8 +140,6 @@ restart:
 		delay(1);
 		goto restart;
 	}
-
-	xfs_put_perag(mp, pag);
 	return last_error;
 }
 
@@ -160,10 +157,16 @@ xfs_inode_ag_iterator(
 	xfs_agnumber_t		ag;
 
 	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-		if (!mp->m_perag[ag].pag_ici_init)
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, ag);
+		if (!pag->pag_ici_init) {
+			xfs_perag_put(pag);
 			continue;
-		error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
+		}
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
 						exclusive);
+		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
 			if (error == EFSCORRUPTED)
@@ -690,16 +693,17 @@ void
 xfs_inode_set_reclaim_tag(
 	xfs_inode_t	*ip)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_perag_t	*pag = xfs_get_perag(mp, ip->i_ino);
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
 
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	read_lock(&pag->pag_ici_lock);
 	spin_lock(&ip->i_flags_lock);
 	__xfs_inode_set_reclaim_tag(pag, ip);
 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
 	spin_unlock(&ip->i_flags_lock);
 	read_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 }
 
 void
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 155e798f30a1..e281eb4a1c49 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -374,7 +374,7 @@ xfs_iget(
 		return EINVAL;
 
 	/* get the perag structure and ensure that it's inode capable */
-	pag = xfs_get_perag(mp, ino);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
 	if (!pag->pagi_inodeok)
 		return EINVAL;
 	ASSERT(pag->pag_ici_init);
@@ -398,7 +398,7 @@ again:
 		if (error)
 			goto out_error_or_again;
 	}
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 
 	*ipp = ip;
 
@@ -417,7 +417,7 @@ out_error_or_again:
 		delay(1);
 		goto again;
 	}
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -488,12 +488,12 @@ xfs_ireclaim(
 	 * added to the tree assert that it's been there before to catch
 	 * problems with the inode life time early on.
 	 */
-	pag = xfs_get_perag(mp, ip->i_ino);
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	write_lock(&pag->pag_ici_lock);
 	if (!radix_tree_delete(&pag->pag_ici_root, agino))
 		ASSERT(0);
 	write_unlock(&pag->pag_ici_lock);
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 
 	/*
 	 * Here we do an (almost) spurious inode lock in order to coordinate
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ef77fd88c8e3..bd3d81636d51 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1946,8 +1946,9 @@ xfs_ifree_cluster(
 	xfs_inode_t		*ip, **ip_found;
 	xfs_inode_log_item_t	*iip;
 	xfs_log_item_t		*lip;
-	xfs_perag_t		*pag = xfs_get_perag(mp, inum);
+	struct xfs_perag	*pag;
 
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
 		blks_per_cluster = 1;
 		ninodes = mp->m_sb.sb_inopblock;
@@ -2088,7 +2089,7 @@ xfs_ifree_cluster(
 	}
 
 	kmem_free(ip_found);
-	xfs_put_perag(mp, pag);
+	xfs_perag_put(pag);
 }
 
 /*
@@ -2675,7 +2676,7 @@ xfs_iflush_cluster(
 	xfs_buf_t	*bp)
 {
 	xfs_mount_t		*mp = ip->i_mount;
-	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
+	struct xfs_perag	*pag;
 	unsigned long		first_index, mask;
 	unsigned long		inodes_per_cluster;
 	int			ilist_size;
@@ -2686,6 +2687,7 @@ xfs_iflush_cluster(
 	int			bufwasdelwri;
 	int			i;
 
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	ASSERT(pag->pagi_inodeok);
 	ASSERT(pag->pag_ici_init);
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..f8a68a2319b5 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -386,14 +386,14 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 /*
  * perag get/put wrappers for eventual ref counting
  */
-static inline xfs_perag_t *
-xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
+static inline struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 {
-	return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
+	return &mp->m_perag[agno];
 }
 
 static inline void
-xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
+xfs_perag_put(struct xfs_perag *pag)
 {
 	/* nothing to see here, move along */
 }
-- 
cgit v1.2.3


From a862e0fdcb8862aab2538ec2fc2f0dc07a625c59 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:41 +0000
Subject: xfs: Don't directly reference m_perag in allocation code

Start abstracting the perag references so that the indexing of the
structures is not directly coded into all the places that uses the
perag structures. This will allow us to separate the use of the
perag structure and the way it is indexed and hence avoid the known
deadlocks related to growing a busy filesystem.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c       | 82 +++++++++++++++++++++++++++---------------------
 fs/xfs/xfs_alloc_btree.c |  9 ++++--
 2 files changed, 53 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 275b1f4f9430..84070f2e0ba4 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1662,11 +1662,13 @@ xfs_free_ag_extent(
 		xfs_agf_t	*agf;
 		xfs_perag_t	*pag;		/* per allocation group data */
 
+		pag = xfs_perag_get(mp, agno);
+		pag->pagf_freeblks += len;
+		xfs_perag_put(pag);
+
 		agf = XFS_BUF_TO_AGF(agbp);
-		pag = &mp->m_perag[agno];
 		be32_add_cpu(&agf->agf_freeblks, len);
 		xfs_trans_agblocks_delta(tp, len);
-		pag->pagf_freeblks += len;
 		XFS_WANT_CORRUPTED_GOTO(
 			be32_to_cpu(agf->agf_freeblks) <=
 			be32_to_cpu(agf->agf_length),
@@ -1969,10 +1971,12 @@ xfs_alloc_get_freelist(
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
 		agf->agf_flfirst = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
 	be32_add_cpu(&agf->agf_flcount, -1);
 	xfs_trans_agflist_delta(tp, -1);
 	pag->pagf_flcount--;
+	xfs_perag_put(pag);
 
 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
 	if (btreeblk) {
@@ -2078,7 +2082,8 @@ xfs_alloc_put_freelist(
 	be32_add_cpu(&agf->agf_fllast, 1);
 	if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
 		agf->agf_fllast = 0;
-	pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+	pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
 	be32_add_cpu(&agf->agf_flcount, 1);
 	xfs_trans_agflist_delta(tp, 1);
 	pag->pagf_flcount++;
@@ -2089,6 +2094,7 @@ xfs_alloc_put_freelist(
 		pag->pagf_btreeblks--;
 		logflags |= XFS_AGF_BTREEBLKS;
 	}
+	xfs_perag_put(pag);
 
 	xfs_alloc_log_agf(tp, agbp, logflags);
 
@@ -2152,7 +2158,6 @@ xfs_read_agf(
 		xfs_trans_brelse(tp, *bpp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
-
 	XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
 	return 0;
 }
@@ -2184,7 +2189,7 @@ xfs_alloc_read_agf(
 	ASSERT(!XFS_BUF_GETERROR(*bpp));
 
 	agf = XFS_BUF_TO_AGF(*bpp);
-	pag = &mp->m_perag[agno];
+	pag = xfs_perag_get(mp, agno);
 	if (!pag->pagf_init) {
 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2211,6 +2216,7 @@ xfs_alloc_read_agf(
 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
 	}
 #endif
+	xfs_perag_put(pag);
 	return 0;
 }
 
@@ -2271,7 +2277,7 @@ xfs_alloc_vextent(
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
 		down_read(&mp->m_peraglock);
-		args->pag = &mp->m_perag[args->agno];
+		args->pag = xfs_perag_get(mp, args->agno);
 		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
 		args->minleft = minleft;
@@ -2341,7 +2347,7 @@ xfs_alloc_vextent(
 		 */
 		down_read(&mp->m_peraglock);
 		for (;;) {
-			args->pag = &mp->m_perag[args->agno];
+			args->pag = xfs_perag_get(mp, args->agno);
 			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
 			args->minleft = minleft;
@@ -2400,6 +2406,7 @@ xfs_alloc_vextent(
 					}
 				}
 			}
+			xfs_perag_put(args->pag);
 		}
 		up_read(&mp->m_peraglock);
 		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
@@ -2427,8 +2434,10 @@ xfs_alloc_vextent(
 			args->len);
 #endif
 	}
+	xfs_perag_put(args->pag);
 	return 0;
 error0:
+	xfs_perag_put(args->pag);
 	up_read(&mp->m_peraglock);
 	return error;
 }
@@ -2455,7 +2464,7 @@ xfs_free_extent(
 	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
 	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
 	down_read(&args.mp->m_peraglock);
-	args.pag = &args.mp->m_perag[args.agno];
+	args.pag = xfs_perag_get(args.mp, args.agno);
 	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
 		goto error0;
 #ifdef DEBUG
@@ -2465,6 +2474,7 @@ xfs_free_extent(
 #endif
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
+	xfs_perag_put(args.pag);
 	up_read(&args.mp->m_peraglock);
 	return error;
 }
@@ -2486,15 +2496,15 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
 		    xfs_agblock_t bno,
 		    xfs_extlen_t len)
 {
-	xfs_mount_t		*mp;
 	xfs_perag_busy_t	*bsy;
+	struct xfs_perag	*pag;
 	int			n;
 
-	mp = tp->t_mountp;
-	spin_lock(&mp->m_perag[agno].pagb_lock);
+	pag = xfs_perag_get(tp->t_mountp, agno);
+	spin_lock(&pag->pagb_lock);
 
 	/* search pagb_list for an open slot */
-	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+	for (bsy = pag->pagb_list, n = 0;
 	     n < XFS_PAGB_NUM_SLOTS;
 	     bsy++, n++) {
 		if (bsy->busy_tp == NULL) {
@@ -2502,11 +2512,11 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
 		}
 	}
 
-	trace_xfs_alloc_busy(mp, agno, bno, len, n);
+	trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
 
 	if (n < XFS_PAGB_NUM_SLOTS) {
-		bsy = &mp->m_perag[agno].pagb_list[n];
-		mp->m_perag[agno].pagb_count++;
+		bsy = &pag->pagb_list[n];
+		pag->pagb_count++;
 		bsy->busy_start = bno;
 		bsy->busy_length = len;
 		bsy->busy_tp = tp;
@@ -2521,7 +2531,8 @@ xfs_alloc_mark_busy(xfs_trans_t *tp,
 		xfs_trans_set_sync(tp);
 	}
 
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
 }
 
 void
@@ -2529,24 +2540,23 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
 		     xfs_agnumber_t agno,
 		     int idx)
 {
-	xfs_mount_t		*mp;
+	struct xfs_perag	*pag;
 	xfs_perag_busy_t	*list;
 
-	mp = tp->t_mountp;
-
-	spin_lock(&mp->m_perag[agno].pagb_lock);
-	list = mp->m_perag[agno].pagb_list;
-
 	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+	pag = xfs_perag_get(tp->t_mountp, agno);
+	spin_lock(&pag->pagb_lock);
+	list = pag->pagb_list;
 
-	trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp);
+	trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
 
 	if (list[idx].busy_tp == tp) {
 		list[idx].busy_tp = NULL;
-		mp->m_perag[agno].pagb_count--;
+		pag->pagb_count--;
 	}
 
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
 }
 
 
@@ -2560,17 +2570,15 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 		    xfs_agblock_t bno,
 		    xfs_extlen_t len)
 {
-	xfs_mount_t		*mp;
+	struct xfs_perag	*pag;
 	xfs_perag_busy_t	*bsy;
 	xfs_agblock_t		uend, bend;
 	xfs_lsn_t		lsn = 0;
 	int			cnt;
 
-	mp = tp->t_mountp;
-
-	spin_lock(&mp->m_perag[agno].pagb_lock);
-
-	uend = bno + len - 1;
+	pag = xfs_perag_get(tp->t_mountp, agno);
+	spin_lock(&pag->pagb_lock);
+	cnt = pag->pagb_count;
 
 	/*
 	 * search pagb_list for this slot, skipping open slots. We have to
@@ -2578,8 +2586,9 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	 * we have to get the most recent LSN for the log force to push out
 	 * all the transactions that span the range.
 	 */
-	for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
-		bsy = &mp->m_perag[agno].pagb_list[cnt];
+	uend = bno + len - 1;
+	for (cnt = 0; cnt < pag->pagb_count; cnt++) {
+		bsy = &pag->pagb_list[cnt];
 		if (!bsy->busy_tp)
 			continue;
 
@@ -2591,7 +2600,8 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 		if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
 			lsn = bsy->busy_tp->t_commit_lsn;
 	}
-	spin_unlock(&mp->m_perag[agno].pagb_lock);
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
 	trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
 
 	/*
@@ -2599,5 +2609,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	 * transaction that freed the block
 	 */
 	if (lsn)
-		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+		xfs_log_force(tp->t_mountp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index adbd9141aea1..b726e10d2c1c 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -61,12 +61,14 @@ xfs_allocbt_set_root(
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
 	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
 	int			btnum = cur->bc_btnum;
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
 
 	ASSERT(ptr->s != 0);
 
 	agf->agf_roots[btnum] = ptr->s;
 	be32_add_cpu(&agf->agf_levels[btnum], inc);
-	cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+	pag->pagf_levels[btnum] += inc;
+	xfs_perag_put(pag);
 
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
@@ -150,6 +152,7 @@ xfs_allocbt_update_lastrec(
 {
 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag;
 	__be32			len;
 	int			numrecs;
 
@@ -193,7 +196,9 @@ xfs_allocbt_update_lastrec(
 	}
 
 	agf->agf_longest = len;
-	cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+	pag = xfs_perag_get(cur->bc_mp, seqno);
+	pag->pagf_longest = be32_to_cpu(len);
+	xfs_perag_put(pag);
 	xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
 
-- 
cgit v1.2.3


From 4196ac08c023c6dab90c3fa460d9c06deaa304c4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:42 +0000
Subject: xfs: Convert filestreams code to use per-ag get/put routines

Use xfs_perag_get() and xfs_perag_put() in the filestreams code.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_filestream.c | 19 ++++++++++++-------
 fs/xfs/xfs_filestream.h | 27 ++++++++++++++++++++++++---
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a631e1451abb..e61f2aa088a9 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -140,6 +140,7 @@ _xfs_filestream_pick_ag(
 	int		flags,
 	xfs_extlen_t	minlen)
 {
+	int		streams, max_streams;
 	int		err, trylock, nscan;
 	xfs_extlen_t	longest, free, minfree, maxfree = 0;
 	xfs_agnumber_t	ag, max_ag = NULLAGNUMBER;
@@ -155,15 +156,15 @@ _xfs_filestream_pick_ag(
 	trylock = XFS_ALLOC_FLAG_TRYLOCK;
 
 	for (nscan = 0; 1; nscan++) {
-
-		TRACE_AG_SCAN(mp, ag, xfs_filestream_peek_ag(mp, ag));
-
-		pag = mp->m_perag + ag;
+		pag = xfs_perag_get(mp, ag);
+		TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
 
 		if (!pag->pagf_init) {
 			err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
-			if (err && !trylock)
+			if (err && !trylock) {
+				xfs_perag_put(pag);
 				return err;
+			}
 		}
 
 		/* Might fail sometimes during the 1st pass with trylock set. */
@@ -173,6 +174,7 @@ _xfs_filestream_pick_ag(
 		/* Keep track of the AG with the most free blocks. */
 		if (pag->pagf_freeblks > maxfree) {
 			maxfree = pag->pagf_freeblks;
+			max_streams = atomic_read(&pag->pagf_fstrms);
 			max_ag = ag;
 		}
 
@@ -195,6 +197,8 @@ _xfs_filestream_pick_ag(
 
 			/* Break out, retaining the reference on the AG. */
 			free = pag->pagf_freeblks;
+			streams = atomic_read(&pag->pagf_fstrms);
+			xfs_perag_put(pag);
 			*agp = ag;
 			break;
 		}
@@ -202,6 +206,7 @@ _xfs_filestream_pick_ag(
 		/* Drop the reference on this AG, it's not usable. */
 		xfs_filestream_put_ag(mp, ag);
 next_ag:
+		xfs_perag_put(pag);
 		/* Move to the next AG, wrapping to AG 0 if necessary. */
 		if (++ag >= mp->m_sb.sb_agcount)
 			ag = 0;
@@ -229,6 +234,7 @@ next_ag:
 		if (max_ag != NULLAGNUMBER) {
 			xfs_filestream_get_ag(mp, max_ag);
 			TRACE_AG_PICK1(mp, max_ag, maxfree);
+			streams = max_streams;
 			free = maxfree;
 			*agp = max_ag;
 			break;
@@ -240,8 +246,7 @@ next_ag:
 		return 0;
 	}
 
-	TRACE_AG_PICK2(mp, startag, *agp, xfs_filestream_peek_ag(mp, *agp),
-			free, nscan, flags);
+	TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 4aba67c5f64f..58378b2ea033 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -79,12 +79,21 @@ extern ktrace_t *xfs_filestreams_trace_buf;
  * the cache that reference per-ag array elements that have since been
  * reallocated.
  */
+/*
+ * xfs_filestream_peek_ag is only used in tracing code
+ */
 static inline int
 xfs_filestream_peek_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_read(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_read(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 static inline int
@@ -92,7 +101,13 @@ xfs_filestream_get_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_inc_return(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_inc_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 static inline int
@@ -100,7 +115,13 @@ xfs_filestream_put_ag(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno)
 {
-	return atomic_dec_return(&mp->m_perag[agno].pagf_fstrms);
+	struct xfs_perag *pag;
+	int		ret;
+
+	pag = xfs_perag_get(mp, agno);
+	ret = atomic_dec_return(&pag->pagf_fstrms);
+	xfs_perag_put(pag);
+	return ret;
 }
 
 /* allocation selection flags */
-- 
cgit v1.2.3


From 44b56e0a1aed522a10051645e85d300e10926fd3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:43 +0000
Subject: xfs: convert remaining direct references to m_perag

Convert the remaining direct lookups of the per ag structures to use
get/put accesses. Ensure that the loops across AGs and prior users
of the interface balance gets and puts correctly.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c   |  8 +++++++-
 fs/xfs/xfs_ialloc.c | 35 +++++++++++++++++++++++++----------
 fs/xfs/xfs_inode.c  |  5 ++++-
 fs/xfs/xfs_mount.c  |  9 ++++++---
 4 files changed, 42 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 98251cdc52aa..a9b95d9cf2ad 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2630,11 +2630,12 @@ xfs_bmap_btalloc(
 			startag = ag = 0;
 		notinit = 0;
 		down_read(&mp->m_peraglock);
+		pag = xfs_perag_get(mp, ag);
 		while (blen < ap->alen) {
-			pag = &mp->m_perag[ag];
 			if (!pag->pagf_init &&
 			    (error = xfs_alloc_pagf_init(mp, args.tp,
 				    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
+				xfs_perag_put(pag);
 				up_read(&mp->m_peraglock);
 				return error;
 			}
@@ -2667,6 +2668,7 @@ xfs_bmap_btalloc(
 						break;
 
 					error = xfs_filestream_new_ag(ap, &ag);
+					xfs_perag_put(pag);
 					if (error) {
 						up_read(&mp->m_peraglock);
 						return error;
@@ -2674,6 +2676,7 @@ xfs_bmap_btalloc(
 
 					/* loop again to set 'blen'*/
 					startag = NULLAGNUMBER;
+					pag = xfs_perag_get(mp, ag);
 					continue;
 				}
 			}
@@ -2681,7 +2684,10 @@ xfs_bmap_btalloc(
 				ag = 0;
 			if (ag == startag)
 				break;
+			xfs_perag_put(pag);
+			pag = xfs_perag_get(mp, ag);
 		}
+		xfs_perag_put(pag);
 		up_read(&mp->m_peraglock);
 		/*
 		 * Since the above loop did a BUF_TRYLOCK, it is
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index cb907ba69c4c..884ee1367f46 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -253,6 +253,7 @@ xfs_ialloc_ag_alloc(
 	xfs_agino_t	thisino;	/* current inode number, for loop */
 	int		isaligned = 0;	/* inode allocation at stripe unit */
 					/* boundary */
+	struct xfs_perag *pag;
 
 	args.tp = tp;
 	args.mp = tp->t_mountp;
@@ -383,7 +384,9 @@ xfs_ialloc_ag_alloc(
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
 	down_read(&args.mp->m_peraglock);
-	args.mp->m_perag[agno].pagi_freecount += newlen;
+	pag = xfs_perag_get(args.mp, agno);
+	pag->pagi_freecount += newlen;
+	xfs_perag_put(pag);
 	up_read(&args.mp->m_peraglock);
 	agi->agi_newino = cpu_to_be32(newino);
 
@@ -488,7 +491,7 @@ xfs_ialloc_ag_select(
 	flags = XFS_ALLOC_FLAG_TRYLOCK;
 	down_read(&mp->m_peraglock);
 	for (;;) {
-		pag = &mp->m_perag[agno];
+		pag = xfs_perag_get(mp, agno);
 		if (!pag->pagi_init) {
 			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 				agbp = NULL;
@@ -527,6 +530,7 @@ xfs_ialloc_ag_select(
 					agbp = NULL;
 					goto nextag;
 				}
+				xfs_perag_put(pag);
 				up_read(&mp->m_peraglock);
 				return agbp;
 			}
@@ -535,6 +539,7 @@ unlock_nextag:
 		if (agbp)
 			xfs_trans_brelse(tp, agbp);
 nextag:
+		xfs_perag_put(pag);
 		/*
 		 * No point in iterating over the rest, if we're shutting
 		 * down.
@@ -672,6 +677,7 @@ xfs_dialloc(
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
 	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */
+	struct xfs_perag *pag;
 
 
 	if (*IO_agbp == NULL) {
@@ -772,11 +778,14 @@ nextag:
 			return noroom ? ENOSPC : 0;
 		}
 		down_read(&mp->m_peraglock);
-		if (mp->m_perag[tagno].pagi_inodeok == 0) {
+		pag = xfs_perag_get(mp, tagno);
+		if (pag->pagi_inodeok == 0) {
+			xfs_perag_put(pag);
 			up_read(&mp->m_peraglock);
 			goto nextag;
 		}
 		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+		xfs_perag_put(pag);
 		up_read(&mp->m_peraglock);
 		if (error)
 			goto nextag;
@@ -790,6 +799,7 @@ nextag:
 	 */
 	agno = tagno;
 	*IO_agbp = NULL;
+	pag = xfs_perag_get(mp, agno);
 
  restart_pagno:
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
@@ -808,7 +818,6 @@ nextag:
 	 * If in the same AG as the parent, try to get near the parent.
 	 */
 	if (pagno == agno) {
-		xfs_perag_t	*pag = &mp->m_perag[agno];
 		int		doneleft;	/* done, to the left */
 		int		doneright;	/* done, to the right */
 		int		searchdistance = 10;
@@ -1007,7 +1016,7 @@ alloc_inode:
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 	down_read(&mp->m_peraglock);
-	mp->m_perag[tagno].pagi_freecount--;
+	pag->pagi_freecount--;
 	up_read(&mp->m_peraglock);
 
 	error = xfs_check_agi_freecount(cur, agi);
@@ -1016,12 +1025,14 @@ alloc_inode:
 
 	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	xfs_perag_put(pag);
 	*inop = ino;
 	return 0;
 error1:
 	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
 error0:
 	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_perag_put(pag);
 	return error;
 }
 
@@ -1052,6 +1063,7 @@ xfs_difree(
 	xfs_mount_t	*mp;	/* mount structure for filesystem */
 	int		off;	/* offset of inode in inode chunk */
 	xfs_inobt_rec_incore_t rec;	/* btree record */
+	struct xfs_perag *pag;
 
 	mp = tp->t_mountp;
 
@@ -1158,7 +1170,9 @@ xfs_difree(
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
-		mp->m_perag[agno].pagi_freecount -= ilen - 1;
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount -= ilen - 1;
+		xfs_perag_put(pag);
 		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
@@ -1189,7 +1203,9 @@ xfs_difree(
 		be32_add_cpu(&agi->agi_freecount, 1);
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
 		down_read(&mp->m_peraglock);
-		mp->m_perag[agno].pagi_freecount++;
+		pag = xfs_perag_get(mp, agno);
+		pag->pagi_freecount++;
+		xfs_perag_put(pag);
 		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
@@ -1379,7 +1395,6 @@ xfs_imap(
 			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
 		return XFS_ERROR(EINVAL);
 	}
-
 	return 0;
 }
 
@@ -1523,8 +1538,7 @@ xfs_ialloc_read_agi(
 		return error;
 
 	agi = XFS_BUF_TO_AGI(*bpp);
-	pag = &mp->m_perag[agno];
-
+	pag = xfs_perag_get(mp, agno);
 	if (!pag->pagi_init) {
 		pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
 		pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -1537,6 +1551,7 @@ xfs_ialloc_read_agi(
 	 */
 	ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
 		XFS_FORCED_SHUTDOWN(mp));
+	xfs_perag_put(pag);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bd3d81636d51..0317b000ab44 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2695,7 +2695,7 @@ xfs_iflush_cluster(
 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
-		return 0;
+		goto out_put;
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
@@ -2764,6 +2764,8 @@ xfs_iflush_cluster(
 out_free:
 	read_unlock(&pag->pag_ici_lock);
 	kmem_free(ilist);
+out_put:
+	xfs_perag_put(pag);
 	return 0;
 
 
@@ -2807,6 +2809,7 @@ cluster_corrupt_out:
 	 */
 	xfs_iflush_abort(iq);
 	kmem_free(ilist);
+	xfs_perag_put(pag);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..9055b60730d0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -438,18 +438,20 @@ xfs_initialize_perag(
 			}
 
 			/* This ag is preferred for inodes */
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			if (index < max_metadata)
 				pag->pagf_metadata = 1;
 			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	} else {
 		/* Setup default behavior for smaller filesystems */
 		for (index = 0; index < agcount; index++) {
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	}
 	return index;
@@ -731,12 +733,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 		error = xfs_ialloc_pagi_init(mp, NULL, index);
 		if (error)
 			return error;
-		pag = &mp->m_perag[index];
+		pag = xfs_perag_get(mp, index);
 		ifree += pag->pagi_freecount;
 		ialloc += pag->pagi_count;
 		bfree += pag->pagf_freeblks;
 		bfreelst += pag->pagf_flcount;
 		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
 	}
 	/*
 	 * Overwrite incore superblock counters with just-read data
-- 
cgit v1.2.3


From 1c1c6ebcf5284aee4910f3b906ac90c20e510c82 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:44 +0000
Subject: xfs: Replace per-ag array with a radix tree

The use of an array for the per-ag structures requires reallocation
of the array when growing the filesystem. This requires locking
access to the array to avoid use after free situations, and the
locking is difficult to get right. To avoid needing to reallocate an
array, change the per-ag structures to an allocated object per ag
and index them using a tree structure.

The AGs are always densely indexed (hence the use of an array), but
the number supported is 2^32 and lookups tend to be random and hence
indexing needs to scale. A simple choice is a radix tree - it works
well with this sort of index.  This change also removes another
large contiguous allocation from the mount/growfs path in XFS.

The growing process now needs to change to only initialise the new
AGs required for the extra space, and as such only needs to
exclusively lock the tree for inserts. The rest of the code only
needs to lock the tree while doing lookups, and hence this will
remove all the deadlocks that currently occur on the m_perag_lock as
it is now an innermost lock. The lock is also changed to a spinlock
from a read/write lock as the hold time is now extremely short.

To complete the picture, the per-ag structures will need to be
reference counted to ensure that we don't free/modify them while
they are still in use.  This will be done in subsequent patch.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c      |  8 -------
 fs/xfs/xfs_bmap.c       |  7 +-----
 fs/xfs/xfs_filestream.c | 13 ++++------
 fs/xfs/xfs_fsops.c      | 42 ++++++++++++++++-----------------
 fs/xfs/xfs_ialloc.c     | 25 ++------------------
 fs/xfs/xfs_itable.c     |  4 ----
 fs/xfs/xfs_mount.c      | 63 +++++++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_mount.h      | 14 +++++++----
 8 files changed, 86 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 84070f2e0ba4..4d66bb75579c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2276,7 +2276,6 @@ xfs_alloc_vextent(
 		 * These three force us into a single a.g.
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
-		down_read(&mp->m_peraglock);
 		args->pag = xfs_perag_get(mp, args->agno);
 		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
@@ -2286,14 +2285,12 @@ xfs_alloc_vextent(
 			goto error0;
 		}
 		if (!args->agbp) {
-			up_read(&mp->m_peraglock);
 			trace_xfs_alloc_vextent_noagbp(args);
 			break;
 		}
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 		if ((error = xfs_alloc_ag_vextent(args)))
 			goto error0;
-		up_read(&mp->m_peraglock);
 		break;
 	case XFS_ALLOCTYPE_START_BNO:
 		/*
@@ -2345,7 +2342,6 @@ xfs_alloc_vextent(
 		 * Loop over allocation groups twice; first time with
 		 * trylock set, second time without.
 		 */
-		down_read(&mp->m_peraglock);
 		for (;;) {
 			args->pag = xfs_perag_get(mp, args->agno);
 			if (no_min) args->minleft = 0;
@@ -2408,7 +2404,6 @@ xfs_alloc_vextent(
 			}
 			xfs_perag_put(args->pag);
 		}
-		up_read(&mp->m_peraglock);
 		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
 			if (args->agno == sagno)
 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2438,7 +2433,6 @@ xfs_alloc_vextent(
 	return 0;
 error0:
 	xfs_perag_put(args->pag);
-	up_read(&mp->m_peraglock);
 	return error;
 }
 
@@ -2463,7 +2457,6 @@ xfs_free_extent(
 	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
 	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
 	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-	down_read(&args.mp->m_peraglock);
 	args.pag = xfs_perag_get(args.mp, args.agno);
 	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
 		goto error0;
@@ -2475,7 +2468,6 @@ xfs_free_extent(
 	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
 	xfs_perag_put(args.pag);
-	up_read(&args.mp->m_peraglock);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a9b95d9cf2ad..7c6d9acd7154 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2629,14 +2629,12 @@ xfs_bmap_btalloc(
 		if (startag == NULLAGNUMBER)
 			startag = ag = 0;
 		notinit = 0;
-		down_read(&mp->m_peraglock);
 		pag = xfs_perag_get(mp, ag);
 		while (blen < ap->alen) {
 			if (!pag->pagf_init &&
 			    (error = xfs_alloc_pagf_init(mp, args.tp,
 				    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
 				xfs_perag_put(pag);
-				up_read(&mp->m_peraglock);
 				return error;
 			}
 			/*
@@ -2669,10 +2667,8 @@ xfs_bmap_btalloc(
 
 					error = xfs_filestream_new_ag(ap, &ag);
 					xfs_perag_put(pag);
-					if (error) {
-						up_read(&mp->m_peraglock);
+					if (error)
 						return error;
-					}
 
 					/* loop again to set 'blen'*/
 					startag = NULLAGNUMBER;
@@ -2688,7 +2684,6 @@ xfs_bmap_btalloc(
 			pag = xfs_perag_get(mp, ag);
 		}
 		xfs_perag_put(pag);
-		up_read(&mp->m_peraglock);
 		/*
 		 * Since the above loop did a BUF_TRYLOCK, it is
 		 * possible that there is space for this request.
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e61f2aa088a9..914d00d0f119 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -253,8 +253,7 @@ next_ag:
 
 /*
  * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.  Must be called with the
- * m_peraglock held in read mode.
+ * references and per-AG references as appropriate.
  */
 static int
 _xfs_filestream_update_ag(
@@ -456,10 +455,10 @@ xfs_filestream_unmount(
 }
 
 /*
- * If the mount point's m_perag array is going to be reallocated, all
+ * If the mount point's m_perag tree is going to be modified, all
  * outstanding cache entries must be flushed to avoid accessing reference count
  * addresses that have been freed.  The call to xfs_filestream_flush() must be
- * made inside the block that holds the m_peraglock in write mode to do the
+ * made inside the block that holds the m_perag_lock in write mode to do the
  * reallocation.
  */
 void
@@ -531,7 +530,6 @@ xfs_filestream_associate(
 
 	mp = pip->i_mount;
 	cache = mp->m_filestream;
-	down_read(&mp->m_peraglock);
 
 	/*
 	 * We have a problem, Houston.
@@ -548,10 +546,8 @@ xfs_filestream_associate(
 	 *
 	 * So, if we can't get the iolock without sleeping then just give up
 	 */
-	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL)) {
-		up_read(&mp->m_peraglock);
+	if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
 		return 1;
-	}
 
 	/* If the parent directory is already in the cache, use its AG. */
 	item = xfs_mru_cache_lookup(cache, pip->i_ino);
@@ -606,7 +602,6 @@ exit_did_pick:
 
 exit:
 	xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-	up_read(&mp->m_peraglock);
 	return -err;
 }
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a13919a6a364..37a6f62c57b6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,27 +167,14 @@ xfs_growfs_data_private(
 	}
 	new = nb - mp->m_sb.sb_dblocks;
 	oagcount = mp->m_sb.sb_agcount;
-	if (nagcount > oagcount) {
-		void *new_perag, *old_perag;
-
-		xfs_filestream_flush(mp);
-
-		new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
-					KM_MAYFAIL);
-		if (!new_perag)
-			return XFS_ERROR(ENOMEM);
-
-		down_write(&mp->m_peraglock);
-		memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-		old_perag = mp->m_perag;
-		mp->m_perag = new_perag;
-
-		mp->m_flags |= XFS_MOUNT_32BITINODES;
-		nagimax = xfs_initialize_perag(mp, nagcount);
-		up_write(&mp->m_peraglock);
 
-		kmem_free(old_perag);
+	/* allocate the new per-ag structures */
+	if (nagcount > oagcount) {
+		error = xfs_initialize_perag(mp, nagcount, &nagimax);
+		if (error)
+			return error;
 	}
+
 	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
 	tp->t_flags |= XFS_TRANS_RESERVE;
 	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
@@ -196,6 +183,11 @@ xfs_growfs_data_private(
 		return error;
 	}
 
+	/*
+	 * Write new AG headers to disk. Non-transactional, but written
+	 * synchronously so they are completed prior to the growfs transaction
+	 * being logged.
+	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
 		/*
@@ -359,6 +351,12 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 	}
+
+	/*
+	 * Update changed superblock fields transactionally. These are not
+	 * seen by the rest of the world until the transaction commit applies
+	 * them atomically to the superblock.
+	 */
 	if (nagcount > oagcount)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
 	if (nb > mp->m_sb.sb_dblocks)
@@ -369,9 +367,9 @@ xfs_growfs_data_private(
 	if (dpct)
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 	error = xfs_trans_commit(tp, 0);
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/* New allocation groups fully initialized, so update mount struct */
 	if (nagimax)
 		mp->m_maxagi = nagimax;
@@ -381,6 +379,8 @@ xfs_growfs_data_private(
 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 	} else
 		mp->m_maxicount = 0;
+
+	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
 		error = xfs_read_buf(mp, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 884ee1367f46..52c9d006c0e6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -383,11 +383,9 @@ xfs_ialloc_ag_alloc(
 	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 	be32_add_cpu(&agi->agi_count, newlen);
 	be32_add_cpu(&agi->agi_freecount, newlen);
-	down_read(&args.mp->m_peraglock);
 	pag = xfs_perag_get(args.mp, agno);
 	pag->pagi_freecount += newlen;
 	xfs_perag_put(pag);
-	up_read(&args.mp->m_peraglock);
 	agi->agi_newino = cpu_to_be32(newino);
 
 	/*
@@ -489,7 +487,6 @@ xfs_ialloc_ag_select(
 	 */
 	agno = pagno;
 	flags = XFS_ALLOC_FLAG_TRYLOCK;
-	down_read(&mp->m_peraglock);
 	for (;;) {
 		pag = xfs_perag_get(mp, agno);
 		if (!pag->pagi_init) {
@@ -531,7 +528,6 @@ xfs_ialloc_ag_select(
 					goto nextag;
 				}
 				xfs_perag_put(pag);
-				up_read(&mp->m_peraglock);
 				return agbp;
 			}
 		}
@@ -544,18 +540,14 @@ nextag:
 		 * No point in iterating over the rest, if we're shutting
 		 * down.
 		 */
-		if (XFS_FORCED_SHUTDOWN(mp)) {
-			up_read(&mp->m_peraglock);
+		if (XFS_FORCED_SHUTDOWN(mp))
 			return NULL;
-		}
 		agno++;
 		if (agno >= agcount)
 			agno = 0;
 		if (agno == pagno) {
-			if (flags == 0) {
-				up_read(&mp->m_peraglock);
+			if (flags == 0)
 				return NULL;
-			}
 			flags = 0;
 		}
 	}
@@ -777,16 +769,13 @@ nextag:
 			*inop = NULLFSINO;
 			return noroom ? ENOSPC : 0;
 		}
-		down_read(&mp->m_peraglock);
 		pag = xfs_perag_get(mp, tagno);
 		if (pag->pagi_inodeok == 0) {
 			xfs_perag_put(pag);
-			up_read(&mp->m_peraglock);
 			goto nextag;
 		}
 		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
 		xfs_perag_put(pag);
-		up_read(&mp->m_peraglock);
 		if (error)
 			goto nextag;
 		agi = XFS_BUF_TO_AGI(agbp);
@@ -1015,9 +1004,7 @@ alloc_inode:
 		goto error0;
 	be32_add_cpu(&agi->agi_freecount, -1);
 	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-	down_read(&mp->m_peraglock);
 	pag->pagi_freecount--;
-	up_read(&mp->m_peraglock);
 
 	error = xfs_check_agi_freecount(cur, agi);
 	if (error)
@@ -1100,9 +1087,7 @@ xfs_difree(
 	/*
 	 * Get the allocation group header.
 	 */
-	down_read(&mp->m_peraglock);
 	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-	up_read(&mp->m_peraglock);
 	if (error) {
 		cmn_err(CE_WARN,
 			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
@@ -1169,11 +1154,9 @@ xfs_difree(
 		be32_add_cpu(&agi->agi_count, -ilen);
 		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
-		down_read(&mp->m_peraglock);
 		pag = xfs_perag_get(mp, agno);
 		pag->pagi_freecount -= ilen - 1;
 		xfs_perag_put(pag);
-		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 
@@ -1202,11 +1185,9 @@ xfs_difree(
 		 */
 		be32_add_cpu(&agi->agi_freecount, 1);
 		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
-		down_read(&mp->m_peraglock);
 		pag = xfs_perag_get(mp, agno);
 		pag->pagi_freecount++;
 		xfs_perag_put(pag);
-		up_read(&mp->m_peraglock);
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
 	}
 
@@ -1328,9 +1309,7 @@ xfs_imap(
 		xfs_buf_t	*agbp;	/* agi buffer */
 		int		i;	/* temp state */
 
-		down_read(&mp->m_peraglock);
 		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
-		up_read(&mp->m_peraglock);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
 					"xfs_ialloc_read_agi() returned "
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 62efab2f3839..940307a6a60b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -420,9 +420,7 @@ xfs_bulkstat(
 	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
 		cond_resched();
 		bp = NULL;
-		down_read(&mp->m_peraglock);
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-		up_read(&mp->m_peraglock);
 		if (error) {
 			/*
 			 * Skip this allocation group and go to the next one.
@@ -849,9 +847,7 @@ xfs_inumbers(
 	agbp = NULL;
 	while (left > 0 && agno < mp->m_sb.sb_agcount) {
 		if (agbp == NULL) {
-			down_read(&mp->m_peraglock);
 			error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-			up_read(&mp->m_peraglock);
 			if (error) {
 				/*
 				 * If we can't read the AGI of this ag,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9055b60730d0..c04dd83cb57c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -209,13 +209,16 @@ STATIC void
 xfs_free_perag(
 	xfs_mount_t	*mp)
 {
-	if (mp->m_perag) {
-		int	agno;
+	xfs_agnumber_t	agno;
+	struct xfs_perag *pag;
 
-		for (agno = 0; agno < mp->m_maxagi; agno++)
-			if (mp->m_perag[agno].pagb_list)
-				kmem_free(mp->m_perag[agno].pagb_list);
-		kmem_free(mp->m_perag);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		spin_unlock(&mp->m_perag_lock);
+		ASSERT(pag);
+		kmem_free(pag->pagb_list);
+		kmem_free(pag);
 	}
 }
 
@@ -389,10 +392,11 @@ xfs_initialize_perag_icache(
 	}
 }
 
-xfs_agnumber_t
+int
 xfs_initialize_perag(
 	xfs_mount_t	*mp,
-	xfs_agnumber_t	agcount)
+	xfs_agnumber_t	agcount,
+	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index, max_metadata;
 	xfs_perag_t	*pag;
@@ -405,6 +409,33 @@ xfs_initialize_perag(
 	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
 	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag)
+			return -ENOMEM;
+		if (radix_tree_preload(GFP_NOFS))
+			return -ENOMEM;
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			BUG();
+			spin_unlock(&mp->m_perag_lock);
+			kmem_free(pag);
+			return -EEXIST;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+	}
+
 	/* Clear the mount flag if no inode can overflow 32 bits
 	 * on this filesystem, or if specifically requested..
 	 */
@@ -454,7 +485,9 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 		}
 	}
-	return index;
+	if (maxagi)
+		*maxagi = index;
+	return 0;
 }
 
 void
@@ -1155,13 +1188,13 @@ xfs_mountfs(
 	/*
 	 * Allocate and initialize the per-ag data.
 	 */
-	init_rwsem(&mp->m_peraglock);
-	mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-				  KM_MAYFAIL);
-	if (!mp->m_perag)
+	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
 		goto out_remove_uuid;
-
-	mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+	}
 
 	if (!sbp->sb_logblocks) {
 		cmn_err(CE_WARN, "XFS: no log defined");
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f8a68a2319b5..cfa7a5d22e72 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -207,8 +207,8 @@ typedef struct xfs_mount {
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* max inobt btree levels. */
-	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct radix_tree_root	m_perag_tree;	/* per-ag accounting info */
+	spinlock_t		m_perag_lock;	/* lock for m_perag_tree */
 	struct mutex		m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
@@ -389,7 +389,12 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 static inline struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 {
-	return &mp->m_perag[agno];
+	struct xfs_perag	*pag;
+
+	spin_lock(&mp->m_perag_lock);
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	spin_unlock(&mp->m_perag_lock);
+	return pag;
 }
 
 static inline void
@@ -450,7 +455,8 @@ extern struct xfs_dmops xfs_dmcore_xfs;
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
-extern xfs_agnumber_t	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
+					xfs_agnumber_t *);
 extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 
-- 
cgit v1.2.3


From aed3bb90abaf0b42e8c8747e192f7bb97f445279 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:45 +0000
Subject: xfs: Reference count per-ag structures

Reference count the per-ag structures to ensure that we keep get/put
pairs balanced. Assert that the reference counts are zero at unmount
time to catch leaks. In future, reference counts will enable us to
safely remove perag structures by allowing us to detect when they
are no longer in use.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_ag.h    |  4 ++--
 fs/xfs/xfs_mount.c |  1 +
 fs/xfs/xfs_mount.h | 11 +++++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 6702bd865811..18ae43f4255d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -196,8 +196,8 @@ typedef struct xfs_perag_busy {
 #define XFS_PAGB_NUM_SLOTS	128
 #endif
 
-typedef struct xfs_perag
-{
+typedef struct xfs_perag {
+	atomic_t	pag_ref;	/* perag reference count */
 	char		pagf_init;	/* this agf's entry is initialized */
 	char		pagi_init;	/* this agi's entry is initialized */
 	char		pagf_metadata;	/* the agf is preferred to be metadata */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c04dd83cb57c..f241fec26070 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -215,6 +215,7 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		kmem_free(pag->pagb_list);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index cfa7a5d22e72..16b22120b98f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -384,7 +384,7 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 
 /*
- * perag get/put wrappers for eventual ref counting
+ * perag get/put wrappers for ref counting
  */
 static inline struct xfs_perag *
 xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
@@ -393,6 +393,12 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 
 	spin_lock(&mp->m_perag_lock);
 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		/* catch leaks in the positive direction during testing */
+		ASSERT(atomic_read(&pag->pag_ref) < 1000);
+		atomic_inc(&pag->pag_ref);
+	}
 	spin_unlock(&mp->m_perag_lock);
 	return pag;
 }
@@ -400,7 +406,8 @@ xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
 static inline void
 xfs_perag_put(struct xfs_perag *pag)
 {
-	/* nothing to see here, move along */
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	atomic_dec(&pag->pag_ref);
 }
 
 /*
-- 
cgit v1.2.3


From 0fa800fbd549736dfdc1d7761f87e33dc8cd973b Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:46 +0000
Subject: xfs: Add trace points for per-ag refcount debugging.

Uninline xfs_perag_{get,put} so that tracepoints can be inserted
into them to speed debugging of reference count problems.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_trace.h | 27 +++++++++++++++++++++++++++
 fs/xfs/xfs_ag.h              |  2 ++
 fs/xfs/xfs_mount.c           | 34 ++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h           | 25 ++-----------------------
 4 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 3353aef50530..1bb09e70b2eb 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -78,6 +78,33 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	)
 )
 
+#define DEFINE_PERAG_REF_EVENT(name) \
+TRACE_EVENT(name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
+		 unsigned long caller_ip), \
+	TP_ARGS(mp, agno, refcount, caller_ip), \
+	TP_STRUCT__entry( \
+		__field(dev_t, dev) \
+		__field(xfs_agnumber_t, agno) \
+		__field(int, refcount) \
+		__field(unsigned long, caller_ip) \
+	), \
+	TP_fast_assign( \
+		__entry->dev = mp->m_super->s_dev; \
+		__entry->agno = agno; \
+		__entry->refcount = refcount; \
+		__entry->caller_ip = caller_ip; \
+	), \
+	TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
+		  MAJOR(__entry->dev), MINOR(__entry->dev), \
+		  __entry->agno, \
+		  __entry->refcount, \
+		  (char *)__entry->caller_ip) \
+);
+
+DEFINE_PERAG_REF_EVENT(xfs_perag_get)
+DEFINE_PERAG_REF_EVENT(xfs_perag_put)
+
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
 	TP_PROTO(struct xfs_attr_list_context *ctx), \
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 18ae43f4255d..963bc2700bf7 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -197,6 +197,8 @@ typedef struct xfs_perag_busy {
 #endif
 
 typedef struct xfs_perag {
+	struct xfs_mount *pag_mount;	/* owner filesystem */
+	xfs_agnumber_t	pag_agno;	/* AG this structure belongs to */
 	atomic_t	pag_ref;	/* perag reference count */
 	char		pagf_init;	/* this agf's entry is initialized */
 	char		pagi_init;	/* this agi's entry is initialized */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f241fec26070..049dbc71c28e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -200,6 +200,38 @@ xfs_uuid_unmount(
 }
 
 
+/*
+ * Reference counting access wrappers to the perag structures.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	spin_lock(&mp->m_perag_lock);
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		/* catch leaks in the positive direction during testing */
+		ASSERT(atomic_read(&pag->pag_ref) < 1000);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	spin_unlock(&mp->m_perag_lock);
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
 /*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
@@ -433,6 +465,8 @@ xfs_initialize_perag(
 			kmem_free(pag);
 			return -EEXIST;
 		}
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
 		spin_unlock(&mp->m_perag_lock);
 		radix_tree_preload_end();
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 16b22120b98f..e62fd1cde464 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -386,29 +386,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 /*
  * perag get/put wrappers for ref counting
  */
-static inline struct xfs_perag *
-xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
-{
-	struct xfs_perag	*pag;
-
-	spin_lock(&mp->m_perag_lock);
-	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
-	if (pag) {
-		ASSERT(atomic_read(&pag->pag_ref) >= 0);
-		/* catch leaks in the positive direction during testing */
-		ASSERT(atomic_read(&pag->pag_ref) < 1000);
-		atomic_inc(&pag->pag_ref);
-	}
-	spin_unlock(&mp->m_perag_lock);
-	return pag;
-}
-
-static inline void
-xfs_perag_put(struct xfs_perag *pag)
-{
-	ASSERT(atomic_read(&pag->pag_ref) > 0);
-	atomic_dec(&pag->pag_ref);
-}
+struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
+void	xfs_perag_put(struct xfs_perag *pag);
 
 /*
  * Per-cpu superblock locking functions
-- 
cgit v1.2.3


From b657fc82a3ca6d7ad16a59e81765f0fb0e86cdbb Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:47 +0000
Subject: xfs: Kill filestreams cache flush

The filestreams cache flush is not needed in the sync code as it
does not affect data writeback, and it is now not used by the growfs
code, either, so kill it.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c |  3 ---
 fs/xfs/xfs_filestream.c     | 14 --------------
 fs/xfs/xfs_filestream.h     |  1 -
 fs/xfs/xfs_mru_cache.c      |  2 +-
 fs/xfs/xfs_mru_cache.h      |  1 -
 5 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index cc964faf12e9..b58f8412dfe2 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -451,9 +451,6 @@ xfs_quiesce_data(
 	xfs_sync_data(mp, SYNC_WAIT);
 	xfs_qm_sync(mp, SYNC_WAIT);
 
-	/* drop inode references pinned by filestreams */
-	xfs_filestream_flush(mp);
-
 	/* write superblock and hoover up shutdown errors */
 	error = xfs_sync_fsdata(mp, SYNC_WAIT);
 
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 914d00d0f119..390850ee6603 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -454,20 +454,6 @@ xfs_filestream_unmount(
 	xfs_mru_cache_destroy(mp->m_filestream);
 }
 
-/*
- * If the mount point's m_perag tree is going to be modified, all
- * outstanding cache entries must be flushed to avoid accessing reference count
- * addresses that have been freed.  The call to xfs_filestream_flush() must be
- * made inside the block that holds the m_perag_lock in write mode to do the
- * reallocation.
- */
-void
-xfs_filestream_flush(
-	xfs_mount_t	*mp)
-{
-	xfs_mru_cache_flush(mp->m_filestream);
-}
-
 /*
  * Return the AG of the filestream the file or directory belongs to, or
  * NULLAGNUMBER otherwise.
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 58378b2ea033..260f757bbc5d 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -135,7 +135,6 @@ int xfs_filestream_init(void);
 void xfs_filestream_uninit(void);
 int xfs_filestream_mount(struct xfs_mount *mp);
 void xfs_filestream_unmount(struct xfs_mount *mp);
-void xfs_filestream_flush(struct xfs_mount *mp);
 xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
 int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
 void xfs_filestream_deassociate(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4b0613d99faa..45ce15dc5b2b 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -398,7 +398,7 @@ exit:
  * guaranteed that all the free functions for all the elements have finished
  * executing and the reaper is not running.
  */
-void
+static void
 xfs_mru_cache_flush(
 	xfs_mru_cache_t		*mru)
 {
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 5d439f34b0c9..36dd3ec8b4eb 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -42,7 +42,6 @@ void xfs_mru_cache_uninit(void);
 int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
 			     unsigned int grp_count,
 			     xfs_mru_cache_free_func_t free_func);
-void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 				void *value);
-- 
cgit v1.2.3


From 8b26c5825e023b1bccac7afd174ebe55b8905cb1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:48 +0000
Subject: xfs: handle ENOMEM correctly during initialisation of perag
 structures

Add proper error handling in case an error occurs while initializing
new perag structures for a mount point.  The mount structure is
restored to its previous state by deleting and freeing any perag
structures added during the call.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049dbc71c28e..be643e588067 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -432,11 +432,13 @@ xfs_initialize_perag(
 	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index, max_metadata;
+	xfs_agnumber_t	first_initialised = 0;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
 	xfs_sb_t	*sbp = &mp->m_sb;
 	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
+	int		error = -ENOMEM;
 
 	/* Check to see if the filesystem can overflow 32 bit inodes */
 	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
@@ -453,17 +455,20 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 			continue;
 		}
+		if (!first_initialised)
+			first_initialised = index;
 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 		if (!pag)
-			return -ENOMEM;
+			goto out_unwind;
 		if (radix_tree_preload(GFP_NOFS))
-			return -ENOMEM;
+			goto out_unwind;
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
 			BUG();
 			spin_unlock(&mp->m_perag_lock);
-			kmem_free(pag);
-			return -EEXIST;
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_unwind;
 		}
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
@@ -523,6 +528,14 @@ xfs_initialize_perag(
 	if (maxagi)
 		*maxagi = index;
 	return 0;
+
+out_unwind:
+	kmem_free(pag);
+	for (; index > first_initialised; index--) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		kmem_free(pag);
+	}
+	return error;
 }
 
 void
-- 
cgit v1.2.3


From e57336ff7fc7520bec7b3a7741043bdebaf622ea Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:49 +0000
Subject: xfs: embed the pagb_list array in the perag structure

Now that the perag structure is allocated memory rather than held in
an array, we don't need to have the busy extent array external to
the structure. Embed it into the perag structure to avoid needing an
extra allocation when setting up.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_ag.h    | 10 ++--------
 fs/xfs/xfs_alloc.c |  4 ++--
 fs/xfs/xfs_mount.c |  3 +--
 3 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 963bc2700bf7..b1a5a1ff88ea 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,14 +187,8 @@ typedef struct xfs_perag_busy {
 /*
  * Per-ag incore structure, copies of information in agf and agi,
  * to improve the performance of allocation group selection.
- *
- * pick sizes which fit in allocation buckets well
  */
-#if (BITS_PER_LONG == 32)
-#define XFS_PAGB_NUM_SLOTS	84
-#elif (BITS_PER_LONG == 64)
 #define XFS_PAGB_NUM_SLOTS	128
-#endif
 
 typedef struct xfs_perag {
 	struct xfs_mount *pag_mount;	/* owner filesystem */
@@ -212,8 +206,6 @@ typedef struct xfs_perag {
 	__uint32_t	pagf_btreeblks;	/* # of blocks held in AGF btrees */
 	xfs_agino_t	pagi_freecount;	/* number of free inodes */
 	xfs_agino_t	pagi_count;	/* number of allocated inodes */
-	int		pagb_count;	/* pagb slots in use */
-	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
 
 	/*
 	 * Inode allocation search lookup optimisation.
@@ -232,6 +224,8 @@ typedef struct xfs_perag {
 	rwlock_t	pag_ici_lock;	/* incore inode lock */
 	struct radix_tree_root pag_ici_root;	/* incore inode cache root */
 #endif
+	int		pagb_count;	/* pagb slots in use */
+	xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS];	/* unstable blocks */
 } xfs_perag_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4d66bb75579c..8aa181d6dd7d 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2200,8 +2200,8 @@ xfs_alloc_read_agf(
 		pag->pagf_levels[XFS_BTNUM_CNTi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
 		spin_lock_init(&pag->pagb_lock);
-		pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
-					sizeof(xfs_perag_busy_t), KM_SLEEP);
+		pag->pagb_count = 0;
+		memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
 		pag->pagf_init = 1;
 	}
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index be643e588067..0df5045abd3b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -247,10 +247,9 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
-		ASSERT(pag);
-		kmem_free(pag->pagb_list);
 		kmem_free(pag);
 	}
 }
-- 
cgit v1.2.3


From 873ff5501d8cd1a21045d6c1da34f0c3876bc235 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Jan 2010 22:17:57 +0000
Subject: xfs: clean up log buffer writes

Don't bother using XFS_bwrite as it doesn't provide much code for
our use case.  Instead opencode it and fold xlog_bdstrat_cb into the
new xlog_bdstrat helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.h |  2 --
 fs/xfs/xfs_log.c           | 67 +++++++++++++++++++++++-----------------------
 2 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a34c7b54822d..c20a76001867 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -408,8 +408,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 	return error;
 }
 
-#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
-
 #define xfs_iowait(bp)	xfs_buf_iowait(bp)
 
 #define xfs_baread(target, rablkno, ralen)  \
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 600b5b06aaeb..0d17516fbb13 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -50,7 +50,6 @@ kmem_zone_t	*xfs_log_ticket_zone;
 	  (off) += (bytes);}
 
 /* Local miscellaneous function prototypes */
-STATIC int	 xlog_bdstrat_cb(struct xfs_buf *);
 STATIC int	 xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
 				    xlog_in_core_t **, xfs_lsn_t *);
 STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
@@ -987,35 +986,6 @@ xlog_iodone(xfs_buf_t *bp)
 
 }	/* xlog_iodone */
 
-/*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- */
-STATIC int
-xlog_bdstrat_cb(struct xfs_buf *bp)
-{
-	xlog_in_core_t *iclog;
-
-	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
-
-	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
-	  /* note for irix bstrat will need  struct bdevsw passed
-	   * Fix the following macro if the code ever is merged
-	   */
-	    XFS_bdstrat(bp);
-		return 0;
-	}
-
-	XFS_BUF_ERROR(bp, EIO);
-	XFS_BUF_STALE(bp);
-	xfs_biodone(bp);
-	return XFS_ERROR(EIO);
-}
-
 /*
  * Return size of each in-core log record buffer.
  *
@@ -1158,7 +1128,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	if (!bp)
 		goto out_free_log;
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 	ASSERT(XFS_BUF_ISBUSY(bp));
 	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
@@ -1196,7 +1165,6 @@ xlog_alloc_log(xfs_mount_t	*mp,
 		if (!XFS_BUF_CPSEMA(bp))
 			ASSERT(0);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
-		XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
 		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
 		iclog->ic_bp = bp;
 		iclog->ic_data = bp->b_addr;
@@ -1343,6 +1311,37 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
 	    xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }	/* xlog_grant_push_ail */
 
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat(
+	struct xfs_buf		*bp)
+{
+	struct xlog_in_core	*iclog;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_STALE(bp);
+		xfs_biodone(bp);
+		/*
+		 * It would seem logical to return EIO here, but we rely on
+		 * the log state machine to propagate I/O errors instead of
+		 * doing it here.
+		 */
+		return 0;
+	}
+
+	bp->b_flags |= _XBF_RUN_QUEUES;
+	xfs_buf_iorequest(bp);
+	return 0;
+}
 
 /*
  * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
@@ -1462,7 +1461,7 @@ xlog_sync(xlog_t		*log,
 	 */
 	XFS_BUF_WRITE(bp);
 
-	if ((error = XFS_bwrite(bp))) {
+	if ((error = xlog_bdstrat(bp))) {
 		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
 				  XFS_BUF_ADDR(bp));
 		return error;
@@ -1502,7 +1501,7 @@ xlog_sync(xlog_t		*log,
 		/* account for internal log which doesn't start at block #0 */
 		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
 		XFS_BUF_WRITE(bp);
-		if ((error = XFS_bwrite(bp))) {
+		if ((error = xlog_bdstrat(bp))) {
 			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
 					  bp, XFS_BUF_ADDR(bp));
 			return error;
-- 
cgit v1.2.3


From 64e0bc7d2a6609ad265757a600e2a0d93c8adb47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Jan 2010 22:17:58 +0000
Subject: xfs: clean up xfs_bwrite

Fold XFS_bwrite into it's only caller, xfs_bwrite and move it into
xfs_buf.c instead of leaving it as a fairly large inline function.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 27 +++++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_buf.h | 19 +------------------
 fs/xfs/xfs_rw.c            | 31 -------------------------------
 fs/xfs/xfs_rw.h            |  1 -
 4 files changed, 28 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 18ae3ba8f78a..492465c6e0b4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1050,6 +1050,33 @@ xfs_buf_ioerror(
 	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
 
+int
+xfs_bwrite(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	int			iowait = (bp->b_flags & XBF_ASYNC) == 0;
+	int			error = 0;
+
+	bp->b_strat = xfs_bdstrat_cb;
+	bp->b_mount = mp;
+	bp->b_flags |= XBF_WRITE;
+	if (!iowait)
+		bp->b_flags |= _XBF_RUN_QUEUES;
+
+	xfs_buf_delwri_dequeue(bp);
+	xfs_buf_iostrategy(bp);
+
+	if (iowait) {
+		error = xfs_buf_iowait(bp);
+		if (error)
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		xfs_buf_relse(bp);
+	}
+
+	return error;
+}
+
 int
 xfs_bawrite(
 	void			*mp,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index c20a76001867..f69b8e714a11 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -232,6 +232,7 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
@@ -390,24 +391,6 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 #define xfs_biozero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
-
-static inline int XFS_bwrite(xfs_buf_t *bp)
-{
-	int	iowait = (bp->b_flags & XBF_ASYNC) == 0;
-	int	error = 0;
-
-	if (!iowait)
-		bp->b_flags |= _XBF_RUN_QUEUES;
-
-	xfs_buf_delwri_dequeue(bp);
-	xfs_buf_iostrategy(bp);
-	if (iowait) {
-		error = xfs_buf_iowait(bp);
-		xfs_buf_relse(bp);
-	}
-	return error;
-}
-
 #define xfs_iowait(bp)	xfs_buf_iowait(bp)
 
 #define xfs_baread(target, rablkno, ralen)  \
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 5aa07caea5f1..9d933a10d6bb 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -305,37 +305,6 @@ xfs_read_buf(
 	return (error);
 }
 
-/*
- * Wrapper around bwrite() so that we can trap
- * write errors, and act accordingly.
- */
-int
-xfs_bwrite(
-	struct xfs_mount *mp,
-	struct xfs_buf	 *bp)
-{
-	int	error;
-
-	/*
-	 * XXXsup how does this work for quotas.
-	 */
-	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-	bp->b_mount = mp;
-	XFS_BUF_WRITE(bp);
-
-	if ((error = XFS_bwrite(bp))) {
-		ASSERT(mp);
-		/*
-		 * Cannot put a buftrace here since if the buffer is not
-		 * B_HOLD then we will brelse() the buffer before returning
-		 * from bwrite and we could be tracing a buffer that has
-		 * been reused.
-		 */
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-	}
-	return (error);
-}
-
 /*
  * helper function to extract extent size hint from inode
  */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index 571f2174435c..ff68eb5e738e 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -40,7 +40,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
-- 
cgit v1.2.3


From 4e23471a3f3aba885ea70100db47ccacb5f069f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Jan 2010 22:17:56 +0000
Subject: xfs: move more buffer helpers into xfs_buf.c

Move xfsbdstrat and xfs_bdstrat_cb from xfs_lrw.c and xfs_bioerror
and xfs_bioerror_relse from xfs_rw.c into xfs_buf.c.  This also
means xfs_bioerror and xfs_bioerror_relse can be marked static now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 120 +++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_buf.h |   4 ++
 fs/xfs/linux-2.6/xfs_lrw.c |  47 ------------------
 fs/xfs/linux-2.6/xfs_lrw.h |   3 --
 fs/xfs/xfs_rw.c            |  82 -------------------------------
 fs/xfs/xfs_rw.h            |   2 -
 6 files changed, 124 insertions(+), 134 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 492465c6e0b4..158fad4550df 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1112,6 +1112,126 @@ xfs_bdwrite(
 	xfs_buf_delwri_queue(bp, 1);
 }
 
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+	xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+	/*
+	 * No need to wait until the buffer is unpinned, we aren't flushing it.
+	 */
+	XFS_BUF_ERROR(bp, EIO);
+
+	/*
+	 * We're calling biodone, so delete XBF_DONE flag.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_STALE(bp);
+
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	xfs_biodone(bp);
+
+	return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+	struct xfs_buf	*bp)
+{
+	int64_t		fl = XFS_BUF_BFLAGS(bp);
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 *
+	 * chunkhold expects B_DONE to be set, whether
+	 * we actually finish the I/O or not. We don't want to
+	 * change that interface.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_STALE(bp);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	if (!(fl & XFS_B_ASYNC)) {
+		/*
+		 * Mark b_error and B_ERROR _both_.
+		 * Lot's of chunkcache code assumes that.
+		 * There's no reason to mark error for
+		 * ASYNC buffers.
+		 */
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_FINISH_IOWAIT(bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+
+	return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+	struct xfs_buf	*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		/*
+		 * Metadata write that didn't get logged but
+		 * written delayed anyway. These aren't associated
+		 * with a transaction, and can be ignored.
+		 */
+		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+			return xfs_bioerror_relse(bp);
+		else
+			return xfs_bioerror(bp);
+	}
+
+	xfs_buf_iorequest(bp);
+	return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		trace_xfs_bdstrat_shut(bp, _RET_IP_);
+		xfs_bioerror_relse(bp);
+		return;
+	}
+
+	xfs_buf_iorequest(bp);
+}
+
 STATIC void
 _xfs_buf_ioend(
 	xfs_buf_t		*bp,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index f69b8e714a11..9a29d18656ec 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -235,6 +235,10 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern int xfs_buf_iorequest(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 0d32457abef1..c80fa00d2ad7 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -783,53 +783,6 @@ write_retry:
 	return -error;
 }
 
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(struct xfs_buf *bp)
-{
-	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-		trace_xfs_bdstrat_shut(bp, _RET_IP_);
-		/*
-		 * Metadata write that didn't get logged but
-		 * written delayed anyway. These aren't associated
-		 * with a transaction, and can be ignored.
-		 */
-		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
-		    (XFS_BUF_ISREAD(bp)) == 0)
-			return (xfs_bioerror_relse(bp));
-		else
-			return (xfs_bioerror(bp));
-	}
-
-	xfs_buf_iorequest(bp);
-	return 0;
-}
-
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	ASSERT(mp);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_buf_iorequest(bp);
-		return;
-	}
-
-	trace_xfs_bdstrat_shut(bp, _RET_IP_);
-	xfs_bioerror_relse(bp);
-}
-
 /*
  * If the underlying (data/log/rt) device is readonly, there are some
  * operations that cannot proceed.
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index d1f7789c7ffb..342ae8c0d011 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -22,9 +22,6 @@ struct xfs_mount;
 struct xfs_inode;
 struct xfs_buf;
 
-/* errors from xfsbdstrat() must be extracted from the buffer */
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
 
 extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 9d933a10d6bb..abb2c458b148 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -153,88 +153,6 @@ xfs_do_force_shutdown(
 	}
 }
 
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call biodone
- * so that the proper iodone callbacks get called.
- */
-int
-xfs_bioerror(
-	xfs_buf_t *bp)
-{
-
-#ifdef XFSERRORDEBUG
-	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
-	/*
-	 * No need to wait until the buffer is unpinned.
-	 * We aren't flushing it.
-	 */
-	XFS_BUF_ERROR(bp, EIO);
-	/*
-	 * We're calling biodone, so delete B_DONE flag. Either way
-	 * we have to call the iodone callback, and calling biodone
-	 * probably is the best way since it takes care of
-	 * GRIO as well.
-	 */
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_UNDONE(bp);
-	XFS_BUF_STALE(bp);
-
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	xfs_biodone(bp);
-
-	return (EIO);
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the biodone call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-int
-xfs_bioerror_relse(
-	xfs_buf_t *bp)
-{
-	int64_t fl;
-
-	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
-	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
-
-	fl = XFS_BUF_BFLAGS(bp);
-	/*
-	 * No need to wait until the buffer is unpinned.
-	 * We aren't flushing it.
-	 *
-	 * chunkhold expects B_DONE to be set, whether
-	 * we actually finish the I/O or not. We don't want to
-	 * change that interface.
-	 */
-	XFS_BUF_UNREAD(bp);
-	XFS_BUF_UNDELAYWRITE(bp);
-	XFS_BUF_DONE(bp);
-	XFS_BUF_STALE(bp);
-	XFS_BUF_CLR_IODONE_FUNC(bp);
-	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	if (!(fl & XFS_B_ASYNC)) {
-		/*
-		 * Mark b_error and B_ERROR _both_.
-		 * Lot's of chunkcache code assumes that.
-		 * There's no reason to mark error for
-		 * ASYNC buffers.
-		 */
-		XFS_BUF_ERROR(bp, EIO);
-		XFS_BUF_FINISH_IOWAIT(bp);
-	} else {
-		xfs_buf_relse(bp);
-	}
-	return (EIO);
-}
-
 /*
  * Prints out an ALERT message about I/O error.
  */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index ff68eb5e738e..a54c3b7cd376 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -40,8 +40,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_bioerror(struct xfs_buf *bp);
-extern int xfs_bioerror_relse(struct xfs_buf *bp);
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			xfs_daddr_t blkno, int len, uint flags,
 			struct xfs_buf **bpp);
-- 
cgit v1.2.3


From e2bcd936eb95d0019ca5e05f9fdd27e770ddded1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:44:58 +1100
Subject: xfs: directory names are unsigned

Convert the struct xfs_name to use unsigned chars for the name
strings to match both what is stored on disk (__uint8_t) and what
the VFS expects (unsigned char).

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index d725428c9df6..b09904555d07 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -151,8 +151,8 @@ typedef enum {
 } xfs_btnum_t;
 
 struct xfs_name {
-	const char	*name;
-	int		len;
+	const unsigned char	*name;
+	int			len;
 };
 
 #endif	/* __XFS_TYPES_H__ */
-- 
cgit v1.2.3


From 046ea753130fc51d885835458bf8c1d84765b9ac Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:47:08 +1100
Subject: xfs: convert DM ops to use unsigned char names

dmops uses a signed char for it's namespace event. To be consistent
with the rest of the code, convert them to unsigned char for the
namespace string.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_mount.h    | 3 ++-
 fs/xfs/xfs_vnodeops.c | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e62fd1cde464..f4d1441f3f15 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -78,7 +78,8 @@ typedef int	(*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
 			struct xfs_inode *, dm_right_t,
 			struct xfs_inode *, dm_right_t,
-			const char *, const char *, mode_t, int, int);
+			const unsigned char *, const unsigned char *,
+			mode_t, int, int);
 typedef int	(*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
 			char *, char *);
 typedef void	(*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 6f268756bf36..9f7c001ef469 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2199,7 +2199,8 @@ xfs_symlink(
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					link_name->name, target_path, 0, 0, 0);
+					link_name->name,
+					(unsigned char *)target_path, 0, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2395,7 +2396,8 @@ std_return:
 					dp, DM_RIGHT_NULL,
 					error ? NULL : ip,
 					DM_RIGHT_NULL, link_name->name,
-					target_path, 0, error, 0);
+					(unsigned char *)target_path,
+					0, error, 0);
 	}
 
 	if (!error)
-- 
cgit v1.2.3


From 2bc754213d40d67c39ddd58cf240f2b948e1951e Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:47:17 +1100
Subject: xfs: convert dirnameops to unsigned char names

To be consistent across the codebase, convert the dirnameops to pass
the directory names by unsigned char strings.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_da_btree.c | 4 ++--
 fs/xfs/xfs_da_btree.h | 5 +++--
 fs/xfs/xfs_dir2.c     | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c0c8869115b1..0ca556b4bf31 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1534,8 +1534,8 @@ xfs_da_hashname(const __uint8_t *name, int namelen)
 enum xfs_dacmp
 xfs_da_compname(
 	struct xfs_da_args *args,
-	const char 	*name,
-	int 		len)
+	const unsigned char *name,
+	int		len)
 {
 	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
 					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 30cd08f56a3a..fe9f5a8c1d2a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -209,7 +209,8 @@ typedef struct xfs_da_state {
  */
 struct xfs_nameops {
 	xfs_dahash_t	(*hashname)(struct xfs_name *);
-	enum xfs_dacmp	(*compname)(struct xfs_da_args *, const char *, int);
+	enum xfs_dacmp	(*compname)(struct xfs_da_args *,
+					const unsigned char *, int);
 };
 
 
@@ -260,7 +261,7 @@ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 
 uint xfs_da_hashname(const __uint8_t *name_string, int name_length);
 enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
-				const char *name, int len);
+				const unsigned char *name, int len);
 
 
 xfs_da_state_t *xfs_da_state_alloc(void);
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 93634a7e90e9..c21c527766bf 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -66,8 +66,8 @@ xfs_ascii_ci_hashname(
 STATIC enum xfs_dacmp
 xfs_ascii_ci_compname(
 	struct xfs_da_args *args,
-	const char	*name,
-	int 		len)
+	const unsigned char *name,
+	int		len)
 {
 	enum xfs_dacmp	result;
 	int		i;
-- 
cgit v1.2.3


From a3380ae39fa321282c407ba5e1835e14b64853d9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:47:25 +1100
Subject: xfs: make xfs_dir_cilookup_result use unsigned char

For consistency with the result of the code.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dir2.c | 2 +-
 fs/xfs/xfs_dir2.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c21c527766bf..3a8c6ba0638f 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -247,7 +247,7 @@ xfs_dir_createname(
 int
 xfs_dir_cilookup_result(
 	struct xfs_da_args *args,
-	const char	*name,
+	const unsigned char *name,
 	int		len)
 {
 	if (args->cmpresult == XFS_CMP_DIFFERENT)
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 1d9ef96f33aa..74a3b1057685 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -100,7 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_dabuf *bp);
 
-extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
-				int len);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
+				const unsigned char *name, int len);
 
 #endif	/* __XFS_DIR2_H__ */
-- 
cgit v1.2.3


From b9c48649577dfc4a8c263c106d518effa24ea54b Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:47:39 +1100
Subject: xfs: xfs_buf_iomove() doesn't care about signedness

xfs_buf_iomove() uses xfs_caddr_t as it's parameter types, but it doesn't
care about the signedness of the variables as it is just copying the
data. Change the prototype to use void * so that we don't get sign
warnings at call sites.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 2 +-
 fs/xfs/linux-2.6/xfs_buf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 158fad4550df..efd745bb8887 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1443,7 +1443,7 @@ xfs_buf_iomove(
 	xfs_buf_t		*bp,	/* buffer to process		*/
 	size_t			boff,	/* starting buffer offset	*/
 	size_t			bsize,	/* length to copy		*/
-	caddr_t			data,	/* data address			*/
+	void			*data,	/* data address			*/
 	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
 {
 	size_t			bend, cpoff, csize;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 9a29d18656ec..4f2ad66edb76 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -243,7 +243,7 @@ extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 				xfs_buf_rw_t);
 
 static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
-- 
cgit v1.2.3


From a9273ca5c6814f393e18ed66645f817b2b71e9ad Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:47:48 +1100
Subject: xfs: convert attr to use unsigned names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be consistent with the directory code, the attr code should use
unsigned names. Convert the names from the vfs at the highest level
to unsigned, and ænsure they are consistenly used as unsigned down
to disk.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_acl.c     | 11 ++++++-----
 fs/xfs/linux-2.6/xfs_ioctl.c   | 18 +++++++++---------
 fs/xfs/linux-2.6/xfs_ioctl.h   | 12 ++++++------
 fs/xfs/linux-2.6/xfs_ioctl32.c |  4 ++--
 fs/xfs/linux-2.6/xfs_iops.c    |  4 ++--
 fs/xfs/linux-2.6/xfs_xattr.c   | 27 +++++++++++++++++++--------
 fs/xfs/xfs_acl.h               |  4 ++--
 fs/xfs/xfs_attr.c              | 38 +++++++++++++++++++++++---------------
 fs/xfs/xfs_attr.h              |  2 +-
 fs/xfs/xfs_attr_leaf.c         | 28 ++++++++++++++--------------
 fs/xfs/xfs_attr_sf.h           |  2 +-
 fs/xfs/xfs_vnodeops.h          | 10 +++++-----
 12 files changed, 90 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 883ca5ab8af5..bf85bbe4a9ae 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -106,7 +106,7 @@ xfs_get_acl(struct inode *inode, int type)
 	struct posix_acl *acl;
 	struct xfs_acl *xfs_acl;
 	int len = sizeof(struct xfs_acl);
-	char *ea_name;
+	unsigned char *ea_name;
 	int error;
 
 	acl = get_cached_acl(inode, type);
@@ -133,7 +133,8 @@ xfs_get_acl(struct inode *inode, int type)
 	if (!xfs_acl)
 		return ERR_PTR(-ENOMEM);
 
-	error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+	error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+							&len, ATTR_ROOT);
 	if (error) {
 		/*
 		 * If the attribute doesn't exist make sure we have a negative
@@ -162,7 +163,7 @@ STATIC int
 xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
 	struct xfs_inode *ip = XFS_I(inode);
-	char *ea_name;
+	unsigned char *ea_name;
 	int error;
 
 	if (S_ISLNK(inode->i_mode))
@@ -194,7 +195,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			(sizeof(struct xfs_acl_entry) *
 			 (XFS_ACL_MAX_ENTRIES - acl->a_count));
 
-		error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
 				len, ATTR_ROOT);
 
 		kfree(xfs_acl);
@@ -262,7 +263,7 @@ xfs_set_mode(struct inode *inode, mode_t mode)
 }
 
 static int
-xfs_acl_exists(struct inode *inode, char *name)
+xfs_acl_exists(struct inode *inode, unsigned char *name)
 {
 	int len = sizeof(struct xfs_acl);
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a034cf624437..3906e85abfdc 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -447,12 +447,12 @@ xfs_attrlist_by_handle(
 int
 xfs_attrmulti_attr_get(
 	struct inode		*inode,
-	char			*name,
-	char			__user *ubuf,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
 	__uint32_t		*len,
 	__uint32_t		flags)
 {
-	char			*kbuf;
+	unsigned char		*kbuf;
 	int			error = EFAULT;
 
 	if (*len > XATTR_SIZE_MAX)
@@ -476,12 +476,12 @@ xfs_attrmulti_attr_get(
 int
 xfs_attrmulti_attr_set(
 	struct inode		*inode,
-	char			*name,
-	const char		__user *ubuf,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
 	__uint32_t		len,
 	__uint32_t		flags)
 {
-	char			*kbuf;
+	unsigned char		*kbuf;
 	int			error = EFAULT;
 
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -501,7 +501,7 @@ xfs_attrmulti_attr_set(
 int
 xfs_attrmulti_attr_remove(
 	struct inode		*inode,
-	char			*name,
+	unsigned char		*name,
 	__uint32_t		flags)
 {
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -519,7 +519,7 @@ xfs_attrmulti_by_handle(
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
 	struct dentry		*dentry;
 	unsigned int		i, size;
-	char			*attr_name;
+	unsigned char		*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
@@ -547,7 +547,7 @@ xfs_attrmulti_by_handle(
 
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
-		ops[i].am_error = strncpy_from_user(attr_name,
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
 				ops[i].am_attrname, MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
 			error = -ERANGE;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 7bd7c6afc1eb..d56173b34a2a 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -45,23 +45,23 @@ xfs_readlink_by_handle(
 extern int
 xfs_attrmulti_attr_get(
 	struct inode		*inode,
-	char			*name,
-	char			__user *ubuf,
+	unsigned char		*name,
+	unsigned char		__user *ubuf,
 	__uint32_t		*len,
 	__uint32_t		flags);
 
 extern int
-	xfs_attrmulti_attr_set(
+xfs_attrmulti_attr_set(
 	struct inode		*inode,
-	char			*name,
-	const char		__user *ubuf,
+	unsigned char		*name,
+	const unsigned char	__user *ubuf,
 	__uint32_t		len,
 	__uint32_t		flags);
 
 extern int
 xfs_attrmulti_attr_remove(
 	struct inode		*inode,
-	char			*name,
+	unsigned char		*name,
 	__uint32_t		flags);
 
 extern struct dentry *
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index be1527b1670c..0bf6d61f0528 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -411,7 +411,7 @@ xfs_compat_attrmulti_by_handle(
 	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
 	struct dentry				*dentry;
 	unsigned int				i, size;
-	char					*attr_name;
+	unsigned char				*attr_name;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
@@ -440,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
 
 	error = 0;
 	for (i = 0; i < am_hreq.opcount; i++) {
-		ops[i].am_error = strncpy_from_user(attr_name,
+		ops[i].am_error = strncpy_from_user((char *)attr_name,
 				compat_ptr(ops[i].am_attrname),
 				MAXNAMELEN);
 		if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 225946012d0b..e8566bbf0f00 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,10 +140,10 @@ xfs_init_security(
 	struct xfs_inode *ip = XFS_I(inode);
 	size_t		length;
 	void		*value;
-	char		*name;
+	unsigned char	*name;
 	int		error;
 
-	error = security_inode_init_security(inode, dir, &name,
+	error = security_inode_init_security(inode, dir, (char **)&name,
 					     &value, &length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 0b1878857fc3..fa01b9daba6b 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -45,7 +45,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
 		value = NULL;
 	}
 
-	error = -xfs_attr_get(ip, name, value, &asize, xflags);
+	error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
 	if (error)
 		return error;
 	return asize;
@@ -67,8 +67,9 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
 		xflags |= ATTR_REPLACE;
 
 	if (!value)
-		return -xfs_attr_remove(ip, name, xflags);
-	return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+		return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+	return -xfs_attr_set(ip, (unsigned char *)name,
+				(void *)value, size, xflags);
 }
 
 static struct xattr_handler xfs_xattr_user_handler = {
@@ -124,8 +125,13 @@ static const char *xfs_xattr_prefix(int flags)
 }
 
 static int
-xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
-		char *name, int namelen, int valuelen, char *value)
+xfs_xattr_put_listent(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
 	char *offset;
@@ -148,7 +154,7 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
 	offset = (char *)context->alist + context->count;
 	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
 	offset += prefix_len;
-	strncpy(offset, name, namelen);			/* real name */
+	strncpy(offset, (char *)name, namelen);			/* real name */
 	offset += namelen;
 	*offset = '\0';
 	context->count += prefix_len + namelen + 1;
@@ -156,8 +162,13 @@ xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
 }
 
 static int
-xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
-		char *name, int namelen, int valuelen, char *value)
+xfs_xattr_put_listent_sizes(
+	struct xfs_attr_list_context *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
 	return 0;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 00fd357c3e46..d13eeba2c8f8 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -36,8 +36,8 @@ struct xfs_acl {
 };
 
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE		"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT		"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE		(unsigned char *)"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT		(unsigned char *)"SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 9d11ebad43b6..f7b426a1b6ee 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -93,12 +93,12 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 STATIC int
 xfs_attr_name_to_xname(
 	struct xfs_name	*xname,
-	const char	*aname)
+	const unsigned char *aname)
 {
 	if (!aname)
 		return EINVAL;
 	xname->name = aname;
-	xname->len = strlen(aname);
+	xname->len = strlen((char *)aname);
 	if (xname->len >= MAXNAMELEN)
 		return EFAULT;		/* match IRIX behaviour */
 
@@ -124,7 +124,7 @@ STATIC int
 xfs_attr_get_int(
 	struct xfs_inode	*ip,
 	struct xfs_name		*name,
-	char			*value,
+	unsigned char		*value,
 	int			*valuelenp,
 	int			flags)
 {
@@ -171,8 +171,8 @@ xfs_attr_get_int(
 int
 xfs_attr_get(
 	xfs_inode_t	*ip,
-	const char	*name,
-	char		*value,
+	const unsigned char *name,
+	unsigned char	*value,
 	int		*valuelenp,
 	int		flags)
 {
@@ -235,8 +235,12 @@ xfs_attr_calc_size(
 }
 
 STATIC int
-xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
-		char *value, int valuelen, int flags)
+xfs_attr_set_int(
+	struct xfs_inode *dp,
+	struct xfs_name	*name,
+	unsigned char	*value,
+	int		valuelen,
+	int		flags)
 {
 	xfs_da_args_t	args;
 	xfs_fsblock_t	firstblock;
@@ -452,8 +456,8 @@ out:
 int
 xfs_attr_set(
 	xfs_inode_t	*dp,
-	const char	*name,
-	char		*value,
+	const unsigned char *name,
+	unsigned char	*value,
 	int		valuelen,
 	int		flags)
 {
@@ -600,7 +604,7 @@ out:
 int
 xfs_attr_remove(
 	xfs_inode_t	*dp,
-	const char	*name,
+	const unsigned char *name,
 	int		flags)
 {
 	int		error;
@@ -669,9 +673,13 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
  */
 /*ARGSUSED*/
 STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
-		     char *name, int namelen,
-		     int valuelen, char *value)
+xfs_attr_put_listent(
+	xfs_attr_list_context_t *context,
+	int		flags,
+	unsigned char	*name,
+	int		namelen,
+	int		valuelen,
+	unsigned char	*value)
 {
 	struct attrlist *alist = (struct attrlist *)context->alist;
 	attrlist_ent_t *aep;
@@ -1980,7 +1988,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
 	xfs_mount_t *mp;
 	xfs_daddr_t dblkno;
-	xfs_caddr_t dst;
+	void *dst;
 	xfs_buf_t *bp;
 	int nmap, error, tmp, valuelen, blkcnt, i;
 	xfs_dablk_t lblkno;
@@ -2039,7 +2047,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 	xfs_inode_t *dp;
 	xfs_bmbt_irec_t map;
 	xfs_daddr_t dblkno;
-	xfs_caddr_t src;
+	void *src;
 	xfs_buf_t *bp;
 	xfs_dablk_t lblkno;
 	int blkcnt, valuelen, nmap, error, tmp, committed;
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 9c3a24372914..e920d68ef509 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -113,7 +113,7 @@ typedef struct attrlist_cursor_kern {
 
 
 typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-				      char *, int, int, char *);
+			      unsigned char *, int, int, unsigned char *);
 
 typedef struct xfs_attr_list_context {
 	struct xfs_inode		*dp;		/* inode */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index baf41b5af756..52519a201849 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -521,11 +521,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 
 	sfe = &sf->list[0];
 	for (i = 0; i < sf->hdr.count; i++) {
-		nargs.name = (char *)sfe->nameval;
+		nargs.name = sfe->nameval;
 		nargs.namelen = sfe->namelen;
-		nargs.value = (char *)&sfe->nameval[nargs.namelen];
+		nargs.value = &sfe->nameval[nargs.namelen];
 		nargs.valuelen = sfe->valuelen;
-		nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+		nargs.hashval = xfs_da_hashname(sfe->nameval,
 						sfe->namelen);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
 		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
@@ -612,10 +612,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 			error = context->put_listent(context,
 					   sfe->flags,
-					   (char *)sfe->nameval,
+					   sfe->nameval,
 					   (int)sfe->namelen,
 					   (int)sfe->valuelen,
-					   (char*)&sfe->nameval[sfe->namelen]);
+					   &sfe->nameval[sfe->namelen]);
 
 			/*
 			 * Either search callback finished early or
@@ -659,8 +659,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		}
 
 		sbp->entno = i;
-		sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
-		sbp->name = (char *)sfe->nameval;
+		sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
+		sbp->name = sfe->nameval;
 		sbp->namelen = sfe->namelen;
 		/* These are bytes, and both on-disk, don't endian-flip */
 		sbp->valuelen = sfe->valuelen;
@@ -818,9 +818,9 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 			continue;
 		ASSERT(entry->flags & XFS_ATTR_LOCAL);
 		name_loc = xfs_attr_leaf_name_local(leaf, i);
-		nargs.name = (char *)name_loc->nameval;
+		nargs.name = name_loc->nameval;
 		nargs.namelen = name_loc->namelen;
-		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+		nargs.value = &name_loc->nameval[nargs.namelen];
 		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
 		nargs.hashval = be32_to_cpu(entry->hashval);
 		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
@@ -2370,10 +2370,10 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 			retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_loc->nameval,
+						name_loc->nameval,
 						(int)name_loc->namelen,
 						be16_to_cpu(name_loc->valuelen),
-						(char *)&name_loc->nameval[name_loc->namelen]);
+						&name_loc->nameval[name_loc->namelen]);
 			if (retval)
 				return retval;
 		} else {
@@ -2397,15 +2397,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 					return retval;
 				retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_rmt->name,
+						name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
-						(char*)args.value);
+						args.value);
 				kmem_free(args.value);
 			} else {
 				retval = context->put_listent(context,
 						entry->flags,
-						(char *)name_rmt->name,
+						name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
 						NULL);
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index 76ab7b0cbb3a..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -52,7 +52,7 @@ typedef struct xfs_attr_sf_sort {
 	__uint8_t	valuelen;	/* length of value */
 	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
 	xfs_dahash_t	hash;		/* this entry's hash value */
-	char		*name;		/* name value, pointer into buffer */
+	unsigned char	*name;		/* name value, pointer into buffer */
 } xfs_attr_sf_sort_t;
 
 #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 167a467403a5..774f40729ca1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -43,11 +43,11 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd,
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
 		struct xfs_inode *src_ip, struct xfs_inode *target_dp,
 		struct xfs_name *target_name, struct xfs_inode *target_ip);
-int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
-		int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
-		int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+		unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+		unsigned char *value, int valuelen, int flags);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
-- 
cgit v1.2.3


From 4a24cb71407dc25035d75dd3d118e0e55679e217 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:48:05 +1100
Subject: xfs: clean up sign warnings in dir2 code

We are now consistently using unsigned char strings for names
so fix up the remaining warnings in the dir2 code to complete
the cleanup.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dir2.c       | 2 +-
 fs/xfs/xfs_dir2_block.c | 9 +++++----
 fs/xfs/xfs_dir2_leaf.c  | 2 +-
 fs/xfs/xfs_dir2_sf.c    | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 3a8c6ba0638f..42520f041265 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,7 +44,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 
-struct xfs_name xfs_name_dotdot = {"..", 2};
+struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2};
 
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ddc4ecc7807f..779a267b0a84 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -57,8 +57,8 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
 void
 xfs_dir_startup(void)
 {
-	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
-	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+	xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 
 /*
@@ -513,8 +513,9 @@ xfs_dir2_block_getdents(
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
-			    be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
+			    cook & 0x7fffffff, be64_to_cpu(dep->inumber),
+			    DT_UNKNOWN)) {
 			*offset = cook & 0x7fffffff;
 			xfs_da_brelse(NULL, bp);
 			return 0;
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 29f484c11b3a..e2d89854ec9e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1081,7 +1081,7 @@ xfs_dir2_leaf_getdents(
 		dep = (xfs_dir2_data_entry_t *)ptr;
 		length = xfs_dir2_data_entsize(dep->namelen);
 
-		if (filldir(dirent, dep->name, dep->namelen,
+		if (filldir(dirent, (char *)dep->name, dep->namelen,
 			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
 			    be64_to_cpu(dep->inumber), DT_UNKNOWN))
 			break;
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 9d4f17a69676..c1a5945d463a 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -782,7 +782,7 @@ xfs_dir2_sf_getdents(
 		}
 
 		ino = xfs_dir2_sf_get_inumber(sfp, xfs_dir2_sf_inumberp(sfep));
-		if (filldir(dirent, sfep->name, sfep->namelen,
+		if (filldir(dirent, (char *)sfep->name, sfep->namelen,
 			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
 			*offset = off & 0x7fffffff;
 			return 0;
-- 
cgit v1.2.3


From 58c75cfb51393a52b45262394c1fa81514b4d9bd Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:49:18 +1100
Subject: xfs: make compile warn about char sign mismatches again

The -fno-unsigned-char directive has no effect anymore as the
XFs build is clean. However, the kernel build hides pointer sign
differences so turn that back on so that we can clean up all the
mismatches prior to a userspace code resync.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 56641fe52a23..19267019dacf 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
 
-EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6 -funsigned-char
+EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6 -Wpointer-sign
 
 XFS_LINUX := linux-2.6
 
-- 
cgit v1.2.3


From f0a0eaa8da08ebc6519cacd731df05bbb4ca47ce Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 10:50:06 +1100
Subject: xfs: suppress spurious uninitialised var warning in xfs_bmapi()

Initialise the xfs_bmalloca_t structure to zero to avoid uninitialised
variable warnings. This is done by zeroing the arg structure rather than
using the uninitialised_var() trick so we know for certain that the
structure is correctly initialised as xfs_bmapi is a very complex
function and it is difficult to prove warnings are spurious.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7c6d9acd7154..1869fb973819 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4471,7 +4471,7 @@ xfs_bmapi(
 	xfs_fsblock_t	abno;		/* allocated block number */
 	xfs_extlen_t	alen;		/* allocated extent length */
 	xfs_fileoff_t	aoff;		/* allocated file offset */
-	xfs_bmalloca_t	bma;		/* args for xfs_bmap_alloc */
+	xfs_bmalloca_t	bma = { 0 };	/* args for xfs_bmap_alloc */
 	xfs_btree_cur_t	*cur;		/* bmap btree cursor */
 	xfs_fileoff_t	end;		/* end of mapped file region */
 	int		eof;		/* we've hit the end of extents */
-- 
cgit v1.2.3


From 587aa0feb74ffe3239b5e26ff5d017ba9f5daec9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 12:04:53 +1100
Subject: xfs: rearrange xfs_mod_sb() to avoid array subscript warning

gcc warns of an array subscript out of bounds in xfs_mod_sb().
The code is written in such a way that if the array subscript is
out of bounds, then it will assert fail. Rearrange the code to
avoid the bounds check warning.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_mount.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0df5045abd3b..d95bd1809f3c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1631,15 +1631,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
 
 	/* find modified range */
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
 
 	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
 	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
 	first = xfs_sb_info[f].offset;
 
-	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-	last = xfs_sb_info[f + 1].offset - 1;
-
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
-- 
cgit v1.2.3


From e6a6d3795565b8ccb957afc6ca0e50db40b2d899 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 22 Sep 2009 19:25:24 +0100
Subject: Squashfs: move zlib decompression wrapper code into a separate file

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/Makefile       |   2 +-
 fs/squashfs/block.c        |  74 ++----------------------------
 fs/squashfs/squashfs.h     |   4 ++
 fs/squashfs/zlib_wrapper.c | 109 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 71 deletions(-)
 create mode 100644 fs/squashfs/zlib_wrapper.c

(limited to 'fs')

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 70e3244fa30f..a397e6f12ab5 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2a7960310349..b8addfdc6094 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -29,7 +29,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/mutex.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/zlib.h>
@@ -153,72 +152,10 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	}
 
 	if (compressed) {
-		int zlib_err = 0, zlib_init = 0;
-
-		/*
-		 * Uncompress block.
-		 */
-
-		mutex_lock(&msblk->read_data_mutex);
-
-		msblk->stream.avail_out = 0;
-		msblk->stream.avail_in = 0;
-
-		bytes = length;
-		do {
-			if (msblk->stream.avail_in == 0 && k < b) {
-				avail = min(bytes, msblk->devblksize - offset);
-				bytes -= avail;
-				wait_on_buffer(bh[k]);
-				if (!buffer_uptodate(bh[k]))
-					goto release_mutex;
-
-				if (avail == 0) {
-					offset = 0;
-					put_bh(bh[k++]);
-					continue;
-				}
-
-				msblk->stream.next_in = bh[k]->b_data + offset;
-				msblk->stream.avail_in = avail;
-				offset = 0;
-			}
-
-			if (msblk->stream.avail_out == 0 && page < pages) {
-				msblk->stream.next_out = buffer[page++];
-				msblk->stream.avail_out = PAGE_CACHE_SIZE;
-			}
-
-			if (!zlib_init) {
-				zlib_err = zlib_inflateInit(&msblk->stream);
-				if (zlib_err != Z_OK) {
-					ERROR("zlib_inflateInit returned"
-						" unexpected result 0x%x,"
-						" srclength %d\n", zlib_err,
-						srclength);
-					goto release_mutex;
-				}
-				zlib_init = 1;
-			}
-
-			zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
-
-			if (msblk->stream.avail_in == 0 && k < b)
-				put_bh(bh[k++]);
-		} while (zlib_err == Z_OK);
-
-		if (zlib_err != Z_STREAM_END) {
-			ERROR("zlib_inflate error, data probably corrupt\n");
-			goto release_mutex;
-		}
-
-		zlib_err = zlib_inflateEnd(&msblk->stream);
-		if (zlib_err != Z_OK) {
-			ERROR("zlib_inflate error, data probably corrupt\n");
-			goto release_mutex;
-		}
-		length = msblk->stream.total_out;
-		mutex_unlock(&msblk->read_data_mutex);
+		length = squashfs_zlib_uncompress(msblk, buffer, bh, b, offset,
+			 length, srclength, pages);
+		if (length < 0)
+			goto read_failure;
 	} else {
 		/*
 		 * Block is uncompressed.
@@ -255,9 +192,6 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	kfree(bh);
 	return length;
 
-release_mutex:
-	mutex_unlock(&msblk->read_data_mutex);
-
 block_release:
 	for (; k < b; k++)
 		put_bh(bh[k]);
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 0e9feb6adf7e..ba87db693651 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -70,6 +70,10 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 				unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
 
+/* zlib_wrapper.c */
+extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **,
+				struct buffer_head **, int, int, int, int, int);
+
 /*
  * Inodes and files operations
  */
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
new file mode 100644
index 000000000000..3be99642d6af
--- /dev/null
+++ b/fs/squashfs/zlib_wrapper.c
@@ -0,0 +1,109 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * zlib_wrapper.c
+ */
+
+
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+	struct buffer_head **bh, int b, int offset, int length, int srclength,
+	int pages)
+{
+	int zlib_err = 0, zlib_init = 0;
+	int avail, bytes, k = 0, page = 0;
+
+	mutex_lock(&msblk->read_data_mutex);
+
+	msblk->stream.avail_out = 0;
+	msblk->stream.avail_in = 0;
+
+	bytes = length;
+	do {
+		if (msblk->stream.avail_in == 0 && k < b) {
+			avail = min(bytes, msblk->devblksize - offset);
+			bytes -= avail;
+			wait_on_buffer(bh[k]);
+			if (!buffer_uptodate(bh[k]))
+				goto release_mutex;
+
+			if (avail == 0) {
+				offset = 0;
+				put_bh(bh[k++]);
+				continue;
+			}
+
+			msblk->stream.next_in = bh[k]->b_data + offset;
+			msblk->stream.avail_in = avail;
+			offset = 0;
+		}
+
+		if (msblk->stream.avail_out == 0 && page < pages) {
+			msblk->stream.next_out = buffer[page++];
+			msblk->stream.avail_out = PAGE_CACHE_SIZE;
+		}
+
+		if (!zlib_init) {
+			zlib_err = zlib_inflateInit(&msblk->stream);
+			if (zlib_err != Z_OK) {
+				ERROR("zlib_inflateInit returned unexpected "
+					"result 0x%x, srclength %d\n",
+					zlib_err, srclength);
+				goto release_mutex;
+			}
+			zlib_init = 1;
+		}
+
+		zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
+
+		if (msblk->stream.avail_in == 0 && k < b)
+			put_bh(bh[k++]);
+	} while (zlib_err == Z_OK);
+
+	if (zlib_err != Z_STREAM_END) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	zlib_err = zlib_inflateEnd(&msblk->stream);
+	if (zlib_err != Z_OK) {
+		ERROR("zlib_inflate error, data probably corrupt\n");
+		goto release_mutex;
+	}
+
+	mutex_unlock(&msblk->read_data_mutex);
+	return msblk->stream.total_out;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+	return -EIO;
+}
-- 
cgit v1.2.3


From f1a40359f8d8ba073257ed31a513e492621bcbc5 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 23 Sep 2009 19:04:49 +0100
Subject: Squashfs: factor out remaining zlib dependencies into separate
 wrapper file

Move zlib buffer init/destroy code into separate wrapper file.  Also
make zlib z_stream field a void * removing the need to include zlib.h
for most files.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/block.c          |  1 -
 fs/squashfs/cache.c          |  1 -
 fs/squashfs/dir.c            |  1 -
 fs/squashfs/export.c         |  1 -
 fs/squashfs/file.c           |  1 -
 fs/squashfs/fragment.c       |  1 -
 fs/squashfs/id.c             |  1 -
 fs/squashfs/inode.c          |  1 -
 fs/squashfs/namei.c          |  1 -
 fs/squashfs/squashfs.h       |  2 ++
 fs/squashfs/squashfs_fs_sb.h |  2 +-
 fs/squashfs/super.c          | 14 ++++-------
 fs/squashfs/symlink.c        |  1 -
 fs/squashfs/zlib_wrapper.c   | 56 ++++++++++++++++++++++++++++++++++----------
 14 files changed, 51 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index b8addfdc6094..3f836e181eb8 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 40c98fa6b5d6..57314bee9059 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -51,7 +51,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
-#include <linux/zlib.h>
 #include <linux/pagemap.h>
 
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 566b0eaed868..12b933ac6585 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -30,7 +30,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 2b1b8fe5e037..7f93d5a9ee05 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -39,7 +39,6 @@
 #include <linux/vfs.h>
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
-#include <linux/zlib.h>
 #include <linux/slab.h>
 
 #include "squashfs_fs.h"
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 717767d831df..a25c5060bdcb 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,7 +47,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/mutex.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index b5a2c15bbbc7..7c90bbd6879d 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -36,7 +36,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 3795b837ba28..b7f64bcd2b70 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -34,7 +34,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 9101dbde39ec..49daaf669e41 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,7 +40,6 @@
 
 #include <linux/fs.h>
 #include <linux/vfs.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 9e398653b22b..5266bd8ad932 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,7 +57,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba87db693651..9c2f76a1c50b 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -71,6 +71,8 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 extern int squashfs_read_inode(struct inode *, long long);
 
 /* zlib_wrapper.c */
+extern void *squashfs_zlib_init(void);
+extern void squashfs_zlib_free(void *);
 extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **,
 				struct buffer_head **, int, int, int, int, int);
 
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index c8c65614dd1c..23a67fa40b03 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -64,7 +64,7 @@ struct squashfs_sb_info {
 	struct mutex		read_data_mutex;
 	struct mutex		meta_index_mutex;
 	struct meta_index	*meta_index;
-	z_stream		stream;
+	void			*stream;
 	__le64			*inode_lookup_table;
 	u64			inode_table;
 	u64			directory_table;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6c197ef53add..b9f8c6a92d6a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,7 +35,6 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/zlib.h>
 #include <linux/magic.h>
 
 #include "squashfs_fs.h"
@@ -87,12 +86,9 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	msblk = sb->s_fs_info;
 
-	msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
-		GFP_KERNEL);
-	if (msblk->stream.workspace == NULL) {
-		ERROR("Failed to allocate zlib workspace\n");
+	msblk->stream = squashfs_zlib_init();
+	if (msblk->stream == NULL)
 		goto failure;
-	}
 
 	sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
 	if (sblk == NULL) {
@@ -292,17 +288,17 @@ failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
 	squashfs_cache_delete(msblk->read_page);
+	squashfs_zlib_free(msblk->stream);
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
-	kfree(msblk->stream.workspace);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	kfree(sblk);
 	return err;
 
 failure:
-	kfree(msblk->stream.workspace);
+	squashfs_zlib_free(msblk->stream);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	return -ENOMEM;
@@ -346,10 +342,10 @@ static void squashfs_put_super(struct super_block *sb)
 		squashfs_cache_delete(sbi->block_cache);
 		squashfs_cache_delete(sbi->fragment_cache);
 		squashfs_cache_delete(sbi->read_page);
+		squashfs_zlib_free(sbi->stream);
 		kfree(sbi->id_table);
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
-		kfree(sbi->stream.workspace);
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 83d87880aac8..e80be2022a7f 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -36,7 +36,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
-#include <linux/zlib.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 3be99642d6af..c814594d522e 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -31,21 +31,51 @@
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 
+void *squashfs_zlib_init()
+{
+	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
+	if (stream == NULL)
+		goto failed;
+	stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+		GFP_KERNEL);
+	if (stream->workspace == NULL)
+		goto failed;
+
+	return stream;
+
+failed:
+	ERROR("Failed to allocate zlib workspace\n");
+	kfree(stream);
+	return NULL;
+}
+
+
+void squashfs_zlib_free(void *strm)
+{
+	z_stream *stream = strm;
+
+	if (stream)
+		kfree(stream->workspace);
+	kfree(stream);
+}
+
+
 int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	struct buffer_head **bh, int b, int offset, int length, int srclength,
 	int pages)
 {
 	int zlib_err = 0, zlib_init = 0;
 	int avail, bytes, k = 0, page = 0;
+	z_stream *stream = msblk->stream;
 
 	mutex_lock(&msblk->read_data_mutex);
 
-	msblk->stream.avail_out = 0;
-	msblk->stream.avail_in = 0;
+	stream->avail_out = 0;
+	stream->avail_in = 0;
 
 	bytes = length;
 	do {
-		if (msblk->stream.avail_in == 0 && k < b) {
+		if (stream->avail_in == 0 && k < b) {
 			avail = min(bytes, msblk->devblksize - offset);
 			bytes -= avail;
 			wait_on_buffer(bh[k]);
@@ -58,18 +88,18 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 				continue;
 			}
 
-			msblk->stream.next_in = bh[k]->b_data + offset;
-			msblk->stream.avail_in = avail;
+			stream->next_in = bh[k]->b_data + offset;
+			stream->avail_in = avail;
 			offset = 0;
 		}
 
-		if (msblk->stream.avail_out == 0 && page < pages) {
-			msblk->stream.next_out = buffer[page++];
-			msblk->stream.avail_out = PAGE_CACHE_SIZE;
+		if (stream->avail_out == 0 && page < pages) {
+			stream->next_out = buffer[page++];
+			stream->avail_out = PAGE_CACHE_SIZE;
 		}
 
 		if (!zlib_init) {
-			zlib_err = zlib_inflateInit(&msblk->stream);
+			zlib_err = zlib_inflateInit(stream);
 			if (zlib_err != Z_OK) {
 				ERROR("zlib_inflateInit returned unexpected "
 					"result 0x%x, srclength %d\n",
@@ -79,9 +109,9 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			zlib_init = 1;
 		}
 
-		zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
+		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
 
-		if (msblk->stream.avail_in == 0 && k < b)
+		if (stream->avail_in == 0 && k < b)
 			put_bh(bh[k++]);
 	} while (zlib_err == Z_OK);
 
@@ -90,14 +120,14 @@ int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 		goto release_mutex;
 	}
 
-	zlib_err = zlib_inflateEnd(&msblk->stream);
+	zlib_err = zlib_inflateEnd(stream);
 	if (zlib_err != Z_OK) {
 		ERROR("zlib_inflate error, data probably corrupt\n");
 		goto release_mutex;
 	}
 
 	mutex_unlock(&msblk->read_data_mutex);
-	return msblk->stream.total_out;
+	return stream->total_out;
 
 release_mutex:
 	mutex_unlock(&msblk->read_data_mutex);
-- 
cgit v1.2.3


From 4c0f0bb2351bee3de8dd7715ee199454a59f1230 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 6 Oct 2009 04:04:15 +0100
Subject: Squashfs: add a decompressor framework

This adds a decompressor framework which allows multiple compression
algorithms to be cleanly supported.

Also update zlib wrapper and other code to use the new framework.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/Makefile         |  2 +-
 fs/squashfs/block.c          |  3 ++-
 fs/squashfs/decompressor.c   | 58 ++++++++++++++++++++++++++++++++++++++++++++
 fs/squashfs/decompressor.h   | 55 +++++++++++++++++++++++++++++++++++++++++
 fs/squashfs/squashfs.h       | 14 +++++------
 fs/squashfs/squashfs_fs_sb.h | 41 ++++++++++++++++---------------
 fs/squashfs/super.c          | 45 +++++++++++++++++++---------------
 fs/squashfs/zlib_wrapper.c   | 17 ++++++++++---
 8 files changed, 184 insertions(+), 51 deletions(-)
 create mode 100644 fs/squashfs/decompressor.c
 create mode 100644 fs/squashfs/decompressor.h

(limited to 'fs')

diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index a397e6f12ab5..df8a19ef870d 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
-squashfs-y += namei.o super.o symlink.o zlib_wrapper.o
+squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 3f836e181eb8..1cb0d81b164b 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -36,6 +36,7 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 
 /*
  * Read the metadata block length, this is stored in the first two
@@ -151,7 +152,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 	}
 
 	if (compressed) {
-		length = squashfs_zlib_uncompress(msblk, buffer, bh, b, offset,
+		length = squashfs_decompress(msblk, buffer, bh, b, offset,
 			 length, srclength, pages);
 		if (length < 0)
 			goto read_failure;
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
new file mode 100644
index 000000000000..0072ccdac1e2
--- /dev/null
+++ b/fs/squashfs/decompressor.c
@@ -0,0 +1,58 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.c
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file (and decompressor.h) implements a decompressor framework for
+ * Squashfs, allowing multiple decompressors to be easily supported
+ */
+
+static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
+	NULL, NULL, NULL, 0, "unknown", 0
+};
+
+static const struct squashfs_decompressor *decompressor[] = {
+	&squashfs_zlib_comp_ops,
+	&squashfs_unknown_comp_ops
+};
+
+
+const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
+{
+	int i;
+
+	for (i = 0; decompressor[i]->id; i++)
+		if (id == decompressor[i]->id)
+			break;
+
+	return decompressor[i];
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
new file mode 100644
index 000000000000..7425f80783f6
--- /dev/null
+++ b/fs/squashfs/decompressor.h
@@ -0,0 +1,55 @@
+#ifndef DECOMPRESSOR_H
+#define DECOMPRESSOR_H
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * decompressor.h
+ */
+
+struct squashfs_decompressor {
+	void	*(*init)(struct squashfs_sb_info *);
+	void	(*free)(void *);
+	int	(*decompress)(struct squashfs_sb_info *, void **,
+		struct buffer_head **, int, int, int, int, int);
+	int	id;
+	char	*name;
+	int	supported;
+};
+
+static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
+{
+	return msblk->decompressor->init(msblk);
+}
+
+static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
+	void *s)
+{
+	if (msblk->decompressor)
+		msblk->decompressor->free(s);
+}
+
+static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
+	void **buffer, struct buffer_head **bh, int b, int offset, int length,
+	int srclength, int pages)
+{
+	return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
+		length, srclength, pages);
+}
+#endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9c2f76a1c50b..fe2587af5512 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -51,6 +51,9 @@ extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
 				u64, int);
 extern int squashfs_read_table(struct super_block *, void *, u64, int);
 
+/* decompressor.c */
+extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
 				unsigned int);
@@ -70,14 +73,8 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
 				unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
 
-/* zlib_wrapper.c */
-extern void *squashfs_zlib_init(void);
-extern void squashfs_zlib_free(void *);
-extern int squashfs_zlib_uncompress(struct squashfs_sb_info *, void **,
-				struct buffer_head **, int, int, int, int, int);
-
 /*
- * Inodes and files operations
+ * Inodes, files and decompressor operations
  */
 
 /* dir.c */
@@ -94,3 +91,6 @@ extern const struct inode_operations squashfs_dir_inode_ops;
 
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+
+/* zlib_wrapper.c */
+extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 23a67fa40b03..753335085e41 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -52,25 +52,26 @@ struct squashfs_cache_entry {
 };
 
 struct squashfs_sb_info {
-	int			devblksize;
-	int			devblksize_log2;
-	struct squashfs_cache	*block_cache;
-	struct squashfs_cache	*fragment_cache;
-	struct squashfs_cache	*read_page;
-	int			next_meta_index;
-	__le64			*id_table;
-	__le64			*fragment_index;
-	unsigned int		*fragment_index_2;
-	struct mutex		read_data_mutex;
-	struct mutex		meta_index_mutex;
-	struct meta_index	*meta_index;
-	void			*stream;
-	__le64			*inode_lookup_table;
-	u64			inode_table;
-	u64			directory_table;
-	unsigned int		block_size;
-	unsigned short		block_log;
-	long long		bytes_used;
-	unsigned int		inodes;
+	const struct squashfs_decompressor	*decompressor;
+	int					devblksize;
+	int					devblksize_log2;
+	struct squashfs_cache			*block_cache;
+	struct squashfs_cache			*fragment_cache;
+	struct squashfs_cache			*read_page;
+	int					next_meta_index;
+	__le64					*id_table;
+	__le64					*fragment_index;
+	unsigned int				*fragment_index_2;
+	struct mutex				read_data_mutex;
+	struct mutex				meta_index_mutex;
+	struct meta_index			*meta_index;
+	void					*stream;
+	__le64					*inode_lookup_table;
+	u64					inode_table;
+	u64					directory_table;
+	unsigned int				block_size;
+	unsigned short				block_log;
+	long long				bytes_used;
+	unsigned int				inodes;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index b9f8c6a92d6a..3550aec2f655 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -41,27 +41,35 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
 
-static int supported_squashfs_filesystem(short major, short minor, short comp)
+static const struct squashfs_decompressor *supported_squashfs_filesystem(short
+	major, short minor, short id)
 {
+	const struct squashfs_decompressor *decompressor;
+
 	if (major < SQUASHFS_MAJOR) {
 		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
 			"filesystems are unsupported\n", major, minor);
-		return -EINVAL;
+		return NULL;
 	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
 		ERROR("Major/Minor mismatch, trying to mount newer "
 			"%d.%d filesystem\n", major, minor);
 		ERROR("Please update your kernel\n");
-		return -EINVAL;
+		return NULL;
 	}
 
-	if (comp != ZLIB_COMPRESSION)
-		return -EINVAL;
+	decompressor = squashfs_lookup_decompressor(id);
+	if (!decompressor->supported) {
+		ERROR("Filesystem uses \"%s\" compression. This is not "
+			"supported\n", decompressor->name);
+		return NULL;
+	}
 
-	return 0;
+	return decompressor;
 }
 
 
@@ -86,10 +94,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	msblk = sb->s_fs_info;
 
-	msblk->stream = squashfs_zlib_init();
-	if (msblk->stream == NULL)
-		goto failure;
-
 	sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
 	if (sblk == NULL) {
 		ERROR("Failed to allocate squashfs_super_block\n");
@@ -116,25 +120,25 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	err = -EINVAL;
+
 	/* Check it is a SQUASHFS superblock */
 	sb->s_magic = le32_to_cpu(sblk->s_magic);
 	if (sb->s_magic != SQUASHFS_MAGIC) {
 		if (!silent)
 			ERROR("Can't find a SQUASHFS superblock on %s\n",
 						bdevname(sb->s_bdev, b));
-		err = -EINVAL;
 		goto failed_mount;
 	}
 
-	/* Check the MAJOR & MINOR versions and compression type */
-	err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+	/* Check the MAJOR & MINOR versions and lookup compression type */
+	msblk->decompressor = supported_squashfs_filesystem(
+			le16_to_cpu(sblk->s_major),
 			le16_to_cpu(sblk->s_minor),
 			le16_to_cpu(sblk->compression));
-	if (err < 0)
+	if (msblk->decompressor == NULL)
 		goto failed_mount;
 
-	err = -EINVAL;
-
 	/*
 	 * Check if there's xattrs in the filesystem.  These are not
 	 * supported in this version, so warn that they will be ignored.
@@ -201,6 +205,10 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	err = -ENOMEM;
 
+	msblk->stream = squashfs_decompressor_init(msblk);
+	if (msblk->stream == NULL)
+		goto failed_mount;
+
 	msblk->block_cache = squashfs_cache_init("metadata",
 			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
 	if (msblk->block_cache == NULL)
@@ -288,7 +296,7 @@ failed_mount:
 	squashfs_cache_delete(msblk->block_cache);
 	squashfs_cache_delete(msblk->fragment_cache);
 	squashfs_cache_delete(msblk->read_page);
-	squashfs_zlib_free(msblk->stream);
+	squashfs_decompressor_free(msblk, msblk->stream);
 	kfree(msblk->inode_lookup_table);
 	kfree(msblk->fragment_index);
 	kfree(msblk->id_table);
@@ -298,7 +306,6 @@ failed_mount:
 	return err;
 
 failure:
-	squashfs_zlib_free(msblk->stream);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 	return -ENOMEM;
@@ -342,7 +349,7 @@ static void squashfs_put_super(struct super_block *sb)
 		squashfs_cache_delete(sbi->block_cache);
 		squashfs_cache_delete(sbi->fragment_cache);
 		squashfs_cache_delete(sbi->read_page);
-		squashfs_zlib_free(sbi->stream);
+		squashfs_decompressor_free(sbi, sbi->stream);
 		kfree(sbi->id_table);
 		kfree(sbi->fragment_index);
 		kfree(sbi->meta_index);
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index c814594d522e..4dd70e04333b 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -30,8 +30,9 @@
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "decompressor.h"
 
-void *squashfs_zlib_init()
+static void *zlib_init(struct squashfs_sb_info *dummy)
 {
 	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
 	if (stream == NULL)
@@ -50,7 +51,7 @@ failed:
 }
 
 
-void squashfs_zlib_free(void *strm)
+static void zlib_free(void *strm)
 {
 	z_stream *stream = strm;
 
@@ -60,7 +61,7 @@ void squashfs_zlib_free(void *strm)
 }
 
 
-int squashfs_zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 	struct buffer_head **bh, int b, int offset, int length, int srclength,
 	int pages)
 {
@@ -137,3 +138,13 @@ release_mutex:
 
 	return -EIO;
 }
+
+const struct squashfs_decompressor squashfs_zlib_comp_ops = {
+	.init = zlib_init,
+	.free = zlib_free,
+	.decompress = zlib_uncompress,
+	.id = ZLIB_COMPRESSION,
+	.name = "zlib",
+	.supported = 1
+};
+
-- 
cgit v1.2.3


From dc3256782f88602953676c447b243dedb1be99ad Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Wed, 14 Oct 2009 03:58:11 +0100
Subject: Squashfs: add decompressor entries for lzma and lzo

Add knowledge of lzma/lzo compression formats to the decompressor
framework.  For now these are added as unsupported.  Without
these entries lzma/lzo compressed filesystems will be flagged as
having unknown compression which is undesirable.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/decompressor.c | 10 ++++++++++
 fs/squashfs/squashfs_fs.h  |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 0072ccdac1e2..157478da6ac9 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -36,12 +36,22 @@
  * Squashfs, allowing multiple decompressors to be easily supported
  */
 
+static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
+	NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+};
+
+static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
+	NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+};
+
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 	NULL, NULL, NULL, 0, "unknown", 0
 };
 
 static const struct squashfs_decompressor *decompressor[] = {
 	&squashfs_zlib_comp_ops,
+	&squashfs_lzma_unsupported_comp_ops,
+	&squashfs_lzo_unsupported_comp_ops,
 	&squashfs_unknown_comp_ops
 };
 
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 283daafc568e..36e1604ab1c1 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -211,7 +211,9 @@ struct meta_index {
 /*
  * definitions for structures on disk
  */
-#define ZLIB_COMPRESSION	 1
+#define ZLIB_COMPRESSION	1
+#define LZMA_COMPRESSION	2
+#define LZO_COMPRESSION		3
 
 struct squashfs_super_block {
 	__le32			s_magic;
-- 
cgit v1.2.3


From 512dd1abd9539a474f2792eeaf6783c59ad7778a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Jan 2010 22:05:48 +0000
Subject: xfs: kill XFS_QMOPT_ASYNC

The option is unused and one of the few remaining users of
xfs_bawrite, so let's get rid of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c | 2 --
 fs/xfs/xfs_quota.h       | 1 -
 2 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d7c7eea09fc2..a447493690eb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1253,8 +1253,6 @@ xfs_qm_dqflush(
 
 	if (flags & XFS_QMOPT_DELWRI) {
 		xfs_bdwrite(mp, bp);
-	} else if (flags & XFS_QMOPT_ASYNC) {
-		error = xfs_bawrite(mp, bp);
 	} else {
 		error = xfs_bwrite(mp, bp);
 	}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 91bfd60f4c74..21d11d9f48f2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -226,7 +226,6 @@ typedef struct xfs_qoff_logformat {
  * flags for dqflush and dqflush_all.
  */
 #define XFS_QMOPT_SYNC		0x1000000
-#define XFS_QMOPT_ASYNC		0x2000000
 #define XFS_QMOPT_DELWRI	0x4000000
 
 /*
-- 
cgit v1.2.3


From 4d1f88d75b00c4d23f4c51305ab5b779a86ef74e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 13 Jan 2010 22:05:49 +0000
Subject: xfs: clean up error handling in xfs_trans_dqresv

Move the error code selection after the goto label and fold the
xfs_quota_error helper into it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_trans_dquot.c | 48 +++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 97ac9640be98..b9db6f781cd6 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,14 +589,6 @@ xfs_trans_unreserve_and_mod_dquots(
 	}
 }
 
-STATIC int
-xfs_quota_error(uint flags)
-{
-	if (flags & XFS_QMOPT_ENOSPC)
-		return ENOSPC;
-	return EDQUOT;
-}
-
 /*
  * This reserves disk blocks and inodes against a dquot.
  * Flags indicate if the dquot is to be locked here and also
@@ -612,7 +604,6 @@ xfs_trans_dqresv(
 	long		ninos,
 	uint		flags)
 {
-	int		error;
 	xfs_qcnt_t	hardlimit;
 	xfs_qcnt_t	softlimit;
 	time_t		timer;
@@ -649,7 +640,6 @@ xfs_trans_dqresv(
 		warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount);
 		resbcountp = &dqp->q_res_rtbcount;
 	}
-	error = 0;
 
 	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
 	    dqp->q_core.d_id &&
@@ -667,19 +657,13 @@ xfs_trans_dqresv(
 			 * nblks.
 			 */
 			if (hardlimit > 0ULL &&
-			     (hardlimit <= nblks + *resbcountp)) {
-				error = xfs_quota_error(flags);
+			    hardlimit <= nblks + *resbcountp)
 				goto error_return;
-			}
-
 			if (softlimit > 0ULL &&
-			     (softlimit <= nblks + *resbcountp)) {
-				if ((timer != 0 && get_seconds() > timer) ||
-				    (warns != 0 && warns >= warnlimit)) {
-					error = xfs_quota_error(flags);
-					goto error_return;
-				}
-			}
+			    softlimit <= nblks + *resbcountp &&
+			    ((timer != 0 && get_seconds() > timer) ||
+			     (warns != 0 && warns >= warnlimit)))
+				goto error_return;
 		}
 		if (ninos > 0) {
 			count = be64_to_cpu(dqp->q_core.d_icount);
@@ -692,16 +676,13 @@ xfs_trans_dqresv(
 			softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
 			if (!softlimit)
 				softlimit = q->qi_isoftlimit;
-			if (hardlimit > 0ULL && count >= hardlimit) {
-				error = xfs_quota_error(flags);
+
+			if (hardlimit > 0ULL && count >= hardlimit)
+				goto error_return;
+			if (softlimit > 0ULL && count >= softlimit &&
+			    ((timer != 0 && get_seconds() > timer) ||
+			     (warns != 0 && warns >= warnlimit)))
 				goto error_return;
-			} else if (softlimit > 0ULL && count >= softlimit) {
-				if ((timer != 0 && get_seconds() > timer) ||
-				     (warns != 0 && warns >= warnlimit)) {
-					error = xfs_quota_error(flags);
-					goto error_return;
-				}
-			}
 		}
 	}
 
@@ -736,9 +717,14 @@ xfs_trans_dqresv(
 	ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
 	ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
 
+	xfs_dqunlock(dqp);
+	return 0;
+
 error_return:
 	xfs_dqunlock(dqp);
-	return error;
+	if (flags & XFS_QMOPT_ENOSPC)
+		return ENOSPC;
+	return EDQUOT;
 }
 
 
-- 
cgit v1.2.3


From a210c1aa7f6c90b729cc3a72d03e789b13cb6c47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 17 Jan 2010 22:36:19 +0000
Subject: xfs: implement quota warnings via netlink

Wire up quota_send_warning to send quota warnings over netlink.
This is used by various desktops to show user quota warnings.

Tested by running the quota_nld daemon while running the xfstest
quota tests and observing the warnings.  I'll see how I can get a
more formal testcase for it written.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_trans_dquot.c | 49 +++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index b9db6f781cd6..c3ab75cb1d9a 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -589,6 +589,20 @@ xfs_trans_unreserve_and_mod_dquots(
 	}
 }
 
+STATIC void
+xfs_quota_warn(
+	struct xfs_mount	*mp,
+	struct xfs_dquot	*dqp,
+	int			type)
+{
+	/* no warnings for project quotas - we just return ENOSPC later */
+	if (dqp->dq_flags & XFS_DQ_PROJ)
+		return;
+	quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+			   be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+			   type);
+}
+
 /*
  * This reserves disk blocks and inodes against a dquot.
  * Flags indicate if the dquot is to be locked here and also
@@ -657,13 +671,21 @@ xfs_trans_dqresv(
 			 * nblks.
 			 */
 			if (hardlimit > 0ULL &&
-			    hardlimit <= nblks + *resbcountp)
+			    hardlimit <= nblks + *resbcountp) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
 				goto error_return;
+			}
 			if (softlimit > 0ULL &&
-			    softlimit <= nblks + *resbcountp &&
-			    ((timer != 0 && get_seconds() > timer) ||
-			     (warns != 0 && warns >= warnlimit)))
-				goto error_return;
+			    softlimit <= nblks + *resbcountp) {
+				if ((timer != 0 && get_seconds() > timer) ||
+				    (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_BSOFTLONGWARN);
+					goto error_return;
+				}
+
+				xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
+			}
 		}
 		if (ninos > 0) {
 			count = be64_to_cpu(dqp->q_core.d_icount);
@@ -677,12 +699,19 @@ xfs_trans_dqresv(
 			if (!softlimit)
 				softlimit = q->qi_isoftlimit;
 
-			if (hardlimit > 0ULL && count >= hardlimit)
-				goto error_return;
-			if (softlimit > 0ULL && count >= softlimit &&
-			    ((timer != 0 && get_seconds() > timer) ||
-			     (warns != 0 && warns >= warnlimit)))
+			if (hardlimit > 0ULL && count >= hardlimit) {
+				xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
 				goto error_return;
+			}
+			if (softlimit > 0ULL && count >= softlimit) {
+				if  ((timer != 0 && get_seconds() > timer) ||
+				     (warns != 0 && warns >= warnlimit)) {
+					xfs_quota_warn(mp, dqp,
+						       QUOTA_NL_ISOFTLONGWARN);
+					goto error_return;
+				}
+				xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 0cadda1c5f194f98a05d252ff4385d86d2ed0862 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Jan 2010 09:56:44 +0000
Subject: xfs: remove duplicate buffer flags

Currently we define aliases for the buffer flags in various
namespaces, which only adds confusion.  Remove all but the XBF_
flags to clean this up a bit.

Note that we still abuse XFS_B_ASYNC/XBF_ASYNC for some non-buffer
uses, but I'll clean that up later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c     |  2 +-
 fs/xfs/linux-2.6/xfs_buf.h     | 22 ++++------------------
 fs/xfs/linux-2.6/xfs_fs_subr.c |  2 +-
 fs/xfs/linux-2.6/xfs_sync.c    |  4 ++--
 fs/xfs/quota/xfs_dquot.c       |  3 +--
 fs/xfs/quota/xfs_dquot_item.c  |  3 +--
 fs/xfs/xfs_alloc.c             |  2 +-
 fs/xfs/xfs_attr.c              | 12 +++++-------
 fs/xfs/xfs_attr_leaf.c         |  2 +-
 fs/xfs/xfs_btree.c             |  4 ++--
 fs/xfs/xfs_ialloc.c            |  2 +-
 fs/xfs/xfs_inode.c             | 20 ++++++++++----------
 fs/xfs/xfs_inode_item.c        |  2 +-
 fs/xfs/xfs_log_recover.c       |  8 ++++----
 fs/xfs/xfs_mount.c             |  4 ++--
 fs/xfs/xfs_trans_buf.c         | 27 ++++++++++++++-------------
 fs/xfs/xfs_vnodeops.c          |  4 ++--
 17 files changed, 53 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index efd745bb8887..730eff1e71a3 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1169,7 +1169,7 @@ xfs_bioerror_relse(
 	XFS_BUF_STALE(bp);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
-	if (!(fl & XFS_B_ASYNC)) {
+	if (!(fl & XBF_ASYNC)) {
 		/*
 		 * Mark b_error and B_ERROR _both_.
 		 * Lot's of chunkcache code assumes that.
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 4f2ad66edb76..ea8c198f0c39 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -275,33 +275,19 @@ extern void xfs_buf_terminate(void);
 	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
 
 
-#define XFS_B_ASYNC		XBF_ASYNC
-#define XFS_B_DELWRI		XBF_DELWRI
-#define XFS_B_READ		XBF_READ
-#define XFS_B_WRITE		XBF_WRITE
-#define XFS_B_STALE		XBF_STALE
-
-#define XFS_BUF_TRYLOCK		XBF_TRYLOCK
-#define XFS_INCORE_TRYLOCK	XBF_TRYLOCK
-#define XFS_BUF_LOCK		XBF_LOCK
-#define XFS_BUF_MAPPED		XBF_MAPPED
-
-#define BUF_BUSY		XBF_DONT_BLOCK
-
 #define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 
-#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XFS_B_STALE)
-#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XFS_B_STALE)
-#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XFS_B_STALE)
+#define XFS_BUF_STALE(bp)	((bp)->b_flags |= XBF_STALE)
+#define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XBF_STALE)
 #define XFS_BUF_SUPER_STALE(bp)	do {				\
 					XFS_BUF_STALE(bp);	\
 					xfs_buf_delwri_dequeue(bp);	\
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 
-#define XFS_BUF_MANAGE		XBF_FS_MANAGED
 #define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
 
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
@@ -390,7 +376,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
 
 #define xfs_biomove(bp, off, len, data, rw) \
 	    xfs_buf_iomove((bp), (off), (len), (data), \
-		((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
+		((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ)
 
 #define xfs_biozero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 7501b85fd860..b6918d76bc7b 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -79,7 +79,7 @@ xfs_flush_pages(
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
 		ret = -filemap_fdatawrite(mapping);
 	}
-	if (flags & XFS_B_ASYNC)
+	if (flags & XBF_ASYNC)
 		return ret;
 	ret2 = xfs_wait_on_pages(ip, first, last);
 	if (!ret)
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b58f8412dfe2..58c24be72c65 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -234,7 +234,7 @@ xfs_sync_inode_data(
 	}
 
 	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XFS_B_ASYNC, FI_NONE);
+				0 : XBF_ASYNC, FI_NONE);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
  out_wait:
@@ -370,7 +370,7 @@ xfs_sync_fsdata(
 	if (flags & SYNC_TRYLOCK) {
 		ASSERT(!(flags & SYNC_WAIT));
 
-		bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+		bp = xfs_getsb(mp, XBF_TRYLOCK);
 		if (!bp)
 			goto out;
 
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index a447493690eb..5756392ffdee 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1527,8 +1527,7 @@ xfs_qm_dqflock_pushbuf_wait(
 	 * the flush lock when the I/O completes.
 	 */
 	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
-		    XFS_QI_DQCHUNKLEN(dqp->q_mount),
-		    XFS_INCORE_TRYLOCK);
+		    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			int	error;
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index d0d4a9a0bbd7..37929d19ef44 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -237,8 +237,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	}
 	mp = dqp->q_mount;
 	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
-		    XFS_QI_DQCHUNKLEN(mp),
-		    XFS_INCORE_TRYLOCK);
+		    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8aa181d6dd7d..a27aeb7d9e74 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2180,7 +2180,7 @@ xfs_alloc_read_agf(
 	ASSERT(agno != NULLAGNUMBER);
 
 	error = xfs_read_agf(mp, tp, agno,
-			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
 			bpp);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7b426a1b6ee..b9c196a53c42 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2015,15 +2015,14 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
 			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-					     blkcnt,
-					     XFS_BUF_LOCK | XBF_DONT_BLOCK,
+					     blkcnt, XBF_LOCK | XBF_DONT_BLOCK,
 					     &bp);
 			if (error)
 				return(error);
 
 			tmp = (valuelen < XFS_BUF_SIZE(bp))
 				? valuelen : XFS_BUF_SIZE(bp);
-			xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+			xfs_biomove(bp, 0, tmp, dst, XBF_READ);
 			xfs_buf_relse(bp);
 			dst += tmp;
 			valuelen -= tmp;
@@ -2149,13 +2148,13 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
 
 		bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt,
-				 XFS_BUF_LOCK | XBF_DONT_BLOCK);
+				 XBF_LOCK | XBF_DONT_BLOCK);
 		ASSERT(bp);
 		ASSERT(!XFS_BUF_GETERROR(bp));
 
 		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
 							XFS_BUF_SIZE(bp);
-		xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+		xfs_biomove(bp, 0, tmp, src, XBF_WRITE);
 		if (tmp < XFS_BUF_SIZE(bp))
 			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
 		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
@@ -2216,8 +2215,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * If the "remote" value is in the cache, remove it.
 		 */
-		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
-				XFS_INCORE_TRYLOCK);
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK);
 		if (bp) {
 			XFS_BUF_STALE(bp);
 			XFS_BUF_UNDELAYWRITE(bp);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 52519a201849..a90ce74fc256 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2950,7 +2950,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 						map.br_blockcount);
 			bp = xfs_trans_get_buf(*trans,
 					dp->i_mount->m_ddev_targp,
-					dblkno, dblkcnt, XFS_BUF_LOCK);
+					dblkno, dblkcnt, XBF_LOCK);
 			xfs_trans_binval(*trans, bp);
 			/*
 			 * Roll to next transaction.
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 36a0992dd669..96be4b0f2496 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -977,7 +977,7 @@ xfs_btree_get_buf_block(
 	xfs_daddr_t		d;
 
 	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XFS_BUF_TRYLOCK));
+	ASSERT(!(flags & XBF_TRYLOCK));
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
@@ -1008,7 +1008,7 @@ xfs_btree_read_buf_block(
 	int			error;
 
 	/* need to sort out how callers deal with failures first */
-	ASSERT(!(flags & XFS_BUF_TRYLOCK));
+	ASSERT(!(flags & XBF_TRYLOCK));
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 52c9d006c0e6..9d884c127bb9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -205,7 +205,7 @@ xfs_ialloc_inode_init(
 		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 					 mp->m_bsize * blks_per_cluster,
-					 XFS_BUF_LOCK);
+					 XBF_LOCK);
 		ASSERT(fbuf);
 		ASSERT(!XFS_BUF_GETERROR(fbuf));
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0317b000ab44..bbb3bee8e936 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -151,7 +151,7 @@ xfs_imap_to_bp(
 				"an error %d on %s.  Returning error.",
 				error, mp->m_fsname);
 		} else {
-			ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+			ASSERT(buf_flags & XBF_TRYLOCK);
 		}
 		return error;
 	}
@@ -239,7 +239,7 @@ xfs_inotobp(
 	if (error)
 		return error;
 
-	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
+	error = xfs_imap_to_bp(mp, tp, &imap, &bp, XBF_LOCK, imap_flags);
 	if (error)
 		return error;
 
@@ -285,7 +285,7 @@ xfs_itobp(
 		return error;
 
 	if (!bp) {
-		ASSERT(buf_flags & XFS_BUF_TRYLOCK);
+		ASSERT(buf_flags & XBF_TRYLOCK);
 		ASSERT(tp == NULL);
 		*bpp = NULL;
 		return EAGAIN;
@@ -807,7 +807,7 @@ xfs_iread(
 	 * Get pointers to the on-disk inode and the buffer containing it.
 	 */
 	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-			       XFS_BUF_LOCK, iget_flags);
+			       XBF_LOCK, iget_flags);
 	if (error)
 		return error;
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -1751,7 +1751,7 @@ xfs_iunlink(
 		 * Here we put the head pointer into our next pointer,
 		 * and then we fall through to point the head at us.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error)
 			return error;
 
@@ -1833,7 +1833,7 @@ xfs_iunlink_remove(
 		 * of dealing with the buffer when there is no need to
 		 * change it.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1895,7 +1895,7 @@ xfs_iunlink_remove(
 		 * Now last_ibp points to the buffer previous to us on
 		 * the unlinked list.  Pull us from the list.
 		 */
-		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
 			cmn_err(CE_WARN,
 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2040,7 +2040,7 @@ xfs_ifree_cluster(
 
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
 					mp->m_bsize * blks_per_cluster,
-					XFS_BUF_LOCK);
+					XBF_LOCK);
 
 		pre_flushed = 0;
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -2151,7 +2151,7 @@ xfs_ifree(
 
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
+	error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XBF_LOCK);
 	if (error)
 		return error;
 
@@ -2952,7 +2952,7 @@ xfs_iflush(
 	 * Get the buffer containing the on-disk inode.
 	 */
 	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-				noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
+				noblock ? XBF_TRYLOCK : XBF_LOCK);
 	if (error || !bp) {
 		xfs_ifunlock(ip);
 		return error;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f38855d21ea5..6194fb5d3777 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -785,7 +785,7 @@ xfs_inode_item_pushbuf(
 
 	mp = ip->i_mount;
 	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
-		    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
+		    iip->ili_format.ilf_len, XBF_TRYLOCK);
 
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 65f1f137d789..97148f0c4bdd 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2184,9 +2184,9 @@ xlog_recover_do_buffer_trans(
 	}
 
 	mp = log->l_mp;
-	buf_flags = XFS_BUF_LOCK;
+	buf_flags = XBF_LOCK;
 	if (!(flags & XFS_BLI_INODE_BUF))
-		buf_flags |= XFS_BUF_MAPPED;
+		buf_flags |= XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
 	if (XFS_BUF_ISERROR(bp)) {
@@ -2288,7 +2288,7 @@ xlog_recover_do_inode_trans(
 	}
 
 	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
-			  XFS_BUF_LOCK);
+			  XBF_LOCK);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
 				  bp, in_f->ilf_blkno);
@@ -3146,7 +3146,7 @@ xlog_recover_process_one_iunlink(
 	/*
 	 * Get the on disk inode to find the next inode in the bucket.
 	 */
-	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
 	if (error)
 		goto fail_iput;
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d95bd1809f3c..bb0154047e85 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -665,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
 			  extra_flags);
@@ -1969,7 +1969,7 @@ xfs_getsb(
 
 	ASSERT(mp->m_sb_bp != NULL);
 	bp = mp->m_sb_bp;
-	if (flags & XFS_BUF_TRYLOCK) {
+	if (flags & XBF_TRYLOCK) {
 		if (!XFS_BUF_CPSEMA(bp)) {
 			return NULL;
 		}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 49130628d5ef..5ffd544434eb 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -75,13 +75,14 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	xfs_buf_log_item_t	*bip;
 
 	if (flags == 0)
-		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+		flags = XBF_LOCK | XBF_MAPPED;
 
 	/*
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
 	if (tp == NULL)
-		return xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
+		return xfs_buf_get(target_dev, blkno, len,
+				   flags | XBF_DONT_BLOCK);
 
 	/*
 	 * If we find the buffer in the cache with this transaction
@@ -117,14 +118,14 @@ xfs_trans_get_buf(xfs_trans_t	*tp,
 	}
 
 	/*
-	 * We always specify the BUF_BUSY flag within a transaction so
-	 * that get_buf does not try to push out a delayed write buffer
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
 	 * which might cause another transaction to take place (if the
 	 * buffer was delayed alloc).  Such recursive transactions can
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_get(target_dev, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_get(target_dev, blkno, len, flags | XBF_DONT_BLOCK);
 	if (bp == NULL) {
 		return NULL;
 	}
@@ -290,15 +291,15 @@ xfs_trans_read_buf(
 	int			error;
 
 	if (flags == 0)
-		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+		flags = XBF_LOCK | XBF_MAPPED;
 
 	/*
 	 * Default to a normal get_buf() call if the tp is NULL.
 	 */
 	if (tp == NULL) {
-		bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
+		bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
 		if (!bp)
-			return (flags & XFS_BUF_TRYLOCK) ?
+			return (flags & XBF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
 
 		if (XFS_BUF_GETERROR(bp) != 0) {
@@ -385,14 +386,14 @@ xfs_trans_read_buf(
 	}
 
 	/*
-	 * We always specify the BUF_BUSY flag within a transaction so
-	 * that get_buf does not try to push out a delayed write buffer
+	 * We always specify the XBF_DONT_BLOCK flag within a transaction
+	 * so that get_buf does not try to push out a delayed write buffer
 	 * which might cause another transaction to take place (if the
 	 * buffer was delayed alloc).  Such recursive transactions can
 	 * easily deadlock with our current transaction as well as cause
 	 * us to run out of stack space.
 	 */
-	bp = xfs_buf_read(target, blkno, len, flags | BUF_BUSY);
+	bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
 	if (bp == NULL) {
 		*bpp = NULL;
 		return 0;
@@ -472,8 +473,8 @@ shutdown_abort:
 	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
 		cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
 #endif
-	ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
-						(XFS_B_STALE|XFS_B_DELWRI));
+	ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
+				     (XBF_STALE|XBF_DELWRI));
 
 	trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
 	xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9f7c001ef469..4da96cdffb76 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -256,7 +256,7 @@ xfs_setattr(
 		    iattr->ia_size > ip->i_d.di_size) {
 			code = xfs_flush_pages(ip,
 					ip->i_d.di_size, iattr->ia_size,
-					XFS_B_ASYNC, FI_NONE);
+					XBF_ASYNC, FI_NONE);
 		}
 
 		/* wait for all I/O to complete */
@@ -1096,7 +1096,7 @@ xfs_release(
 		 */
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 		if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
-			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
+			xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
 	}
 
 	if (ip->i_d.di_nlink != 0) {
-- 
cgit v1.2.3


From 4139b3b337cffd106744386c842b89dc86e31d4b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Jan 2010 09:56:45 +0000
Subject: xfs: kill XLOG_VEC_SET_TYPE

This macro only obsfucates the log item type assignments, so kill it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_dquot_item.c |  6 +++---
 fs/xfs/xfs_buf_item.c         |  8 ++++----
 fs/xfs/xfs_extfree_item.c     |  4 ++--
 fs/xfs/xfs_inode_item.c       | 18 +++++++++---------
 fs/xfs/xfs_log.c              |  4 ++--
 fs/xfs/xfs_log.h              |  4 +---
 fs/xfs/xfs_trans.c            |  2 +-
 7 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 37929d19ef44..116580d52fae 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -74,11 +74,11 @@ xfs_qm_dquot_logitem_format(
 
 	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
 	logvec->i_len  = sizeof(xfs_dq_logformat_t);
-	XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT);
+	logvec->i_type = XLOG_REG_TYPE_QFORMAT;
 	logvec++;
 	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
 	logvec->i_len  = sizeof(xfs_disk_dquot_t);
-	XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT);
+	logvec->i_type = XLOG_REG_TYPE_DQUOT;
 
 	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
 	logitem->qli_format.qlf_size = 2;
@@ -466,7 +466,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
 
 	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
 	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
+	log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
 	qf->qql_format.qf_size = 1;
 }
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a30f7e9eb2b9..e0a11583ce5a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -250,7 +250,7 @@ xfs_buf_item_format(
 		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
 	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
 	vecp->i_len = base_size;
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
+	vecp->i_type = XLOG_REG_TYPE_BFORMAT;
 	vecp++;
 	nvecs = 1;
 
@@ -297,14 +297,14 @@ xfs_buf_item_format(
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			break;
 		} else if (next_bit != last_bit + 1) {
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 			nvecs++;
 			vecp++;
 			first_bit = next_bit;
@@ -316,7 +316,7 @@ xfs_buf_item_format(
 			buffer_offset = first_bit * XFS_BLI_CHUNK;
 			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 			vecp->i_len = nbits * XFS_BLI_CHUNK;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
+			vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
  * this number is used by recovery, and it gets confused by the boundary
  * split here
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 05a4bdd4be39..6f35ed1b39b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,7 +82,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
 	log_vector->i_len = size;
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
+	log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
 	ASSERT(size >= sizeof(xfs_efi_log_format_t));
 }
 
@@ -406,7 +406,7 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
 
 	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
 	log_vector->i_len = size;
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
+	log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
 	ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 6194fb5d3777..da4cac67bdae 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -228,7 +228,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
 	vecp->i_len  = sizeof(xfs_inode_log_format_t);
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
+	vecp->i_type = XLOG_REG_TYPE_IFORMAT;
 	vecp++;
 	nvecs	     = 1;
 
@@ -279,7 +279,7 @@ xfs_inode_item_format(
 
 	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
 	vecp->i_len  = sizeof(struct xfs_icdinode);
-	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
+	vecp->i_type = XLOG_REG_TYPE_ICORE;
 	vecp++;
 	nvecs++;
 	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -336,7 +336,7 @@ xfs_inode_item_format(
 				vecp->i_addr =
 					(char *)(ip->i_df.if_u1.if_extents);
 				vecp->i_len = ip->i_df.if_bytes;
-				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+				vecp->i_type = XLOG_REG_TYPE_IEXT;
 			} else
 #endif
 			{
@@ -355,7 +355,7 @@ xfs_inode_item_format(
 				vecp->i_addr = (xfs_caddr_t)ext_buffer;
 				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 						XFS_DATA_FORK);
-				XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
+				vecp->i_type = XLOG_REG_TYPE_IEXT;
 			}
 			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
 			iip->ili_format.ilf_dsize = vecp->i_len;
@@ -373,7 +373,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_df.if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
 			vecp->i_len = ip->i_df.if_broot_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
+			vecp->i_type = XLOG_REG_TYPE_IBROOT;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -399,7 +399,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_df.if_real_bytes == 0) ||
 			       (ip->i_df.if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
+			vecp->i_type = XLOG_REG_TYPE_ILOCAL;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -477,7 +477,7 @@ xfs_inode_item_format(
 			vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
 					XFS_ATTR_FORK);
 #endif
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 			iip->ili_format.ilf_asize = vecp->i_len;
 			vecp++;
 			nvecs++;
@@ -492,7 +492,7 @@ xfs_inode_item_format(
 			ASSERT(ip->i_afp->if_broot != NULL);
 			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
 			vecp->i_len = ip->i_afp->if_broot_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -516,7 +516,7 @@ xfs_inode_item_format(
 			ASSERT((ip->i_afp->if_real_bytes == 0) ||
 			       (ip->i_afp->if_real_bytes == data_bytes));
 			vecp->i_len = (int)data_bytes;
-			XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
+			vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
 			vecp++;
 			nvecs++;
 			iip->ili_format.ilf_asize = (unsigned)data_bytes;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 0d17516fbb13..20118ddadef6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -617,7 +617,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (! (XLOG_FORCED_SHUTDOWN(log))) {
 		reg[0].i_addr = (void*)&magic;
 		reg[0].i_len  = sizeof(magic);
-		XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
+		reg[0].i_type = XLOG_REG_TYPE_UNMOUNT;
 
 		error = xfs_log_reserve(mp, 600, 1, &tic,
 					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
@@ -1236,7 +1236,7 @@ xlog_commit_record(xfs_mount_t  *mp,
 
 	reg[0].i_addr = NULL;
 	reg[0].i_len = 0;
-	XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
+	reg[0].i_type = XLOG_REG_TYPE_COMMIT;
 
 	ASSERT_ALWAYS(iclog);
 	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d0c9baa50b1a..811ccf4d8b3e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,10 +110,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XLOG_REG_TYPE_TRANSHDR		19
 #define XLOG_REG_TYPE_MAX		19
 
-#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
-
 typedef struct xfs_log_iovec {
-	xfs_caddr_t		i_addr;		/* beginning address of region */
+	xfs_caddr_t	i_addr;		/* beginning address of region */
 	int		i_len;		/* length in bytes of region */
 	uint		i_type;		/* type of region */
 } xfs_log_iovec_t;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 237badcbac3b..7dbe3c3051db 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1121,7 +1121,7 @@ xfs_trans_fill_vecs(
 	tp->t_header.th_num_items = nitems;
 	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
 	log_vector->i_len = sizeof(xfs_trans_header_t);
-	XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR);
+	log_vector->i_type = XLOG_REG_TYPE_TRANSHDR;
 }
 
 
-- 
cgit v1.2.3


From a14a348bff2f99471a28e5928eb6801224c053d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Jan 2010 09:56:46 +0000
Subject: xfs: cleanup up xfs_log_force calling conventions

Remove the XFS_LOG_FORCE argument which was always set, and the
XFS_LOG_URGE define, which was never used.

Split xfs_log_force into a two helpers - xfs_log_force which forces
the whole log, and xfs_log_force_lsn which forces up to the
specified LSN.  The underlying implementations already were entirely
separate, as were the users.

Also re-indent the new _xfs_log_force/_xfs_log_force which
previously had a weird coding style.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c    |  17 +--
 fs/xfs/quota/xfs_dquot.c       |  10 +-
 fs/xfs/quota/xfs_dquot_item.c  |   9 +-
 fs/xfs/quota/xfs_qm_syscalls.c |   4 +-
 fs/xfs/xfs_alloc.c             |   2 +-
 fs/xfs/xfs_inode.c             |   9 +-
 fs/xfs/xfs_inode_item.c        |   7 +-
 fs/xfs/xfs_log.c               | 312 ++++++++++++++++++++---------------------
 fs/xfs/xfs_log.h               |  15 +-
 fs/xfs/xfs_log_recover.c       |   3 +-
 fs/xfs/xfs_mount.c             |   4 +-
 fs/xfs/xfs_trans.c             |   5 +-
 fs/xfs/xfs_trans_ail.c         |   2 +-
 fs/xfs/xfs_vnodeops.c          |   5 +-
 14 files changed, 193 insertions(+), 211 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 58c24be72c65..c9b863eacab7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -296,10 +296,7 @@ xfs_sync_data(
 	if (error)
 		return XFS_ERROR(error);
 
-	xfs_log_force(mp, 0,
-		      (flags & SYNC_WAIT) ?
-		       XFS_LOG_FORCE | XFS_LOG_SYNC :
-		       XFS_LOG_FORCE);
+	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
 	return 0;
 }
 
@@ -325,10 +322,6 @@ xfs_commit_dummy_trans(
 	struct xfs_inode	*ip = mp->m_rootip;
 	struct xfs_trans	*tp;
 	int			error;
-	int			log_flags = XFS_LOG_FORCE;
-
-	if (flags & SYNC_WAIT)
-		log_flags |= XFS_LOG_SYNC;
 
 	/*
 	 * Put a dummy transaction in the log to tell recovery
@@ -350,7 +343,7 @@ xfs_commit_dummy_trans(
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	/* the log force ensures this transaction is pushed to disk */
-	xfs_log_force(mp, 0, log_flags);
+	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
 	return error;
 }
 
@@ -390,7 +383,7 @@ xfs_sync_fsdata(
 		 * become pinned in between there and here.
 		 */
 		if (XFS_BUF_ISPINNED(bp))
-			xfs_log_force(mp, 0, XFS_LOG_FORCE);
+			xfs_log_force(mp, 0);
 	}
 
 
@@ -575,7 +568,7 @@ xfs_flush_inodes(
 	igrab(inode);
 	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
 	wait_for_completion(&completion);
-	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 
 /*
@@ -591,7 +584,7 @@ xfs_sync_worker(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 		xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 5756392ffdee..f9baeedbfdfe 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1248,7 +1248,7 @@ xfs_qm_dqflush(
 	 */
 	if (XFS_BUF_ISPINNED(bp)) {
 		trace_xfs_dqflush_force(dqp);
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 	}
 
 	if (flags & XFS_QMOPT_DELWRI) {
@@ -1531,11 +1531,9 @@ xfs_qm_dqflock_pushbuf_wait(
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			int	error;
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(dqp->q_mount,
-					      (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
+
+			if (XFS_BUF_ISPINNED(bp))
+				xfs_log_force(dqp->q_mount, 0);
 			error = xfs_bawrite(dqp->q_mount, bp);
 			if (error)
 				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 116580d52fae..1b564376d50c 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -190,7 +190,7 @@ xfs_qm_dqunpin_wait(
 	/*
 	 * Give the log a push so we don't wait here too long.
 	 */
-	xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	xfs_log_force(dqp->q_mount, 0);
 	wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
 
@@ -245,10 +245,9 @@ xfs_qm_dquot_logitem_pushbuf(
 			qip->qli_pushbuf_flag = 0;
 			xfs_dqunlock(dqp);
 
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
+			if (XFS_BUF_ISPINNED(bp))
+				xfs_log_force(mp, 0);
+
 			if (dopush) {
 				int	error;
 #ifdef XFSRACEDEBUG
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 873e07e29074..5d0ee8d492db 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1192,9 +1192,9 @@ xfs_qm_internalqcheck(
 	if (! XFS_IS_QUOTA_ON(mp))
 		return XFS_ERROR(ESRCH);
 
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	XFS_bflush(mp->m_ddev_targp);
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	XFS_bflush(mp->m_ddev_targp);
 
 	mutex_lock(&qcheck_lock);
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index a27aeb7d9e74..94cddbfb2560 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2601,5 +2601,5 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
 	 * transaction that freed the block
 	 */
 	if (lsn)
-		xfs_log_force(tp->t_mountp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+		xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bbb3bee8e936..d0d1b5a05183 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2484,8 +2484,11 @@ __xfs_iunpin_wait(
 		return;
 
 	/* Give the log a push to start the unpinning I/O */
-	xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
-				iip->ili_last_lsn : 0, XFS_LOG_FORCE);
+	if (iip && iip->ili_last_lsn)
+		xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
+	else
+		xfs_log_force(ip->i_mount, 0);
+
 	if (wait)
 		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
 }
@@ -2970,7 +2973,7 @@ xfs_iflush(
 	 * get stuck waiting in the write for too long.
 	 */
 	if (XFS_BUF_ISPINNED(bp))
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 
 	/*
 	 * inode clustering:
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index da4cac67bdae..48ec1c0b23ce 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -804,10 +804,9 @@ xfs_inode_item_pushbuf(
 
 			trace_xfs_inode_item_push(bp, _RET_IP_);
 
-			if (XFS_BUF_ISPINNED(bp)) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-					      XFS_LOG_FORCE);
-			}
+			if (XFS_BUF_ISPINNED(bp))
+				xfs_log_force(mp, 0);
+
 			if (dopush) {
 				int	error;
 				error = xfs_bawrite(mp, bp);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 20118ddadef6..4f16be4b6ee5 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -79,11 +79,6 @@ STATIC int  xlog_state_release_iclog(xlog_t		*log,
 STATIC void xlog_state_switch_iclogs(xlog_t		*log,
 				     xlog_in_core_t *iclog,
 				     int		eventual_size);
-STATIC int  xlog_state_sync(xlog_t			*log,
-			    xfs_lsn_t 			lsn,
-			    uint			flags,
-			    int				*log_flushed);
-STATIC int  xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed);
 STATIC void xlog_state_want_sync(xlog_t	*log, xlog_in_core_t *iclog);
 
 /* local functions to manipulate grant head */
@@ -296,65 +291,6 @@ xfs_log_done(xfs_mount_t	*mp,
 	return lsn;
 }	/* xfs_log_done */
 
-
-/*
- * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
- *	the force is done synchronously.
- *
- * Asynchronous forces are implemented by setting the WANT_SYNC
- * bit in the appropriate in-core log and then returning.
- *
- * Synchronous forces are implemented with a signal variable. All callers
- * to force a given lsn to disk will wait on a the sv attached to the
- * specific in-core log.  When given in-core log finally completes its
- * write to disk, that thread will wake up all threads waiting on the
- * sv.
- */
-int
-_xfs_log_force(
-	xfs_mount_t	*mp,
-	xfs_lsn_t	lsn,
-	uint		flags,
-	int		*log_flushed)
-{
-	xlog_t		*log = mp->m_log;
-	int		dummy;
-
-	if (!log_flushed)
-		log_flushed = &dummy;
-
-	ASSERT(flags & XFS_LOG_FORCE);
-
-	XFS_STATS_INC(xs_log_force);
-
-	if (log->l_flags & XLOG_IO_ERROR)
-		return XFS_ERROR(EIO);
-	if (lsn == 0)
-		return xlog_state_sync_all(log, flags, log_flushed);
-	else
-		return xlog_state_sync(log, lsn, flags, log_flushed);
-}	/* _xfs_log_force */
-
-/*
- * Wrapper for _xfs_log_force(), to be used when caller doesn't care
- * about errors or whether the log was flushed or not. This is the normal
- * interface to use when trying to unpin items or move the log forward.
- */
-void
-xfs_log_force(
-	xfs_mount_t	*mp,
-	xfs_lsn_t	lsn,
-	uint		flags)
-{
-	int	error;
-	error = _xfs_log_force(mp, lsn, flags, NULL);
-	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
-			"error %d returned.", error);
-	}
-}
-
-
 /*
  * Attaches a new iclog I/O completion callback routine during
  * transaction commit.  If the log is in error state, a non-zero
@@ -601,7 +537,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return 0;
 
-	error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
 
 #ifdef DEBUG
@@ -2853,7 +2789,6 @@ xlog_state_switch_iclogs(xlog_t		*log,
 	log->l_iclog = iclog->ic_next;
 }	/* xlog_state_switch_iclogs */
 
-
 /*
  * Write out all data in the in-core log as of this exact moment in time.
  *
@@ -2881,11 +2816,17 @@ xlog_state_switch_iclogs(xlog_t		*log,
  *	   b) when we return from flushing out this iclog, it is still
  *		not in the active nor dirty state.
  */
-STATIC int
-xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
+int
+_xfs_log_force(
+	struct xfs_mount	*mp,
+	uint			flags,
+	int			*log_flushed)
 {
-	xlog_in_core_t	*iclog;
-	xfs_lsn_t	lsn;
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	xfs_lsn_t		lsn;
+
+	XFS_STATS_INC(xs_log_force);
 
 	spin_lock(&log->l_icloglock);
 
@@ -2931,7 +2872,9 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
 
 				if (xlog_state_release_iclog(log, iclog))
 					return XFS_ERROR(EIO);
-				*log_flushed = 1;
+
+				if (log_flushed)
+					*log_flushed = 1;
 				spin_lock(&log->l_icloglock);
 				if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn &&
 				    iclog->ic_state != XLOG_STATE_DIRTY)
@@ -2975,19 +2918,37 @@ maybe_sleep:
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR)
 			return XFS_ERROR(EIO);
-		*log_flushed = 1;
-
+		if (log_flushed)
+			*log_flushed = 1;
 	} else {
 
 no_sleep:
 		spin_unlock(&log->l_icloglock);
 	}
 	return 0;
-}	/* xlog_state_sync_all */
+}
 
+/*
+ * Wrapper for _xfs_log_force(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int	error;
+
+	error = _xfs_log_force(mp, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
 
 /*
- * Used by code which implements synchronous log forces.
+ * Force the in-core log to disk for a specific LSN.
  *
  * Find in-core log with lsn.
  *	If it is in the DIRTY state, just return.
@@ -2995,109 +2956,142 @@ no_sleep:
  *		state and go to sleep or return.
  *	If it is in any other state, go to sleep or return.
  *
- * If filesystem activity goes to zero, the iclog will get flushed only by
- * bdflush().
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * sv.
  */
-STATIC int
-xlog_state_sync(xlog_t	  *log,
-		xfs_lsn_t lsn,
-		uint	  flags,
-		int	  *log_flushed)
+int
+_xfs_log_force_lsn(
+	struct xfs_mount	*mp,
+	xfs_lsn_t		lsn,
+	uint			flags,
+	int			*log_flushed)
 {
-    xlog_in_core_t	*iclog;
-    int			already_slept = 0;
-
-try_again:
-    spin_lock(&log->l_icloglock);
-    iclog = log->l_iclog;
+	struct log		*log = mp->m_log;
+	struct xlog_in_core	*iclog;
+	int			already_slept = 0;
 
-    if (iclog->ic_state & XLOG_STATE_IOERROR) {
-	    spin_unlock(&log->l_icloglock);
-	    return XFS_ERROR(EIO);
-    }
+	ASSERT(lsn != 0);
 
-    do {
-	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
-		iclog = iclog->ic_next;
-		continue;
-	}
+	XFS_STATS_INC(xs_log_force);
 
-	if (iclog->ic_state == XLOG_STATE_DIRTY) {
+try_again:
+	spin_lock(&log->l_icloglock);
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
 		spin_unlock(&log->l_icloglock);
-		return 0;
+		return XFS_ERROR(EIO);
 	}
 
-	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
-		/*
-		 * We sleep here if we haven't already slept (e.g.
-		 * this is the first time we've looked at the correct
-		 * iclog buf) and the buffer before us is going to
-		 * be sync'ed. The reason for this is that if we
-		 * are doing sync transactions here, by waiting for
-		 * the previous I/O to complete, we can allow a few
-		 * more transactions into this iclog before we close
-		 * it down.
-		 *
-		 * Otherwise, we mark the buffer WANT_SYNC, and bump
-		 * up the refcnt so we can release the log (which drops
-		 * the ref count).  The state switch keeps new transaction
-		 * commits from using this buffer.  When the current commits
-		 * finish writing into the buffer, the refcount will drop to
-		 * zero and the buffer will go out then.
-		 */
-		if (!already_slept &&
-		    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
-						 XLOG_STATE_SYNCING))) {
-			ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
-			XFS_STATS_INC(xs_log_force_sleep);
-			sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
-				&log->l_icloglock, s);
-			*log_flushed = 1;
-			already_slept = 1;
-			goto try_again;
-		} else {
+	do {
+		if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			spin_unlock(&log->l_icloglock);
+			return 0;
+		}
+
+		if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+			/*
+			 * We sleep here if we haven't already slept (e.g.
+			 * this is the first time we've looked at the correct
+			 * iclog buf) and the buffer before us is going to
+			 * be sync'ed. The reason for this is that if we
+			 * are doing sync transactions here, by waiting for
+			 * the previous I/O to complete, we can allow a few
+			 * more transactions into this iclog before we close
+			 * it down.
+			 *
+			 * Otherwise, we mark the buffer WANT_SYNC, and bump
+			 * up the refcnt so we can release the log (which
+			 * drops the ref count).  The state switch keeps new
+			 * transaction commits from using this buffer.  When
+			 * the current commits finish writing into the buffer,
+			 * the refcount will drop to zero and the buffer will
+			 * go out then.
+			 */
+			if (!already_slept &&
+			    (iclog->ic_prev->ic_state &
+			     (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
+				ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+
+				XFS_STATS_INC(xs_log_force_sleep);
+
+				sv_wait(&iclog->ic_prev->ic_write_wait,
+					PSWP, &log->l_icloglock, s);
+				if (log_flushed)
+					*log_flushed = 1;
+				already_slept = 1;
+				goto try_again;
+			}
 			atomic_inc(&iclog->ic_refcnt);
 			xlog_state_switch_iclogs(log, iclog, 0);
 			spin_unlock(&log->l_icloglock);
 			if (xlog_state_release_iclog(log, iclog))
 				return XFS_ERROR(EIO);
-			*log_flushed = 1;
+			if (log_flushed)
+				*log_flushed = 1;
 			spin_lock(&log->l_icloglock);
 		}
-	}
 
-	if ((flags & XFS_LOG_SYNC) && /* sleep */
-	    !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+		if ((flags & XFS_LOG_SYNC) && /* sleep */
+		    !(iclog->ic_state &
+		      (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+			/*
+			 * Don't wait on completion if we know that we've
+			 * gotten a log write error.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				spin_unlock(&log->l_icloglock);
+				return XFS_ERROR(EIO);
+			}
+			XFS_STATS_INC(xs_log_force_sleep);
+			sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
+			/*
+			 * No need to grab the log lock here since we're
+			 * only deciding whether or not to return EIO
+			 * and the memory read should be atomic.
+			 */
+			if (iclog->ic_state & XLOG_STATE_IOERROR)
+				return XFS_ERROR(EIO);
 
-		/*
-		 * Don't wait on completion if we know that we've
-		 * gotten a log write error.
-		 */
-		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			if (log_flushed)
+				*log_flushed = 1;
+		} else {		/* just return */
 			spin_unlock(&log->l_icloglock);
-			return XFS_ERROR(EIO);
 		}
-		XFS_STATS_INC(xs_log_force_sleep);
-		sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
-		/*
-		 * No need to grab the log lock here since we're
-		 * only deciding whether or not to return EIO
-		 * and the memory read should be atomic.
-		 */
-		if (iclog->ic_state & XLOG_STATE_IOERROR)
-			return XFS_ERROR(EIO);
-		*log_flushed = 1;
-	} else {		/* just return */
-		spin_unlock(&log->l_icloglock);
-	}
-	return 0;
 
-    } while (iclog != log->l_iclog);
+		return 0;
+	} while (iclog != log->l_iclog);
 
-    spin_unlock(&log->l_icloglock);
-    return 0;
-}	/* xlog_state_sync */
+	spin_unlock(&log->l_icloglock);
+	return 0;
+}
+
+/*
+ * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care
+ * about errors or whether the log was flushed or not. This is the normal
+ * interface to use when trying to unpin items or move the log forward.
+ */
+void
+xfs_log_force_lsn(
+	xfs_mount_t	*mp,
+	xfs_lsn_t	lsn,
+	uint		flags)
+{
+	int	error;
 
+	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+			"error %d returned.", error);
+	}
+}
 
 /*
  * Called when we want to mark the current iclog as being ready to sync to
@@ -3462,7 +3456,6 @@ xfs_log_force_umount(
 	xlog_ticket_t	*tic;
 	xlog_t		*log;
 	int		retval;
-	int		dummy;
 
 	log = mp->m_log;
 
@@ -3536,13 +3529,14 @@ xfs_log_force_umount(
 	}
 	spin_unlock(&log->l_grant_lock);
 
-	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
 		ASSERT(!logerror);
 		/*
 		 * Force the incore logs to disk before shutting the
 		 * log down completely.
 		 */
-		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy);
+		_xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+
 		spin_lock(&log->l_icloglock);
 		retval = xlog_state_ioerror(log);
 		spin_unlock(&log->l_icloglock);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 811ccf4d8b3e..7074be9d13e9 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -70,14 +70,8 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  * Flags to xfs_log_force()
  *
  *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
- *	XFS_LOG_FORCE:	Start in-core log write now.
- *	XFS_LOG_URGE:	Start write within some window of time.
- *
- * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
  */
 #define XFS_LOG_SYNC		0x1
-#define XFS_LOG_FORCE		0x2
-#define XFS_LOG_URGE		0x4
 
 #endif	/* __KERNEL__ */
 
@@ -138,12 +132,17 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       void		**iclog,
 		       uint		flags);
 int	  _xfs_log_force(struct xfs_mount *mp,
-			 xfs_lsn_t	lsn,
 			 uint		flags,
 			 int		*log_forced);
 void	  xfs_log_force(struct xfs_mount	*mp,
-			xfs_lsn_t		lsn,
 			uint			flags);
+int	  _xfs_log_force_lsn(struct xfs_mount *mp,
+			     xfs_lsn_t		lsn,
+			     uint		flags,
+			     int		*log_forced);
+void	  xfs_log_force_lsn(struct xfs_mount	*mp,
+			    xfs_lsn_t		lsn,
+			    uint		flags);
 int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 97148f0c4bdd..22e6efdc17ea 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3913,8 +3913,7 @@ xlog_recover_finish(
 		 * case the unlink transactions would have problems
 		 * pushing the EFIs out of the way.
 		 */
-		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
-			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
+		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
 
 		xlog_recover_process_iunlinks(log);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb0154047e85..7f81ed72c875 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1455,7 +1455,7 @@ xfs_unmountfs(
 	 * push out the iclog we will never get that unlocked. hence we
 	 * need to force the log first.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
 
 	xfs_qm_unmount(mp);
@@ -1465,7 +1465,7 @@ xfs_unmountfs(
 	 * that nothing is pinned.  This is important because bflush()
 	 * will skip pinned buffers.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 
 	xfs_binval(mp->m_ddev_targp);
 	if (mp->m_rtdev_targp) {
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7dbe3c3051db..be942d4e3324 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -981,9 +981,8 @@ shut_us_down:
 	 */
 	if (sync) {
 		if (!error) {
-			error = _xfs_log_force(mp, commit_lsn,
-				      XFS_LOG_FORCE | XFS_LOG_SYNC,
-				      log_flushed);
+			error = _xfs_log_force_lsn(mp, commit_lsn,
+				      XFS_LOG_SYNC, log_flushed);
 		}
 		XFS_STATS_INC(xs_trans_sync);
 	} else {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 063dfbdca94b..d7b1af8a832d 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -371,7 +371,7 @@ xfsaild_push(
 		 * move forward in the AIL.
 		 */
 		XFS_STATS_INC(xs_push_ail_flush);
-		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		xfs_log_force(mp, 0);
 	}
 
 	if (!count) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 4da96cdffb76..fd108b738559 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -631,9 +631,8 @@ xfs_fsync(
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-				      XFS_LOG_FORCE | XFS_LOG_SYNC,
-				      &log_flushed);
+			error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC,
+					       &log_flushed);
 		} else {
 			/*
 			 * If the inode is not pinned and nothing has changed
-- 
cgit v1.2.3


From bdfb04301fa5fdd95f219539a9a5b9663b1e5fc2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 20 Jan 2010 21:55:30 +0000
Subject: xfs: replace KM_LARGE with explicit vmalloc use

We use the KM_LARGE flag to make kmem_alloc and friends use vmalloc
if necessary.  As we only need this for a few boot/mount time
allocations just switch to explicit vmalloc calls there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/kmem.c    | 56 +++++++++++++++++-----------------------------
 fs/xfs/linux-2.6/kmem.h    | 21 ++++++++++++++---
 fs/xfs/linux-2.6/xfs_buf.c |  6 ++---
 fs/xfs/quota/xfs_qm.c      | 26 ++++++++++++++++-----
 fs/xfs/xfs_itable.c        |  8 ++++---
 5 files changed, 66 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 2d3f90afe5f1..bc7405585def 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -16,7 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <linux/mm.h>
-#include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
@@ -24,8 +23,25 @@
 #include "time.h"
 #include "kmem.h"
 
-#define MAX_VMALLOCS	6
-#define MAX_SLAB_SIZE	0x20000
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+
+	while (!(ptr = kmem_zalloc_large(kmsize))) {
+		if ((kmsize >>= 1) <= minsize)
+			kmsize = minsize;
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
 
 void *
 kmem_alloc(size_t size, unsigned int __nocast flags)
@@ -34,19 +50,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
-#ifdef DEBUG
-	if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
-		printk(KERN_WARNING "Large %s attempt, size=%ld\n",
-			__func__, (long)size);
-		dump_stack();
-	}
-#endif
-
 	do {
-		if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
-			ptr = kmalloc(size, lflags);
-		else
-			ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+		ptr = kmalloc(size, lflags);
 		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
 			return ptr;
 		if (!(++retries % 100))
@@ -68,27 +73,6 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
 	return ptr;
 }
 
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
-		   unsigned int __nocast flags)
-{
-	void		*ptr;
-	size_t		kmsize = maxsize;
-	unsigned int	kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
-
-	while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
-		if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
-			break;
-		if ((kmsize >>= 1) <= minsize) {
-			kmsize = minsize;
-			kmflags = flags;
-		}
-	}
-	if (ptr)
-		*size = kmsize;
-	return ptr;
-}
-
 void
 kmem_free(const void *ptr)
 {
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 179cbd630f69..f7c8f7a9ea6d 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 /*
  * General memory allocation interfaces
@@ -30,7 +31,6 @@
 #define KM_NOSLEEP	0x0002u
 #define KM_NOFS		0x0004u
 #define KM_MAYFAIL	0x0008u
-#define KM_LARGE	0x0010u
 
 /*
  * We use a special process flag to avoid recursive callbacks into
@@ -42,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE));
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
 
 	if (flags & KM_NOSLEEP) {
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -56,10 +56,25 @@ kmem_flags_convert(unsigned int __nocast flags)
 
 extern void *kmem_alloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
 extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
 extern void  kmem_free(const void *);
 
+static inline void *kmem_zalloc_large(size_t size)
+{
+	void *ptr;
+
+	ptr = vmalloc(size);
+	if (ptr)
+		memset(ptr, 0, size);
+	return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+	vfree(ptr);
+}
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
 /*
  * Zone interfaces
  */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 730eff1e71a3..44e20e578ba0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1525,8 +1525,8 @@ xfs_alloc_bufhash(
 
 	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
 	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
-	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
-					sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
+	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
+					 sizeof(xfs_bufhash_t));
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
 		spin_lock_init(&btp->bt_hash[i].bh_lock);
 		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1537,7 +1537,7 @@ STATIC void
 xfs_free_bufhash(
 	xfs_buftarg_t		*btp)
 {
-	kmem_free(btp->bt_hash);
+	kmem_free_large(btp->bt_hash);
 	btp->bt_hash = NULL;
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 9e627a8b5b0e..11cfd8245c7c 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -118,9 +118,14 @@ xfs_Gqm_init(void)
 	 */
 	udqhash = kmem_zalloc_greedy(&hsize,
 				     XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t),
-				     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-	gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
+				     XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
+	if (!udqhash)
+		goto out;
+
+	gdqhash = kmem_zalloc_large(hsize);
+	if (!udqhash)
+		goto out_free_udqhash;
+
 	hsize /= sizeof(xfs_dqhash_t);
 	ndquot = hsize << 8;
 
@@ -170,6 +175,11 @@ xfs_Gqm_init(void)
 	mutex_init(&qcheck_lock);
 #endif
 	return xqm;
+
+ out_free_udqhash:
+	kmem_free_large(udqhash);
+ out:
+	return NULL;
 }
 
 /*
@@ -189,8 +199,8 @@ xfs_qm_destroy(
 		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
 		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
 	}
-	kmem_free(xqm->qm_usr_dqhtable);
-	kmem_free(xqm->qm_grp_dqhtable);
+	kmem_free_large(xqm->qm_usr_dqhtable);
+	kmem_free_large(xqm->qm_grp_dqhtable);
 	xqm->qm_usr_dqhtable = NULL;
 	xqm->qm_grp_dqhtable = NULL;
 	xqm->qm_dqhashmask = 0;
@@ -219,8 +229,12 @@ xfs_qm_hold_quotafs_ref(
 	 */
 	mutex_lock(&xfs_Gqm_lock);
 
-	if (xfs_Gqm == NULL)
+	if (!xfs_Gqm) {
 		xfs_Gqm = xfs_Gqm_init();
+		if (!xfs_Gqm)
+			return ENOMEM;
+	}
+
 	/*
 	 * We can keep a list of all filesystems with quotas mounted for
 	 * debugging and statistical purposes, but ...
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 940307a6a60b..3af02314c605 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -408,8 +408,10 @@ xfs_bulkstat(
 		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4,
-				   KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
+	if (!irbuf)
+		return ENOMEM;
+
 	nirbuf = irbsize / sizeof(*irbuf);
 
 	/*
@@ -727,7 +729,7 @@ xfs_bulkstat(
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
-	kmem_free(irbuf);
+	kmem_free_large(irbuf);
 	*ubcountp = ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
-- 
cgit v1.2.3


From 9b00f30762fe9f914eb6e03057a616ed63a4e8ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 21 Jan 2010 11:17:20 +0000
Subject: xfs: quota limit statvfs available blocks

A "df" run on an NFS client of an exported XFS file system reports
the wrong information for "available" blocks.  When a block quota is
enforced, the amount reported as free is limited by the quota, but
the amount reported available is not (and should be).

Reported-by: Guk-Bong, Kwon <gbkwon@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm_bhv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index a5346630dfae..97b410c12794 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -59,7 +59,7 @@ xfs_fill_statvfs_from_dquot(
 		be64_to_cpu(dp->d_blk_hardlimit);
 	if (limit && statp->f_blocks > limit) {
 		statp->f_blocks = limit;
-		statp->f_bfree =
+		statp->f_bfree = statp->f_bavail =
 			(statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
 			 (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
 	}
-- 
cgit v1.2.3


From 19f5fb7ad679bb361222c7916086435020c37cce Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 24 Jan 2010 14:34:07 -0500
Subject: ext4: Use bitops to read/modify EXT4_I(inode)->i_state

At several places we modify EXT4_I(inode)->i_state without holding
i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage,
ext4_do_update_inode, ...). These modifications are racy and we can
lose updates to i_state. So convert handling of i_state to use bitops
which are atomic.

Cc: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 41 +++++++++++++++++++++++++++++------------
 fs/ext4/extents.c |  8 ++++----
 fs/ext4/file.c    |  4 ++--
 fs/ext4/ialloc.c  |  3 ++-
 fs/ext4/inode.c   | 38 ++++++++++++++++++++------------------
 fs/ext4/migrate.c |  6 +++---
 fs/ext4/xattr.c   | 22 +++++++++++-----------
 7 files changed, 71 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 307ecd13a762..ffe1334c891e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 		return flags & EXT4_OTHER_FLMASK;
 }
 
-/*
- * Inode dynamic state flags
- */
-#define EXT4_STATE_JDATA		0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW			0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR		0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND		0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE	0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE		0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN	0x00000040 /* need convert on dio done*/
-
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
 	__u32 group;		/* Group number for this data */
@@ -631,7 +620,7 @@ struct ext4_inode_info {
 	 * near to their parent directory's inode.
 	 */
 	ext4_group_t	i_block_group;
-	__u32	i_state;		/* Dynamic state flags for ext4 */
+	unsigned long	i_state_flags;		/* Dynamic state flags */
 
 	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1051,6 +1040,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT4_STATE_JDATA,		/* journaled data exists */
+	EXT4_STATE_NEW,			/* inode is newly created */
+	EXT4_STATE_XATTR,		/* has in-inode xattrs */
+	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
+	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
+	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
+	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+};
+
+static inline int ext4_test_inode_state(struct inode *inode, int bit)
+{
+	return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_set_inode_state(struct inode *inode, int bit)
+{
+	set_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+{
+	clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c56877972b0e..54616157c0f3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3076,7 +3076,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		if (io)
 			io->flag = DIO_AIO_UNWRITTEN;
 		else
-			EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		goto out;
 	}
 	/* async DIO end_io complete, convert the filled extent to written */
@@ -3362,8 +3362,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			if (io)
 				io->flag = DIO_AIO_UNWRITTEN;
 			else
-				EXT4_I(inode)->i_state |=
-					EXT4_STATE_DIO_UNWRITTEN;;
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_DIO_UNWRITTEN);
 		}
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
@@ -3739,7 +3739,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
 	int error = 0;
 
 	/* in-inode? */
-	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 		struct ext4_iloc iloc;
 		int offset;	/* offset of xattr in inode */
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..f6071ce0b155 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -35,9 +35,9 @@
  */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
-	if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
 		ext4_alloc_da_blocks(inode);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ead4f6c..2fab5adae1e2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1029,7 +1029,8 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT4_STATE_NEW;
+	ei->i_state_flags = 0;
+	ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1a3d7b232cd7..3e530119d7f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1307,7 +1307,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 			 * i_data's format changing.  Force the migrate
 			 * to fail by clearing migrate flags
 			 */
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 		}
 
 		/*
@@ -1794,7 +1794,7 @@ static int ext4_journalled_write_end(struct file *file,
 	new_i_size = pos + copied;
 	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2632,7 +2632,7 @@ static int __ext4_journalled_writepage(struct page *page,
 		ret = err;
 
 	walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 out:
 	return ret;
 }
@@ -3303,7 +3303,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+	if (EXT4_JOURNAL(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
@@ -3322,7 +3323,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		 * everything they get.
 		 */
 
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
 		journal = EXT4_JOURNAL(inode);
 		jbd2_journal_lock_updates(journal);
 		err = jbd2_journal_flush(journal);
@@ -3790,8 +3791,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
 			ext4_free_io_end(iocb->private);
 			iocb->private = NULL;
-		} else if (ret > 0 && (EXT4_I(inode)->i_state &
-				       EXT4_STATE_DIO_UNWRITTEN)) {
+		} else if (ret > 0 && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
 			int err;
 			/*
 			 * for non AIO case, since the IO is already
@@ -3801,7 +3802,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 							     offset, ret);
 			if (err < 0)
 				ret = err;
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+			ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		}
 		return ret;
 	}
@@ -4457,7 +4458,7 @@ void ext4_truncate(struct inode *inode)
 		return;
 
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		ext4_ext_truncate(inode);
@@ -4743,7 +4744,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
-		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 
 void ext4_set_inode_flags(struct inode *inode)
@@ -4837,7 +4838,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
 
-	ei->i_state = 0;
+	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -4920,7 +4921,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-				ei->i_state |= EXT4_STATE_XATTR;
+				ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		}
 	} else
 		ei->i_extra_isize = 0;
@@ -5060,7 +5061,7 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	/* For fields not not tracking in the in-memory inode,
 	 * initialise them to zero for new inodes. */
-	if (ei->i_state & EXT4_STATE_NEW)
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
 	ext4_get_inode_flags(ei);
@@ -5156,7 +5157,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	rc = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (!err)
 		err = rc;
-	ei->i_state &= ~EXT4_STATE_NEW;
+	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 
 	ext4_update_inode_fsync_trans(handle, inode, 0);
 out_brelse:
@@ -5580,8 +5581,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
 	entry = IFIRST(header);
 
 	/* No extended attributes present */
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
-		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5625,7 +5626,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (ext4_handle_valid(handle) &&
 	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
@@ -5639,7 +5640,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
-				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
 					ext4_warning(inode->i_sb, __func__,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 81415814b00b..46a4101e0aec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * happened after we started the migrate. We need to
 	 * fail the migrate
 	 */
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+	if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
 		retval = -EAGAIN;
 		up_write(&EXT4_I(inode)->i_data_sem);
 		goto err_out;
 	} else
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -533,7 +533,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 * allocation.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	handle = ext4_journal_start(inode, 1);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7ed45aa..c619a7ea670d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return -ENODATA;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -396,7 +396,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return 0;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -908,7 +908,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	is->s.base = is->s.first = IFIRST(header);
 	is->s.here = is->s.first;
 	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
 		if (error)
 			return error;
@@ -940,10 +940,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	header = IHDR(inode, ext4_raw_inode(&is->iloc));
 	if (!IS_LAST_ENTRY(s->first)) {
 		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-		EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 	} else {
 		header->h_magic = cpu_to_le32(0);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
 	return 0;
 }
@@ -986,8 +986,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
-	no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
-	EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 
 	error = ext4_get_inode_loc(inode, &is.iloc);
 	if (error)
@@ -997,10 +997,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (error)
 		goto cleanup;
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
 		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+		ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	}
 
 	error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1052,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = ext4_current_time(inode);
 		if (!value)
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1067,7 @@ cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
 	if (no_expand == 0)
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
-- 
cgit v1.2.3


From f710b4b96ba292dfed2153afc47e9063b0abfd89 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 Jan 2010 03:31:32 -0500
Subject: ext4: Reserve INCOMPAT_EA_INODE and INCOMPAT_DIRDATA feature
 codepoints

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ffe1334c891e..61cf3b3cde4e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -284,6 +284,7 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
@@ -1144,6 +1145,8 @@ static inline void ext4_clear_inode_state(struct inode *inode, int bit)
 #define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
 #define EXT4_FEATURE_INCOMPAT_MMP               0x0100
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
 
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
-- 
cgit v1.2.3


From cbe132a8bdcff0f9afd9060948fb50597c7400b8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 26 Jan 2010 15:08:49 +1100
Subject: xfs: don't hold onto reserved blocks on remount,ro

If we hold onto reserved blocks when doing a remount,ro we end
up writing the blocks used count to disk that includes the reserved
blocks. Reserved blocks are not actually used, so this results in
the values in the superblock being incorrect.

Hence if we run xfs_check or xfs_repair -n while the filesystem is
mounted remount,ro we end up with an inconsistent filesystem being
reported. Also, running xfs_copy on the remount,ro filesystem will
result in an inconsistent image being generated.

To fix this, unreserve the blocks when doing the remount,ro, and
reserved them again on remount,rw. This way a remount,ro filesystem
will appear consistent on disk to all utilities.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_super.c | 28 ++++++++++++++++++++++++++++
 fs/xfs/xfs_mount.h           |  1 +
 2 files changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f2e398a5616..e9c21454b9e2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1318,6 +1318,8 @@ xfs_fs_remount(
 
 	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+		__uint64_t resblks;
+
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 		if (mp->m_flags & XFS_MOUNT_BARRIER)
 			xfs_mountfs_check_barriers(mp);
@@ -1335,11 +1337,37 @@ xfs_fs_remount(
 			}
 			mp->m_update_flags = 0;
 		}
+
+		/*
+		 * Fill out the reserve pool if it is empty. Use the stashed
+		 * value if it is non-zero, otherwise go with the default.
+		 */
+		if (mp->m_resblks_save) {
+			resblks = mp->m_resblks_save;
+			mp->m_resblks_save = 0;
+		} else {
+			resblks = mp->m_sb.sb_dblocks;
+			do_div(resblks, 20);
+			resblks = min_t(__uint64_t, resblks, 1024);
+		}
+		xfs_reserve_blocks(mp, &resblks, NULL);
 	}
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/*
+		 * After we have synced the data but before we sync the
+		 * metadata, we need to free up the reserve block pool so that
+		 * the used block count in the superblock on disk is correct at
+		 * the end of the remount. Stash the current reserve pool size
+		 * so that if we get remounted rw, we can return it to the same
+		 * size.
+		 */
+		__uint64_t resblks = 0;
+
 		xfs_quiesce_data(mp);
+		mp->m_resblks_save = mp->m_resblks;
+		xfs_reserve_blocks(mp, &resblks, NULL);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f4d1441f3f15..02d45f213e58 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -225,6 +225,7 @@ typedef struct xfs_mount {
 	__uint64_t		m_maxioffset;	/* maximum inode offset */
 	__uint64_t		m_resblks;	/* total reserved blocks */
 	__uint64_t		m_resblks_avail;/* available reserved blocks */
+	__uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	int			m_dalign;	/* stripe unit */
 	int			m_swidth;	/* stripe width */
 	int			m_sinoalign;	/* stripe unit inode alignment */
-- 
cgit v1.2.3


From 388f1f0c346b533b06d8bc792f7204ebc3e4b7da Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 26 Jan 2010 15:10:15 +1100
Subject: xfs: turn off sign warnings

Because they cause warnings in static inline functions conditionally
compiled into XFS from the VFS (e.g. fsnotify).

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 19267019dacf..5c5a366aa332 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,7 +16,7 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
 
-EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6 -Wpointer-sign
+EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6
 
 XFS_LINUX := linux-2.6
 
-- 
cgit v1.2.3


From d5db0f97fbbeff11c88dec1aaf1536a975afbaeb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 5 Feb 2010 22:59:53 +0000
Subject: xfs: more reserved blocks fixups

This mangles the reserved blocks counts a little more.

1) add a helper function for the default reserved count
2) add helper functions to save/restore counts on ro/rw
3) save/restore reserved blocks on freeze/thaw
4) disallow changing reserved count while readonly

V2: changed field name to match Dave's changes

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c |  3 +++
 fs/xfs/linux-2.6/xfs_super.c | 51 ++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_mount.c           | 34 +++++++++++++++++++----------
 fs/xfs/xfs_mount.h           |  1 +
 4 files changed, 64 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 3906e85abfdc..4ea1ee18aded 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1431,6 +1431,9 @@ xfs_file_ioctl(
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (mp->m_flags & XFS_MOUNT_RDONLY)
+			return -XFS_ERROR(EROFS);
+
 		if (copy_from_user(&inout, arg, sizeof(inout)))
 			return -XFS_ERROR(EFAULT);
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9c21454b9e2..6ce828e0e17b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1256,6 +1256,29 @@ xfs_fs_statfs(
 	return 0;
 }
 
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks = 0;
+
+	mp->m_resblks_save = mp->m_resblks;
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+	__uint64_t resblks;
+
+	if (mp->m_resblks_save) {
+		resblks = mp->m_resblks_save;
+		mp->m_resblks_save = 0;
+	} else
+		resblks = xfs_default_resblks(mp);
+
+	xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
@@ -1318,8 +1341,6 @@ xfs_fs_remount(
 
 	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
-		__uint64_t resblks;
-
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 		if (mp->m_flags & XFS_MOUNT_BARRIER)
 			xfs_mountfs_check_barriers(mp);
@@ -1342,15 +1363,7 @@ xfs_fs_remount(
 		 * Fill out the reserve pool if it is empty. Use the stashed
 		 * value if it is non-zero, otherwise go with the default.
 		 */
-		if (mp->m_resblks_save) {
-			resblks = mp->m_resblks_save;
-			mp->m_resblks_save = 0;
-		} else {
-			resblks = mp->m_sb.sb_dblocks;
-			do_div(resblks, 20);
-			resblks = min_t(__uint64_t, resblks, 1024);
-		}
-		xfs_reserve_blocks(mp, &resblks, NULL);
+		xfs_restore_resvblks(mp);
 	}
 
 	/* rw -> ro */
@@ -1363,11 +1376,9 @@ xfs_fs_remount(
 		 * so that if we get remounted rw, we can return it to the same
 		 * size.
 		 */
-		__uint64_t resblks = 0;
 
 		xfs_quiesce_data(mp);
-		mp->m_resblks_save = mp->m_resblks;
-		xfs_reserve_blocks(mp, &resblks, NULL);
+		xfs_save_resvblks(mp);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
@@ -1386,10 +1397,21 @@ xfs_fs_freeze(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
+	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
 	return -xfs_fs_log_dummy(mp);
 }
 
+STATIC int
+xfs_fs_unfreeze(
+	struct super_block	*sb)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	xfs_restore_resvblks(mp);
+	return 0;
+}
+
 STATIC int
 xfs_fs_show_options(
 	struct seq_file		*m,
@@ -1612,6 +1634,7 @@ static const struct super_operations xfs_super_operations = {
 	.put_super		= xfs_fs_put_super,
 	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,
+	.unfreeze_fs		= xfs_fs_unfreeze,
 	.statfs			= xfs_fs_statfs,
 	.remount_fs		= xfs_fs_remount,
 	.show_options		= xfs_fs_show_options,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7f81ed72c875..5061149b2cc4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1091,6 +1091,22 @@ xfs_mount_reset_sbqflags(
 	return xfs_trans_commit(tp, 0);
 }
 
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	__uint64_t resblks;
+
+	/*
+	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
+	 * This may drive us straight to ENOSPC on mount, but that implies
+	 * we were already there on the last unmount. Warn if this occurs.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(__uint64_t, resblks, 1024);
+	return resblks;
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -1401,18 +1417,14 @@ xfs_mountfs(
 	 * when at ENOSPC. This is needed for operations like create with
 	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
 	 * are not allowed to use this reserved space.
-	 *
-	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
-	 * This may drive us straight to ENOSPC on mount, but that implies
-	 * we were already there on the last unmount. Warn if this occurs.
 	 */
-	resblks = mp->m_sb.sb_dblocks;
-	do_div(resblks, 20);
-	resblks = min_t(__uint64_t, resblks, 1024);
-	error = xfs_reserve_blocks(mp, &resblks, NULL);
-	if (error)
-		cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
-				"Continuing without a reserve pool.");
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+				"blocks. Continuing without a reserve pool.");
+	}
 
 	return 0;
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 02d45f213e58..70504fcf14cd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -421,6 +421,7 @@ typedef struct xfs_mod_sb {
 } xfs_mod_sb_t;
 
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
+extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int	xfs_mountfs(xfs_mount_t *mp);
 
 extern void	xfs_unmountfs(xfs_mount_t *);
-- 
cgit v1.2.3


From 777df5afdb26c71634edd60582be620ff94e87a0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 6 Feb 2010 12:37:26 +1100
Subject: xfs: Make inode reclaim states explicit

A.K.A.: don't rely on xfs_iflush() return value in reclaim

We have gradually been moving checks out of the reclaim code because
they are duplicated in xfs_iflush(). We've had a history of problems
in this area, and many of them stem from the overloading of the
return values from xfs_iflush() and interaction with inode flush
locking to determine if the inode is safe to reclaim.

With the desire to move to delayed write flushing of inodes and
non-blocking inode tree reclaim walks, the overloading of the
return value of xfs_iflush makes it very difficult to determine
the correct thing to do next.

This patch explicitly re-adds the checks to the inode reclaim code,
removing the reliance on the return value of xfs_iflush() to
determine what to do next. It also means that we can clearly
document all the inode states that reclaim must handle and hence
we can easily see that we handled all the necessary cases.

This also removes the need for the xfs_inode_clean() check in
xfs_iflush() as all callers now check this first (safely).

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_sync.c | 81 ++++++++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_inode.c          | 11 +-----
 fs/xfs/xfs_inode.h          |  1 +
 3 files changed, 64 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index c9b863eacab7..525260c7617f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -706,12 +706,43 @@ __xfs_inode_clear_reclaim_tag(
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
 
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		unpin and reclaim
+ *	stale, pinned		0		unpin and reclaim
+ *	dirty, async		0		block on flush lock, reclaim
+ *	dirty, sync flush	0		block on flush lock, reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned		=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty		=> flush, wait and reclaim
+ */
 STATIC int
 xfs_reclaim_inode(
 	struct xfs_inode	*ip,
 	struct xfs_perag	*pag,
 	int			sync_mode)
 {
+	int	error;
+
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
 	 * with us starting reclaim on the inode.  Once we have the
@@ -729,30 +760,42 @@ xfs_reclaim_inode(
 	spin_unlock(&ip->i_flags_lock);
 	write_unlock(&pag->pag_ici_lock);
 
-	/*
-	 * If the inode is still dirty, then flush it out.  If the inode
-	 * is not in the AIL, then it will be OK to flush it delwri as
-	 * long as xfs_iflush() does not keep any references to the inode.
-	 * We leave that decision up to xfs_iflush() since it has the
-	 * knowledge of whether it's OK to simply do a delwri flush of
-	 * the inode or whether we need to wait until the inode is
-	 * pulled from the AIL.
-	 * We get the flush lock regardless, though, just to make sure
-	 * we don't free it while it is being flushed.
-	 */
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_iflock(ip);
 
-	/*
-	 * In the case of a forced shutdown we rely on xfs_iflush() to
-	 * wait for the inode to be unpinned before returning an error.
-	 */
-	if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
-		/* synchronize with xfs_iflush_done */
-		xfs_iflock(ip);
-		xfs_ifunlock(ip);
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip))
+		xfs_iunpin_wait(ip);
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/* Now we have an inode that needs flushing */
+	error = xfs_iflush(ip, sync_mode);
+	if (!error) {
+		switch(sync_mode) {
+		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+		case XFS_IFLUSH_DELWRI:
+		case XFS_IFLUSH_ASYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+		case XFS_IFLUSH_SYNC:
+			/* IO issued, synchronise with IO completion */
+			xfs_iflock(ip);
+			break;
+		default:
+			ASSERT(0);
+			break;
+		}
 	}
 
+reclaim:
+	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_ireclaim(ip);
 	return 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d0d1b5a05183..8d0666dd170a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2493,7 +2493,7 @@ __xfs_iunpin_wait(
 		wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
 }
 
-static inline void
+void
 xfs_iunpin_wait(
 	xfs_inode_t	*ip)
 {
@@ -2848,15 +2848,6 @@ xfs_iflush(
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
 
-	/*
-	 * If the inode isn't dirty, then just release the inode flush lock and
-	 * do nothing.
-	 */
-	if (xfs_inode_clean(ip)) {
-		xfs_ifunlock(ip);
-		return 0;
-	}
-
 	/*
 	 * We can't flush the inode until it is unpinned, so wait for it if we
 	 * are allowed to block.  We know noone new can pin it, because we are
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ec1f28c4fc4f..8b618ea4d692 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -483,6 +483,7 @@ int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void		xfs_iext_realloc(xfs_inode_t *, int, int);
 void		xfs_ipin(xfs_inode_t *);
 void		xfs_iunpin(xfs_inode_t *);
+void		xfs_iunpin_wait(xfs_inode_t *);
 int		xfs_iflush(xfs_inode_t *, uint);
 void		xfs_ichgtime(xfs_inode_t *, int);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
-- 
cgit v1.2.3


From c854363e80b49dd04a4de18ebc379eb8c8806674 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 6 Feb 2010 12:39:36 +1100
Subject: xfs: Use delayed write for inodes rather than async V2

We currently do background inode flush asynchronously, resulting in
inodes being written in whatever order the background writeback
issues them. Not only that, there are also blocking and non-blocking
asynchronous inode flushes, depending on where the flush comes from.

This patch completely removes asynchronous inode writeback. It
removes all the strange writeback modes and replaces them with
either a synchronous flush or a non-blocking delayed write flush.
That is, inode flushes will only issue IO directly if they are
synchronous, and background flushing may do nothing if the operation
would block (e.g. on a pinned inode or buffer lock).

Delayed write flushes will now result in the inode buffer sitting in
the delwri queue of the buffer cache to be flushed by either an AIL
push or by the xfsbufd timing out the buffer. This will allow
accumulation of dirty inode buffers in memory and allow optimisation
of inode cluster writeback at the xfsbufd level where we have much
greater queue depths than the block layer elevators. We will also
get adjacent inode cluster buffer IO merging for free when a later
patch in the series allows sorting of the delayed write buffers
before dispatch.

This effectively means that any inode that is written back by
background writeback will be seen as flush locked during AIL
pushing, and will result in the buffers being pushed from there.
This writeback path is currently non-optimal, but the next patch
in the series will fix that problem.

A side effect of this delayed write mechanism is that background
inode reclaim will no longer directly flush inodes, nor can it wait
on the flush lock. The result is that inode reclaim must leave the
inode in the reclaimable state until it is clean. Hence attempts to
reclaim a dirty inode in the background will simply skip the inode
until it is clean and this allows other mechanisms (i.e. xfsbufd) to
do more optimal writeback of the dirty buffers. As a result, the
inode reclaim code has been rewritten so that it no longer relies on
the ambiguous return values of xfs_iflush() to determine whether it
is safe to reclaim an inode.

Portions of this patch are derived from patches by Christoph
Hellwig.

Version 2:
- cleanup reclaim code as suggested by Christoph
- log background reclaim inode flush errors
- just pass sync flags to xfs_iflush

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_super.c |   4 +-
 fs/xfs/linux-2.6/xfs_sync.c  | 105 +++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_inode.c           |  75 +++----------------------------
 fs/xfs/xfs_inode.h           |  10 -----
 fs/xfs/xfs_inode_item.c      |  10 +++--
 fs/xfs/xfs_mount.c           |  13 +++++-
 6 files changed, 102 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 6ce828e0e17b..3b5b46b8e3b9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1064,7 +1064,7 @@ xfs_fs_write_inode(
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
 		xfs_iflock(ip);
 
-		error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+		error = xfs_iflush(ip, SYNC_WAIT);
 	} else {
 		error = EAGAIN;
 		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
@@ -1072,7 +1072,7 @@ xfs_fs_write_inode(
 		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
 			goto out_unlock;
 
-		error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK);
+		error = xfs_iflush(ip, 0);
 	}
 
  out_unlock:
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 525260c7617f..a9f6d20aff41 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -270,8 +270,7 @@ xfs_sync_inode_attr(
 		goto out_unlock;
 	}
 
-	error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
-			   XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
+	error = xfs_iflush(ip, flags);
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -460,16 +459,18 @@ xfs_quiesce_fs(
 {
 	int	count = 0, pincount;
 
+	xfs_reclaim_inodes(mp, 0);
 	xfs_flush_buftarg(mp->m_ddev_targp, 0);
-	xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
 
 	/*
 	 * This loop must run at least twice.  The first instance of the loop
 	 * will flush most meta data but that will generate more meta data
 	 * (typically directory updates).  Which then must be flushed and
-	 * logged before we can write the unmount record.
+	 * logged before we can write the unmount record. We also so sync
+	 * reclaim of inodes to catch any that the above delwri flush skipped.
 	 */
 	do {
+		xfs_reclaim_inodes(mp, SYNC_WAIT);
 		xfs_sync_attr(mp, SYNC_WAIT);
 		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
 		if (!pincount) {
@@ -585,7 +586,7 @@ xfs_sync_worker(
 
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		xfs_log_force(mp, 0);
-		xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 		error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
@@ -719,21 +720,42 @@ __xfs_inode_clear_reclaim_tag(
  *	shutdown		EIO		unpin and reclaim
  *	clean, unpinned		0		reclaim
  *	stale, unpinned		0		reclaim
- *	clean, pinned(*)	0		unpin and reclaim
- *	stale, pinned		0		unpin and reclaim
- *	dirty, async		0		block on flush lock, reclaim
- *	dirty, sync flush	0		block on flush lock, reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, delwri ok	0		requeue
+ *	dirty, delwri blocked	EAGAIN		requeue
+ *	dirty, sync flush	0		reclaim
  *
  * (*) dgc: I don't think the clean, pinned state is possible but it gets
  * handled anyway given the order of checks implemented.
  *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
  * Hence the order of actions after gaining the locks should be:
  *	bad		=> reclaim
  *	shutdown	=> unpin and reclaim
- *	pinned		=> unpin
+ *	pinned, delwri	=> requeue
+ *	pinned, sync	=> unpin
  *	stale		=> reclaim
  *	clean		=> reclaim
- *	dirty		=> flush, wait and reclaim
+ *	dirty, delwri	=> flush and requeue
+ *	dirty, sync	=> flush, wait and reclaim
  */
 STATIC int
 xfs_reclaim_inode(
@@ -741,7 +763,7 @@ xfs_reclaim_inode(
 	struct xfs_perag	*pag,
 	int			sync_mode)
 {
-	int	error;
+	int	error = 0;
 
 	/*
 	 * The radix tree lock here protects a thread in xfs_iget from racing
@@ -761,7 +783,11 @@ xfs_reclaim_inode(
 	write_unlock(&pag->pag_ici_lock);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_iflock(ip);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
 
 	if (is_bad_inode(VFS_I(ip)))
 		goto reclaim;
@@ -769,8 +795,13 @@ xfs_reclaim_inode(
 		xfs_iunpin_wait(ip);
 		goto reclaim;
 	}
-	if (xfs_ipincount(ip))
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT)) {
+			xfs_ifunlock(ip);
+			goto out;
+		}
 		xfs_iunpin_wait(ip);
+	}
 	if (xfs_iflags_test(ip, XFS_ISTALE))
 		goto reclaim;
 	if (xfs_inode_clean(ip))
@@ -778,27 +809,43 @@ xfs_reclaim_inode(
 
 	/* Now we have an inode that needs flushing */
 	error = xfs_iflush(ip, sync_mode);
-	if (!error) {
-		switch(sync_mode) {
-		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-		case XFS_IFLUSH_DELWRI:
-		case XFS_IFLUSH_ASYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-		case XFS_IFLUSH_SYNC:
-			/* IO issued, synchronise with IO completion */
-			xfs_iflock(ip);
-			break;
-		default:
-			ASSERT(0);
-			break;
-		}
+	if (sync_mode & SYNC_WAIT) {
+		xfs_iflock(ip);
+		goto reclaim;
 	}
 
+	/*
+	 * When we have to flush an inode but don't have SYNC_WAIT set, we
+	 * flush the inode out using a delwri buffer and wait for the next
+	 * call into reclaim to find it in a clean state instead of waiting for
+	 * it now. We also don't return errors here - if the error is transient
+	 * then the next reclaim pass will flush the inode, and if the error
+	 * is permanent then the next sync reclaim will relcaim the inode and
+	 * pass on the error.
+	 */
+	if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"inode 0x%llx background reclaim flush failed with %d",
+			(long long)ip->i_ino, error);
+	}
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and xfssyncd never goes back to the idle
+	 * state. Instead, return 0 to let the next scheduled background reclaim
+	 * attempt to reclaim the inode again.
+	 */
+	return 0;
+
 reclaim:
 	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_ireclaim(ip);
-	return 0;
+	return error;
+
 }
 
 int
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d0666dd170a..fa31360046d4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2835,8 +2835,6 @@ xfs_iflush(
 	xfs_dinode_t		*dip;
 	xfs_mount_t		*mp;
 	int			error;
-	int			noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
-	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
 
 	XFS_STATS_INC(xs_iflush_count);
 
@@ -2859,7 +2857,7 @@ xfs_iflush(
 	 * in the same cluster are dirty, they will probably write the inode
 	 * out for us if they occur after the log force completes.
 	 */
-	if (noblock && xfs_ipincount(ip)) {
+	if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
 		xfs_iunpin_nowait(ip);
 		xfs_ifunlock(ip);
 		return EAGAIN;
@@ -2892,61 +2890,11 @@ xfs_iflush(
 		return XFS_ERROR(EIO);
 	}
 
-	/*
-	 * Decide how buffer will be flushed out.  This is done before
-	 * the call to xfs_iflush_int because this field is zeroed by it.
-	 */
-	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-		/*
-		 * Flush out the inode buffer according to the directions
-		 * of the caller.  In the cases where the caller has given
-		 * us a choice choose the non-delwri case.  This is because
-		 * the inode is in the AIL and we need to get it out soon.
-		 */
-		switch (flags) {
-		case XFS_IFLUSH_SYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-			flags = 0;
-			break;
-		case XFS_IFLUSH_ASYNC_NOBLOCK:
-		case XFS_IFLUSH_ASYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-			flags = INT_ASYNC;
-			break;
-		case XFS_IFLUSH_DELWRI:
-			flags = INT_DELWRI;
-			break;
-		default:
-			ASSERT(0);
-			flags = 0;
-			break;
-		}
-	} else {
-		switch (flags) {
-		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
-		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
-		case XFS_IFLUSH_DELWRI:
-			flags = INT_DELWRI;
-			break;
-		case XFS_IFLUSH_ASYNC_NOBLOCK:
-		case XFS_IFLUSH_ASYNC:
-			flags = INT_ASYNC;
-			break;
-		case XFS_IFLUSH_SYNC:
-			flags = 0;
-			break;
-		default:
-			ASSERT(0);
-			flags = 0;
-			break;
-		}
-	}
-
 	/*
 	 * Get the buffer containing the on-disk inode.
 	 */
 	error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-				noblock ? XBF_TRYLOCK : XBF_LOCK);
+				(flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
 	if (error || !bp) {
 		xfs_ifunlock(ip);
 		return error;
@@ -2974,13 +2922,10 @@ xfs_iflush(
 	if (error)
 		goto cluster_corrupt_out;
 
-	if (flags & INT_DELWRI) {
-		xfs_bdwrite(mp, bp);
-	} else if (flags & INT_ASYNC) {
-		error = xfs_bawrite(mp, bp);
-	} else {
+	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	}
+	else
+		xfs_bdwrite(mp, bp);
 	return error;
 
 corrupt_out:
@@ -3015,16 +2960,6 @@ xfs_iflush_int(
 	iip = ip->i_itemp;
 	mp = ip->i_mount;
 
-
-	/*
-	 * If the inode isn't dirty, then just release the inode
-	 * flush lock and do nothing.
-	 */
-	if (xfs_inode_clean(ip)) {
-		xfs_ifunlock(ip);
-		return 0;
-	}
-
 	/* set *dip = inode's place in the buffer */
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8b618ea4d692..6c912b027596 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -419,16 +419,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 #define XFS_IOLOCK_DEP(flags)	(((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_DEP(flags)	(((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
 
-/*
- * Flags for xfs_iflush()
- */
-#define	XFS_IFLUSH_DELWRI_ELSE_SYNC	1
-#define	XFS_IFLUSH_DELWRI_ELSE_ASYNC	2
-#define	XFS_IFLUSH_SYNC			3
-#define	XFS_IFLUSH_ASYNC		4
-#define	XFS_IFLUSH_DELWRI		5
-#define	XFS_IFLUSH_ASYNC_NOBLOCK	6
-
 /*
  * Flags for xfs_itruncate_start().
  */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 48ec1c0b23ce..207553e82954 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -866,10 +866,14 @@ xfs_inode_item_push(
 	       iip->ili_format.ilf_fields != 0);
 
 	/*
-	 * Write out the inode.  The completion routine ('iflush_done') will
-	 * pull it from the AIL, mark it clean, unlock the flush lock.
+	 * Push the inode to it's backing buffer. This will not remove the
+	 * inode from the AIL - a further push will be required to trigger a
+	 * buffer push. However, this allows all the dirty inodes to be pushed
+	 * to the buffer before it is pushed to disk. THe buffer IO completion
+	 * will pull th einode from the AIL, mark it clean and unlock the flush
+	 * lock.
 	 */
-	(void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+	(void) xfs_iflush(ip, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	return;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
 	 * need to force the log first.
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
-	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+
+	/*
+	 * Do a delwri reclaim pass first so that as many dirty inodes are
+	 * queued up for IO as possible. Then flush the buffers before making
+	 * a synchronous path to catch all the remaining inodes are reclaimed.
+	 * This makes the reclaim process as quick as possible by avoiding
+	 * synchronous writeout and blocking on inodes already in the delwri
+	 * state as much as possible.
+	 */
+	xfs_reclaim_inodes(mp, 0);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
 
-- 
cgit v1.2.3


From d808f617ad00a413585b806de340feda5ad9a2da Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 2 Feb 2010 10:13:42 +1100
Subject: xfs: Don't issue buffer IO direct from AIL push V2

All buffers logged into the AIL are marked as delayed write.
When the AIL needs to push the buffer out, it issues an async write of the
buffer. This means that IO patterns are dependent on the order of
buffers in the AIL.

Instead of flushing the buffer, promote the buffer in the delayed
write list so that the next time the xfsbufd is run the buffer will
be flushed by the xfsbufd. Return the state to the xfsaild that the
buffer was promoted so that the xfsaild knows that it needs to cause
the xfsbufd to run to flush the buffers that were promoted.

Using the xfsbufd for issuing the IO allows us to dispatch all
buffer IO from the one queue. This means that we can make much more
enlightened decisions on what order to flush buffers to disk as
we don't have multiple places issuing IO. Optimisations to xfsbufd
will be in a future patch.

Version 2
- kill XFS_ITEM_FLUSHING as it is now unused.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c    | 29 +++++++++++++
 fs/xfs/linux-2.6/xfs_buf.h    |  2 +
 fs/xfs/linux-2.6/xfs_trace.h  |  1 +
 fs/xfs/quota/xfs_dquot_item.c | 85 ++++++-------------------------------
 fs/xfs/quota/xfs_dquot_item.h |  4 --
 fs/xfs/xfs_buf_item.c         | 64 +++++++++++++++-------------
 fs/xfs/xfs_inode_item.c       | 98 +++++++------------------------------------
 fs/xfs/xfs_inode_item.h       |  6 ---
 fs/xfs/xfs_trans.h            |  3 +-
 fs/xfs/xfs_trans_ail.c        | 13 +++---
 10 files changed, 102 insertions(+), 203 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44e20e578ba0..b306265caa33 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1778,6 +1778,35 @@ xfs_buf_delwri_dequeue(
 	trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
 }
 
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+	struct xfs_buf	*bp)
+{
+	struct xfs_buftarg *btp = bp->b_target;
+	long		age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+	ASSERT(bp->b_flags & XBF_DELWRI);
+	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+	/*
+	 * Check the buffer age before locking the delayed write queue as we
+	 * don't need to promote buffers that are already past the flush age.
+	 */
+	if (bp->b_queuetime < jiffies - age)
+		return;
+	bp->b_queuetime = jiffies - age;
+	spin_lock(&btp->bt_delwrite_lock);
+	list_move(&bp->b_list, &btp->bt_delwrite_queue);
+	spin_unlock(&btp->bt_delwrite_lock);
+}
+
 STATIC void
 xfs_buf_runall_queues(
 	struct workqueue_struct	*queue)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index ea8c198f0c39..be45e8c5768d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -266,6 +266,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
 
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
 
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
@@ -395,6 +396,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+
 #ifdef CONFIG_KDB_MODULES
 extern struct list_head *xfs_get_buftarg_list(void);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 1bb09e70b2eb..a4574dcf5065 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -483,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
 DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1b564376d50c..dda0fb045c8a 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -212,18 +212,10 @@ xfs_qm_dquot_logitem_pushbuf(
 	xfs_dquot_t	*dqp;
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
-	uint		dopush;
 
 	dqp = qip->qli_dquot;
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
-	/*
-	 * The qli_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(qip->qli_pushbuf_flag != 0);
-	ASSERT(qip->qli_push_owner == current_pid());
-
 	/*
 	 * If flushlock isn't locked anymore, chances are that the
 	 * inode flush completed and the inode was taken off the AIL.
@@ -231,47 +223,20 @@ xfs_qm_dquot_logitem_pushbuf(
 	 */
 	if (completion_done(&dqp->q_flush)  ||
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
 		return;
 	}
 	mp = dqp->q_mount;
 	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
 		    XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  !completion_done(&dqp->q_flush));
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-
-			if (XFS_BUF_ISPINNED(bp))
-				xfs_log_force(mp, 0);
-
-			if (dopush) {
-				int	error;
-#ifdef XFSRACEDEBUG
-				delay_for_intr();
-				delay(300);
-#endif
-				error = xfs_bawrite(mp, bp);
-				if (error)
-					xfs_fs_cmn_err(CE_WARN, mp,
-	"xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
-							error, qip, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			qip->qli_pushbuf_flag = 0;
-			xfs_dqunlock(dqp);
-			xfs_buf_relse(bp);
-		}
+	xfs_dqunlock(dqp);
+	if (!bp)
 		return;
-	}
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
+	return;
 
-	qip->qli_pushbuf_flag = 0;
-	xfs_dqunlock(dqp);
 }
 
 /*
@@ -289,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
 	xfs_dq_logitem_t	*qip)
 {
 	xfs_dquot_t		*dqp;
-	uint			retval;
 
 	dqp = qip->qli_dquot;
 	if (atomic_read(&dqp->q_pincount) > 0)
-		return (XFS_ITEM_PINNED);
+		return XFS_ITEM_PINNED;
 
 	if (! xfs_qm_dqlock_nowait(dqp))
-		return (XFS_ITEM_LOCKED);
+		return XFS_ITEM_LOCKED;
 
-	retval = XFS_ITEM_SUCCESS;
 	if (!xfs_dqflock_nowait(dqp)) {
 		/*
-		 * The dquot is already being flushed.	It may have been
-		 * flushed delayed write, however, and we don't want to
-		 * get stuck waiting for that to complete.  So, we want to check
-		 * to see if we can lock the dquot's buffer without sleeping.
-		 * If we can and it is marked for delayed write, then we
-		 * hold it and send it out from the push routine.  We don't
-		 * want to do that now since we might sleep in the device
-		 * strategy routine.  We also don't want to grab the buffer lock
-		 * here because we'd like not to call into the buffer cache
-		 * while holding the AIL lock.
-		 * Make sure to only return PUSHBUF if we set pushbuf_flag
-		 * ourselves.  If someone else is doing it then we don't
-		 * want to go to the push routine and duplicate their efforts.
+		 * dquot has already been flushed to the backing buffer,
+		 * leave it locked, pushbuf routine will unlock it.
 		 */
-		if (qip->qli_pushbuf_flag == 0) {
-			qip->qli_pushbuf_flag = 1;
-			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
-#ifdef DEBUG
-			qip->qli_push_owner = current_pid();
-#endif
-			/*
-			 * The dquot is left locked.
-			 */
-			retval = XFS_ITEM_PUSHBUF;
-		} else {
-			retval = XFS_ITEM_FLUSHING;
-			xfs_dqunlock_nonotify(dqp);
-		}
+		return XFS_ITEM_PUSHBUF;
 	}
 
 	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
-	return (retval);
+	return XFS_ITEM_SUCCESS;
 }
 
 
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
 	xfs_log_item_t		 qli_item;	   /* common portion */
 	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
 	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
-	unsigned short		 qli_pushbuf_flag; /* 1 bit used in push_ail */
-#ifdef DEBUG
-	uint64_t		 qli_push_owner;
-#endif
 	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
 } xfs_dq_logitem_t;
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e0a11583ce5a..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove(
 /*
  * This is called to attempt to lock the buffer associated with this
  * buf log item.  Don't sleep on the buffer lock.  If we can't get
- * the lock right away, return 0.  If we can get the lock, pull the
- * buffer from the free list, mark it busy, and return 1.
+ * the lock right away, return 0.  If we can get the lock, take a
+ * reference to the buffer. If this is a delayed write buffer that
+ * needs AIL help to be written back, invoke the pushbuf routine
+ * rather than the normal success path.
  */
 STATIC uint
 xfs_buf_item_trylock(
@@ -477,24 +479,18 @@ xfs_buf_item_trylock(
 	xfs_buf_t	*bp;
 
 	bp = bip->bli_buf;
-
-	if (XFS_BUF_ISPINNED(bp)) {
+	if (XFS_BUF_ISPINNED(bp))
 		return XFS_ITEM_PINNED;
-	}
-
-	if (!XFS_BUF_CPSEMA(bp)) {
+	if (!XFS_BUF_CPSEMA(bp))
 		return XFS_ITEM_LOCKED;
-	}
 
-	/*
-	 * Remove the buffer from the free list.  Only do this
-	 * if it's on the free list.  Private buffers like the
-	 * superblock buffer are not.
-	 */
+	/* take a reference to the buffer.  */
 	XFS_BUF_HOLD(bp);
 
 	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 	trace_xfs_buf_item_trylock(bip);
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		return XFS_ITEM_PUSHBUF;
 	return XFS_ITEM_SUCCESS;
 }
 
@@ -626,11 +622,9 @@ xfs_buf_item_committed(
 }
 
 /*
- * This is called to asynchronously write the buffer associated with this
- * buf log item out to disk. The buffer will already have been locked by
- * a successful call to xfs_buf_item_trylock().  If the buffer still has
- * B_DELWRI set, then get it going out to disk with a call to bawrite().
- * If not, then just release the buffer.
+ * The buffer is locked, but is not a delayed write buffer. This happens
+ * if we race with IO completion and hence we don't want to try to write it
+ * again. Just release the buffer.
  */
 STATIC void
 xfs_buf_item_push(
@@ -642,17 +636,29 @@ xfs_buf_item_push(
 	trace_xfs_buf_item_push(bip);
 
 	bp = bip->bli_buf;
+	ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+	xfs_buf_relse(bp);
+}
 
-	if (XFS_BUF_ISDELAYWRITE(bp)) {
-		int	error;
-		error = xfs_bawrite(bip->bli_item.li_mountp, bp);
-		if (error)
-			xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
-			"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
-					error, bip, bp);
-	} else {
-		xfs_buf_relse(bp);
-	}
+/*
+ * The buffer is locked and is a delayed write buffer. Promote the buffer
+ * in the delayed write queue as the caller knows that they must invoke
+ * the xfsbufd to get this buffer written. We have to unlock the buffer
+ * to allow the xfsbufd to write it, too.
+ */
+STATIC void
+xfs_buf_item_pushbuf(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	trace_xfs_buf_item_pushbuf(bip);
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_ISDELAYWRITE(bp));
+	xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
 }
 
 /* ARGSUSED */
@@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-	.iop_pushbuf	= NULL,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committing
 };
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 207553e82954..d4dc063111f8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
 
 	if (!xfs_iflock_nowait(ip)) {
 		/*
-		 * If someone else isn't already trying to push the inode
-		 * buffer, we get to do it.
+		 * inode has already been flushed to the backing buffer,
+		 * leave it locked in shared mode, pushbuf routine will
+		 * unlock it.
 		 */
-		if (iip->ili_pushbuf_flag == 0) {
-			iip->ili_pushbuf_flag = 1;
-#ifdef DEBUG
-			iip->ili_push_owner = current_pid();
-#endif
-			/*
-			 * Inode is left locked in shared mode.
-			 * Pushbuf routine gets to unlock it.
-			 */
-			return XFS_ITEM_PUSHBUF;
-		} else {
-			/*
-			 * We hold the AIL lock, so we must specify the
-			 * NONOTIFY flag so that we won't double trip.
-			 */
-			xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
-			return XFS_ITEM_FLUSHING;
-		}
-		/* NOTREACHED */
+		return XFS_ITEM_PUSHBUF;
 	}
 
 	/* Stale items should force out the iclog */
 	if (ip->i_flags & XFS_ISTALE) {
 		xfs_ifunlock(ip);
+		/*
+		 * we hold the AIL lock - notify the unlock routine of this
+		 * so it doesn't try to get the lock again.
+		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
 		return XFS_ITEM_PINNED;
 	}
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
  * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
  * failed to get the inode flush lock but did get the inode locked SHARED.
  * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll initiate a bawrite on that
- * buffer to expedite the process.
- *
- * We aren't holding the AIL lock (or the flush lock) when this gets called,
- * so it is inherently race-y.
+ * marked delayed write. If that's the case, we'll promote it and that will
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
  */
 STATIC void
 xfs_inode_item_pushbuf(
@@ -759,26 +743,16 @@ xfs_inode_item_pushbuf(
 	xfs_inode_t	*ip;
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
-	uint		dopush;
 
 	ip = iip->ili_inode;
-
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
-	/*
-	 * The ili_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(iip->ili_pushbuf_flag != 0);
-	ASSERT(iip->ili_push_owner == current_pid());
-
 	/*
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
 	if (completion_done(&ip->i_flush) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return;
 	}
@@ -787,53 +761,12 @@ xfs_inode_item_pushbuf(
 	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
 		    iip->ili_format.ilf_len, XBF_TRYLOCK);
 
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			/*
-			 * We were racing with iflush because we don't hold
-			 * the AIL lock or the flush lock. However, at this point,
-			 * we have the buffer, and we know that it's dirty.
-			 * So, it's possible that iflush raced with us, and
-			 * this item is already taken off the AIL.
-			 * If not, we can flush it async.
-			 */
-			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  !completion_done(&ip->i_flush));
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-			trace_xfs_inode_item_push(bp, _RET_IP_);
-
-			if (XFS_BUF_ISPINNED(bp))
-				xfs_log_force(mp, 0);
-
-			if (dopush) {
-				int	error;
-				error = xfs_bawrite(mp, bp);
-				if (error)
-					xfs_fs_cmn_err(CE_WARN, mp,
-		"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
-							error, iip, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			xfs_buf_relse(bp);
-		}
-		return;
-	}
-	/*
-	 * We have to be careful about resetting pushbuf flag too early (above).
-	 * Even though in theory we can do it as soon as we have the buflock,
-	 * we don't want others to be doing work needlessly. They'll come to
-	 * this function thinking that pushing the buffer is their
-	 * responsibility only to find that the buffer is still locked by
-	 * another doing the same thing
-	 */
-	iip->ili_pushbuf_flag = 0;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!bp)
+		return;
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
 	return;
 }
 
@@ -937,7 +870,6 @@ xfs_inode_item_init(
 	/*
 	   We have zeroed memory. No need ...
 	   iip->ili_extents_buf = NULL;
-	   iip->ili_pushbuf_flag = 0;
 	 */
 
 	iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
 						      data exts */
 	struct xfs_bmbt_rec	*ili_aextents_buf; /* array of logged
 						      attr exts */
-	unsigned int            ili_pushbuf_flag;  /* one bit used in push_ail */
-
-#ifdef DEBUG
-	uint64_t                ili_push_owner;    /* one who sets pushbuf_flag
-						      above gets to push the buf */
-#endif
 #ifdef XFS_TRANS_DEBUG
 	int			ili_root_size;
 	char			*ili_orig_root;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..c93e3a102857 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -861,8 +861,7 @@ typedef struct xfs_item_ops {
 #define	XFS_ITEM_SUCCESS	0
 #define	XFS_ITEM_PINNED		1
 #define	XFS_ITEM_LOCKED		2
-#define	XFS_ITEM_FLUSHING	3
-#define XFS_ITEM_PUSHBUF	4
+#define XFS_ITEM_PUSHBUF	3
 
 /*
  * This structure is used to maintain a list of block ranges that have been
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d7b1af8a832d..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -253,6 +253,7 @@ xfsaild_push(
 	int		flush_log, count, stuck;
 	xfs_mount_t	*mp = ailp->xa_mount;
 	struct xfs_ail_cursor	*cur = &ailp->xa_cursors;
+	int		push_xfsbufd = 0;
 
 	spin_lock(&ailp->xa_lock);
 	xfs_trans_ail_cursor_init(ailp, cur);
@@ -308,6 +309,7 @@ xfsaild_push(
 			XFS_STATS_INC(xs_push_ail_pushbuf);
 			IOP_PUSHBUF(lip);
 			last_pushed_lsn = lsn;
+			push_xfsbufd = 1;
 			break;
 
 		case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
 			stuck++;
 			break;
 
-		case XFS_ITEM_FLUSHING:
-			XFS_STATS_INC(xs_push_ail_flushing);
-			last_pushed_lsn = lsn;
-			stuck++;
-			break;
-
 		default:
 			ASSERT(0);
 			break;
@@ -374,6 +370,11 @@ xfsaild_push(
 		xfs_log_force(mp, 0);
 	}
 
+	if (push_xfsbufd) {
+		/* we've got delayed write buffers to flush */
+		wake_up_process(mp->m_ddev_targp->bt_task);
+	}
+
 	if (!count) {
 		/* We're past our target or empty, so idle */
 		last_pushed_lsn = 0;
-- 
cgit v1.2.3


From 089716aa1480b7197bcd678b8477774c379a2768 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 26 Jan 2010 15:13:25 +1100
Subject: xfs: Sort delayed write buffers before dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently when the xfsbufd writes delayed write buffers, it pushes
them to disk in the order they come off the delayed write list. If
there are lots of buffers ѕpread widely over the disk, this results
in overwhelming the elevator sort queues in the block layer and we
end up losing the posibility of merging adjacent buffers to minimise
the number of IOs.

Use the new generic list_sort function to sort the delwri dispatch
queue before issue to ensure that the buffers are pushed in the most
friendly order possible to the lower layers.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 87 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b306265caa33..4556a4c31e36 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
 #include <linux/migrate.h>
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
+#include <linux/list_sort.h>
 
 #include "xfs_sb.h"
 #include "xfs_inum.h"
@@ -1877,14 +1878,42 @@ xfs_buf_delwri_split(
 
 }
 
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+	void		*priv,
+	struct list_head *a,
+	struct list_head *b)
+{
+	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
+	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
+	xfs_daddr_t		diff;
+
+	diff = ap->b_bn - bp->b_bn;
+	if (diff < 0)
+		return -1;
+	if (diff > 0)
+		return 1;
+	return 0;
+}
+
+void
+xfs_buf_delwri_sort(
+	xfs_buftarg_t	*target,
+	struct list_head *list)
+{
+	list_sort(NULL, list, xfs_buf_cmp);
+}
+
 STATIC int
 xfsbufd(
 	void		*data)
 {
-	struct list_head tmp;
-	xfs_buftarg_t	*target = (xfs_buftarg_t *)data;
-	int		count;
-	xfs_buf_t	*bp;
+	xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
 
 	current->flags |= PF_MEMALLOC;
 
@@ -1893,6 +1922,8 @@ xfsbufd(
 	do {
 		long	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
 		long	tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+		int	count = 0;
+		struct list_head tmp;
 
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1907,11 +1938,10 @@ xfsbufd(
 		schedule_timeout_interruptible(tout);
 
 		xfs_buf_delwri_split(target, &tmp, age);
-		count = 0;
+		list_sort(NULL, &tmp, xfs_buf_cmp);
 		while (!list_empty(&tmp)) {
-			bp = list_entry(tmp.next, xfs_buf_t, b_list);
-			ASSERT(target == bp->b_target);
-
+			struct xfs_buf *bp;
+			bp = list_first_entry(&tmp, struct xfs_buf, b_list);
 			list_del_init(&bp->b_list);
 			xfs_buf_iostrategy(bp);
 			count++;
@@ -1937,42 +1967,45 @@ xfs_flush_buftarg(
 	xfs_buftarg_t	*target,
 	int		wait)
 {
-	struct list_head tmp;
-	xfs_buf_t	*bp, *n;
+	xfs_buf_t	*bp;
 	int		pincount = 0;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(wait_list);
 
 	xfs_buf_runall_queues(xfsconvertd_workqueue);
 	xfs_buf_runall_queues(xfsdatad_workqueue);
 	xfs_buf_runall_queues(xfslogd_workqueue);
 
 	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-	pincount = xfs_buf_delwri_split(target, &tmp, 0);
+	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
 
 	/*
-	 * Dropped the delayed write list lock, now walk the temporary list
+	 * Dropped the delayed write list lock, now walk the temporary list.
+	 * All I/O is issued async and then if we need to wait for completion
+	 * we do that after issuing all the IO.
 	 */
-	list_for_each_entry_safe(bp, n, &tmp, b_list) {
+	list_sort(NULL, &tmp_list, xfs_buf_cmp);
+	while (!list_empty(&tmp_list)) {
+		bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
 		ASSERT(target == bp->b_target);
-		if (wait)
+		list_del_init(&bp->b_list);
+		if (wait) {
 			bp->b_flags &= ~XBF_ASYNC;
-		else
-			list_del_init(&bp->b_list);
-
+			list_add(&bp->b_list, &wait_list);
+		}
 		xfs_buf_iostrategy(bp);
 	}
 
-	if (wait)
+	if (wait) {
+		/* Expedite and wait for IO to complete. */
 		blk_run_address_space(target->bt_mapping);
+		while (!list_empty(&wait_list)) {
+			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
-	/*
-	 * Remaining list items must be flushed before returning
-	 */
-	while (!list_empty(&tmp)) {
-		bp = list_entry(tmp.next, xfs_buf_t, b_list);
-
-		list_del_init(&bp->b_list);
-		xfs_iowait(bp);
-		xfs_buf_relse(bp);
+			list_del_init(&bp->b_list);
+			xfs_iowait(bp);
+			xfs_buf_relse(bp);
+		}
 	}
 
 	return pincount;
-- 
cgit v1.2.3


From 7d6a7bde52e449f21a0e86a7a4955b4e08a49d69 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 26 Jan 2010 15:13:41 +1100
Subject: xfs: Use delay write promotion for dquot flushing

xfs_qm_dqflock_pushbuf_wait() does a very similar trick to item
pushing used to do to flush out delayed write dquot buffers. Change
it to use the new promotion method rather than an async flush.

Also, xfs_qm_dqflock_pushbuf_wait() can return without the flush lock
held, yet the callers make the assumption that after this call the
flush lock is held. Always return with the flush lock held.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_dquot.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f9baeedbfdfe..1620a56b067e 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1528,21 +1528,16 @@ xfs_qm_dqflock_pushbuf_wait(
 	 */
 	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
 		    XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK);
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			int	error;
-
-			if (XFS_BUF_ISPINNED(bp))
-				xfs_log_force(dqp->q_mount, 0);
-			error = xfs_bawrite(dqp->q_mount, bp);
-			if (error)
-				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-					"xfs_qm_dqflock_pushbuf_wait: "
-					"pushbuf error %d on dqp %p, bp %p",
-					error, dqp, bp);
-		} else {
-			xfs_buf_relse(bp);
-		}
+	if (!bp)
+		goto out_lock;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		if (XFS_BUF_ISPINNED(bp))
+			xfs_log_force(dqp->q_mount, 0);
+		xfs_buf_delwri_promote(bp);
+		wake_up_process(bp->b_target->bt_task);
 	}
+	xfs_buf_relse(bp);
+out_lock:
 	xfs_dqflock(dqp);
 }
-- 
cgit v1.2.3


From 20026d92013d7bb3abb295337191def6758fc086 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 4 Feb 2010 09:48:58 +1100
Subject: xfs: kill the unused XFS_QMOPT_* flush flags V2

dquots are never flushed asynchronously. Remove the flag and the
async write support from the flush function. Make the default flush
a delwri flush to make the inode flush code, which leaves the
XFS_QMOPT_SYNC the only flag remaining.  Convert that to use
SYNC_WAIT instead, just like the inode flush code.

V2:
- just pass flush flags straight through

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_dquot.c      | 13 ++++++-------
 fs/xfs/quota/xfs_dquot_item.c |  2 +-
 fs/xfs/quota/xfs_qm.c         | 14 ++++++--------
 fs/xfs/xfs_quota.h            |  8 +-------
 4 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 1620a56b067e..5f79dd78626b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1187,7 +1187,7 @@ xfs_qm_dqflush(
 	 * block, nada.
 	 */
 	if (!XFS_DQ_IS_DIRTY(dqp) ||
-	    (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
+	    (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
 		xfs_dqfunlock(dqp);
 		return 0;
 	}
@@ -1251,18 +1251,17 @@ xfs_qm_dqflush(
 		xfs_log_force(mp, 0);
 	}
 
-	if (flags & XFS_QMOPT_DELWRI) {
-		xfs_bdwrite(mp, bp);
-	} else {
+	if (flags & SYNC_WAIT)
 		error = xfs_bwrite(mp, bp);
-	}
+	else
+		xfs_bdwrite(mp, bp);
 
 	trace_xfs_dqflush_done(dqp);
 
 	/*
 	 * dqp is still locked, but caller is free to unlock it now.
 	 */
-	return (error);
+	return error;
 
 }
 
@@ -1443,7 +1442,7 @@ xfs_qm_dqpurge(
 		 * We don't care about getting disk errors here. We need
 		 * to purge this dquot anyway, so we go ahead regardless.
 		 */
-		error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		error = xfs_qm_dqflush(dqp, SYNC_WAIT);
 		if (error)
 			xfs_fs_cmn_err(CE_WARN, mp,
 				"xfs_qm_dqpurge: dquot %p flush failed", dqp);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index dda0fb045c8a..4e4ee9a57194 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -153,7 +153,7 @@ xfs_qm_dquot_logitem_push(
 	 * lock without sleeping, then there must not have been
 	 * anyone in the process of flushing the dquot.
 	 */
-	error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+	error = xfs_qm_dqflush(dqp, 0);
 	if (error)
 		xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
 			"xfs_qm_dquot_logitem_push: push error %d on dqp %p",
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 11cfd8245c7c..8699e51cb45e 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -450,7 +450,7 @@ xfs_qm_unmount_quotas(
 STATIC int
 xfs_qm_dqflush_all(
 	xfs_mount_t	*mp,
-	int		flags)
+	int		sync_mode)
 {
 	int		recl;
 	xfs_dquot_t	*dqp;
@@ -486,7 +486,7 @@ again:
 		 * across a disk write.
 		 */
 		xfs_qm_mplist_unlock(mp);
-		error = xfs_qm_dqflush(dqp, flags);
+		error = xfs_qm_dqflush(dqp, sync_mode);
 		xfs_dqunlock(dqp);
 		if (error)
 			return error;
@@ -926,13 +926,11 @@ xfs_qm_sync(
 {
 	int		recl, restarts;
 	xfs_dquot_t	*dqp;
-	uint		flush_flags;
 	int		error;
 
 	if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
 		return 0;
 
-	flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
 	restarts = 0;
 
   again:
@@ -992,7 +990,7 @@ xfs_qm_sync(
 		 * across a disk write
 		 */
 		xfs_qm_mplist_unlock(mp);
-		error = xfs_qm_dqflush(dqp, flush_flags);
+		error = xfs_qm_dqflush(dqp, flags);
 		xfs_dqunlock(dqp);
 		if (error && XFS_FORCED_SHUTDOWN(mp))
 			return 0;	/* Need to prevent umount failure */
@@ -1796,7 +1794,7 @@ xfs_qm_quotacheck(
 	 * successfully.
 	 */
 	if (!error)
-		error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+		error = xfs_qm_dqflush_all(mp, 0);
 
 	/*
 	 * We can get this error if we couldn't do a dquot allocation inside
@@ -2018,7 +2016,7 @@ xfs_qm_shake_freelist(
 			 * We flush it delayed write, so don't bother
 			 * releasing the mplock.
 			 */
-			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, 0);
 			if (error) {
 				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
 			"xfs_qm_dqflush_all: dquot %p flush failed", dqp);
@@ -2201,7 +2199,7 @@ xfs_qm_dqreclaim_one(void)
 			 * We flush it delayed write, so don't bother
 			 * releasing the freelist lock.
 			 */
-			error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			error = xfs_qm_dqflush(dqp, 0);
 			if (error) {
 				xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
 			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 21d11d9f48f2..fdcab3f81dde 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -222,16 +222,10 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_DELRTBCOUNT	0x0400000
 #define XFS_QMOPT_RES_INOS	0x0800000
 
-/*
- * flags for dqflush and dqflush_all.
- */
-#define XFS_QMOPT_SYNC		0x1000000
-#define XFS_QMOPT_DELWRI	0x4000000
-
 /*
  * flags for dqalloc.
  */
-#define XFS_QMOPT_INHERIT	0x8000000
+#define XFS_QMOPT_INHERIT	0x1000000
 
 /*
  * flags to xfs_trans_mod_dquot.
-- 
cgit v1.2.3


From e8b217e7530c6a073ac69f1c85b922d93fdf5647 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Feb 2010 10:16:26 +1100
Subject: xfs: remove invalid barrier optimization from xfs_fsync

We always need to flush the disk write cache and can't skip it just because
the no inode attributes have changed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_vnodeops.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index fd108b738559..43241e289800 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -597,7 +597,7 @@ xfs_fsync(
 {
 	xfs_trans_t	*tp;
 	int		error = 0;
-	int		log_flushed = 0, changed = 1;
+	int		log_flushed = 0;
 
 	xfs_itrace_entry(ip);
 
@@ -627,18 +627,10 @@ xfs_fsync(
 		 * disk yet, the inode will be still be pinned.  If it is,
 		 * force the log.
 		 */
-
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
 		if (xfs_ipincount(ip)) {
 			error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC,
 					       &log_flushed);
-		} else {
-			/*
-			 * If the inode is not pinned and nothing has changed
-			 * we don't need to flush the cache.
-			 */
-			changed = 0;
 		}
 	} else	{
 		/*
@@ -673,7 +665,7 @@ xfs_fsync(
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
-	if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
+	if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
 		/*
 		 * If the log write didn't issue an ordered tag we need
 		 * to flush the disk cache for the data device now.
-- 
cgit v1.2.3


From 07fec73625dc0db6f9aed68019918208a2ca53f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 9 Feb 2010 11:43:49 +1100
Subject: xfs: log changed inodes instead of writing them synchronously

When an inode has already be flushed delayed write,
xfs_inode_clean() returns true and hence xfs_fs_write_inode() can
return on a synchronous inode write without having written the
inode. Currently these sycnhronous writes only come sync(1),
unmount, a sycnhronous NFS export and cachefiles so should be
relatively rare and out of common performance paths.

Realistically, a synchronous inode write is not necessary here; we
can avoid writing the inode by logging any non-transactional changes
that are pending.  This needs to be done with synchronous
transactions, but it avoids seeking between the log and inode
clusters as we do now. We don't force the log if the inode is
pinned, though, so this differs from the fsync case.  For normal
sys_sync and unmount behaviour this is fine because we do a
synchronous log force in xfs_sync_data which is called from the
->sync_fs code.

It does however break the NFS synchronous export guarantees for now,
but work is under way to fix this at a higher level or for the
higher level to provide an additional flag in the writeback control
to tell us that a log force is needed.

Portions of this patch are based on work from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 111 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 82 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 3b5b46b8e3b9..25ea2408118f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1021,12 +1021,45 @@ xfs_fs_dirty_inode(
 	XFS_I(inode)->i_update_core = 1;
 }
 
-/*
- * Attempt to flush the inode, this will actually fail
- * if the inode is pinned, but we dirty the inode again
- * at the point when it is unpinned after a log write,
- * since this is when the inode itself becomes flushable.
- */
+STATIC int
+xfs_log_inode(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+	error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		/* we need to return with the lock hold shared */
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Note - it's possible that we might have pushed ourselves out of the
+	 * way during trans_reserve which would flush the inode.  But there's
+	 * no guarantee that the inode buffer has actually gone out yet (it's
+	 * delwri).  Plus the buffer could be pinned anyway if it's part of
+	 * an inode in another recent transaction.  So we play it safe and
+	 * fire off the transaction anyway.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+	xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
 STATIC int
 xfs_fs_write_inode(
 	struct inode		*inode,
@@ -1034,7 +1067,7 @@ xfs_fs_write_inode(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error = 0;
+	int			error = EAGAIN;
 
 	xfs_itrace_entry(ip);
 
@@ -1045,35 +1078,55 @@ xfs_fs_write_inode(
 		error = xfs_wait_on_pages(ip, 0, -1);
 		if (error)
 			goto out;
-	}
-
-	/*
-	 * Bypass inodes which have already been cleaned by
-	 * the inode flush clustering code inside xfs_iflush
-	 */
-	if (xfs_inode_clean(ip))
-		goto out;
 
-	/*
-	 * We make this non-blocking if the inode is contended, return
-	 * EAGAIN to indicate to the caller that they did not succeed.
-	 * This prevents the flush path from blocking on inodes inside
-	 * another operation right now, they get caught later by xfs_sync.
-	 */
-	if (sync) {
+		/*
+		 * Make sure the inode has hit stable storage.  By using the
+		 * log and the fsync transactions we reduce the IOs we have
+		 * to do here from two (log and inode) to just the log.
+		 *
+		 * Note: We still need to do a delwri write of the inode after
+		 * this to flush it to the backing buffer so that bulkstat
+		 * works properly if this is the first time the inode has been
+		 * written.  Because we hold the ilock atomically over the
+		 * transaction commit and the inode flush we are guaranteed
+		 * that the inode is not pinned when it returns. If the flush
+		 * lock is already held, then the inode has already been
+		 * flushed once and we don't need to flush it again.  Hence
+		 * the code will only flush the inode if it isn't already
+		 * being flushed.
+		 */
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
-		xfs_iflock(ip);
-
-		error = xfs_iflush(ip, SYNC_WAIT);
+		if (ip->i_update_core) {
+			error = xfs_log_inode(ip);
+			if (error)
+				goto out_unlock;
+		}
 	} else {
-		error = EAGAIN;
+		/*
+		 * We make this non-blocking if the inode is contended, return
+		 * EAGAIN to indicate to the caller that they did not succeed.
+		 * This prevents the flush path from blocking on inodes inside
+		 * another operation right now, they get caught later by xfs_sync.
+		 */
 		if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
 			goto out;
-		if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-			goto out_unlock;
+	}
+
+	if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+		goto out_unlock;
 
-		error = xfs_iflush(ip, 0);
+	/*
+	 * Now we have the flush lock and the inode is not pinned, we can check
+	 * if the inode is really clean as we know that there are no pending
+	 * transaction completions, it is not waiting on the delayed write
+	 * queue and there is no IO in progress.
+	 */
+	if (xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
+		error = 0;
+		goto out_unlock;
 	}
+	error = xfs_iflush(ip, 0);
 
  out_unlock:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-- 
cgit v1.2.3


From 5322892d867e186c6b4c5fff5c99ea4863696a60 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 4 Feb 2010 10:09:14 +1100
Subject: xfs: kill xfs_bawrite

There are no more users of this function left in the XFS code
now that we've switched everything to delayed write flushing.
Remove it.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 19 -------------------
 fs/xfs/linux-2.6/xfs_buf.h |  1 -
 2 files changed, 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 4556a4c31e36..d50df3a8101c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1078,25 +1078,6 @@ xfs_bwrite(
 	return error;
 }
 
-int
-xfs_bawrite(
-	void			*mp,
-	struct xfs_buf		*bp)
-{
-	trace_xfs_buf_bawrite(bp, _RET_IP_);
-
-	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
-
-	xfs_buf_delwri_dequeue(bp);
-
-	bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-	bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-
-	bp->b_mount = mp;
-	bp->b_strat = xfs_bdstrat_cb;
-	return xfs_bdstrat_cb(bp);
-}
-
 void
 xfs_bdwrite(
 	void			*mp,
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index be45e8c5768d..386e7361e50e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -233,7 +233,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 
 extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-- 
cgit v1.2.3


From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 3 Feb 2010 15:36:43 -0800
Subject: syslog: distinguish between /proc/kmsg and syscalls

This allows the LSM to distinguish between syslog functions originating
from /proc/kmsg access and direct syscalls.  By default, the commoncaps
will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg
file descriptor.  For example the kernel syslog reader can now drop
privileges after opening /proc/kmsg, instead of staying privileged with
CAP_SYS_ADMIN.  MAC systems that implement security_syslog have unchanged
behavior.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/kmsg.c             | 14 +++++++-------
 include/linux/security.h   | 11 ++++++-----
 include/linux/syslog.h     | 29 +++++++++++++++++++++++++++++
 kernel/printk.c            |  7 ++++---
 security/commoncap.c       |  7 ++++++-
 security/security.c        |  4 ++--
 security/selinux/hooks.c   |  5 +++--
 security/smack/smack_lsm.c |  4 ++--
 8 files changed, 59 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/syslog.h

(limited to 'fs')

diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca78346d3f0..6a3d843a1088 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,37 +12,37 @@
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
+#include <linux/syslog.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
 
-extern int do_syslog(int type, char __user *bug, int count);
-
 static int kmsg_open(struct inode * inode, struct file * file)
 {
-	return do_syslog(1,NULL,0);
+	return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE);
 }
 
 static int kmsg_release(struct inode * inode, struct file * file)
 {
-	(void) do_syslog(0,NULL,0);
+	(void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE);
 	return 0;
 }
 
 static ssize_t kmsg_read(struct file *file, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
-	if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0))
+	if ((file->f_flags & O_NONBLOCK) &&
+	    !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE))
 		return -EAGAIN;
-	return do_syslog(2, buf, count);
+	return do_syslog(2, buf, count, SYSLOG_FROM_FILE);
 }
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &log_wait, wait);
-	if (do_syslog(9, NULL, 0))
+	if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
diff --git a/include/linux/security.h b/include/linux/security.h
index 26eca85b2417..a4dc74d86ac6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -76,7 +76,7 @@ extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
 extern int cap_task_setioprio(struct task_struct *p, int ioprio);
 extern int cap_task_setnice(struct task_struct *p, int nice);
-extern int cap_syslog(int type);
+extern int cap_syslog(int type, bool from_file);
 extern int cap_vm_enough_memory(struct mm_struct *mm, long pages);
 
 struct msghdr;
@@ -1349,6 +1349,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	logging to the console.
  *	See the syslog(2) manual page for an explanation of the @type values.
  *	@type contains the type of action.
+ *	@from_file indicates the context of action (if it came from /proc).
  *	Return 0 if permission is granted.
  * @settime:
  *	Check permission to change the system time.
@@ -1463,7 +1464,7 @@ struct security_operations {
 	int (*sysctl) (struct ctl_table *table, int op);
 	int (*quotactl) (int cmds, int type, int id, struct super_block *sb);
 	int (*quota_on) (struct dentry *dentry);
-	int (*syslog) (int type);
+	int (*syslog) (int type, bool from_file);
 	int (*settime) (struct timespec *ts, struct timezone *tz);
 	int (*vm_enough_memory) (struct mm_struct *mm, long pages);
 
@@ -1762,7 +1763,7 @@ int security_acct(struct file *file);
 int security_sysctl(struct ctl_table *table, int op);
 int security_quotactl(int cmds, int type, int id, struct super_block *sb);
 int security_quota_on(struct dentry *dentry);
-int security_syslog(int type);
+int security_syslog(int type, bool from_file);
 int security_settime(struct timespec *ts, struct timezone *tz);
 int security_vm_enough_memory(long pages);
 int security_vm_enough_memory_mm(struct mm_struct *mm, long pages);
@@ -2008,9 +2009,9 @@ static inline int security_quota_on(struct dentry *dentry)
 	return 0;
 }
 
-static inline int security_syslog(int type)
+static inline int security_syslog(int type, bool from_file)
 {
-	return cap_syslog(type);
+	return cap_syslog(type, from_file);
 }
 
 static inline int security_settime(struct timespec *ts, struct timezone *tz)
diff --git a/include/linux/syslog.h b/include/linux/syslog.h
new file mode 100644
index 000000000000..5f02b1817be1
--- /dev/null
+++ b/include/linux/syslog.h
@@ -0,0 +1,29 @@
+/*  Syslog internals
+ *
+ *  Copyright 2010 Canonical, Ltd.
+ *  Author: Kees Cook <kees.cook@canonical.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_SYSLOG_H
+#define _LINUX_SYSLOG_H
+
+#define SYSLOG_FROM_CALL 0
+#define SYSLOG_FROM_FILE 1
+
+int do_syslog(int type, char __user *buf, int count, bool from_file);
+
+#endif /* _LINUX_SYSLOG_H */
diff --git a/kernel/printk.c b/kernel/printk.c
index 17463ca2e229..809cf9a258a0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
 #include <linux/kexec.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
 
 #include <asm/uaccess.h>
 
@@ -273,14 +274,14 @@ static inline void boot_delay_msec(void)
  *	9 -- Return number of unread characters in the log buffer
  *     10 -- Return size of the log buffer
  */
-int do_syslog(int type, char __user *buf, int len)
+int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	unsigned i, j, limit, count;
 	int do_clear = 0;
 	char c;
 	int error = 0;
 
-	error = security_syslog(type);
+	error = security_syslog(type, from_file);
 	if (error)
 		return error;
 
@@ -417,7 +418,7 @@ out:
 
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-	return do_syslog(type, buf, len);
+	return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 
 /*
diff --git a/security/commoncap.c b/security/commoncap.c
index f800fdb3de94..677fad9d5cba 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/prctl.h>
 #include <linux/securebits.h>
+#include <linux/syslog.h>
 
 /*
  * If a non-root user executes a setuid-root binary in
@@ -888,12 +889,16 @@ error:
 /**
  * cap_syslog - Determine whether syslog function is permitted
  * @type: Function requested
+ * @from_file: Whether this request came from an open file (i.e. /proc)
  *
  * Determine whether the current process is permitted to use a particular
  * syslog function, returning 0 if permission is granted, -ve if not.
  */
-int cap_syslog(int type)
+int cap_syslog(int type, bool from_file)
 {
+	/* /proc/kmsg can open be opened by CAP_SYS_ADMIN */
+	if (type != 1 && from_file)
+		return 0;
 	if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
diff --git a/security/security.c b/security/security.c
index 440afe5eb54c..971092c06f31 100644
--- a/security/security.c
+++ b/security/security.c
@@ -203,9 +203,9 @@ int security_quota_on(struct dentry *dentry)
 	return security_ops->quota_on(dentry);
 }
 
-int security_syslog(int type)
+int security_syslog(int type, bool from_file)
 {
-	return security_ops->syslog(type);
+	return security_ops->syslog(type, from_file);
 }
 
 int security_settime(struct timespec *ts, struct timezone *tz)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9a2ee845e9d4..a4862a0730fa 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -76,6 +76,7 @@
 #include <linux/selinux.h>
 #include <linux/mutex.h>
 #include <linux/posix-timers.h>
+#include <linux/syslog.h>
 
 #include "avc.h"
 #include "objsec.h"
@@ -2049,11 +2050,11 @@ static int selinux_quota_on(struct dentry *dentry)
 	return dentry_has_perm(cred, NULL, dentry, FILE__QUOTAON);
 }
 
-static int selinux_syslog(int type)
+static int selinux_syslog(int type, bool from_file)
 {
 	int rc;
 
-	rc = cap_syslog(type);
+	rc = cap_syslog(type, from_file);
 	if (rc)
 		return rc;
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 529c9ca65878..a5721b373f53 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -157,12 +157,12 @@ static int smack_ptrace_traceme(struct task_struct *ptp)
  *
  * Returns 0 on success, error code otherwise.
  */
-static int smack_syslog(int type)
+static int smack_syslog(int type, bool from_file)
 {
 	int rc;
 	char *sp = current_security();
 
-	rc = cap_syslog(type);
+	rc = cap_syslog(type, from_file);
 	if (rc != 0)
 		return rc;
 
-- 
cgit v1.2.3


From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 3 Feb 2010 15:37:13 -0800
Subject: syslog: use defined constants instead of raw numbers

Right now the syslog "type" action are just raw numbers which makes
the source difficult to follow.  This patch replaces the raw numbers
with defined constants for some level of sanity.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/kmsg.c           | 10 +++++-----
 include/linux/syslog.h   | 23 +++++++++++++++++++++++
 kernel/printk.c          | 45 +++++++++++++++++++--------------------------
 security/commoncap.c     |  5 +++--
 security/selinux/hooks.c | 21 +++++++++++----------
 5 files changed, 61 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 6a3d843a1088..cfe90a48a6e8 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait;
 
 static int kmsg_open(struct inode * inode, struct file * file)
 {
-	return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE);
+	return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE);
 }
 
 static int kmsg_release(struct inode * inode, struct file * file)
 {
-	(void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE);
+	(void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE);
 	return 0;
 }
 
@@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
 			 size_t count, loff_t *ppos)
 {
 	if ((file->f_flags & O_NONBLOCK) &&
-	    !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE))
+	    !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
 		return -EAGAIN;
-	return do_syslog(2, buf, count, SYSLOG_FROM_FILE);
+	return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE);
 }
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &log_wait, wait);
-	if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE))
+	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE))
 		return POLLIN | POLLRDNORM;
 	return 0;
 }
diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index 5f02b1817be1..38911391a139 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -21,6 +21,29 @@
 #ifndef _LINUX_SYSLOG_H
 #define _LINUX_SYSLOG_H
 
+/* Close the log.  Currently a NOP. */
+#define SYSLOG_ACTION_CLOSE          0
+/* Open the log. Currently a NOP. */
+#define SYSLOG_ACTION_OPEN           1
+/* Read from the log. */
+#define SYSLOG_ACTION_READ           2
+/* Read all messages remaining in the ring buffer. */
+#define SYSLOG_ACTION_READ_ALL       3
+/* Read and clear all messages remaining in the ring buffer */
+#define SYSLOG_ACTION_READ_CLEAR     4
+/* Clear ring buffer. */
+#define SYSLOG_ACTION_CLEAR          5
+/* Disable printk's to console */
+#define SYSLOG_ACTION_CONSOLE_OFF    6
+/* Enable printk's to console */
+#define SYSLOG_ACTION_CONSOLE_ON     7
+/* Set level of messages printed to console */
+#define SYSLOG_ACTION_CONSOLE_LEVEL  8
+/* Return number of unread characters in the log buffer */
+#define SYSLOG_ACTION_SIZE_UNREAD    9
+/* Return size of the log buffer */
+#define SYSLOG_ACTION_SIZE_BUFFER   10
+
 #define SYSLOG_FROM_CALL 0
 #define SYSLOG_FROM_FILE 1
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 809cf9a258a0..3e162d867098 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -259,21 +259,6 @@ static inline void boot_delay_msec(void)
 }
 #endif
 
-/*
- * Commands to do_syslog:
- *
- * 	0 -- Close the log.  Currently a NOP.
- * 	1 -- Open the log. Currently a NOP.
- * 	2 -- Read from the log.
- * 	3 -- Read all messages remaining in the ring buffer.
- * 	4 -- Read and clear all messages remaining in the ring buffer
- * 	5 -- Clear ring buffer.
- * 	6 -- Disable printk's to console
- * 	7 -- Enable printk's to console
- *	8 -- Set level of messages printed to console
- *	9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	unsigned i, j, limit, count;
@@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		return error;
 
 	switch (type) {
-	case 0:		/* Close log */
+	case SYSLOG_ACTION_CLOSE:	/* Close log */
 		break;
-	case 1:		/* Open log */
+	case SYSLOG_ACTION_OPEN:	/* Open log */
 		break;
-	case 2:		/* Read from log */
+	case SYSLOG_ACTION_READ:	/* Read from log */
 		error = -EINVAL;
 		if (!buf || len < 0)
 			goto out;
@@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		if (!error)
 			error = i;
 		break;
-	case 4:		/* Read/clear last kernel messages */
+	/* Read/clear last kernel messages */
+	case SYSLOG_ACTION_READ_CLEAR:
 		do_clear = 1;
 		/* FALL THRU */
-	case 3:		/* Read last kernel messages */
+	/* Read last kernel messages */
+	case SYSLOG_ACTION_READ_ALL:
 		error = -EINVAL;
 		if (!buf || len < 0)
 			goto out;
@@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			}
 		}
 		break;
-	case 5:		/* Clear ring buffer */
+	/* Clear ring buffer */
+	case SYSLOG_ACTION_CLEAR:
 		logged_chars = 0;
 		break;
-	case 6:		/* Disable logging to console */
+	/* Disable logging to console */
+	case SYSLOG_ACTION_CONSOLE_OFF:
 		if (saved_console_loglevel == -1)
 			saved_console_loglevel = console_loglevel;
 		console_loglevel = minimum_console_loglevel;
 		break;
-	case 7:		/* Enable logging to console */
+	/* Enable logging to console */
+	case SYSLOG_ACTION_CONSOLE_ON:
 		if (saved_console_loglevel != -1) {
 			console_loglevel = saved_console_loglevel;
 			saved_console_loglevel = -1;
 		}
 		break;
-	case 8:		/* Set level of messages printed to console */
+	/* Set level of messages printed to console */
+	case SYSLOG_ACTION_CONSOLE_LEVEL:
 		error = -EINVAL;
 		if (len < 1 || len > 8)
 			goto out;
@@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		saved_console_loglevel = -1;
 		error = 0;
 		break;
-	case 9:		/* Number of chars in the log buffer */
+	/* Number of chars in the log buffer */
+	case SYSLOG_ACTION_SIZE_UNREAD:
 		error = log_end - log_start;
 		break;
-	case 10:	/* Size of the log buffer */
+	/* Size of the log buffer */
+	case SYSLOG_ACTION_SIZE_BUFFER:
 		error = log_buf_len;
 		break;
 	default:
diff --git a/security/commoncap.c b/security/commoncap.c
index 677fad9d5cba..cf01b2eebb60 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -897,9 +897,10 @@ error:
 int cap_syslog(int type, bool from_file)
 {
 	/* /proc/kmsg can open be opened by CAP_SYS_ADMIN */
-	if (type != 1 && from_file)
+	if (type != SYSLOG_ACTION_OPEN && from_file)
 		return 0;
-	if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
+	if ((type != SYSLOG_ACTION_READ_ALL &&
+	     type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a4862a0730fa..6b36ce2eef2e 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2059,20 +2059,21 @@ static int selinux_syslog(int type, bool from_file)
 		return rc;
 
 	switch (type) {
-	case 3:		/* Read last kernel messages */
-	case 10:	/* Return size of the log buffer */
+	case SYSLOG_ACTION_READ_ALL:	/* Read last kernel messages */
+	case SYSLOG_ACTION_SIZE_BUFFER:	/* Return size of the log buffer */
 		rc = task_has_system(current, SYSTEM__SYSLOG_READ);
 		break;
-	case 6:		/* Disable logging to console */
-	case 7:		/* Enable logging to console */
-	case 8:		/* Set level of messages printed to console */
+	case SYSLOG_ACTION_CONSOLE_OFF:	/* Disable logging to console */
+	case SYSLOG_ACTION_CONSOLE_ON:	/* Enable logging to console */
+	/* Set level of messages printed to console */
+	case SYSLOG_ACTION_CONSOLE_LEVEL:
 		rc = task_has_system(current, SYSTEM__SYSLOG_CONSOLE);
 		break;
-	case 0:		/* Close log */
-	case 1:		/* Open log */
-	case 2:		/* Read from log */
-	case 4:		/* Read/clear last kernel messages */
-	case 5:		/* Clear ring buffer */
+	case SYSLOG_ACTION_CLOSE:	/* Close log */
+	case SYSLOG_ACTION_OPEN:	/* Open log */
+	case SYSLOG_ACTION_READ:	/* Read from log */
+	case SYSLOG_ACTION_READ_CLEAR:	/* Read/clear last kernel messages */
+	case SYSLOG_ACTION_CLEAR:	/* Clear ring buffer */
 	default:
 		rc = task_has_system(current, SYSTEM__SYSLOG_MOD);
 		break;
-- 
cgit v1.2.3


From b21dda438baa450a76a375a35653ae0377793fab Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 5 Feb 2010 12:08:31 +0100
Subject: fuse: cleanup in fuse_notify_inval_...()

Small cleanup in fuse_notify_inval_inode() and
fuse_notify_inval_entry().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51d9e33d634f..ab622305c2f5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -865,13 +865,10 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
 
 	down_read(&fc->killsb);
 	err = -ENOENT;
-	if (!fc->sb)
-		goto err_unlock;
-
-	err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
-				       outarg.off, outarg.len);
-
-err_unlock:
+	if (fc->sb) {
+		err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+					       outarg.off, outarg.len);
+	}
 	up_read(&fc->killsb);
 	return err;
 
@@ -910,12 +907,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 
 	down_read(&fc->killsb);
 	err = -ENOENT;
-	if (!fc->sb)
-		goto err_unlock;
-
-	err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
-
-err_unlock:
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
 	up_read(&fc->killsb);
 	return err;
 
-- 
cgit v1.2.3


From b2d82ee3c8b2193ee5bc7eca4687ee9be30abd34 Mon Sep 17 00:00:00 2001
From: Fang Wenqi <anton.fang@gmail.com>
Date: Wed, 30 Dec 2009 18:37:13 +0800
Subject: fuse: fix large stack use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc 4.4 warns about:
  fs/fuse/dev.c: In function ‘fuse_notify_inval_entry’:
  fs/fuse/dev.c:925: warning: the frame size of 1060 bytes is larger than 1024 bytes

The problem is we declare two structures and a large array on the stack,
I move the array alway from the stack and allocate memory for it dynamically.

Signed-off-by: Fang Wenqi <antonf@turbolinux.com.cn>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ab622305c2f5..eb7e9423691f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -881,10 +881,15 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 				   struct fuse_copy_state *cs)
 {
 	struct fuse_notify_inval_entry_out outarg;
-	int err = -EINVAL;
-	char buf[FUSE_NAME_MAX+1];
+	int err = -ENOMEM;
+	char *buf;
 	struct qstr name;
 
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
 	if (size < sizeof(outarg))
 		goto err;
 
@@ -910,9 +915,11 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 	if (fc->sb)
 		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
 	up_read(&fc->killsb);
+	kfree(buf);
 	return err;
 
 err:
+	kfree(buf);
 	fuse_copy_finish(cs);
 	return err;
 }
-- 
cgit v1.2.3


From 73c77e2ccc14413c232c3e0b3aa43a0c4b72ec70 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@suse.de>
Date: Mon, 25 Jan 2010 11:42:24 -0600
Subject: xfs: fix xfs to work with Virtually Indexed architectures

xfs_buf.c includes what is essentially a hand rolled version of
blk_rq_map_kern().  In order to work properly with the vmalloc buffers
that xfs uses, this hand rolled routine must also implement the flushing
API for vmap/vmalloc areas.

[style updates from hch@lst.de]
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..6f3ebb634b8b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -76,6 +76,27 @@ struct workqueue_struct *xfsconvertd_workqueue;
 #define xfs_buf_deallocate(bp) \
 	kmem_zone_free(xfs_buf_zone, (bp));
 
+static inline int
+xfs_buf_is_vmapped(
+	struct xfs_buf	*bp)
+{
+	/*
+	 * Return true if the buffer is vmapped.
+	 *
+	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+	 * code is clever enough to know it doesn't have to map a single page,
+	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+	 */
+	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+	struct xfs_buf	*bp)
+{
+	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
 /*
  *	Page Region interfaces.
  *
@@ -314,7 +335,7 @@ xfs_buf_free(
 	if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 		uint		i;
 
-		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
+		if (xfs_buf_is_vmapped(bp))
 			free_address(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
@@ -1107,6 +1128,9 @@ xfs_buf_bio_end_io(
 
 	xfs_buf_ioerror(bp, -error);
 
+	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
 	do {
 		struct page	*page = bvec->bv_page;
 
@@ -1216,6 +1240,10 @@ next_chunk:
 
 submit_io:
 	if (likely(bio->bi_size)) {
+		if (xfs_buf_is_vmapped(bp)) {
+			flush_kernel_vmap_range(bp->b_addr,
+						xfs_buf_vmap_len(bp));
+		}
 		submit_bio(rw, bio);
 		if (size)
 			goto next_chunk;
-- 
cgit v1.2.3


From f12f98dba6ea1517cd7fbb912208893b9c014c15 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 5 Feb 2010 13:14:00 -0500
Subject: cifs: fix length calculation for converted unicode readdir names

cifs_from_ucs2 returns the length of the converted name, including the
length of the NULL terminator. We don't want to include the NULL
terminator in the dentry name length however since that'll throw off the
hash calculation for the dentry cache.

I believe that this is the root cause of several problems that have
cropped up recently that seem to be papered over with the "noserverino"
mount option. More confirmation of that would be good, but this is
clearly a bug and it fixes at least one reproducible problem that
was reported.

This patch fixes at least this reproducer in this kernel.org bug:

    http://bugzilla.kernel.org/show_bug.cgi?id=15088#c12

Reported-by: Bjorn Tore Sund <bjorn.sund@it.uib.no>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f84062f9a985..f5618f8cc462 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -666,6 +666,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 					   min(len, max_len), nlt,
 					   cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		pqst->len -= nls_nullsize(nlt);
 	} else {
 		pqst->name = filename;
 		pqst->len = len;
-- 
cgit v1.2.3


From 301a6a317797ca362951ea21da397c05236f0070 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 6 Feb 2010 07:08:53 +0000
Subject: [CIFS] Maximum username length check in session setup does not match

Fix length check reported by D. Binderman (see below)

d binderman <dcb314@hotmail.com> wrote:
>
> I just ran the sourceforge tool cppcheck over the source code of the
> new Linux kernel 2.6.33-rc6
>
> It said
>
> [./cifs/sess.c:250]: (error) Buffer access out-of-bounds

May turn out to be harmless, but best to be safe. Note max
username length is defined to 32 due to Linux (Windows
maximum is 20).

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7085a6275c4c..aaa9c1c5a5bd 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -223,9 +223,9 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 		/* null user mount */
 		*bcc_ptr = 0;
 		*(bcc_ptr+1) = 0;
-	} else { /* 300 should be long enough for any conceivable user name */
+	} else {
 		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
-					  300, nls_cp);
+					  MAX_USERNAME_SIZE, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -246,11 +246,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
 	/* copy user */
 	if (ses->userName == NULL) {
 		/* BB what about null user mounts - check that we do this BB */
-	} else { /* 300 should be long enough for any conceivable user name */
-		strncpy(bcc_ptr, ses->userName, 300);
+	} else {
+		strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
 	}
-	/* BB improve check for overflow */
-	bcc_ptr += strnlen(ses->userName, 300);
+	bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
 	*bcc_ptr = 0;
 	bcc_ptr++; /* account for null termination */
 
-- 
cgit v1.2.3


From ccd4bb1beb3316de4611de24d223ad761b5a7e95 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 8 Feb 2010 17:39:58 +0000
Subject: [CIFS] Don't cache timestamps on utimes due to coarse granularity

force revalidate of the file when any of the timestamps are set since
some filesytem types do not have finer granularity timestamps and
we can not always detect which file systems round timestamps down
to determine whether we can cache the mtime on setattr
samba bugzilla 3775

Acked-by: Shirish Pargaonkar <sharishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cf18ee765590..e3fda978f481 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1762,8 +1762,18 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 
-	if (!rc)
+	if (!rc) {
 		rc = inode_setattr(inode, attrs);
+
+		/* force revalidate when any of these times are set since some
+		   of the fs types (eg ext3, fat) do not have fine enough
+		   time granularity to match protocol, and we do not have a
+		   a way (yet) to query the server fs's time granularity (and
+		   whether it rounds times down).
+		*/
+		if (!rc && (attrs->ia_valid & (ATTR_MTIME | ATTR_CTIME)))
+			cifsInode->time = 0;
+	}
 out:
 	kfree(args);
 	kfree(full_path);
-- 
cgit v1.2.3


From 05507fa2ac8d5e503bcf33ee43329449027d9060 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 5 Feb 2010 13:30:36 -0500
Subject: cifs: fix dentry hash calculation for case-insensitive mounts

case-insensitive mounts shouldn't use full_name_hash(). Make sure we
use the parent dentry's d_hash routine when one is set.

Reported-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/readdir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f5618f8cc462..c343b14ba2d3 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -77,6 +77,11 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 
 	cFYI(1, ("For %s", name->name));
 
+	if (parent->d_op && parent->d_op->d_hash)
+		parent->d_op->d_hash(parent, name);
+	else
+		name->hash = full_name_hash(name->name, name->len);
+
 	dentry = d_lookup(parent, name);
 	if (dentry) {
 		/* FIXME: check for inode number changes? */
@@ -671,8 +676,6 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 		pqst->name = filename;
 		pqst->len = len;
 	}
-	pqst->hash = full_name_hash(pqst->name, pqst->len);
-/*	cFYI(1, ("filldir on %s",pqst->name));  */
 	return rc;
 }
 
-- 
cgit v1.2.3


From 123df2944c436c80640c4281c5bc9c7950b18687 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Dec 2009 04:57:57 -0500
Subject: Lose the new_name argument of fsnotify_move()

it's always new_dentry->d_name.name

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/inode.c       | 2 +-
 fs/namei.c               | 6 ++----
 include/linux/fsnotify.h | 3 ++-
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 274ac865bae8..049d6c36da09 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -496,7 +496,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 	}
 	d_move(old_dentry, dentry);
 	fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
-		old_dentry->d_name.name, S_ISDIR(old_dentry->d_inode->i_mode),
+		S_ISDIR(old_dentry->d_inode->i_mode),
 		NULL, old_dentry);
 	fsnotify_oldname_free(old_name);
 	unlock_rename(new_dir, old_dir);
diff --git a/fs/namei.c b/fs/namei.c
index d62fdc875f22..f69df876fac3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2661,11 +2661,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
 	else
 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-	if (!error) {
-		const char *new_name = old_dentry->d_name.name;
-		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
+	if (!error)
+		fsnotify_move(old_dir, new_dir, old_name, is_dir,
 			      new_dentry->d_inode, old_dentry);
-	}
 	fsnotify_oldname_free(old_name);
 
 	return error;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 936f9aa8bb97..2d755c49c324 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -65,7 +65,7 @@ static inline void fsnotify_link_count(struct inode *inode)
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
-				 const char *old_name, const char *new_name,
+				 const char *old_name,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
@@ -73,6 +73,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
 	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
+	const char *new_name = moved->d_name.name;
 
 	if (old_dir == new_dir)
 		old_dir_mask |= FS_DN_RENAME;
-- 
cgit v1.2.3


From cccc6bba3f771ef29b33e4f79e70ebc3dba245b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Dec 2009 05:07:33 -0500
Subject: Lose the first argument of audit_inode_child()

it's always equal to ->d_name.name of the second argument

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |  2 +-
 include/linux/audit.h    | 11 +++++------
 include/linux/fsnotify.h |  8 ++++----
 kernel/auditsc.c         |  7 ++-----
 4 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f69df876fac3..865282f8e012 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1337,7 +1337,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
-	audit_inode_child(victim->d_name.name, victim, dir);
+	audit_inode_child(victim, dir);
 
 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3c7a358241a7..f391d45c8aea 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -424,7 +424,7 @@ extern void audit_syscall_exit(int failed, long return_code);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
-extern void __audit_inode_child(const char *dname, const struct dentry *dentry,
+extern void __audit_inode_child(const struct dentry *dentry,
 				const struct inode *parent);
 extern void __audit_ptrace(struct task_struct *t);
 
@@ -442,11 +442,10 @@ static inline void audit_inode(const char *name, const struct dentry *dentry) {
 	if (unlikely(!audit_dummy_context()))
 		__audit_inode(name, dentry);
 }
-static inline void audit_inode_child(const char *dname, 
-				     const struct dentry *dentry,
+static inline void audit_inode_child(const struct dentry *dentry,
 				     const struct inode *parent) {
 	if (unlikely(!audit_dummy_context()))
-		__audit_inode_child(dname, dentry, parent);
+		__audit_inode_child(dentry, parent);
 }
 void audit_core_dumps(long signr);
 
@@ -544,9 +543,9 @@ extern int audit_signals;
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
 #define __audit_inode(n,d) do { ; } while (0)
-#define __audit_inode_child(d,i,p) do { ; } while (0)
+#define __audit_inode_child(i,p) do { ; } while (0)
 #define audit_inode(n,d) do { ; } while (0)
-#define audit_inode_child(d,i,p) do { ; } while (0)
+#define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 2d755c49c324..df8fd9a3b214 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -104,7 +104,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
 		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
-	audit_inode_child(new_name, moved, new_dir);
+	audit_inode_child(moved, new_dir);
 }
 
 /*
@@ -147,7 +147,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
-	audit_inode_child(dentry->d_name.name, dentry, inode);
+	audit_inode_child(dentry, inode);
 
 	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
@@ -162,7 +162,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
 				  inode);
 	fsnotify_link_count(inode);
-	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
+	audit_inode_child(new_dentry, dir);
 
 	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
 }
@@ -176,7 +176,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	struct inode *d_inode = dentry->d_inode;
 
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
-	audit_inode_child(dentry->d_name.name, dentry, inode);
+	audit_inode_child(dentry, inode);
 
 	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 
 /**
  * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
  * @dentry: dentry being audited
  * @parent: inode of dentry parent
  *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
  * must be hooked prior, in order to capture the target inode during
  * unsuccessful attempts.
  */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
 			 const struct inode *parent)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
 	const char *found_parent = NULL, *found_child = NULL;
 	const struct inode *inode = dentry->d_inode;
+	const char *dname = dentry->d_name.name;
 	int dirlen = 0;
 
 	if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
 
 	if (inode)
 		handle_one(inode);
-	/* determine matching parent */
-	if (!dname)
-		goto add_names;
 
 	/* parent is more likely, look for it first */
 	for (idx = 0; idx < context->name_count; idx++) {
-- 
cgit v1.2.3


From 84eb8fb42c120ff32b201c1cdd910033c888f699 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 5 Jan 2010 19:41:44 +0900
Subject: [SCSI] compat_ioct: fix bsg SG_IO

bsg's SG_IO doesn't work on 32-bit userspace and 64-bit kernelspace.

The problem is that both sg and bsg drivers use SG_IO
ioctl. sg_ioctl_trans() does 32/64-bit conversion even against bsg
header. It messes up bsg header. bsg driver gets garbage.

This patch fixes sg_ioctl_trans to handle only sg header (struct
sg_io_hdr).

Reported-by: Giridhar Malavali <giridhar.malavali@qlogic.com>
Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
---
 fs/compat_ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c5c45de1a2ee..7cbbc7ab4b50 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -301,6 +301,12 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	u32 data;
 	void __user *dxferp;
 	int err;
+	int interface_id;
+
+	if (get_user(interface_id, &sgio32->interface_id))
+		return -EFAULT;
+	if (interface_id != 'S')
+		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
 
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
-- 
cgit v1.2.3


From 8cfb3343f70bcf9403218df120ecf345f06dd585 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jeremy.kerr@canonical.com>
Date: Mon, 1 Feb 2010 21:34:14 -0700
Subject: of: make set_node_proc_entry private to proc_devtree.c

We only need set_node_proc_entry in proc_devtree.c, so move it there.

This fixes the !HAVE_ARCH_DEVTREE_FIXUPS build, as we can't make make
the definition in linux/of.h conditional on this #define (definitions in
asm/prom.h can't be exposed to linux/of.h, due to the enforced #include
ordering).

Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 fs/proc/proc_devtree.c | 5 +++--
 include/linux/of.h     | 6 ------
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 123257bb356b..2309bf17203f 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -14,12 +14,13 @@
 #include <asm/uaccess.h>
 #include "internal.h"
 
-#ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
 				       struct proc_dir_entry *de)
 {
-}
+#ifdef HAVE_ARCH_DEVTREE_FIXUPS
+	np->pde = de;
 #endif
+}
 
 static struct proc_dir_entry *proc_device_tree;
 
diff --git a/include/linux/of.h b/include/linux/of.h
index 3cc0d7ae290e..fd47c81d7a25 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -73,12 +73,6 @@ static inline void of_node_set_flag(struct device_node *n, unsigned long flag)
 	set_bit(flag, &n->_flags);
 }
 
-static inline void
-set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de)
-{
-	dn->pde = de;
-}
-
 extern struct device_node *of_find_all_nodes(struct device_node *prev);
 
 #if defined(CONFIG_SPARC)
-- 
cgit v1.2.3


From 50ab2fe147e22c8786552cda1791a61ae81b84d2 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jeremy.kerr@canonical.com>
Date: Mon, 1 Feb 2010 21:34:14 -0700
Subject: proc_devtree: include linux/of.h

Currenly, proc_devtree.c depends on asm/prom.h to include linux/of.h, to
provide some device-tree definitions (eg, struct property).

Instead, include linux/of.h directly. We still need asm/prom.h for
HAVE_ARCH_DEVTREE_FIXUPS.

Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 fs/proc/proc_devtree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 2309bf17203f..0ec45110e15e 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -10,6 +10,7 @@
 #include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/of.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
-- 
cgit v1.2.3


From 2c1740098c708b465e87637b237feb2fd98f129a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Feb 2010 09:32:27 -0500
Subject: NFS: Fix a bug in nfs_fscache_release_page()

Not having an fscache cookie is perfectly valid if the user didn't mount
with the fscache option.

This patch fixes http://bugzilla.kernel.org/show_bug.cgi?id=15234

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: stable@kernel.org
---
 fs/nfs/fscache.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index fa588006588d..237874f1af23 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -354,12 +354,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
  */
 int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
-	struct nfs_inode *nfsi = NFS_I(page->mapping->host);
-	struct fscache_cookie *cookie = nfsi->fscache;
-
-	BUG_ON(!cookie);
-
 	if (PageFsCache(page)) {
+		struct nfs_inode *nfsi = NFS_I(page->mapping->host);
+		struct fscache_cookie *cookie = nfsi->fscache;
+
+		BUG_ON(!cookie);
 		dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
 			 cookie, page, nfsi);
 
-- 
cgit v1.2.3


From 7549ad5f9b6eda49bbac4b14c5b8f37bf464f922 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Feb 2010 09:32:34 -0500
Subject: NFS: Remove a redundant check for PageFsCache in nfs_migrate_page()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: David Howells <dhowells@redhat.com>
---
 fs/nfs/write.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7b54b8bb101f..d63d964a0392 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1598,8 +1598,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 	struct nfs_page *req;
 	int ret;
 
-	if (PageFsCache(page))
-		nfs_fscache_release_page(page, GFP_KERNEL);
+	nfs_fscache_release_page(page, GFP_KERNEL);
 
 	req = nfs_find_and_lock_request(page);
 	ret = PTR_ERR(req);
-- 
cgit v1.2.3


From fdcb45777a3d1689c5541e1f85ee3ebbd197d2c1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Feb 2010 09:32:40 -0500
Subject: NFS: Fix the mapping of the NFSERR_SERVERFAULT error

It was recently pointed out that the NFSERR_SERVERFAULT error, which is
designed to inform the user of a serious internal error on the server, was
being mapped to an error value that is internal to the kernel.

This patch maps it to the error EREMOTEIO, which is exported to userland
through errno.h.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/mount_clnt.c | 2 +-
 fs/nfs/nfs2xdr.c    | 2 +-
 fs/nfs/nfs4xdr.c    | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 0adefc40cc89..59047f8d7d72 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -120,7 +120,7 @@ static struct {
 	{ .status = MNT3ERR_INVAL,		.errno = -EINVAL,	},
 	{ .status = MNT3ERR_NAMETOOLONG,	.errno = -ENAMETOOLONG,	},
 	{ .status = MNT3ERR_NOTSUPP,		.errno = -ENOTSUPP,	},
-	{ .status = MNT3ERR_SERVERFAULT,	.errno = -ESERVERFAULT,	},
+	{ .status = MNT3ERR_SERVERFAULT,	.errno = -EREMOTEIO,	},
 };
 
 struct mountres {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5e078b222b4e..7bc2da8efd4a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -699,7 +699,7 @@ static struct {
 	{ NFSERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFSERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFSERR_TOOSMALL,	-ETOOSMALL	},
-	{ NFSERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFSERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFSERR_BADTYPE,	-EBADTYPE	},
 	{ NFSERR_JUKEBOX,	-EJUKEBOX	},
 	{ -1,			-EIO		}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e437fd6a819f..5cd5184b56db 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4631,7 +4631,7 @@ static int decode_sequence(struct xdr_stream *xdr,
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
 	 */
-	status = -ESERVERFAULT;
+	status = -EREMOTEIO;
 
 	if (memcmp(id.data, res->sr_session->sess_id.data,
 		   NFS4_MAX_SESSIONID_LEN)) {
@@ -5774,7 +5774,7 @@ static struct {
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
-	{ NFS4ERR_SERVERFAULT,	-ESERVERFAULT	},
+	{ NFS4ERR_SERVERFAULT,	-EREMOTEIO	},
 	{ NFS4ERR_BADTYPE,	-EBADTYPE	},
 	{ NFS4ERR_LOCKED,	-EAGAIN		},
 	{ NFS4ERR_SYMLINK,	-ELOOP		},
@@ -5801,7 +5801,7 @@ nfs4_stat_to_errno(int stat)
 	}
 	if (stat <= 10000 || stat > 10100) {
 		/* The server is looney tunes. */
-		return -ESERVERFAULT;
+		return -EREMOTEIO;
 	}
 	/* If we cannot translate the error, the recovery routines should
 	 * handle it.
-- 
cgit v1.2.3


From f79f11852831ba8837e82b73364e6f1cd0145499 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Feb 2010 16:14:04 +0100
Subject: compat_ioctl: ignore RAID_VERSION ioctl

md ioctls are now handled by the md driver itself, but mdadm
may call RAID_VERSION on other devices as well. Mark the command
as IGNORE_IOCTL so this fails silently rather than printing
an annoying message.

Reported-by: "Michael S. Tsirkin" <m.s.tsirkin@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c5c45de1a2ee..b6f23b25370e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1038,6 +1038,8 @@ COMPATIBLE_IOCTL(FIOQSIZE)
 #ifdef CONFIG_BLOCK
 /* loop */
 IGNORE_IOCTL(LOOP_CLR_FD)
+/* md calls this on random blockdevs */
+IGNORE_IOCTL(RAID_VERSION)
 /* SG stuff */
 COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
 COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
-- 
cgit v1.2.3


From 66655de6d132b726be64c324bc3f9ea366d20697 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 8 Feb 2010 23:18:22 +0000
Subject: seq_file: Add helpers for iteration over a hlist

Some places in kernel need to iterate over a hlist in seq_file,
so provide some common helpers.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/seq_file.c            | 59 +++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/seq_file.h | 11 +++++++++
 2 files changed, 67 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9dbf3ff..f65b16f02da3 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -674,7 +674,6 @@ struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 
 	return NULL;
 }
-
 EXPORT_SYMBOL(seq_list_start);
 
 struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
@@ -684,7 +683,6 @@ struct list_head *seq_list_start_head(struct list_head *head, loff_t pos)
 
 	return seq_list_start(head, pos - 1);
 }
-
 EXPORT_SYMBOL(seq_list_start_head);
 
 struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
@@ -695,5 +693,60 @@ struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
 	++*ppos;
 	return lh == head ? NULL : lh;
 }
-
 EXPORT_SYMBOL(seq_list_next);
+
+/**
+ * seq_hlist_start - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ */
+struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos)
+{
+	struct hlist_node *node;
+
+	hlist_for_each(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start);
+
+/**
+ * seq_hlist_start_head - start an iteration of a hlist
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ */
+struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head);
+
+/**
+ * seq_hlist_next - move to the next position of the hlist
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @pos:  the current posision
+ *
+ * Called at seq_file->op->next().
+ */
+struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+				  loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return head->first;
+	else
+		return node->next;
+}
+EXPORT_SYMBOL(seq_hlist_next);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 8366d8f12e53..c95bcdc18f4c 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -135,4 +135,15 @@ extern struct list_head *seq_list_start_head(struct list_head *head,
 extern struct list_head *seq_list_next(void *v, struct list_head *head,
 		loff_t *ppos);
 
+/*
+ * Helpers for iteration over hlist_head-s in seq_files
+ */
+
+extern struct hlist_node *seq_hlist_start(struct hlist_head *head,
+		loff_t pos);
+extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head,
+		loff_t pos);
+extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
+		loff_t *ppos);
+
 #endif
-- 
cgit v1.2.3


From 4cfbafd33f5ae99688ab82525a1d449c1c1b198f Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Wed, 10 Feb 2010 13:56:40 -0800
Subject: compat_ioctl: add compat handler for TIOCGSID ioctl

This is used by tcgetsid(3).

Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b6f23b25370e..30698a13fb22 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -936,6 +936,7 @@ COMPATIBLE_IOCTL(TCSETSF)
 COMPATIBLE_IOCTL(TIOCLINUX)
 COMPATIBLE_IOCTL(TIOCSBRK)
 COMPATIBLE_IOCTL(TIOCCBRK)
+COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
-- 
cgit v1.2.3


From 803bf5ec259941936262d10ecc84511b76a20921 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 10 Feb 2010 13:56:42 -0800
Subject: fs/exec.c: restrict initial stack space expansion to rlimit

When reserving stack space for a new process, make sure we're not
attempting to expand the stack by more than rlimit allows.

This fixes a bug caused by b6a2fea39318e43fee84fa7b0b90d68bed92d2ba ("mm:
variable length argument support") and unmasked by
fc63cf237078c86214abcb2ee9926d8ad289da9b ("exec: setup_arg_pages() fails
to return errors").

This bug means that when limiting the stack to less the 20*PAGE_SIZE (eg.
80K on 4K pages or 'ulimit -s 79') all processes will be killed before
they start.  This is particularly bad with 64K pages, where a ulimit below
1280K will kill every process.

To test, do:

  'ulimit -s 15; ls'

before and after the patch is applied.  Before it's applied, 'ls' should
be killed.  After the patch is applied, 'ls' should no longer be killed.

A stack limit of 15KB since it's small enough to trigger 20*PAGE_SIZE.
Also 15KB not a multiple of PAGE_SIZE, which is a trickier case to handle
correctly with this code.

4K pages should be fine to test with.

[kosaki.motohiro@jp.fujitsu.com: cleanup]
[akpm@linux-foundation.org: cleanup cleanup]
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Americo Wang <xiyou.wangcong@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0790a107ff7e..e95c692ef0e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -571,6 +571,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	struct vm_area_struct *prev = NULL;
 	unsigned long vm_flags;
 	unsigned long stack_base;
+	unsigned long stack_size;
+	unsigned long stack_expand;
+	unsigned long rlim_stack;
 
 #ifdef CONFIG_STACK_GROWSUP
 	/* Limit stack size to 1GB */
@@ -627,10 +630,24 @@ int setup_arg_pages(struct linux_binprm *bprm,
 			goto out_unlock;
 	}
 
+	stack_expand = EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+	stack_size = vma->vm_end - vma->vm_start;
+	/*
+	 * Align this down to a page boundary as expand_stack
+	 * will align it up.
+	 */
+	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
+	rlim_stack = min(rlim_stack, stack_size);
 #ifdef CONFIG_STACK_GROWSUP
-	stack_base = vma->vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_start + rlim_stack;
+	else
+		stack_base = vma->vm_end + stack_expand;
 #else
-	stack_base = vma->vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE;
+	if (stack_size + stack_expand > rlim_stack)
+		stack_base = vma->vm_end - rlim_stack;
+	else
+		stack_base = vma->vm_start - stack_expand;
 #endif
 	ret = expand_stack(vma, stack_base);
 	if (ret)
-- 
cgit v1.2.3


From 0e5a9fb0426108d750c97c25b1ab04d3768b5aff Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Fri, 5 Feb 2010 18:25:41 -0500
Subject: GFS2: Fix error code

We need this one-liner to signal the mount helper of the 'insufficient journals' condition.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8a102f731003..a86ed6381566 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -725,7 +725,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		goto fail;
 	}
 
-	error = -EINVAL;
+	error = -EUSERS;
 	if (!gfs2_jindex_size(sdp)) {
 		fs_err(sdp, "no journals!\n");
 		goto fail_jindex;
-- 
cgit v1.2.3


From 07ccb7bf2c928fef4fea2cda69ba2e23479578db Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 12 Feb 2010 10:10:55 +0000
Subject: GFS2: Fix bmap allocation corner-case bug

This patch solves a corner case during allocation which occurs if both
metadata (indirect) and data blocks are required but there is an
obstacle in the filesystem (e.g. a resource group header or another
allocated block) such that when the allocation is requested only
enough blocks for the metadata are returned.

By changing the exit condition of this loop, we ensure that a
minimum of one data block will always be returned.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d47379e794b..583e823307ae 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -541,7 +541,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 				*ptr++ = cpu_to_be64(bn++);
 			break;
 		}
-	} while (state != ALLOC_DATA);
+	} while ((state != ALLOC_DATA) || !dblock);
 
 	ip->i_height = height;
 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
-- 
cgit v1.2.3


From 87185517de81101da5afbc82cefdeed6eeaa38fb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Feb 2010 19:43:31 +0000
Subject: xfs: only clear the suid bit once in xfs_write

file_remove_suid already calls into ->setattr to clear the suid and
sgid bits if needed, no need to start a second transaction to do it
ourselves.

Note that xfs_write_clear_setuid issues a sync transaction while the
path through ->setattr doesn't, but that is consistant with the
other filesystems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 15 +++------------
 fs/xfs/xfs_rw.c            | 42 ------------------------------------------
 fs/xfs/xfs_rw.h            |  1 -
 3 files changed, 3 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index c80fa00d2ad7..eac6f80d786d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -630,18 +630,9 @@ start:
 	 * by root.  This keeps people from modifying setuid and
 	 * setgid binaries.
 	 */
-
-	if (((xip->i_d.di_mode & S_ISUID) ||
-	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
-		(S_ISGID | S_IXGRP))) &&
-	     !capable(CAP_FSETID)) {
-		error = xfs_write_clear_setuid(xip);
-		if (likely(!error))
-			error = -file_remove_suid(file);
-		if (unlikely(error)) {
-			goto out_unlock_internal;
-		}
-	}
+	error = -file_remove_suid(file);
+	if (unlikely(error))
+		goto out_unlock_internal;
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index abb2c458b148..e336742a58a4 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -46,48 +46,6 @@
 #include "xfs_rw.h"
 #include "xfs_trace.h"
 
-/*
- * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
- * which clears the setuid and setgid bits when a file is written.
- */
-int
-xfs_write_clear_setuid(
-	xfs_inode_t	*ip)
-{
-	xfs_mount_t	*mp;
-	xfs_trans_t	*tp;
-	int		error;
-
-	mp = ip->i_mount;
-	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
-	if ((error = xfs_trans_reserve(tp, 0,
-				      XFS_WRITEID_LOG_RES(mp),
-				      0, 0, 0))) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-	xfs_trans_ihold(tp, ip);
-	ip->i_d.di_mode &= ~S_ISUID;
-
-	/*
-	 * Note that we don't have to worry about mandatory
-	 * file locking being disabled here because we only
-	 * clear the S_ISGID bit if the Group execute bit is
-	 * on, but if it was on then mandatory locking wouldn't
-	 * have been enabled.
-	 */
-	if (ip->i_d.di_mode & S_IXGRP) {
-		ip->i_d.di_mode &= ~S_ISGID;
-	}
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return 0;
-}
-
 /*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index a54c3b7cd376..11c41ec6ed75 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -39,7 +39,6 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 /*
  * Prototypes for functions in xfs_rw.c.
  */
-extern int xfs_write_clear_setuid(struct xfs_inode *ip);
 extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 			xfs_daddr_t blkno, int len, uint flags,
 			struct xfs_buf **bpp);
-- 
cgit v1.2.3


From 180040b89ee2aed88c0a0b1fcf7ada9a512b12e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 5 Feb 2010 09:57:55 +0000
Subject: xfs: optimize log flushing in xfs_fsync

If we have a pinned inode it must have a log item attached to it.
Usually that log item will have ili_last_lsn already set, in which
case we only need to flush the log up to that LSN instead of doing a
full log force.  This gives speedups of about 5% in some fsync heavy
workloads.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 43241e289800..ddd2c5d1b854 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -629,8 +629,14 @@ xfs_fsync(
 		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		if (xfs_ipincount(ip)) {
-			error = _xfs_log_force(ip->i_mount, XFS_LOG_SYNC,
-					       &log_flushed);
+			if (ip->i_itemp->ili_last_lsn) {
+				error = _xfs_log_force_lsn(ip->i_mount,
+						ip->i_itemp->ili_last_lsn,
+						XFS_LOG_SYNC, &log_flushed);
+			} else {
+				error = _xfs_log_force(ip->i_mount,
+						XFS_LOG_SYNC, &log_flushed);
+			}
 		}
 	} else	{
 		/*
-- 
cgit v1.2.3


From 3f6fae9559225741c91f1320090b285da1413290 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 11 Feb 2010 07:43:00 +0000
Subject: Btrfs: btrfs_mark_extent_written uses the wrong slot

My test do: fallocate a big file and do write. The file is 512M, but
after file write is done btrfs-debug-tree shows:
item 6 key (257 EXTENT_DATA 0) itemoff 3516 itemsize 53
                extent data disk byte 1103101952 nr 536870912
                extent data offset 0 nr 399634432 ram 536870912
                extent compression 0
Looks like a regression introducted by
6c7d54ac87f338c479d9729e8392eca3f76e11e1, where we set wrong slot.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Acked-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 413a30dafcda..a7fd9f3a750a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -720,13 +720,15 @@ again:
 					inode->i_ino, orig_offset);
 		BUG_ON(ret);
 	}
-	fi = btrfs_item_ptr(leaf, path->slots[0],
-			   struct btrfs_file_extent_item);
 	if (del_nr == 0) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
-- 
cgit v1.2.3


From e902ec9906e844f4613fa6190c6fa65f162dc86e Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Sat, 30 Jan 2010 18:06:35 +0900
Subject: nilfs2: issue discard request after cleaning segments

This adds a function to send discard requests for given array of
segment numbers, and calls the function when garbage collection
succeeded.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 Documentation/filesystems/nilfs2.txt |  3 +++
 fs/nilfs2/segment.c                  | 10 ++++++++++
 fs/nilfs2/super.c                    |  8 +++++++-
 fs/nilfs2/the_nilfs.c                | 38 ++++++++++++++++++++++++++++++++++++
 fs/nilfs2/the_nilfs.h                |  1 +
 include/linux/nilfs2_fs.h            |  1 +
 6 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 839efd8a8a8c..cf6d0d85ca82 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -74,6 +74,9 @@ norecovery		Disable recovery of the filesystem on mount.
 			This disables every write access on the device for
 			read-only mounts or snapshots.  This option will fail
 			for r/w mounts on an unclean volume.
+discard			Issue discard/TRIM commands to the underlying block
+			device when blocks are freed.  This is useful for SSD
+			devices and sparse/thinly-provisioned LUNs.
 
 NILFS2 usage
 ============
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508b47a8..9280b0f10792 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2560,6 +2560,16 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(sci->sc_interval);
 	}
+	if (nilfs_test_opt(sbi, DISCARD)) {
+		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
+						 sci->sc_nfreesegs);
+		if (ret) {
+			printk(KERN_WARNING
+			       "NILFS warning: error %d on discard request, "
+			       "turning discards off for the device\n", ret);
+			nilfs_clear_opt(sbi, DISCARD);
+		}
+	}
 
  out_unlock:
 	sci->sc_freesegs = NULL;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8173faee31e6..3f88401a375b 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -481,6 +481,8 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",order=strict");
 	if (nilfs_test_opt(sbi, NORECOVERY))
 		seq_printf(seq, ",norecovery");
+	if (nilfs_test_opt(sbi, DISCARD))
+		seq_printf(seq, ",discard");
 
 	return 0;
 }
@@ -550,7 +552,7 @@ static const struct export_operations nilfs_export_ops = {
 enum {
 	Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
-	Opt_err,
+	Opt_discard, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -561,6 +563,7 @@ static match_table_t tokens = {
 	{Opt_snapshot, "cp=%u"},
 	{Opt_order, "order=%s"},
 	{Opt_norecovery, "norecovery"},
+	{Opt_discard, "discard"},
 	{Opt_err, NULL}
 };
 
@@ -614,6 +617,9 @@ static int parse_options(char *options, struct super_block *sb)
 		case Opt_norecovery:
 			nilfs_set_opt(sbi, NORECOVERY);
 			break;
+		case Opt_discard:
+			nilfs_set_opt(sbi, DISCARD);
+			break;
 		default:
 			printk(KERN_ERR
 			       "NILFS: Unrecognized mount option \"%s\"\n", p);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6241e1722efc..92733d5651d2 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -646,6 +646,44 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	goto out;
 }
 
+int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
+			    size_t nsegs)
+{
+	sector_t seg_start, seg_end;
+	sector_t start = 0, nblocks = 0;
+	unsigned int sects_per_block;
+	__u64 *sn;
+	int ret = 0;
+
+	sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+		bdev_logical_block_size(nilfs->ns_bdev);
+	for (sn = segnump; sn < segnump + nsegs; sn++) {
+		nilfs_get_segment_range(nilfs, *sn, &seg_start, &seg_end);
+
+		if (!nblocks) {
+			start = seg_start;
+			nblocks = seg_end - seg_start + 1;
+		} else if (start + nblocks == seg_start) {
+			nblocks += seg_end - seg_start + 1;
+		} else {
+			ret = blkdev_issue_discard(nilfs->ns_bdev,
+						   start * sects_per_block,
+						   nblocks * sects_per_block,
+						   GFP_NOFS,
+						   DISCARD_FL_BARRIER);
+			if (ret < 0)
+				return ret;
+			nblocks = 0;
+		}
+	}
+	if (nblocks)
+		ret = blkdev_issue_discard(nilfs->ns_bdev,
+					   start * sects_per_block,
+					   nblocks * sects_per_block,
+					   GFP_NOFS, DISCARD_FL_BARRIER);
+	return ret;
+}
+
 int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
 {
 	struct inode *dat = nilfs_dat_inode(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 589786e33464..fd057f8ad439 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -221,6 +221,7 @@ struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index 3fe02cf8b65a..640702e97457 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -153,6 +153,7 @@ struct nilfs_super_root {
 						   semantics also for data */
 #define NILFS_MOUNT_NORECOVERY		0x4000  /* Disable write access during
 						   mount-time recovery */
+#define NILFS_MOUNT_DISCARD		0x8000  /* Issue DISCARD requests */
 
 
 /**
-- 
cgit v1.2.3


From 7512487e6d6459e4c3f9c7cedc53050a6c30e387 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 26 Jan 2010 13:59:40 +0900
Subject: nilfs2: use mnt_want_write in ioctls where write access is needed

A few nilfs2 ioctls need to ask for and then later release write
access to the mount in order to avoid potential write to read-only
mounts.

This adds the missing mnt_want_write and mnt_drop_write in
nilfs_ioctl_change_cpmode, nilfs_ioctl_delete_checkpoint, and
nilfs_ioctl_clean_segments.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c | 60 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6b2b83de363..8e5cad020c30 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,6 +26,7 @@
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/mount.h>	/* mnt_want_write(), mnt_drop_write() */
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -107,20 +108,28 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
-		return -EFAULT;
+		goto out;
 
 	mutex_lock(&nilfs->ns_mount_mutex);
+
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_change_cpmode(
 		cpfile, cpmode.cm_cno, cpmode.cm_mode);
-	if (unlikely(ret < 0)) {
+	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
-		mutex_unlock(&nilfs->ns_mount_mutex);
-		return ret;
-	}
-	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+
 	mutex_unlock(&nilfs->ns_mount_mutex);
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
@@ -135,16 +144,23 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(&cno, argp, sizeof(cno)))
-		return -EFAULT;
+		goto out;
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
-	if (unlikely(ret < 0)) {
+	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
-		return ret;
-	}
-	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	else
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
@@ -496,12 +512,19 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	ret = -EFAULT;
 	if (copy_from_user(argv, argp, sizeof(argv)))
-		return -EFAULT;
+		goto out;
 
+	ret = -EINVAL;
 	nsegs = argv[4].v_nmembs;
 	if (argv[4].v_size != argsz[4])
-		return -EINVAL;
+		goto out;
+
 	/*
 	 * argv[4] points to segment numbers this ioctl cleans.  We
 	 * use kmalloc() for its buffer because memory used for the
@@ -509,9 +532,10 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 	 */
 	kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
 			       nsegs * sizeof(__u64));
-	if (IS_ERR(kbufs[4]))
-		return PTR_ERR(kbufs[4]);
-
+	if (IS_ERR(kbufs[4])) {
+		ret = PTR_ERR(kbufs[4]);
+		goto out;
+	}
 	nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 
 	for (n = 0; n < 4; n++) {
@@ -563,10 +587,12 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		nilfs_remove_all_gcinode(nilfs);
 	clear_nilfs_gc_running(nilfs);
 
- out_free:
+out_free:
 	while (--n >= 0)
 		vfree(kbufs[n]);
 	kfree(kbufs[4]);
+out:
+	mnt_drop_write(filp->f_path.mnt);
 	return ret;
 }
 
-- 
cgit v1.2.3


From fe5f171bb272946ce5fbf843ce2f8467d0d41b9a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 31 Jan 2010 19:46:40 +0900
Subject: nilfs2: fix potential hang in nilfs_error on errors=remount-ro

nilfs_error() calls nilfs_detach_segment_constructor() if
errors=remount-ro option is specified, and this may lead to a hang due
to recursive locking of, for instance, nilfs->ns_segctor_sem and
others.

In this case, detaching segment constructor is not necessary because
read-only flag is set to the filesystem and further writes are
blocked.

This fixes the potential hang issue by removing the
nilfs_detach_segment_constructor() call from nilfs_error.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 11 +++++++++--
 fs/nilfs2/super.c   |  3 ---
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9280b0f10792..ab439a7ef2d8 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2875,8 +2875,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 
-	/* Each field of nilfs_segctor is cleared through the initialization
-	   of super-block info */
+	if (NILFS_SC(sbi)) {
+		/*
+		 * This happens if the filesystem was remounted
+		 * read/write after nilfs_error degenerated it into a
+		 * read-only mount.
+		 */
+		nilfs_detach_segment_constructor(sbi);
+	}
+
 	sbi->s_sc_info = nilfs_segctor_new(sbi);
 	if (!sbi->s_sc_info)
 		return -ENOMEM;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3f88401a375b..f068270f6c75 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -96,9 +96,6 @@ void nilfs_error(struct super_block *sb, const char *function,
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct the_nilfs *nilfs = sbi->s_nilfs;
 
-		if (!nilfs_test_opt(sbi, ERRORS_CONT))
-			nilfs_detach_segment_constructor(sbi);
-
 		down_write(&nilfs->ns_sem);
 		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
 			nilfs->ns_mount_state |= NILFS_ERROR_FS;
-- 
cgit v1.2.3


From 086d1764b22bb2d9d79bb8e2198927acf028d732 Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Fri, 5 Feb 2010 23:15:26 +0900
Subject: nilfs2: delete unnecessary condition in nilfs_dat_translate

This is a trivial patch to delete unnecessary condition in nilfs_dat_translate.

nilfs_dat_translate() will asign translated address to *blocknrp if blocknrp
is not NULL.  However the condition is unneeded, because all callers of
nilfs_dat_translate() pass blocknrp properly.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dat.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 187dd07ba86c..9d1e5de91afb 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -388,8 +388,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (blocknrp != NULL)
-		*blocknrp = blocknr;
+	*blocknrp = blocknr;
 
  out:
 	kunmap_atomic(kaddr, KM_USER0);
-- 
cgit v1.2.3


From dcd76186955e2b595c378dbe5b9bb6c8c5374b10 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Tue, 26 Jan 2010 15:20:15 +0900
Subject: nilfs2: get rid of nilfs_segctor_req struct

This will clean up nilfs_segctor_req struct and the obscure request
argument passed among private methods of segment constructor.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 79 ++++++++++++++++++++++++++---------------------------
 fs/nilfs2/segment.h |  2 ++
 2 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ab439a7ef2d8..fa4abfc22d33 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2425,43 +2425,43 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 	return err;
 }
 
-struct nilfs_segctor_req {
-	int mode;
-	__u32 seq_accepted;
-	int sc_err;  /* construction failure */
-	int sb_err;  /* super block writeback failure */
-};
-
 #define FLUSH_FILE_BIT	(0x1) /* data file only */
 #define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
 
-static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
-				 struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_accept - record accepted sequence count of log-write requests
+ * @sci: segment constructor object
+ */
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
 {
-	req->sc_err = req->sb_err = 0;
 	spin_lock(&sci->sc_state_lock);
-	req->seq_accepted = sci->sc_seq_request;
+	sci->sc_seq_accepted = sci->sc_seq_request;
 	spin_unlock(&sci->sc_state_lock);
 
 	if (sci->sc_timer)
 		del_timer_sync(sci->sc_timer);
 }
 
-static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
-				 struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_notify - notify the result of request to caller threads
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ * @err: error code to be notified
+ */
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
 {
 	/* Clear requests (even when the construction failed) */
 	spin_lock(&sci->sc_state_lock);
 
-	if (req->mode == SC_LSEG_SR) {
+	if (mode == SC_LSEG_SR) {
 		sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
-		sci->sc_seq_done = req->seq_accepted;
-		nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+		sci->sc_seq_done = sci->sc_seq_accepted;
+		nilfs_segctor_wakeup(sci, err);
 		sci->sc_flush_request = 0;
 	} else {
-		if (req->mode == SC_FLUSH_FILE)
+		if (mode == SC_FLUSH_FILE)
 			sci->sc_flush_request &= ~FLUSH_FILE_BIT;
-		else if (req->mode == SC_FLUSH_DAT)
+		else if (mode == SC_FLUSH_DAT)
 			sci->sc_flush_request &= ~FLUSH_DAT_BIT;
 
 		/* re-enable timer if checkpoint creation was not done */
@@ -2472,30 +2472,37 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
 	spin_unlock(&sci->sc_state_lock);
 }
 
-static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
-				   struct nilfs_segctor_req *req)
+/**
+ * nilfs_segctor_construct - form logs and write them to disk
+ * @sci: segment constructor object
+ * @mode: mode of log forming
+ */
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err = 0;
 
+	nilfs_segctor_accept(sci);
+
 	if (nilfs_discontinued(nilfs))
-		req->mode = SC_LSEG_SR;
-	if (!nilfs_segctor_confirm(sci)) {
-		err = nilfs_segctor_do_construct(sci, req->mode);
-		req->sc_err = err;
-	}
+		mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci))
+		err = nilfs_segctor_do_construct(sci, mode);
+
 	if (likely(!err)) {
-		if (req->mode != SC_FLUSH_DAT)
+		if (mode != SC_FLUSH_DAT)
 			atomic_set(&nilfs->ns_ndirtyblks, 0);
 		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
-			req->sb_err = nilfs_commit_super(sbi,
-					nilfs_altsb_need_update(nilfs));
+			err = nilfs_commit_super(
+				sbi, nilfs_altsb_need_update(nilfs));
 			up_write(&nilfs->ns_sem);
 		}
 	}
+
+	nilfs_segctor_notify(sci, mode, err);
 	return err;
 }
 
@@ -2526,7 +2533,6 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	struct nilfs_sc_info *sci = NILFS_SC(sbi);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_transaction_info ti;
-	struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
 	int err;
 
 	if (unlikely(!sci))
@@ -2547,10 +2553,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	list_splice_tail_init(&nilfs->ns_gc_inodes, &sci->sc_gc_inodes);
 
 	for (;;) {
-		nilfs_segctor_accept(sci, &req);
-		err = nilfs_segctor_construct(sci, &req);
+		err = nilfs_segctor_construct(sci, SC_LSEG_SR);
 		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
-		nilfs_segctor_notify(sci, &req);
 
 		if (likely(!err))
 			break;
@@ -2583,13 +2587,9 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
 	struct nilfs_sb_info *sbi = sci->sc_sbi;
 	struct nilfs_transaction_info ti;
-	struct nilfs_segctor_req req = { .mode = mode };
 
 	nilfs_transaction_lock(sbi, &ti, 0);
-
-	nilfs_segctor_accept(sci, &req);
-	nilfs_segctor_construct(sci, &req);
-	nilfs_segctor_notify(sci, &req);
+	nilfs_segctor_construct(sci, mode);
 
 	/*
 	 * Unclosed segment should be retried.  We do this using sc_timer.
@@ -2807,12 +2807,9 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 	do {
 		struct nilfs_sb_info *sbi = sci->sc_sbi;
 		struct nilfs_transaction_info ti;
-		struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
 
 		nilfs_transaction_lock(sbi, &ti, 0);
-		nilfs_segctor_accept(sci, &req);
-		ret = nilfs_segctor_construct(sci, &req);
-		nilfs_segctor_notify(sci, &req);
+		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
 		nilfs_transaction_unlock(sbi);
 
 	} while (ret && retrycount-- > 0);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3d3ab2f9864c..3155e0c7f415 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -116,6 +116,7 @@ struct nilfs_segsum_pointer {
  * @sc_wait_daemon: Daemon wait queue
  * @sc_wait_task: Start/end wait queue to control segctord task
  * @sc_seq_request: Request counter
+ * @sc_seq_accept: Accepted request count
  * @sc_seq_done: Completion counter
  * @sc_sync: Request of explicit sync operation
  * @sc_interval: Timeout value of background construction
@@ -169,6 +170,7 @@ struct nilfs_sc_info {
 	wait_queue_head_t	sc_wait_task;
 
 	__u32			sc_seq_request;
+	__u32			sc_seq_accepted;
 	__u32			sc_seq_done;
 
 	int			sc_sync;
-- 
cgit v1.2.3


From e605f0a7249d8002c660af379f884896cbaa45ae Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Dec 2009 00:57:52 +0900
Subject: nilfs2: get rid of s_dirt flag use

This replaces s_dirt flag use in nilfs with a new flag added on the
nilfs object.  The s_dirt flag was used to indicate if
sop->write_super() should be called, however the current version of
nilfs does not use the callback.  Thus, it can be replaced with the
own flag.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Jiro SEKIBA <jir@unicus.jp>
---
 fs/nilfs2/segment.c   | 11 +++++------
 fs/nilfs2/super.c     |  4 ++--
 fs/nilfs2/the_nilfs.h |  2 ++
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index fa4abfc22d33..c4fcdd1567a9 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1937,8 +1937,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
 	int update_sr = (sci->sc_super_root != NULL);
 
 	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2020,7 +2019,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 	if (update_sr) {
 		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
 				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
-		sbi->s_super->s_dirt = 1;
+		set_nilfs_sb_dirty(nilfs);
 
 		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2645,6 +2644,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
 	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
 	struct timer_list timer;
 	int timeout = 0;
 
@@ -2690,7 +2690,6 @@ static int nilfs_segctor_thread(void *arg)
 	} else {
 		DEFINE_WAIT(wait);
 		int should_sleep = 1;
-		struct the_nilfs *nilfs;
 
 		prepare_to_wait(&sci->sc_wait_daemon, &wait,
 				TASK_INTERRUPTIBLE);
@@ -2711,8 +2710,8 @@ static int nilfs_segctor_thread(void *arg)
 		finish_wait(&sci->sc_wait_daemon, &wait);
 		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
 			   time_after_eq(jiffies, sci->sc_timer->expires));
-		nilfs = sci->sc_sbi->s_nilfs;
-		if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+
+		if (nilfs_sb_dirty(nilfs) && nilfs_sb_need_update(nilfs))
 			set_nilfs_discontinued(nilfs);
 	}
 	goto loop;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f068270f6c75..92579cc4c935 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -298,7 +298,7 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
 		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 		nilfs->ns_sbwtime[1] = t;
 	}
-	sbi->s_super->s_dirt = 0;
+	clear_nilfs_sb_dirty(nilfs);
 	return nilfs_sync_super(sbi, dupsb);
 }
 
@@ -342,7 +342,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 		err = nilfs_construct_segment(sb);
 
 	down_write(&nilfs->ns_sem);
-	if (sb->s_dirt)
+	if (nilfs_sb_dirty(nilfs))
 		nilfs_commit_super(sbi, 1);
 	up_write(&nilfs->ns_sem);
 
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd057f8ad439..e9795f1724d7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -38,6 +38,7 @@ enum {
 				   the latest checkpoint was loaded */
 	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
 	THE_NILFS_GC_RUNNING,	/* gc process is running */
+	THE_NILFS_SB_DIRTY,	/* super block is dirty */
 };
 
 /**
@@ -197,6 +198,7 @@ THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
+THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ		10
-- 
cgit v1.2.3


From d1c6b72a7224f6cd6924f7079f79580cde696d68 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 17 Dec 2009 00:55:40 +0900
Subject: nilfs2: move iterator to write log into segment buffer

This moves iterator to submit write requests for a series of logs into
segbuf.c, and hides nilfs_segbuf_write() and nilfs_segbuf_wait() in
the file.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c  | 18 ++++++++++++++++++
 fs/nilfs2/segbuf.h  |  5 +----
 fs/nilfs2/segment.c |  9 ++-------
 3 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 645c78656aa0..ab56fe44e377 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -40,6 +40,11 @@ struct nilfs_write_info {
 };
 
 
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs);
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
+
+
 static struct kmem_cache *nilfs_segbuf_cachep;
 
 static void nilfs_segbuf_init_once(void *obj)
@@ -302,6 +307,19 @@ void nilfs_truncate_logs(struct list_head *logs,
 	}
 }
 
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret = 0;
+
+	list_for_each_entry(segbuf, logs, sb_list) {
+		ret = nilfs_segbuf_write(segbuf, nilfs);
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
 int nilfs_wait_on_logs(struct list_head *logs)
 {
 	struct nilfs_segment_buffer *segbuf;
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 6af1630fb401..94dfd3517bc0 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -166,13 +166,10 @@ nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
 	segbuf->sb_sum.nfileblk++;
 }
 
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct the_nilfs *nilfs);
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf);
-
 void nilfs_clear_logs(struct list_head *logs);
 void nilfs_truncate_logs(struct list_head *logs,
 			 struct nilfs_segment_buffer *last);
+int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs);
 int nilfs_wait_on_logs(struct list_head *logs);
 
 static inline void nilfs_destroy_logs(struct list_head *logs)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c4fcdd1567a9..ada2f1b947a3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1764,14 +1764,9 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
 static int nilfs_segctor_write(struct nilfs_sc_info *sci,
 			       struct the_nilfs *nilfs)
 {
-	struct nilfs_segment_buffer *segbuf;
-	int ret = 0;
+	int ret;
 
-	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
-		ret = nilfs_segbuf_write(segbuf, nilfs);
-		if (ret)
-			break;
-	}
+	ret = nilfs_write_logs(&sci->sc_segbufs, nilfs);
 	list_splice_tail_init(&sci->sc_segbufs, &sci->sc_write_logs);
 	return ret;
 }
-- 
cgit v1.2.3


From d67b1b03254c501fef371b0e5916c94a52bfc2c5 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sat, 6 Feb 2010 08:45:15 +0000
Subject: fs/xfs: Correct NULL test

Test the value that was just allocated rather than the previously tested one.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@r@
expression *x;
expression e;
identifier l;
@@

if (x == NULL || ...) {
    ... when forall
    return ...; }
... when != goto l;
    when != x = e
    when != &x
*x == NULL
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8699e51cb45e..417e61e3d9dd 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -123,7 +123,7 @@ xfs_Gqm_init(void)
 		goto out;
 
 	gdqhash = kmem_zalloc_large(hsize);
-	if (!udqhash)
+	if (!gdqhash)
 		goto out_free_udqhash;
 
 	hsize /= sizeof(xfs_dqhash_t);
-- 
cgit v1.2.3


From 7c540d9e3da38c3d1c15fb8059e4577a84ac0066 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jeremy.kerr@canonical.com>
Date: Sun, 14 Feb 2010 07:13:41 -0700
Subject: proc_devtree: fix THIS_MODULE without module.h

Commit e22f628395432b967f2f505858c64450f7835365 introduced a build
breakage for ARM devtree work: the THIS_MODULE macro was added, but we
don't have module.h

This change adds the necessary #include to get THIS_MODULE defined.
While we could just replace it with NULL (PROC_FS is a bool, not a
tristate), using THIS_MODULE will prevent unexpected breakage if we
ever do compile this as a module.

Signed-off-by: Jeremy Kerr <jeremy.kerr@canonical.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Michal Simek <monstr@monstr.eu>
---
 fs/proc/proc_devtree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 0ec45110e15e..f8650dce74fb 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/of.h>
+#include <linux/module.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
-- 
cgit v1.2.3


From 175359f89df39f4faed663c8cfd6ee0222d2fa1e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 11 Feb 2010 13:13:10 +0100
Subject: reiserfs: Fix softlockup while waiting on an inode

When we wait for an inode through reiserfs_iget(), we hold
the reiserfs lock. And waiting for an inode may imply waiting
for its writeback. But the inode writeback path may also require
the reiserfs lock, which leads to a deadlock.

We just need to release the reiserfs lock from reiserfs_iget()
to fix this.

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christian Kujau <lists@nerdbynature.de>
Cc: Chris Mason <chris.mason@oracle.com>
---
 fs/reiserfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9087b10209e6..2df0f5c7c60b 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1497,9 +1497,11 @@ struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
 
 	args.objectid = key->on_disk_key.k_objectid;
 	args.dirid = key->on_disk_key.k_dir_id;
+	reiserfs_write_unlock(s);
 	inode = iget5_locked(s, key->on_disk_key.k_objectid,
 			     reiserfs_find_actor, reiserfs_init_locked_inode,
 			     (void *)(&args));
+	reiserfs_write_lock(s);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From 12062dddda450976b129dcb1bacd91acaf4d8030 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 15 Feb 2010 14:19:27 -0500
Subject: ext4: move __func__ into a macro for ext4_warning, ext4_error

Just a pet peeve of mine; we had a mishash of calls with either __func__
or "function_name" and the latter tends to get out of sync.

I think it's easier to just hide the __func__ in a macro, and it'll
be consistent from then on.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c      |  29 +++++---------
 fs/ext4/dir.c         |   4 +-
 fs/ext4/ext4.h        |   7 +++-
 fs/ext4/ext4_jbd2.c   |   2 +-
 fs/ext4/extents.c     |   9 +++--
 fs/ext4/ialloc.c      |  27 +++++--------
 fs/ext4/inode.c       |  43 +++++++++------------
 fs/ext4/mballoc.c     |  27 ++++++-------
 fs/ext4/move_extent.c |  12 +++---
 fs/ext4/namei.c       |  51 +++++++++++--------------
 fs/ext4/resize.c      | 102 +++++++++++++++++++-------------------------------
 fs/ext4/super.c       |  11 +++---
 fs/ext4/xattr.c       |  32 +++++++---------
 13 files changed, 146 insertions(+), 210 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 22bc7435d913..720061a0ee53 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -97,8 +97,8 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		/* If checksum is bad mark all blocks used to prevent allocation
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-			ext4_error(sb, __func__,
-				  "Checksum bad for group %u", block_group);
+			ext4_error(sb, "Checksum bad for group %u",
+					block_group);
 			ext4_free_blks_set(sb, gdp, 0);
 			ext4_free_inodes_set(sb, gdp, 0);
 			ext4_itable_unused_set(sb, gdp, 0);
@@ -210,10 +210,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= ngroups) {
-		ext4_error(sb, "ext4_get_group_desc",
-			   "block_group >= groups_count - "
-			   "block_group = %u, groups_count = %u",
-			   block_group, ngroups);
+		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+			   " groups_count = %u", block_group, ngroups);
 
 		return NULL;
 	}
@@ -221,8 +219,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error(sb, "ext4_get_group_desc",
-			   "Group descriptor not loaded - "
+		ext4_error(sb, "Group descriptor not loaded - "
 			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
 		return NULL;
@@ -282,9 +279,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
 		return 1;
 
 err_out:
-	ext4_error(sb, __func__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %llu",
+	ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
 			block_group, bitmap_blk);
 	return 0;
 }
@@ -311,8 +306,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __func__,
-			    "Cannot read block bitmap - "
+		ext4_error(sb, "Cannot read block bitmap - "
 			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -354,8 +348,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
-		ext4_error(sb, __func__,
-			    "Cannot read block bitmap - "
+		ext4_error(sb, "Cannot read block bitmap - "
 			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -419,8 +412,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error(sb, __func__,
-			   "Adding blocks in system zones - "
+		ext4_error(sb, "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
 		goto error_return;
@@ -453,8 +445,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 						bit + i, bitmap_bh->b_data)) {
-			ext4_error(sb, __func__,
-				   "bit already cleared for block %llu",
+			ext4_error(sb, "bit already cleared for block %llu",
 				   (ext4_fsblk_t)(block + i));
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc93168e262..0e0bef3ba91e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,7 +83,7 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error(dir->i_sb, function,
+		__ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%u, inode=%u, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
@@ -150,7 +150,7 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				ext4_error(sb, __func__, "directory #%lu "
+				ext4_error(sb, "directory #%lu "
 					   "contains a hole at offset %Lu",
 					   inode->i_ino,
 					   (unsigned long long) filp->f_pos);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 61cf3b3cde4e..509437ffb71b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1486,13 +1486,16 @@ extern int ext4_group_extend(struct super_block *sb,
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error(struct super_block *, const char *, const char *, ...)
+extern void __ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+#define ext4_error(sb, message...)	__ext4_error(sb, __func__, ## message)
 extern void __ext4_std_error(struct super_block *, const char *, int);
 extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning(struct super_block *, const char *, const char *, ...)
+extern void __ext4_warning(struct super_block *, const char *,
+			  const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+#define ext4_warning(sb, message...)	__ext4_warning(sb, __func__, ## message)
 extern void ext4_msg(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c711b6d..2f407c487e6b 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -132,7 +132,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
-				ext4_error(inode->i_sb, __func__,
+				ext4_error(inode->i_sb,
 					   "IO error syncing inode, "
 					   "inode=%lu, block=%llu",
 					   inode->i_ino,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54616157c0f3..bd808915ad2f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -440,7 +440,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return 0;
 
 corrupted:
-	ext4_error(inode->i_sb, function,
+	__ext4_error(inode->i_sb, function,
 			"bad header/extent in inode #%lu: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
 			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
@@ -1538,8 +1538,9 @@ int ext4_ext_try_to_merge(struct inode *inode,
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
-			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
-			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+			ext4_error(inode->i_sb,
+				   "inode#%lu, eh->eh_entries = 0!",
+				   inode->i_ino);
 	}
 
 	return merge_done;
@@ -3238,7 +3239,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
 	if (path[depth].p_ext == NULL && depth != 0) {
-		ext4_error(inode->i_sb, __func__, "bad extent address "
+		ext4_error(inode->i_sb, "bad extent address "
 			   "inode: %lu, iblock: %d, depth: %d",
 			   inode->i_ino, iblock, depth);
 		err = -EIO;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2fab5adae1e2..e4aaf619b56d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,8 +76,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __func__, "Checksum bad for group %u",
-			   block_group);
+		ext4_error(sb, "Checksum bad for group %u", block_group);
 		ext4_free_blks_set(sb, gdp, 0);
 		ext4_free_inodes_set(sb, gdp, 0);
 		ext4_itable_unused_set(sb, gdp, 0);
@@ -111,8 +110,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_inode_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __func__,
-			    "Cannot read inode bitmap - "
+		ext4_error(sb, "Cannot read inode bitmap - "
 			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -153,8 +151,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
-		ext4_error(sb, __func__,
-			    "Cannot read inode bitmap - "
+		ext4_error(sb, "Cannot read inode bitmap - "
 			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
@@ -229,8 +226,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error(sb, "ext4_free_inode",
-			   "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
@@ -248,8 +244,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 					bit, bitmap_bh->b_data);
 	if (!cleared)
-		ext4_error(sb, "ext4_free_inode",
-			   "bit already cleared for inode %lu", ino);
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
 	else {
 		gdp = ext4_get_group_desc(sb, block_group, &bh2);
 
@@ -736,8 +731,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_unlock_group(sb, group);
-		ext4_error(sb, __func__,
-			   "reserved inode or inode > inodes count - "
+		ext4_error(sb, "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
 		return 1;
@@ -1099,8 +1093,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
-		ext4_warning(sb, __func__,
-			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto error;
 	}
 
@@ -1108,8 +1101,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
 	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
-		ext4_warning(sb, __func__,
-			     "inode bitmap error for orphan %lu", ino);
+		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
 		goto error;
 	}
 
@@ -1141,8 +1133,7 @@ iget_failed:
 	err = PTR_ERR(inode);
 	inode = NULL;
 bad_orphan:
-	ext4_warning(sb, __func__,
-		     "bad orphan inode %lu!  e2fsck was run?", ino);
+	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
 	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
 	       bit, (unsigned long long)bitmap_bh->b_blocknr,
 	       ext4_test_bit(bit, bitmap_bh->b_data));
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e530119d7f0..536067bcf75b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -194,7 +194,7 @@ void ext4_delete_inode(struct inode *inode)
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
-		ext4_warning(inode->i_sb, __func__,
+		ext4_warning(inode->i_sb,
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
@@ -212,7 +212,7 @@ void ext4_delete_inode(struct inode *inode)
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
 		if (err != 0) {
-			ext4_warning(inode->i_sb, __func__,
+			ext4_warning(inode->i_sb,
 				     "couldn't extend journal (err %d)", err);
 		stop_handle:
 			ext4_journal_stop(handle);
@@ -323,8 +323,7 @@ static int ext4_block_to_path(struct inode *inode,
 		offsets[n++] = i_block & (ptrs - 1);
 		final = ptrs;
 	} else {
-		ext4_warning(inode->i_sb, "ext4_block_to_path",
-			     "block %lu > max in inode %lu",
+		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
 			     i_block + direct_blocks +
 			     indirect_blocks + double_blocks, inode->i_ino);
 	}
@@ -344,7 +343,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
-			ext4_error(inode->i_sb, function,
+			__ext4_error(inode->i_sb, function,
 				   "invalid block reference %u "
 				   "in inode #%lu", blk, inode->i_ino);
 			return -EIO;
@@ -1125,7 +1124,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
 				sector_t logical, sector_t phys, int len)
 {
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		ext4_error(inode->i_sb, msg,
+		__ext4_error(inode->i_sb, msg,
 			   "inode #%lu logical block %llu mapped to %llu "
 			   "(size %d)", inode->i_ino,
 			   (unsigned long long) logical,
@@ -4147,7 +4146,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
-		ext4_error(inode->i_sb, __func__, "inode #%lu: "
+		ext4_error(inode->i_sb, "inode #%lu: "
 			   "attempt to clear blocks %llu len %lu, invalid",
 			   inode->i_ino, (unsigned long long) block_to_free,
 			   count);
@@ -4255,7 +4254,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
-			ext4_error(inode->i_sb, __func__,
+			ext4_error(inode->i_sb,
 				   "circular indirect block detected, "
 				   "inode=%lu, block=%llu",
 				   inode->i_ino,
@@ -4297,7 +4296,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 
 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						   nr, 1)) {
-				ext4_error(inode->i_sb, __func__,
+				ext4_error(inode->i_sb,
 					   "indirect mapped block in inode "
 					   "#%lu invalid (level %d, blk #%lu)",
 					   inode->i_ino, depth,
@@ -4313,7 +4312,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * (should be rare).
 			 */
 			if (!bh) {
-				ext4_error(inode->i_sb, "ext4_free_branches",
+				ext4_error(inode->i_sb,
 					   "Read failure, inode=%lu, block=%llu",
 					   inode->i_ino, nr);
 				continue;
@@ -4628,9 +4627,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
-			   "inode block - inode=%lu, block=%llu",
-			   inode->i_ino, block);
+		ext4_error(sb, "unable to read inode block - "
+			   "inode=%lu, block=%llu", inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -4728,9 +4726,8 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(sb, __func__,
-				   "unable to read inode block - inode=%lu, "
-				   "block=%llu", inode->i_ino, block);
+			ext4_error(sb, "unable to read inode block - inode=%lu,"
+				   " block=%llu", inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -4941,8 +4938,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		ext4_error(sb, __func__,
-			   "bad extended attribute block %llu in inode #%lu",
+		ext4_error(sb, "bad extended attribute block %llu inode #%lu",
 			   ei->i_file_acl, inode->i_ino);
 		ret = -EIO;
 		goto bad_inode;
@@ -4988,8 +4984,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
-		ext4_error(inode->i_sb, __func__,
-			   "bogus i_mode (%o) for inode=%lu",
+		ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
 			   inode->i_mode, inode->i_ino);
 		goto bad_inode;
 	}
@@ -5228,10 +5223,8 @@ int ext4_write_inode(struct inode *inode, int wait)
 		if (wait)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			ext4_error(inode->i_sb, __func__,
-				   "IO error syncing inode, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
+			ext4_error(inode->i_sb, "IO error syncing inode, "
+				   "inode=%lu, block=%llu", inode->i_ino,
 				   (unsigned long long)iloc.bh->b_blocknr);
 			err = -EIO;
 		}
@@ -5644,7 +5637,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
-					ext4_warning(inode->i_sb, __func__,
+					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d129c1039f1d..415e11f1e9ee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2709,8 +2709,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 	len = ac->ac_b_ex.fe_len;
 	if (!ext4_data_block_valid(sbi, block, len)) {
-		ext4_error(sb, __func__,
-			   "Allocating blocks %llu-%llu which overlap "
+		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
 			   "fs metadata\n", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
@@ -3623,15 +3622,13 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
-		ext4_error(sb, __func__, "Error in reading block "
-				"bitmap for %u", group);
+		ext4_error(sb, "Error reading block bitmap for %u", group);
 		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
-		ext4_error(sb, __func__, "Error in loading buddy "
-				"information for %u", group);
+		ext4_error(sb, "Error loading buddy information for %u", group);
 		put_bh(bitmap_bh);
 		return 0;
 	}
@@ -3804,15 +3801,15 @@ repeat:
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
-			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %u", group);
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
 			continue;
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
-			ext4_error(sb, __func__, "Error in reading block "
-					"bitmap for %u", group);
+			ext4_error(sb, "Error reading block bitmap for %u",
+					group);
 			ext4_mb_release_desc(&e4b);
 			continue;
 		}
@@ -4077,8 +4074,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
-			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %u", group);
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
 			continue;
 		}
 		ext4_lock_group(sb, group);
@@ -4478,8 +4475,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	es = EXT4_SB(sb)->s_es;
 	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
 	    !ext4_data_block_valid(sbi, block, count)) {
-		ext4_error(sb, __func__,
-			   "Freeing blocks not in datazone - "
+		ext4_error(sb, "Freeing blocks not in datazone - "
 			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
@@ -4548,8 +4544,7 @@ do_more:
 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
 		      EXT4_SB(sb)->s_itb_per_group)) {
 
-		ext4_error(sb, __func__,
-			   "Freeing blocks in system zone - "
+		ext4_error(sb, "Freeing blocks in system zone - "
 			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
 		goto error_return;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415be87a4..1654eb862d74 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -152,12 +152,12 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 	int ret = 0;
 
 	if (inode1 == NULL) {
-		ext4_error(inode2->i_sb, function,
+		__ext4_error(inode2->i_sb, function,
 			"Both inodes should not be NULL: "
 			"inode1 NULL inode2 %lu", inode2->i_ino);
 		ret = -EIO;
 	} else if (inode2 == NULL) {
-		ext4_error(inode1->i_sb, function,
+		__ext4_error(inode1->i_sb, function,
 			"Both inodes should not be NULL: "
 			"inode1 %lu inode2 NULL", inode1->i_ino);
 		ret = -EIO;
@@ -526,7 +526,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * new_ext       |-------|
 	 */
 	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		ext4_error(orig_inode->i_sb, __func__,
+		ext4_error(orig_inode->i_sb,
 			"new_ext_end(%u) should be less than or equal to "
 			"oext->ee_block(%u) + oext_alen(%d) - 1",
 			new_ext_end, le32_to_cpu(oext->ee_block),
@@ -689,12 +689,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	while (1) {
 		/* The extent for donor must be found. */
 		if (!dext) {
-			ext4_error(donor_inode->i_sb, __func__,
+			ext4_error(donor_inode->i_sb,
 				   "The extent for donor must be found");
 			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			ext4_error(donor_inode->i_sb, __func__,
+			ext4_error(donor_inode->i_sb,
 				"Donor offset(%u) and the first block of donor "
 				"extent(%u) should be equal",
 				donor_off,
@@ -1351,7 +1351,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			if (ret1 < 0)
 				break;
 			if (*moved_len > len) {
-				ext4_error(orig_inode->i_sb, __func__,
+				ext4_error(orig_inode->i_sb,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e10dd60..bd2dc0b71c8c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -383,8 +383,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unrecognised inode hash code %d",
+		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
 			     root->info.hash_version);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -399,8 +398,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash flags: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
 			     root->info.unused_flags);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -408,8 +406,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	}
 
 	if ((indirect = root->info.indirect_levels) > 1) {
-		ext4_warning(dir->i_sb, __func__,
-			     "Unimplemented inode hash depth: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
 			     root->info.indirect_levels);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -421,8 +418,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
-		ext4_warning(dir->i_sb, __func__,
-			     "dx entry: limit != root limit");
+		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
 		goto fail;
@@ -433,7 +429,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 	{
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
-			ext4_warning(dir->i_sb, __func__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: no count or count > limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -478,7 +474,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 			goto fail2;
 		at = entries = ((struct dx_node *) bh->b_data)->entries;
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext4_warning(dir->i_sb, __func__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: limit != node limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -494,7 +490,7 @@ fail2:
 	}
 fail:
 	if (*err == ERR_BAD_DX_DIR)
-		ext4_warning(dir->i_sb, __func__,
+		ext4_warning(dir->i_sb,
 			     "Corrupt dir inode %ld, running e2fsck is "
 			     "recommended.", dir->i_ino);
 	return NULL;
@@ -947,9 +943,8 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, __func__, "reading directory #%lu "
-				   "offset %lu", dir->i_ino,
-				   (unsigned long)block);
+			ext4_error(sb, "reading directory #%lu offset %lu",
+				   dir->i_ino, (unsigned long)block);
 			brelse(bh);
 			goto next;
 		}
@@ -1041,7 +1036,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
-			ext4_warning(sb, __func__,
+			ext4_warning(sb,
 			     "error reading index page in directory #%lu",
 			     dir->i_ino);
 			*err = retval;
@@ -1071,14 +1066,13 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
-			ext4_error(dir->i_sb, "ext4_lookup",
-				   "bad inode number: %u", ino);
+			ext4_error(dir->i_sb, "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
 		if (unlikely(IS_ERR(inode))) {
 			if (PTR_ERR(inode) == -ESTALE) {
-				ext4_error(dir->i_sb, __func__,
+				ext4_error(dir->i_sb,
 						"deleted inode referenced: %u",
 						ino);
 				return ERR_PTR(-EIO);
@@ -1110,7 +1104,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	brelse(bh);
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+		ext4_error(child->d_inode->i_sb,
 			   "bad inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
@@ -1410,7 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
 		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
-		ext4_error(dir->i_sb, __func__,
+		ext4_error(dir->i_sb,
 			   "invalid rec_len for '..' in inode %lu",
 			   dir->i_ino);
 		brelse(bh);
@@ -1575,8 +1569,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
-			ext4_warning(sb, __func__,
-				     "Directory index full!");
+			ext4_warning(sb, "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
@@ -1916,11 +1909,11 @@ static int empty_dir(struct inode *inode)
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
-			ext4_error(inode->i_sb, __func__,
+			ext4_error(inode->i_sb,
 				   "error %d reading directory #%lu offset 0",
 				   err, inode->i_ino);
 		else
-			ext4_warning(inode->i_sb, __func__,
+			ext4_warning(inode->i_sb,
 				     "bad directory (dir #%lu) - no data block",
 				     inode->i_ino);
 		return 1;
@@ -1931,7 +1924,7 @@ static int empty_dir(struct inode *inode)
 			!le32_to_cpu(de1->inode) ||
 			strcmp(".", de->name) ||
 			strcmp("..", de1->name)) {
-		ext4_warning(inode->i_sb, "empty_dir",
+		ext4_warning(inode->i_sb,
 			     "bad directory (dir #%lu) - no `.' or `..'",
 			     inode->i_ino);
 		brelse(bh);
@@ -1949,7 +1942,7 @@ static int empty_dir(struct inode *inode)
 				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
 			if (!bh) {
 				if (err)
-					ext4_error(sb, __func__,
+					ext4_error(sb,
 						   "error %d reading directory"
 						   " #%lu offset %u",
 						   err, inode->i_ino, offset);
@@ -2163,7 +2156,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning(inode->i_sb, "ext4_rmdir",
+		ext4_warning(inode->i_sb,
 			     "empty directory has too many links (%d)",
 			     inode->i_nlink);
 	inode->i_version++;
@@ -2215,7 +2208,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
-		ext4_warning(inode->i_sb, "ext4_unlink",
+		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
 			     inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
@@ -2462,7 +2455,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 	if (retval) {
-		ext4_warning(old_dir->i_sb, "ext4_rename",
+		ext4_warning(old_dir->i_sb,
 				"Deleting old file (%lu), %d, error=%d",
 				old_dir->i_ino, old_dir->i_nlink, retval);
 	}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c5541d8a6..5692c48754a0 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -48,65 +48,54 @@ static int verify_group_input(struct super_block *sb,
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
 	if (group != sbi->s_groups_count)
-		ext4_warning(sb, __func__,
-			     "Cannot add at group %u (only %u groups)",
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
 			     input->group, sbi->s_groups_count);
 	else if (offset != 0)
-			ext4_warning(sb, __func__, "Last group not full");
+			ext4_warning(sb, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
-		ext4_warning(sb, __func__, "Reserved blocks too high (%u)",
+		ext4_warning(sb, "Reserved blocks too high (%u)",
 			     input->reserved_blocks);
 	else if (free_blocks_count < 0)
-		ext4_warning(sb, __func__, "Bad blocks count %u",
+		ext4_warning(sb, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext4_warning(sb, __func__,
-			     "Cannot read last block (%llu)",
+		ext4_warning(sb, "Cannot read last block (%llu)",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
-		ext4_warning(sb, __func__,
-			     "Block bitmap not in group (block %llu)",
+		ext4_warning(sb, "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap not in group (block %llu)",
+		ext4_warning(sb, "Inode bitmap not in group (block %llu)",
 			     (unsigned long long)input->inode_bitmap);
 	else if (outside(input->inode_table, start, end) ||
 		 outside(itend - 1, start, end))
-		ext4_warning(sb, __func__,
-			     "Inode table not in group (blocks %llu-%llu)",
+		ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
-		ext4_warning(sb, __func__,
-			     "Block bitmap same as inode bitmap (%llu)",
+		ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __func__,
-			     "Block bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->block_bitmap, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Block bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Inode bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 		 inside(itend - 1, start, metaend))
-		ext4_warning(sb, __func__,
-			     "Inode table (%llu-%llu) overlaps"
-			     "GDT table (%llu-%llu)",
+		ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_table,
 			     itend - 1, start, metaend - 1);
 	else
@@ -364,8 +353,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) !=
 		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-			ext4_warning(sb, __func__,
-				     "reserved GDT %llu"
+			ext4_warning(sb, "reserved GDT %llu"
 				     " missing grp %d (%llu)",
 				     blk, grp,
 				     grp *
@@ -420,8 +408,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
          */
 	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-		ext4_warning(sb, __func__,
-			"won't resize using backup superblock at %llu",
+		ext4_warning(sb, "won't resize using backup superblock at %llu",
 			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
 	}
@@ -444,8 +431,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-		ext4_warning(sb, __func__,
-			     "new group %u GDT block %llu not reserved",
+		ext4_warning(sb, "new group %u GDT block %llu not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -468,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 			GFP_NOFS);
 	if (!n_group_desc) {
 		err = -ENOMEM;
-		ext4_warning(sb, __func__,
+		ext4_warning(sb,
 			      "not enough memory for %lu groups", gdb_num + 1);
 		goto exit_inode;
 	}
@@ -567,8 +553,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	/* Get each reserved primary GDT block and verify it holds backups */
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
-			ext4_warning(sb, __func__,
-				     "reserved block %llu"
+			ext4_warning(sb, "reserved block %llu"
 				     " not at offset %ld",
 				     blk,
 				     (long)(data - (__le32 *)dind->b_data));
@@ -713,8 +698,7 @@ static void update_backups(struct super_block *sb,
 	 */
 exit_err:
 	if (err) {
-		ext4_warning(sb, __func__,
-			     "can't update backup for group %u (err %d), "
+		ext4_warning(sb, "can't update backup for group %u (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -753,20 +737,19 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ext4_warning(sb, __func__,
-			     "Can't resize non-sparse filesystem further");
+		ext4_warning(sb, "Can't resize non-sparse filesystem further");
 		return -EPERM;
 	}
 
 	if (ext4_blocks_count(es) + input->blocks_count <
 	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__, "blocks_count overflow");
+		ext4_warning(sb, "blocks_count overflow");
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
 	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __func__, "inodes_count overflow");
+		ext4_warning(sb, "inodes_count overflow");
 		return -EINVAL;
 	}
 
@@ -774,14 +757,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		if (!EXT4_HAS_COMPAT_FEATURE(sb,
 					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
 		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
-			ext4_warning(sb, __func__,
+			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
 		inode = ext4_iget(sb, EXT4_RESIZE_INO);
 		if (IS_ERR(inode)) {
-			ext4_warning(sb, __func__,
-				     "Error opening resize inode");
+			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(inode);
 		}
 	}
@@ -810,8 +792,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
-		ext4_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
+		ext4_warning(sb, "multiple resizers run on filesystem!");
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -997,13 +978,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
+			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
 	if (n_blocks_count < o_blocks_count) {
-		ext4_warning(sb, __func__,
-			     "can't shrink FS - resize aborted");
+		ext4_warning(sb, "can't shrink FS - resize aborted");
 		return -EBUSY;
 	}
 
@@ -1011,15 +991,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
-		ext4_warning(sb, __func__,
-			     "need to use ext2online to resize further");
+		ext4_warning(sb, "need to use ext2online to resize further");
 		return -EPERM;
 	}
 
 	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
 
 	if (o_blocks_count + add < o_blocks_count) {
-		ext4_warning(sb, __func__, "blocks_count overflow");
+		ext4_warning(sb, "blocks_count overflow");
 		return -EINVAL;
 	}
 
@@ -1027,16 +1006,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		add = n_blocks_count - o_blocks_count;
 
 	if (o_blocks_count + add < n_blocks_count)
-		ext4_warning(sb, __func__,
-			     "will only finish group (%llu"
-			     " blocks, %u new)",
+		ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
 	bh = sb_bread(sb, o_blocks_count + add - 1);
 	if (!bh) {
-		ext4_warning(sb, __func__,
-			     "can't read last block, resize aborted");
+		ext4_warning(sb, "can't read last block, resize aborted");
 		return -ENOSPC;
 	}
 	brelse(bh);
@@ -1047,14 +1023,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle = ext4_journal_start_sb(sb, 3);
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
-		ext4_warning(sb, __func__, "error %d on journal start", err);
+		ext4_warning(sb, "error %d on journal start", err);
 		goto exit_put;
 	}
 
 	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
 	if (o_blocks_count != ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__,
-			     "multiple resizers run on filesystem!");
+		ext4_warning(sb, "multiple resizers run on filesystem!");
 		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		err = -EBUSY;
@@ -1063,8 +1038,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	if ((err = ext4_journal_get_write_access(handle,
 						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, __func__,
-			     "error %d on journal write access", err);
+		ext4_warning(sb, "error %d on journal write access", err);
 		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		goto exit_put;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 68a55dffb360..1c85bb67e6eb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -333,7 +333,7 @@ static void ext4_handle_error(struct super_block *sb)
 			sb->s_id);
 }
 
-void ext4_error(struct super_block *sb, const char *function,
+void __ext4_error(struct super_block *sb, const char *function,
 		const char *fmt, ...)
 {
 	va_list args;
@@ -450,7 +450,7 @@ void ext4_msg (struct super_block * sb, const char *prefix,
 	va_end(args);
 }
 
-void ext4_warning(struct super_block *sb, const char *function,
+void __ext4_warning(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
 	va_list args;
@@ -507,7 +507,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 		return;
 
-	ext4_warning(sb, __func__,
+	ext4_warning(sb,
 		     "updating to rev %d because of new feature flag, "
 		     "running e2fsck is recommended",
 		     EXT4_DYNAMIC_REV);
@@ -3367,10 +3367,9 @@ static void ext4_clear_journal_err(struct super_block *sb,
 		char nbuf[16];
 
 		errstr = ext4_decode_error(sb, j_errno, nbuf);
-		ext4_warning(sb, __func__, "Filesystem error recorded "
+		ext4_warning(sb, "Filesystem error recorded "
 			     "from previous mount: %s", errstr);
-		ext4_warning(sb, __func__, "Marking fs in need of "
-			     "filesystem check.");
+		ext4_warning(sb, "Marking fs in need of filesystem check.");
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c619a7ea670d..627c98abbed9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -227,7 +227,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-bad_block:	ext4_error(inode->i_sb, __func__,
+bad_block:
+		ext4_error(inode->i_sb,
 			   "inode %lu: bad block %llu", inode->i_ino,
 			   EXT4_I(inode)->i_file_acl);
 		error = -EIO;
@@ -371,7 +372,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb, __func__,
+		ext4_error(inode->i_sb,
 			   "inode %lu: bad block %llu", inode->i_ino,
 			   EXT4_I(inode)->i_file_acl);
 		error = -EIO;
@@ -665,9 +666,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, __func__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+			ext4_error(sb, "inode %lu: bad block %llu",
+				   inode->i_ino, EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -880,9 +880,8 @@ cleanup_dquot:
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, __func__,
-		   "inode %lu: bad block %llu", inode->i_ino,
-		   EXT4_I(inode)->i_file_acl);
+	ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+		   inode->i_ino, EXT4_I(inode)->i_file_acl);
 	goto cleanup;
 
 #undef header
@@ -1195,9 +1194,8 @@ retry:
 		if (!bh)
 			goto cleanup;
 		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, __func__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+			ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+				   inode->i_ino, EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -1372,16 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, __func__,
-			"inode %lu: block %llu read error", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+			   inode->i_ino, EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, __func__,
-			"inode %lu: bad block %llu", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+			   inode->i_ino, EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	ext4_xattr_release_block(handle, inode, bh);
@@ -1506,7 +1502,7 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb, __func__,
+			ext4_error(inode->i_sb,
 				"inode %lu: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
-- 
cgit v1.2.3


From 9aaab0589baa61d637a52badddbff2d74f35a955 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Mon, 15 Feb 2010 14:26:16 -0500
Subject: ext4: add missing error checking to ext4_expand_extra_isize_ea()

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
---
 fs/ext4/xattr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 627c98abbed9..efc16a4b7ceb 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1300,6 +1300,8 @@ retry:
 
 		/* Remove the chosen entry from the inode */
 		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+		if (error)
+			goto cleanup;
 
 		entry = IFIRST(header);
 		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
-- 
cgit v1.2.3


From aca92ff6f57c000d1b4523e383c8bd6b8269b8b1 Mon Sep 17 00:00:00 2001
From: Leonard Michlmayr <leonard.michlmayr@gmail.com>
Date: Thu, 4 Mar 2010 17:07:28 -0500
Subject: ext4: correctly calculate number of blocks for fiemap

ext4_fiemap() rounds the length of the requested range down to
blocksize, which is is not the true number of blocks that cover the
requested region.  This problem is especially impressive if the user
requests only the first byte of a file: not a single extent will be
reported.

We fix this by calculating the last block of the region and then
subtract to find the number of blocks in the extents.

Signed-off-by: Leonard Michlmayr <leonard.michlmayr@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bd808915ad2f..7d54850f7136 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3768,7 +3768,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
 	ext4_lblk_t start_blk;
-	ext4_lblk_t len_blks;
 	int error = 0;
 
 	/* fallback to generic here if not in extents fmt */
@@ -3782,8 +3781,14 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
 		error = ext4_xattr_fiemap(inode, fieinfo);
 	} else {
+		ext4_lblk_t len_blks;
+		__u64 last_blk;
+
 		start_blk = start >> inode->i_sb->s_blocksize_bits;
-		len_blks = len >> inode->i_sb->s_blocksize_bits;
+		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+		if (last_blk >= EXT_MAX_BLOCK)
+			last_blk = EXT_MAX_BLOCK-1;
+		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
 
 		/*
 		 * Walk the extent tree gathering extent information.
-- 
cgit v1.2.3


From ba869023eac8354b17acdcff82b851ea8e7b1809 Mon Sep 17 00:00:00 2001
From: dingdinghua <dingdinghua@nrchpc.ac.cn>
Date: Mon, 15 Feb 2010 16:35:42 -0500
Subject: jbd2: delay discarding buffers in journal_unmap_buffer

Delay discarding buffers in journal_unmap_buffer until
we know that "add to orphan" operation has definitely been
committed, otherwise the log space of committing transation
may be freed and reused before truncate get committed, updates
may get lost if crash happens.

Signed-off-by: dingdinghua <dingdinghua@nrchpc.ac.cn>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 10 +++++-----
 fs/jbd2/transaction.c | 43 +++++++++++++++++++++++++++++++------------
 2 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1bc74b6f26d2..3ee211ed58f1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -930,12 +930,12 @@ restart_loop:
 		/* A buffer which has been freed while still being
 		 * journaled by a previous transaction may end up still
 		 * being dirty here, but we want to avoid writing back
-		 * that buffer in the future now that the last use has
-		 * been committed.  That's not only a performance gain,
-		 * it also stops aliasing problems if the buffer is left
-		 * behind for writeback and gets reallocated for another
+		 * that buffer in the future after the "add to orphan"
+		 * operation been committed,  That's not only a performance
+		 * gain, it also stops aliasing problems if the buffer is
+		 * left behind for writeback and gets reallocated for another
 		 * use in a different page. */
-		if (buffer_freed(bh)) {
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
 			clear_buffer_freed(bh);
 			clear_buffer_jbddirty(bh);
 		}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0512700542f..bfc70f57900f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1727,6 +1727,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!jh)
 		goto zap_buffer_no_jh;
 
+	/*
+	 * We cannot remove the buffer from checkpoint lists until the
+	 * transaction adding inode to orphan list (let's call it T)
+	 * is committed.  Otherwise if the transaction changing the
+	 * buffer would be cleaned from the journal before T is
+	 * committed, a crash will cause that the correct contents of
+	 * the buffer will be lost.  On the other hand we have to
+	 * clear the buffer dirty bit at latest at the moment when the
+	 * transaction marking the buffer as freed in the filesystem
+	 * structures is committed because from that moment on the
+	 * buffer can be reallocated and used by a different page.
+	 * Since the block hasn't been freed yet but the inode has
+	 * already been added to orphan list, it is safe for us to add
+	 * the buffer to BJ_Forget list of the newest transaction.
+	 */
 	transaction = jh->b_transaction;
 	if (transaction == NULL) {
 		/* First case: not on any transaction.  If it
@@ -1783,16 +1798,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
 		/*
-		 * If it is committing, we simply cannot touch it.  We
-		 * can remove it's next_transaction pointer from the
-		 * running transaction if that is set, but nothing
-		 * else. */
+		 * The buffer is committing, we simply cannot touch
+		 * it. So we just set j_next_transaction to the
+		 * running transaction (if there is one) and mark
+		 * buffer as freed so that commit code knows it should
+		 * clear dirty bits when it is done with the buffer.
+		 */
 		set_buffer_freed(bh);
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction ==
-					journal->j_running_transaction);
-			jh->b_next_transaction = NULL;
-		}
+		if (journal->j_running_transaction && buffer_jbddirty(bh))
+			jh->b_next_transaction = journal->j_running_transaction;
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);
@@ -1969,7 +1983,7 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
  */
 void __jbd2_journal_refile_buffer(struct journal_head *jh)
 {
-	int was_dirty;
+	int was_dirty, jlist;
 	struct buffer_head *bh = jh2bh(jh);
 
 	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -1991,8 +2005,13 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
 	__jbd2_journal_temp_unlink_buffer(jh);
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
-	__jbd2_journal_file_buffer(jh, jh->b_transaction,
-				jh->b_modified ? BJ_Metadata : BJ_Reserved);
+	if (buffer_freed(bh))
+		jlist = BJ_Forget;
+	else if (jh->b_modified)
+		jlist = BJ_Metadata;
+	else
+		jlist = BJ_Reserved;
+	__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
-- 
cgit v1.2.3


From 65d269538a1129495ac45a14a777cd11cfe881d8 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 15 Feb 2010 12:19:53 -0500
Subject: NFS: Too many GETATTR and ACCESS calls after direct I/O

The cached read and write paths initialize fattr->time_start in their
setup procedures.  The value of fattr->time_start is propagated to
read_cache_jiffies by nfs_update_inode().  Subsequent calls to
nfs_attribute_timeout() will then use a good time stamp when
computing the attribute cache timeout, and squelch unneeded GETATTR
calls.

Since the direct I/O paths erroneously leave the inode's
fattr->time_start field set to zero, read_cache_jiffies for that inode
is set to zero after any direct read or write operation.  This
triggers an otw GETATTR or ACCESS call to update the file's attribute
and access caches properly, even when the NFS READ or WRITE replies
have usable post-op attributes.

Make sure the direct read and write setup code performs the same fattr
initialization as the cached I/O paths to prevent unnecessary GETATTR
calls.

This was likely introduced by commit 0e574af1 in 2.6.15, which appears
to add new nfs_fattr_init() call sites in the cached read and write
paths, but not in the equivalent places in fs/nfs/direct.c.  A
subsequent commit in the same series, 33801147, introduces the
fattr->time_start field.

Interestingly, the direct write reschedule path already has a call to
nfs_fattr_init() in the right place.

Reported-by: Quentin Barnes <qbarnes@yahoo-inc.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/direct.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e1d415e97849..0d289823e856 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -342,6 +342,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
 		data->res.count = bytes;
+		nfs_fattr_init(&data->fattr);
 		msg.rpc_argp = &data->args;
 		msg.rpc_resp = &data->res;
 
@@ -575,6 +576,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
+	nfs_fattr_init(&data->fattr);
 
 	NFS_PROTO(data->inode)->commit_setup(data, &msg);
 
@@ -766,6 +768,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
 		data->res.verf = &data->verf;
+		nfs_fattr_init(&data->fattr);
 
 		task_setup_data.task = &data->task;
 		task_setup_data.callback_data = data;
-- 
cgit v1.2.3


From 73b50c1c92666d326b5fa2c945d46509f2f6d91f Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Tue, 16 Feb 2010 15:06:29 -0500
Subject: ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode

Calls to ext4_handle_dirty_metadata should only pass in an inode
pointer for inode-specific metadata, and not for shared metadata
blocks such as inode table blocks, block group descriptors, the
superblock, etc.

The BUG_ON can get tripped when updating a special device (such as a
block device) that is opened (so that i_mapping is set in
fs/block_dev.c) and the file system is mounted in no journal mode.

Addresses-Google-Bug: #2404870

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.c | 2 +-
 fs/ext4/ialloc.c    | 2 +-
 fs/ext4/inode.c     | 6 +++---
 fs/ext4/namei.c     | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 2f407c487e6b..53d2764d71ca 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,7 +125,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 			ext4_journal_abort_handle(where, __func__, bh,
 						  handle, err);
 	} else {
-		if (inode && bh)
+		if (inode)
 			mark_buffer_dirty_inode(bh, inode);
 		else
 			mark_buffer_dirty(bh);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e4aaf619b56d..004c9da9e5c6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -898,7 +898,7 @@ repeat_in_this_group:
 				BUFFER_TRACE(inode_bitmap_bh,
 					"call ext4_handle_dirty_metadata");
 				err = ext4_handle_dirty_metadata(handle,
-								 inode,
+								 NULL,
 							inode_bitmap_bh);
 				if (err)
 					goto fail;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 536067bcf75b..ecac8c5a6f5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5120,7 +5120,7 @@ static int ext4_do_update_inode(handle_t *handle,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
 			ext4_handle_sync(handle);
-			err = ext4_handle_dirty_metadata(handle, inode,
+			err = ext4_handle_dirty_metadata(handle, NULL,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
@@ -5149,7 +5149,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	}
 
 	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-	rc = ext4_handle_dirty_metadata(handle, inode, bh);
+	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
 	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
@@ -5701,7 +5701,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
 				err = ext4_handle_dirty_metadata(handle,
-								 inode,
+								 NULL,
 								 iloc.bh);
 			brelse(iloc.bh);
 		}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bd2dc0b71c8c..a13948f8242f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2017,7 +2017,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+	err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
@@ -2089,7 +2089,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		if (err)
 			goto out_brelse;
 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+		err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
 	} else {
 		struct ext4_iloc iloc2;
 		struct inode *i_prev =
-- 
cgit v1.2.3


From 7c0ff870d1ed287504a61ed865f3d728c757436b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 3 Feb 2010 23:13:24 -0800
Subject: sysfs: sysfs_sd_setattr set iattrs unconditionally

There is currently a bug in sysfs_sd_setattr inherited from
sysfs_setattr in 2.6.32 where the first time we set the attributes
on a sysfs file we allocate backing store but do not set the
backing store attributes.  Resulting in overly restrictive
permissions on sysfs files.

The fix is to simply modify the code so that it always executes
when we update the sysfs attributes, as we did in 2.6.31 and earlier.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Tested-by: Jean Delvare <khali@linux-fr.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/inode.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 220b758523ae..6a06a1d1ea7b 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -81,24 +81,23 @@ int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr * iattr)
 		if (!sd_attrs)
 			return -ENOMEM;
 		sd->s_iattr = sd_attrs;
-	} else {
-		/* attributes were changed at least once in past */
-		iattrs = &sd_attrs->ia_iattr;
-
-		if (ia_valid & ATTR_UID)
-			iattrs->ia_uid = iattr->ia_uid;
-		if (ia_valid & ATTR_GID)
-			iattrs->ia_gid = iattr->ia_gid;
-		if (ia_valid & ATTR_ATIME)
-			iattrs->ia_atime = iattr->ia_atime;
-		if (ia_valid & ATTR_MTIME)
-			iattrs->ia_mtime = iattr->ia_mtime;
-		if (ia_valid & ATTR_CTIME)
-			iattrs->ia_ctime = iattr->ia_ctime;
-		if (ia_valid & ATTR_MODE) {
-			umode_t mode = iattr->ia_mode;
-			iattrs->ia_mode = sd->s_mode = mode;
-		}
+	}
+	/* attributes were changed at least once in past */
+	iattrs = &sd_attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = sd->s_mode = mode;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 003cb608a2533d0927a83bc4e07e46d7a622eda9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 2 Feb 2010 14:39:01 +0900
Subject: percpu: add __percpu sparse annotations to fs

Add __percpu sparse annotations to fs.

These annotations are to make sparse consider percpu variables to be
in a different address space and warn if accessed without going
through percpu accessors.  This patch doesn't affect normal builds.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/ext4.h            | 2 +-
 fs/nfs/iostat.h           | 4 ++--
 fs/xfs/xfs_mount.h        | 2 +-
 include/linux/mount.h     | 2 +-
 include/linux/nfs_fs_sb.h | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 874d169a193e..4cedc91ec59d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1014,7 +1014,7 @@ struct ext4_sb_info {
 	atomic_t s_lock_busy;
 
 	/* locality groups */
-	struct ext4_locality_group *s_locality_groups;
+	struct ext4_locality_group __percpu *s_locality_groups;
 
 	/* for write statistics */
 	unsigned long s_sectors_written_start;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 46d779abafd3..1d8d5c813b01 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -57,12 +57,12 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
 }
 #endif
 
-static inline struct nfs_iostats *nfs_alloc_iostats(void)
+static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
 {
 	return alloc_percpu(struct nfs_iostats);
 }
 
-static inline void nfs_free_iostats(struct nfs_iostats *stats)
+static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
 {
 	if (stats != NULL)
 		free_percpu(stats);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1df7e4502967..24c88870cdb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -243,7 +243,7 @@ typedef struct xfs_mount {
 	struct xfs_qmops	*m_qm_ops;	/* vector of XQM ops */
 	atomic_t		m_active_trans;	/* number trans frozen */
 #ifdef HAVE_PERCPU_SB
-	xfs_icsb_cnts_t		*m_sb_cnts;	/* per-cpu superblock counters */
+	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
 	unsigned long		m_icsb_counters; /* disabled per-cpu counters */
 	struct notifier_block	m_icsb_notifier; /* hotplug cpu notifier */
 	struct mutex		m_icsb_mutex;	/* balancer sync lock */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d5275364867..b5f43a34ef88 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -66,7 +66,7 @@ struct vfsmount {
 	int mnt_pinned;
 	int mnt_ghosts;
 #ifdef CONFIG_SMP
-	int *mnt_writers;
+	int __percpu *mnt_writers;
 #else
 	int mnt_writers;
 #endif
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 34fc6be5bfcf..6a2e44fd75e2 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -105,7 +105,7 @@ struct nfs_server {
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nlm_host		*nlm_host;	/* NLM client handle */
-	struct nfs_iostats *	io_stats;	/* I/O statistics */
+	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
 	int			flags;		/* various flags */
-- 
cgit v1.2.3


From 03f29365e84ff6d651be4e6186e0400ca59da6cd Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Thu, 18 Feb 2010 19:11:35 +0900
Subject: nilfs2: delete unnecessary condition in load_segment_summary

This is a trivial patch to remove unnecessary condition.

load_segment_summary() checks crc of segment_summary OR crc of whole
log data blocks based on boolean argument full_check.  However,
callers of the function pass only 1 as full_check, which means only
whole log data blocks checking code is running all the time.

This patch deletes the condition and full_check argument and also
deletes enum 'NILFS_SEG_FAIL_CHECKSUM_SEGSUM' and corresponding case
clause, for it is nolonger used anymore.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/recovery.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index c9c96c7825dc..017bedc761a0 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -39,7 +39,6 @@ enum {
 	NILFS_SEG_FAIL_IO,
 	NILFS_SEG_FAIL_MAGIC,
 	NILFS_SEG_FAIL_SEQ,
-	NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
 	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
 	NILFS_SEG_FAIL_CHECKSUM_FULL,
 	NILFS_SEG_FAIL_CONSISTENCY,
@@ -71,10 +70,6 @@ static int nilfs_warn_segment_error(int err)
 		printk(KERN_WARNING
 		       "NILFS warning: Sequence number mismatch\n");
 		break;
-	case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
-		printk(KERN_WARNING
-		       "NILFS warning: Checksum error in segment summary\n");
-		break;
 	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
 		printk(KERN_WARNING
 		       "NILFS warning: Checksum error in super root\n");
@@ -206,19 +201,15 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
  * @pseg_start: start disk block number of partial segment
  * @seg_seq: sequence number requested
  * @ssi: pointer to nilfs_segsum_info struct to store information
- * @full_check: full check flag
- *              (0: only checks segment summary CRC, 1: data CRC)
  */
 static int
 load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
-		     u64 seg_seq, struct nilfs_segsum_info *ssi,
-		     int full_check)
+		     u64 seg_seq, struct nilfs_segsum_info *ssi)
 {
 	struct buffer_head *bh_sum;
 	struct nilfs_segment_summary *sum;
-	unsigned long offset, nblock;
-	u64 check_bytes;
-	u32 crc, crc_sum;
+	unsigned long nblock;
+	u32 crc;
 	int ret = NILFS_SEG_FAIL_IO;
 
 	bh_sum = sb_bread(sbi->s_super, pseg_start);
@@ -237,34 +228,24 @@ load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
 		ret = NILFS_SEG_FAIL_SEQ;
 		goto failed;
 	}
-	if (full_check) {
-		offset = sizeof(sum->ss_datasum);
-		check_bytes =
-			((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
-		nblock = ssi->nblocks;
-		crc_sum = le32_to_cpu(sum->ss_datasum);
-		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
-	} else { /* only checks segment summary */
-		offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
-		check_bytes = ssi->sumbytes;
-		nblock = ssi->nsumblk;
-		crc_sum = le32_to_cpu(sum->ss_sumsum);
-		ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
-	}
 
+	nblock = ssi->nblocks;
 	if (unlikely(nblock == 0 ||
 		     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
 		/* This limits the number of blocks read in the CRC check */
 		ret = NILFS_SEG_FAIL_CONSISTENCY;
 		goto failed;
 	}
-	if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+	if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum),
+			  ((u64)nblock << sbi->s_super->s_blocksize_bits),
 			  pseg_start, nblock)) {
 		ret = NILFS_SEG_FAIL_IO;
 		goto failed;
 	}
-	if (crc == crc_sum)
+	if (crc == le32_to_cpu(sum->ss_datasum))
 		ret = 0;
+	else
+		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
  failed:
 	brelse(bh_sum);
  out:
@@ -598,7 +579,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 
 	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
 
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO) {
 				err = -EIO;
@@ -821,7 +802,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 
 	for (;;) {
 		/* Load segment summary */
-		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi);
 		if (ret) {
 			if (ret == NILFS_SEG_FAIL_IO)
 				goto failed;
-- 
cgit v1.2.3


From c44dcc56d2b5c79ba3063d20f76e5347e2e418f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 11 Feb 2010 02:24:46 -0500
Subject: switch inotify_user to anon_inode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/notify/inotify/inotify_user.c | 59 ++++------------------------------------
 include/linux/magic.h            |  1 -
 2 files changed, 6 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a94e8bd8eb1f..472cdf29ef82 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -29,14 +29,12 @@
 #include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
 #include <linux/kernel.h> /* roundup() */
-#include <linux/magic.h> /* superblock magic number */
-#include <linux/mount.h> /* mntget */
 #include <linux/namei.h> /* LOOKUP_FOLLOW */
-#include <linux/path.h> /* struct path */
 #include <linux/sched.h> /* struct user */
 #include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
 #include <linux/types.h>
+#include <linux/anon_inodes.h>
 #include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
@@ -45,8 +43,6 @@
 
 #include <asm/ioctls.h>
 
-static struct vfsmount *inotify_mnt __read_mostly;
-
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -645,9 +641,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
 	struct fsnotify_group *group;
 	struct user_struct *user;
-	struct file *filp;
-	struct path path;
-	int fd, ret;
+	int ret;
 
 	/* Check the IN_* constants for consistency.  */
 	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
@@ -656,10 +650,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
 		return -EINVAL;
 
-	fd = get_unused_fd_flags(flags & O_CLOEXEC);
-	if (fd < 0)
-		return fd;
-
 	user = get_current_user();
 	if (unlikely(atomic_read(&user->inotify_devs) >=
 			inotify_max_user_instances)) {
@@ -676,27 +666,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 
 	atomic_inc(&user->inotify_devs);
 
-	path.mnt = inotify_mnt;
-	path.dentry = inotify_mnt->mnt_root;
-	path_get(&path);
-	filp = alloc_file(&path, FMODE_READ, &inotify_fops);
-	if (!filp)
-		goto Enfile;
+	ret = anon_inode_getfd("inotify", &inotify_fops, group,
+				  O_RDONLY | flags);
+	if (ret >= 0)
+		return ret;
 
-	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = group;
-
-	fd_install(fd, filp);
-
-	return fd;
-
-Enfile:
-	ret = -ENFILE;
-	path_put(&path);
 	atomic_dec(&user->inotify_devs);
 out_free_uid:
 	free_uid(user);
-	put_unused_fd(fd);
 	return ret;
 }
 
@@ -783,20 +760,6 @@ out:
 	return ret;
 }
 
-static int
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "inotify", NULL,
-			INOTIFYFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type inotify_fs_type = {
-    .name	= "inotifyfs",
-    .get_sb	= inotify_get_sb,
-    .kill_sb	= kill_anon_super,
-};
-
 /*
  * inotify_user_setup - Our initialization function.  Note that we cannnot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
@@ -804,16 +767,6 @@ static struct file_system_type inotify_fs_type = {
  */
 static int __init inotify_user_setup(void)
 {
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
 
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 76285e01b39e..eb9800f05782 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -52,7 +52,6 @@
 #define CGROUP_SUPER_MAGIC	0x27e0eb
 
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
-#define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
 #define STACK_END_MAGIC		0x57AC6E9D
 
-- 
cgit v1.2.3


From ac278a9c505092dd82077a2446af8f9fc0d9c095 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Tue, 16 Feb 2010 18:09:36 +0000
Subject: fix LOOKUP_FOLLOW on automount "symlinks"

Make sure that automount "symlinks" are followed regardless of LOOKUP_FOLLOW;
it should have no effect on them.

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d62fdc875f22..a4855af776a8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -822,6 +822,17 @@ fail:
 	return PTR_ERR(dentry);
 }
 
+/*
+ * This is a temporary kludge to deal with "automount" symlinks; proper
+ * solution is to trigger them on follow_mount(), so that do_lookup()
+ * would DTRT.  To be killed before 2.6.34-final.
+ */
+static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
+{
+	return inode && unlikely(inode->i_op->follow_link) &&
+		((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -942,8 +953,7 @@ last_component:
 		if (err)
 			break;
 		inode = next.dentry->d_inode;
-		if ((lookup_flags & LOOKUP_FOLLOW)
-		    && inode && inode->i_op->follow_link) {
+		if (follow_on_final(inode, lookup_flags)) {
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
-- 
cgit v1.2.3


From 7fee4868be91e71a3ee8e57289ebf5e10a12297e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 14 Jan 2010 01:03:28 -0500
Subject: Switch proc/self to nd_set_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e42bbd843ed1..58324c299165 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2369,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return ERR_PTR(-ENOENT);
-	sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
-	return ERR_PTR(vfs_follow_link(nd,tmp));
+	char *name = ERR_PTR(-ENOENT);
+	if (tgid) {
+		name = __getname();
+		if (!name)
+			name = ERR_PTR(-ENOMEM);
+		else
+			sprintf(name, "%d", tgid);
+	}
+	nd_set_link(nd, name);
+	return NULL;
+}
+
+static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		__putname(s);
 }
 
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
 	.follow_link	= proc_self_follow_link,
+	.put_link	= proc_self_put_link,
 };
 
 /*
-- 
cgit v1.2.3


From aeaa5ccd6421fbf9e7ded0ac67b12ea2b9fcf51e Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Mon, 15 Feb 2010 18:07:39 -0500
Subject: vfs: don't call ima_file_check() unconditionally in nfsd_open()

commit 1e41568d7378d1ba8c64ba137b9ddd00b59f893a ("Take ima_path_check()
in nfsd past dentry_open() in nfsd_open()") moved this code back to its
original location but missed the "else".

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 97d79eff6b7f..8715d194561a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -752,7 +752,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			    flags, current_cred());
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
-	host_err = ima_file_check(*filp, access);
+	else
+		host_err = ima_file_check(*filp, access);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
-- 
cgit v1.2.3


From 0d561f12b490dd2b993d73112d3297007688e6df Mon Sep 17 00:00:00 2001
From: Jiro SEKIBA <jir@unicus.jp>
Date: Sat, 20 Feb 2010 19:47:49 +0900
Subject: nilfs2: add reader's lock for cno in nilfs_ioctl_sync

This adds reader's lock for the_nilfs->cno in nilfs_ioctl_sync,
for the_nilfs->cno should be proctected by segctor_sem when reading.

Signed-off-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 8e5cad020c30..313d0a21da48 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -601,13 +601,17 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 {
 	__u64 cno;
 	int ret;
+	struct the_nilfs *nilfs;
 
 	ret = nilfs_construct_segment(inode->i_sb);
 	if (ret < 0)
 		return ret;
 
 	if (argp != NULL) {
-		cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+		nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+		down_read(&nilfs->ns_segctor_sem);
+		cno = nilfs->ns_cno - 1;
+		up_read(&nilfs->ns_segctor_sem);
 		if (copy_to_user(argp, &cno, sizeof(cno)))
 			return -EFAULT;
 	}
-- 
cgit v1.2.3


From 8f9941aeccc318f243ab3fa55aaa17f4c1cb33f9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 19 Feb 2010 18:14:21 +0000
Subject: CacheFiles: Fix a race in cachefiles_delete_object() vs rename

cachefiles_delete_object() can race with rename.  It gets the parent directory
of the object it's asked to delete, then locks it - but rename may have changed
the object's parent between the get and the completion of the lock.

However, if such a circumstance is detected, we abandon our attempt to delete
the object - since it's no longer in the index key path, it won't be seen
again by lookups of that key.  The assumption is that cachefilesd may have
culled it by renaming it to the graveyard for later destruction.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/namei.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 14ac4806e291..eeb4986ea7db 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -348,7 +348,17 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 	dir = dget_parent(object->dentry);
 
 	mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-	ret = cachefiles_bury_object(cache, dir, object->dentry);
+
+	/* we need to check that our parent is _still_ our parent - it may have
+	 * been renamed */
+	if (dir == object->dentry->d_parent) {
+		ret = cachefiles_bury_object(cache, dir, object->dentry);
+	} else {
+		/* it got moved, presumably by cachefilesd culling it, so it's
+		 * no longer in the key path and we can ignore it */
+		mutex_unlock(&dir->d_inode->i_mutex);
+		ret = 0;
+	}
 
 	dput(dir);
 	_leave(" = %d", ret);
-- 
cgit v1.2.3


From 1cc523271ef0b6305c565a143e3d48f6fff826dd Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Mon, 22 Feb 2010 07:57:17 +0000
Subject: seq_file: add RCU versions of new hlist/list iterators (v3)

Many usages of seq_file use RCU protected lists, so non RCU
iterators will not work safely.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/seq_file.c            | 71 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rculist.h  |  5 ++++
 include/linux/seq_file.h | 15 +++++++---
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index f65b16f02da3..5afd554efad3 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -750,3 +750,74 @@ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
 		return node->next;
 }
 EXPORT_SYMBOL(seq_hlist_next);
+
+/**
+ * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+				       loff_t pos)
+{
+	struct hlist_node *node;
+
+	__hlist_for_each_rcu(node, head)
+		if (pos-- == 0)
+			return node;
+	return NULL;
+}
+EXPORT_SYMBOL(seq_hlist_start_rcu);
+
+/**
+ * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU
+ * @head: the head of the hlist
+ * @pos:  the start position of the sequence
+ *
+ * Called at seq_file->op->start(). Call this function if you want to
+ * print a header at the top of the output.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+					    loff_t pos)
+{
+	if (!pos)
+		return SEQ_START_TOKEN;
+
+	return seq_hlist_start_rcu(head, pos - 1);
+}
+EXPORT_SYMBOL(seq_hlist_start_head_rcu);
+
+/**
+ * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU
+ * @v:    the current iterator
+ * @head: the head of the hlist
+ * @pos:  the current posision
+ *
+ * Called at seq_file->op->next().
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+struct hlist_node *seq_hlist_next_rcu(void *v,
+				      struct hlist_head *head,
+				      loff_t *ppos)
+{
+	struct hlist_node *node = v;
+
+	++*ppos;
+	if (v == SEQ_START_TOKEN)
+		return rcu_dereference(head->first);
+	else
+		return rcu_dereference(node->next);
+}
+EXPORT_SYMBOL(seq_hlist_next_rcu);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 1bf0f708c4fc..701fe9cb552a 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -406,6 +406,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		n->next->pprev = &n->next;
 }
 
+#define __hlist_for_each_rcu(pos, head)			\
+	for (pos = rcu_dereference((head)->first);	\
+	     pos && ({ prefetch(pos->next); 1; });	\
+	     pos = rcu_dereference(pos->next))
+
 /**
  * hlist_for_each_entry_rcu - iterate over rcu list of given type
  * @tpos:	the type * to use as a loop cursor.
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index c95bcdc18f4c..03c0232b4169 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -140,10 +140,17 @@ extern struct list_head *seq_list_next(void *v, struct list_head *head,
  */
 
 extern struct hlist_node *seq_hlist_start(struct hlist_head *head,
-		loff_t pos);
+					  loff_t pos);
 extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head,
-		loff_t pos);
+					       loff_t pos);
 extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head,
-		loff_t *ppos);
-
+					 loff_t *ppos);
+
+extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head,
+					      loff_t pos);
+extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head,
+						   loff_t pos);
+extern struct hlist_node *seq_hlist_next_rcu(void *v,
+						   struct hlist_head *head,
+						   loff_t *ppos);
 #endif
-- 
cgit v1.2.3


From a17e18790a8c47113a73139d54a375dc9ccd8f08 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Mon, 22 Feb 2010 12:44:24 -0800
Subject: fs/exec.c: fix initial stack reservation

803bf5ec259941936262d10ecc84511b76a20921 ("fs/exec.c: restrict initial
stack space expansion to rlimit") attempts to limit the initial stack to
20*PAGE_SIZE.  Unfortunately, in attempting ensure the stack is not
reduced in size, we ended up not changing the stack at all.

This size reduction check is not necessary as the expand_stack call does
this already.

This caused a regression in UML resulting in most guest processes being
killed.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Jouni Malinen <j@w1.fi>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e95c692ef0e4..cce6bbdbdbb1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -637,7 +637,6 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	 * will align it up.
 	 */
 	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
-	rlim_stack = min(rlim_stack, stack_size);
 #ifdef CONFIG_STACK_GROWSUP
 	if (stack_size + stack_expand > rlim_stack)
 		stack_base = vma->vm_start + rlim_stack;
-- 
cgit v1.2.3


From 370b41911c0f1074e654a28466411b5c879e4dfd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:25 -0500
Subject: cifs: add parens around smb_var in BCC macros

...to remove ambiguity about how these values are interpreted when
passing in more complex values as arguments.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifspdu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 3877737f96a6..14d036d8db11 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -415,10 +415,10 @@ struct smb_hdr {
 	__u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
-#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
+#define pByteArea(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + (2 * (smb_var)->WordCount) + 2)
 
 /*
  * Computer Name Length (since Netbios name was length 16 with last byte 0x20)
-- 
cgit v1.2.3


From f0d3868b78752024dd7550d0d18fb9e0601cfb69 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:26 -0500
Subject: cifs: clean up indentation in CIFSSMBQAllEAs

Add a label that we can goto on error, and reduce some of the
if/then/else indentation in this function.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 151 +++++++++++++++++++++++++++---------------------------
 1 file changed, 75 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 941441d3e386..30709589e0c9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5282,9 +5282,10 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	int bytes_returned;
 	int name_len;
+	struct fealist *ea_response_data;
 	struct fea *temp_fea;
 	char *temp_ptr;
-	__u16 params, byte_count;
+	__u16 params, byte_count, data_offset;
 
 	cFYI(1, ("In Query All EAs path %s", searchName));
 QAllEAsRetry:
@@ -5334,85 +5335,83 @@ QAllEAsRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cFYI(1, ("Send error in QueryAllEAs = %d", rc));
-	} else {		/* decode response */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		goto QAllEAsOut;
+	}
 
-		/* BB also check enough total bytes returned */
-		/* BB we need to improve the validity checking
-		of these trans2 responses */
-		if (rc || (pSMBr->ByteCount < 4))
-			rc = -EIO;	/* bad smb */
-	   /* else if (pFindData){
-			memcpy((char *) pFindData,
-			       (char *) &pSMBr->hdr.Protocol +
-			       data_offset, kl);
-		}*/ else {
-			/* check that length of list is not more than bcc */
-			/* check that each entry does not go beyond length
-			   of list */
-			/* check that each element of each entry does not
-			   go beyond end of list */
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			struct fealist *ea_response_data;
-			rc = 0;
-			/* validate_trans2_offsets() */
-			/* BB check if start of smb + data_offset > &bcc+ bcc */
-			ea_response_data = (struct fealist *)
-				(((char *) &pSMBr->hdr.Protocol) +
-				data_offset);
-			name_len = le32_to_cpu(ea_response_data->list_len);
-			cFYI(1, ("ea length %d", name_len));
-			if (name_len <= 8) {
-			/* returned EA size zeroed at top of function */
-				cFYI(1, ("empty EA list returned from server"));
-			} else {
-				/* account for ea list len */
-				name_len -= 4;
-				temp_fea = ea_response_data->list;
-				temp_ptr = (char *)temp_fea;
-				while (name_len > 0) {
-					__u16 value_len;
-					name_len -= 4;
-					temp_ptr += 4;
-					rc += temp_fea->name_len;
-				/* account for prefix user. and trailing null */
-					rc = rc + 5 + 1;
-					if (rc < (int)buf_size) {
-						memcpy(EAData, "user.", 5);
-						EAData += 5;
-						memcpy(EAData, temp_ptr,
-						       temp_fea->name_len);
-						EAData += temp_fea->name_len;
-						/* null terminate name */
-						*EAData = 0;
-						EAData = EAData + 1;
-					} else if (buf_size == 0) {
-						/* skip copy - calc size only */
-					} else {
-						/* stop before overrun buffer */
-						rc = -ERANGE;
-						break;
-					}
-					name_len -= temp_fea->name_len;
-					temp_ptr += temp_fea->name_len;
-					/* account for trailing null */
-					name_len--;
-					temp_ptr++;
-					value_len =
-					      le16_to_cpu(temp_fea->value_len);
-					name_len -= value_len;
-					temp_ptr += value_len;
-					/* BB check that temp_ptr is still
-					      within the SMB BB*/
 
-					/* no trailing null to account for
-					   in value len */
-					/* go on to next EA */
-					temp_fea = (struct fea *)temp_ptr;
-				}
-			}
+	/* BB also check enough total bytes returned */
+	/* BB we need to improve the validity checking
+	of these trans2 responses */
+
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+	if (rc || (pSMBr->ByteCount < 4)) {
+		rc = -EIO;	/* bad smb */
+		goto QAllEAsOut;
+	}
+
+	/* check that length of list is not more than bcc */
+	/* check that each entry does not go beyond length
+	   of list */
+	/* check that each element of each entry does not
+	   go beyond end of list */
+	/* validate_trans2_offsets() */
+	/* BB check if start of smb + data_offset > &bcc+ bcc */
+
+	data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+	ea_response_data = (struct fealist *)
+				(((char *) &pSMBr->hdr.Protocol) + data_offset);
+
+	name_len = le32_to_cpu(ea_response_data->list_len);
+	cFYI(1, ("ea length %d", name_len));
+	if (name_len <= 8) {
+		cFYI(1, ("empty EA list returned from server"));
+		goto QAllEAsOut;
+	}
+
+	/* account for ea list len */
+	name_len -= 4;
+	temp_fea = ea_response_data->list;
+	temp_ptr = (char *)temp_fea;
+	while (name_len > 0) {
+		__u16 value_len;
+		name_len -= 4;
+		temp_ptr += 4;
+		rc += temp_fea->name_len;
+	/* account for prefix user. and trailing null */
+		rc = rc + 5 + 1;
+		if (rc < (int) buf_size) {
+			memcpy(EAData, "user.", 5);
+			EAData += 5;
+			memcpy(EAData, temp_ptr, temp_fea->name_len);
+			EAData += temp_fea->name_len;
+			/* null terminate name */
+			*EAData = 0;
+			EAData = EAData + 1;
+		} else if (buf_size == 0) {
+			/* skip copy - calc size only */
+		} else {
+			/* stop before overrun buffer */
+			rc = -ERANGE;
+			break;
 		}
+		name_len -= temp_fea->name_len;
+		temp_ptr += temp_fea->name_len;
+		/* account for trailing null */
+		name_len--;
+		temp_ptr++;
+		value_len = le16_to_cpu(temp_fea->value_len);
+		name_len -= value_len;
+		temp_ptr += value_len;
+		/* BB check that temp_ptr is still
+		      within the SMB BB*/
+
+		/* no trailing null to account for
+		   in value len */
+		/* go on to next EA */
+		temp_fea = (struct fea *)temp_ptr;
 	}
+
+QAllEAsOut:
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto QAllEAsRetry;
-- 
cgit v1.2.3


From 6e462b9f2c37101312109aaa46e196bdca6c846d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:26 -0500
Subject: cifs: rename name_len to list_len in CIFSSMBQAllEAs

...for clarity and so we can reuse the name for the real name_len.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 30709589e0c9..f5e15279ea29 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5281,7 +5281,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 	TRANSACTION2_QPI_RSP *pSMBr = NULL;
 	int rc = 0;
 	int bytes_returned;
-	int name_len;
+	int list_len;
 	struct fealist *ea_response_data;
 	struct fea *temp_fea;
 	char *temp_ptr;
@@ -5295,18 +5295,18 @@ QAllEAsRetry:
 		return rc;
 
 	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
+		list_len =
 		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
 				     PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
-		name_len *= 2;
+		list_len++;	/* trailing null */
+		list_len *= 2;
 	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
+		list_len = strnlen(searchName, PATH_MAX);
+		list_len++;	/* trailing null */
+		strncpy(pSMB->FileName, searchName, list_len);
 	}
 
-	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
+	params = 2 /* level */ + 4 /* reserved */ + list_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	/* BB find exact max SMB PDU from sess structure BB */
@@ -5361,23 +5361,23 @@ QAllEAsRetry:
 	ea_response_data = (struct fealist *)
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
 
-	name_len = le32_to_cpu(ea_response_data->list_len);
-	cFYI(1, ("ea length %d", name_len));
-	if (name_len <= 8) {
+	list_len = le32_to_cpu(ea_response_data->list_len);
+	cFYI(1, ("ea length %d", list_len));
+	if (list_len <= 8) {
 		cFYI(1, ("empty EA list returned from server"));
 		goto QAllEAsOut;
 	}
 
 	/* account for ea list len */
-	name_len -= 4;
+	list_len -= 4;
 	temp_fea = ea_response_data->list;
 	temp_ptr = (char *)temp_fea;
-	while (name_len > 0) {
+	while (list_len > 0) {
 		__u16 value_len;
-		name_len -= 4;
+		list_len -= 4;
 		temp_ptr += 4;
 		rc += temp_fea->name_len;
-	/* account for prefix user. and trailing null */
+		/* account for prefix user. and trailing null */
 		rc = rc + 5 + 1;
 		if (rc < (int) buf_size) {
 			memcpy(EAData, "user.", 5);
@@ -5386,7 +5386,7 @@ QAllEAsRetry:
 			EAData += temp_fea->name_len;
 			/* null terminate name */
 			*EAData = 0;
-			EAData = EAData + 1;
+			++EAData;
 		} else if (buf_size == 0) {
 			/* skip copy - calc size only */
 		} else {
@@ -5394,13 +5394,13 @@ QAllEAsRetry:
 			rc = -ERANGE;
 			break;
 		}
-		name_len -= temp_fea->name_len;
+		list_len -= temp_fea->name_len;
 		temp_ptr += temp_fea->name_len;
 		/* account for trailing null */
-		name_len--;
+		list_len--;
 		temp_ptr++;
 		value_len = le16_to_cpu(temp_fea->value_len);
-		name_len -= value_len;
+		list_len -= value_len;
 		temp_ptr += value_len;
 		/* BB check that temp_ptr is still
 		      within the SMB BB*/
-- 
cgit v1.2.3


From e529614ad0c2bf1f3a6fcf1402aa77430ecfaa2c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:26 -0500
Subject: cifs: increase maximum buffer size in CIFSSMBQAllEAs

It's 4000 now, but there's no reason to limit it to that. We should be
able to handle a response up to CIFSMaxBufSize.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f5e15279ea29..4f24f83ce623 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5310,7 +5310,7 @@ QAllEAsRetry:
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	/* BB find exact max SMB PDU from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
-- 
cgit v1.2.3


From 0cd126b504cede8a74acf7583a44eba32f9a1da1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:26 -0500
Subject: cifs: verify lengths of QueryAllEAs reply

Make sure the lengths in a QUERY_ALL_EAS reply don't make the parser walk
off the end of the SMB.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 49 +++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4f24f83ce623..e197e1647d5d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5285,6 +5285,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 	struct fealist *ea_response_data;
 	struct fea *temp_fea;
 	char *temp_ptr;
+	char *end_of_smb;
 	__u16 params, byte_count, data_offset;
 
 	cFYI(1, ("In Query All EAs path %s", searchName));
@@ -5368,22 +5369,47 @@ QAllEAsRetry:
 		goto QAllEAsOut;
 	}
 
+	/* make sure list_len doesn't go past end of SMB */
+	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
+	if ((char *)ea_response_data + list_len > end_of_smb) {
+		cFYI(1, ("EA list appears to go beyond SMB"));
+		rc = -EIO;
+		goto QAllEAsOut;
+	}
+
 	/* account for ea list len */
 	list_len -= 4;
 	temp_fea = ea_response_data->list;
 	temp_ptr = (char *)temp_fea;
 	while (list_len > 0) {
+		__u8 name_len;
 		__u16 value_len;
+
 		list_len -= 4;
 		temp_ptr += 4;
-		rc += temp_fea->name_len;
+		/* make sure we can read name_len and value_len */
+		if (list_len < 0) {
+			cFYI(1, ("EA entry goes beyond length of list"));
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
+		name_len = temp_fea->name_len;
+		value_len = le16_to_cpu(temp_fea->value_len);
+		list_len -= name_len + 1 + value_len;
+		if (list_len < 0) {
+			cFYI(1, ("EA entry goes beyond length of list"));
+			rc = -EIO;
+			goto QAllEAsOut;
+		}
+
 		/* account for prefix user. and trailing null */
-		rc = rc + 5 + 1;
+		rc += (5 + 1 + name_len);
 		if (rc < (int) buf_size) {
 			memcpy(EAData, "user.", 5);
 			EAData += 5;
-			memcpy(EAData, temp_ptr, temp_fea->name_len);
-			EAData += temp_fea->name_len;
+			memcpy(EAData, temp_ptr, name_len);
+			EAData += name_len;
 			/* null terminate name */
 			*EAData = 0;
 			++EAData;
@@ -5394,20 +5420,7 @@ QAllEAsRetry:
 			rc = -ERANGE;
 			break;
 		}
-		list_len -= temp_fea->name_len;
-		temp_ptr += temp_fea->name_len;
-		/* account for trailing null */
-		list_len--;
-		temp_ptr++;
-		value_len = le16_to_cpu(temp_fea->value_len);
-		list_len -= value_len;
-		temp_ptr += value_len;
-		/* BB check that temp_ptr is still
-		      within the SMB BB*/
-
-		/* no trailing null to account for
-		   in value len */
-		/* go on to next EA */
+		temp_ptr += name_len + 1 + value_len;
 		temp_fea = (struct fea *)temp_ptr;
 	}
 
-- 
cgit v1.2.3


From 31c0519f7af99ae60fd39f7f1c1f7ae1efb56760 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:18:26 -0500
Subject: cifs: merge CIFSSMBQueryEA with CIFSSMBQAllEAs

Add an "ea_name" parameter to CIFSSMBQAllEAs. When it's set make it
behave like CIFSSMBQueryEA does now. The current callers of
CIFSSMBQueryEA are converted to use CIFSSMBQAllEAs, and the old
CIFSSMBQueryEA function is removed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |   7 +-
 fs/cifs/cifssmb.c   | 214 ++++++++++++----------------------------------------
 fs/cifs/inode.c     |   2 +-
 fs/cifs/xattr.c     |   8 +-
 4 files changed, 54 insertions(+), 177 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5646727e33f5..88e2bc44ac58 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -363,13 +363,10 @@ extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 			__u32 filter, struct file *file, int multishot,
 			const struct nls_table *nls_codepage);
 extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-			const unsigned char *searchName, char *EAData,
+			const unsigned char *searchName,
+			const unsigned char *ea_name, char *EAData,
 			size_t bufsize, const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
-		const unsigned char *searchName, const unsigned char *ea_name,
-		unsigned char *ea_value, size_t buf_size,
-		const struct nls_table *nls_codepage, int remap_special_chars);
 extern int CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon,
 		const char *fileName, const char *ea_name,
 		const void *ea_value, const __u16 ea_value_len,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e197e1647d5d..d6d40b883abc 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5269,12 +5269,22 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 	cifs_buf_release(pSMB);
 	return rc;
 }
+
 #ifdef CONFIG_CIFS_XATTR
+/*
+ * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
+ * function used by listxattr and getxattr type calls. When ea_name is set,
+ * it looks for that attribute name and stuffs that value into the EAData
+ * buffer. When ea_name is NULL, it stuffs a list of attribute names into the
+ * buffer. In both cases, the return value is either the length of the
+ * resulting data or a negative error code. If EAData is a NULL pointer then
+ * the data isn't copied to it, but the length is returned.
+ */
 ssize_t
 CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
-		 const unsigned char *searchName,
-		 char *EAData, size_t buf_size,
-		 const struct nls_table *nls_codepage, int remap)
+		const unsigned char *searchName, const unsigned char *ea_name,
+		char *EAData, size_t buf_size,
+		const struct nls_table *nls_codepage, int remap)
 {
 		/* BB assumes one setup word */
 	TRANSACTION2_QPI_REQ *pSMB = NULL;
@@ -5403,27 +5413,46 @@ QAllEAsRetry:
 			goto QAllEAsOut;
 		}
 
-		/* account for prefix user. and trailing null */
-		rc += (5 + 1 + name_len);
-		if (rc < (int) buf_size) {
-			memcpy(EAData, "user.", 5);
-			EAData += 5;
-			memcpy(EAData, temp_ptr, name_len);
-			EAData += name_len;
-			/* null terminate name */
-			*EAData = 0;
-			++EAData;
-		} else if (buf_size == 0) {
-			/* skip copy - calc size only */
+		if (ea_name) {
+			if (strncmp(ea_name, temp_ptr, name_len) == 0) {
+				temp_ptr += name_len + 1;
+				rc = value_len;
+				if (buf_size == 0)
+					goto QAllEAsOut;
+				if ((size_t)value_len > buf_size) {
+					rc = -ERANGE;
+					goto QAllEAsOut;
+				}
+				memcpy(EAData, temp_ptr, value_len);
+				goto QAllEAsOut;
+			}
 		} else {
-			/* stop before overrun buffer */
-			rc = -ERANGE;
-			break;
+			/* account for prefix user. and trailing null */
+			rc += (5 + 1 + name_len);
+			if (rc < (int) buf_size) {
+				memcpy(EAData, "user.", 5);
+				EAData += 5;
+				memcpy(EAData, temp_ptr, name_len);
+				EAData += name_len;
+				/* null terminate name */
+				*EAData = 0;
+				++EAData;
+			} else if (buf_size == 0) {
+				/* skip copy - calc size only */
+			} else {
+				/* stop before overrun buffer */
+				rc = -ERANGE;
+				break;
+			}
 		}
 		temp_ptr += name_len + 1 + value_len;
 		temp_fea = (struct fea *)temp_ptr;
 	}
 
+	/* didn't find the named attribute */
+	if (ea_name)
+		rc = -ENODATA;
+
 QAllEAsOut:
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -5432,155 +5461,6 @@ QAllEAsOut:
 	return (ssize_t)rc;
 }
 
-ssize_t CIFSSMBQueryEA(const int xid, struct cifsTconInfo *tcon,
-		const unsigned char *searchName, const unsigned char *ea_name,
-		unsigned char *ea_value, size_t buf_size,
-		const struct nls_table *nls_codepage, int remap)
-{
-	TRANSACTION2_QPI_REQ *pSMB = NULL;
-	TRANSACTION2_QPI_RSP *pSMBr = NULL;
-	int rc = 0;
-	int bytes_returned;
-	int name_len;
-	struct fea *temp_fea;
-	char *temp_ptr;
-	__u16 params, byte_count;
-
-	cFYI(1, ("In Query EA path %s", searchName));
-QEARetry:
-	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
-		      (void **) &pSMBr);
-	if (rc)
-		return rc;
-
-	if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-		name_len =
-		    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
-				     PATH_MAX, nls_codepage, remap);
-		name_len++;	/* trailing null */
-		name_len *= 2;
-	} else {	/* BB improve the check for buffer overruns BB */
-		name_len = strnlen(searchName, PATH_MAX);
-		name_len++;	/* trailing null */
-		strncpy(pSMB->FileName, searchName, name_len);
-	}
-
-	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
-	pSMB->TotalDataCount = 0;
-	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max SMB PDU from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
-	pSMB->MaxSetupCount = 0;
-	pSMB->Reserved = 0;
-	pSMB->Flags = 0;
-	pSMB->Timeout = 0;
-	pSMB->Reserved2 = 0;
-	pSMB->ParameterOffset = cpu_to_le16(offsetof(
-		struct smb_com_transaction2_qpi_req, InformationLevel) - 4);
-	pSMB->DataCount = 0;
-	pSMB->DataOffset = 0;
-	pSMB->SetupCount = 1;
-	pSMB->Reserved3 = 0;
-	pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_PATH_INFORMATION);
-	byte_count = params + 1 /* pad */ ;
-	pSMB->TotalParameterCount = cpu_to_le16(params);
-	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
-	pSMB->Reserved4 = 0;
-	pSMB->hdr.smb_buf_length += byte_count;
-	pSMB->ByteCount = cpu_to_le16(byte_count);
-
-	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
-			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	if (rc) {
-		cFYI(1, ("Send error in Query EA = %d", rc));
-	} else {		/* decode response */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
-
-		/* BB also check enough total bytes returned */
-		/* BB we need to improve the validity checking
-		of these trans2 responses */
-		if (rc || (pSMBr->ByteCount < 4))
-			rc = -EIO;	/* bad smb */
-	   /* else if (pFindData){
-			memcpy((char *) pFindData,
-			       (char *) &pSMBr->hdr.Protocol +
-			       data_offset, kl);
-		}*/ else {
-			/* check that length of list is not more than bcc */
-			/* check that each entry does not go beyond length
-			   of list */
-			/* check that each element of each entry does not
-			   go beyond end of list */
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			struct fealist *ea_response_data;
-			rc = -ENODATA;
-			/* validate_trans2_offsets() */
-			/* BB check if start of smb + data_offset > &bcc+ bcc*/
-			ea_response_data = (struct fealist *)
-				(((char *) &pSMBr->hdr.Protocol) +
-				data_offset);
-			name_len = le32_to_cpu(ea_response_data->list_len);
-			cFYI(1, ("ea length %d", name_len));
-			if (name_len <= 8) {
-			/* returned EA size zeroed at top of function */
-				cFYI(1, ("empty EA list returned from server"));
-			} else {
-				/* account for ea list len */
-				name_len -= 4;
-				temp_fea = ea_response_data->list;
-				temp_ptr = (char *)temp_fea;
-				/* loop through checking if we have a matching
-				name and then return the associated value */
-				while (name_len > 0) {
-					__u16 value_len;
-					name_len -= 4;
-					temp_ptr += 4;
-					value_len =
-					      le16_to_cpu(temp_fea->value_len);
-				/* BB validate that value_len falls within SMB,
-				even though maximum for name_len is 255 */
-					if (memcmp(temp_fea->name, ea_name,
-						  temp_fea->name_len) == 0) {
-						/* found a match */
-						rc = value_len;
-				/* account for prefix user. and trailing null */
-						if (rc <= (int)buf_size) {
-							memcpy(ea_value,
-								temp_fea->name+temp_fea->name_len+1,
-								rc);
-							/* ea values, unlike ea
-							   names, are not null
-							   terminated */
-						} else if (buf_size == 0) {
-						/* skip copy - calc size only */
-						} else {
-						/* stop before overrun buffer */
-							rc = -ERANGE;
-						}
-						break;
-					}
-					name_len -= temp_fea->name_len;
-					temp_ptr += temp_fea->name_len;
-					/* account for trailing null */
-					name_len--;
-					temp_ptr++;
-					name_len -= value_len;
-					temp_ptr += value_len;
-					/* No trailing null to account for in
-					   value_len.  Go on to next EA */
-					temp_fea = (struct fea *)temp_ptr;
-				}
-			}
-		}
-	}
-	cifs_buf_release(pSMB);
-	if (rc == -EAGAIN)
-		goto QEARetry;
-
-	return (ssize_t)rc;
-}
-
 int
 CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	     const char *ea_name, const void *ea_value,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index e3fda978f481..5e9cc9c2fc7d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -366,7 +366,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	char ea_value[4];
 	__u32 mode;
 
-	rc = CIFSSMBQueryEA(xid, cifs_sb->tcon, path, "SETFILEBITS",
+	rc = CIFSSMBQAllEAs(xid, cifs_sb->tcon, path, "SETFILEBITS",
 			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
 			    cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index a75afa3dd9e1..3e2ef0de1209 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -244,7 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += 5; /* skip past user. prefix */
-		rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4) == 0) {
@@ -252,7 +252,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			goto get_ea_exit;
 
 		ea_name += 4; /* skip past os2. prefix */
-		rc = CIFSSMBQueryEA(xid, pTcon, full_path, ea_name, ea_value,
+		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
 			buf_size, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
@@ -364,8 +364,8 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 	/* if proc/fs/cifs/streamstoxattr is set then
 		search server for EAs or streams to
 		returns as xattrs */
-	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, data, buf_size,
-				cifs_sb->local_nls,
+	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
+				buf_size, cifs_sb->local_nls,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
-- 
cgit v1.2.3


From 96c03bccc7defb1f0d486165af5c0b8590f56c11 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 23 Feb 2010 20:51:43 +0000
Subject: [CIFS] Minor cleanup to EA patch

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   | 3 ++-
 fs/cifs/cifssmb.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 49503d2edc7e..bc0025cdd1c9 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,6 +1,7 @@
 Version 1.62
 ------------
-Add sockopt=TCP_NODELAY mount option.
+Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
+to more strictly handle corrupt frames.
 
 Version 1.61
 ------------
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d6d40b883abc..99ae57d01d4f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5392,7 +5392,7 @@ QAllEAsRetry:
 	temp_fea = ea_response_data->list;
 	temp_ptr = (char *)temp_fea;
 	while (list_len > 0) {
-		__u8 name_len;
+		int name_len;
 		__u16 value_len;
 
 		list_len -= 4;
-- 
cgit v1.2.3


From 835a36ca4a4cd9da557318c0e213346644a4b2c8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Feb 2010 16:21:33 -0500
Subject: cifs: set server_eof in cifs_fattr_to_inode

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e9cc9c2fc7d..8bdbc818164c 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -111,6 +111,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 
 	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
 
+	cifs_i->server_eof = fattr->cf_eof;
 	/*
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
-- 
cgit v1.2.3


From c8d46e41bc744c8fa0092112af3942fcd46c8b18 Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Wed, 24 Feb 2010 09:52:53 -0500
Subject: ext4: Add flag to files with blocks intentionally past EOF

fallocate() may potentially instantiate blocks past EOF, depending
on the flags used when it is called.

e2fsck currently has a test for blocks past i_size, and it
sometimes trips up - noticeably on xfstests 013 which runs fsstress.

This patch from Jiayang does fix it up - it (along with
e2fsprogs updates and other patches recently from Aneesh) has
survived many fsstress runs in a row.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  5 +++--
 fs/ext4/extents.c | 22 +++++++++++++++++++++-
 fs/ext4/inode.c   |  9 ++++++++-
 fs/ext4/ioctl.c   |  9 +++++++++
 4 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 509437ffb71b..74664ca19e22 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -285,10 +285,11 @@ struct flex_groups {
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
 #define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004B80FF /* User modifiable flags */
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7d54850f7136..a2c21aa09e2b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3186,7 +3186,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex;
+	struct ext4_extent newex, *ex, *last_ex;
 	ext4_fsblk_t newblock;
 	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
@@ -3367,6 +3367,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
 	}
+
+	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+		if (eh->eh_entries) {
+			last_ex = EXT_LAST_EXTENT(eh);
+			if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+					    + ext4_ext_get_actual_len(last_ex))
+				EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+		} else {
+			WARN_ON(eh->eh_entries == 0);
+			ext4_error(inode->i_sb, __func__,
+				"inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+			}
+	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
@@ -3500,6 +3513,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
 			i_size_write(inode, new_size);
 		if (new_size > EXT4_I(inode)->i_disksize)
 			ext4_update_i_disksize(inode, new_size);
+	} else {
+		/*
+		 * Mark that we allocate beyond EOF so the subsequent truncate
+		 * can proceed even if the new size is the same as i_size.
+		 */
+		if (new_size > i_size_read(inode))
+			EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
 	}
 
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ecac8c5a6f5c..edb7edc99f71 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4456,6 +4456,8 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
+	EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
@@ -5305,7 +5307,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+	    attr->ia_valid & ATTR_SIZE &&
+	    (attr->ia_size < inode->i_size ||
+	     (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5336,6 +5340,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 			}
 		}
+		/* ext4_truncate will clear the flag */
+		if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+			ext4_truncate(inode);
 	}
 
 	rc = inode_setattr(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193126db..2220feb2dcc1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			flags &= ~EXT4_EXTENTS_FL;
 		}
 
+		if (flags & EXT4_EOFBLOCKS_FL) {
+			/* we don't support adding EOFBLOCKS flag */
+			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (oldflags & EXT4_EOFBLOCKS_FL)
+			ext4_truncate(inode);
+
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
-- 
cgit v1.2.3


From 482a74258fd08d30bf2ab0f5549afab5a5c9daba Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 24 Feb 2010 11:35:32 -0500
Subject: ext4: mount flags manipulation cleanup

Replace intermediate EXT4_MOUNT_XXX flags manipulation to
corresponding macro.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1c85bb67e6eb..7e8b1b4236d3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -796,10 +796,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 	if (sbi->s_qf_names[GRPQUOTA])
 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+	if (test_opt(sb, USRQUOTA))
 		seq_puts(seq, ",usrquota");
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+	if (test_opt(sb, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 #endif
 }
@@ -1383,14 +1383,13 @@ static int parse_options(char *options, struct super_block *sb,
 			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
 		datacheck:
 			if (is_remount) {
-				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
-						!= data_opt) {
+				if (test_opt(sb, DATA_FLAGS) != data_opt) {
 					ext4_msg(sb, KERN_ERR,
 						"Cannot change data mode on remount");
 					return 0;
 				}
 			} else {
-				sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+				clear_opt(sbi->s_mount_opt, DATA_FLAGS);
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
@@ -1625,18 +1624,14 @@ set_qf_format:
 	}
 #ifdef CONFIG_QUOTA
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-		if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
-		     sbi->s_qf_names[USRQUOTA])
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
 			clear_opt(sbi->s_mount_opt, USRQUOTA);
 
-		if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
-		     sbi->s_qf_names[GRPQUOTA])
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 
-		if ((sbi->s_qf_names[USRQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
-		    (sbi->s_qf_names[GRPQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+		if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) ||
+			(sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) {
 			ext4_msg(sb, KERN_ERR, "old and new quota "
 					"format mixing");
 			return 0;
@@ -2452,11 +2447,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-		sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+		set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-		sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+		set_opt(sbi->s_mount_opt, ORDERED_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
@@ -2484,7 +2479,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -3520,7 +3515,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		ext4_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	es = sbi->s_es;
 
-- 
cgit v1.2.3


From 56c50f11f4d11cb14d78fe52330efb69d219c62f Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 1 Mar 2010 23:28:41 -0500
Subject: ext4: trivial quota cleanup

The patch is aimed to reorganize and simplify quota code a bit.
Quota code is itself complex enough, but we can make it more readable
in some places:
- Move quota option parsing to separate functions.
- Simplify old-quota and journaled-quota mix check.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 123 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 69 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7e8b1b4236d3..79937e921988 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1206,6 +1206,64 @@ static ext4_fsblk_t get_sb_block(void **data)
 
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
 
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *qname;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return 0;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext4_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return 0;
+	}
+	if (sbi->s_qf_names[qtype] &&
+		strcmp(sbi->s_qf_names[qtype], qname)) {
+		ext4_msg(sb, KERN_ERR,
+			"%s quota file already specified", QTYPE2NAME(qtype));
+		kfree(qname);
+		return 0;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	if (strchr(sbi->s_qf_names[qtype], '/')) {
+		ext4_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		kfree(sbi->s_qf_names[qtype]);
+		sbi->s_qf_names[qtype] = NULL;
+		return 0;
+	}
+	set_opt(sbi->s_mount_opt, QUOTA);
+	return 1;
+}
+
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
+
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return 0;
+	}
+	/*
+	 * The space will be released later when all options are confirmed
+	 * to be correct
+	 */
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
@@ -1217,8 +1275,7 @@ static int parse_options(char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype, qfmt;
-	char *qname;
+	int qfmt;
 #endif
 
 	if (!options)
@@ -1401,63 +1458,22 @@ static int parse_options(char *options, struct super_block *sb,
 			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
-			qtype = USRQUOTA;
-			goto set_qf_name;
-		case Opt_grpjquota:
-			qtype = GRPQUOTA;
-set_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    !sbi->s_qf_names[qtype]) {
-				ext4_msg(sb, KERN_ERR,
-				       "Cannot change journaled "
-				       "quota options when quota turned on");
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
 				return 0;
-			}
-			qname = match_strdup(&args[0]);
-			if (!qname) {
-				ext4_msg(sb, KERN_ERR,
-					"Not enough memory for "
-					"storing quotafile name");
-				return 0;
-			}
-			if (sbi->s_qf_names[qtype] &&
-			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				ext4_msg(sb, KERN_ERR,
-					"%s quota file already "
-					"specified", QTYPE2NAME(qtype));
-				kfree(qname);
-				return 0;
-			}
-			sbi->s_qf_names[qtype] = qname;
-			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				ext4_msg(sb, KERN_ERR,
-					"quotafile must be on "
-					"filesystem root");
-				kfree(sbi->s_qf_names[qtype]);
-				sbi->s_qf_names[qtype] = NULL;
+			break;
+		case Opt_grpjquota:
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
 				return 0;
-			}
-			set_opt(sbi->s_mount_opt, QUOTA);
 			break;
 		case Opt_offusrjquota:
-			qtype = USRQUOTA;
-			goto clear_qf_name;
+			if (!clear_qf_name(sb, USRQUOTA))
+				return 0;
+			break;
 		case Opt_offgrpjquota:
-			qtype = GRPQUOTA;
-clear_qf_name:
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_qf_names[qtype]) {
-				ext4_msg(sb, KERN_ERR, "Cannot change "
-					"journaled quota options when "
-					"quota turned on");
+			if (!clear_qf_name(sb, GRPQUOTA))
 				return 0;
-			}
-			/*
-			 * The space will be released later when all options
-			 * are confirmed to be correct
-			 */
-			sbi->s_qf_names[qtype] = NULL;
 			break;
+
 		case Opt_jqfmt_vfsold:
 			qfmt = QFMT_VFS_OLD;
 			goto set_qf_format;
@@ -1630,8 +1646,7 @@ set_qf_format:
 		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
 			clear_opt(sbi->s_mount_opt, GRPQUOTA);
 
-		if ((sbi->s_qf_names[USRQUOTA] && test_opt(sb, GRPQUOTA)) ||
-			(sbi->s_qf_names[GRPQUOTA] && test_opt(sb, USRQUOTA))) {
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
 			ext4_msg(sb, KERN_ERR, "old and new quota "
 					"format mixing");
 			return 0;
-- 
cgit v1.2.3


From 23e2af3518facab6838d7aac1f46fbd7a5a290ce Mon Sep 17 00:00:00 2001
From: dingdinghua <dingdinghua@nrchpc.ac.cn>
Date: Wed, 24 Feb 2010 12:11:20 -0500
Subject: jbd2: clean up an assertion in jbd2_journal_commit_transaction()

commit_transaction has the same value as journal->j_running_transaction,
so we can simplify the assert statement.

Signed-off-by: dingdinghua <dingdinghua@nrchpc.ac.cn>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3ee211ed58f1..671da7fb7ffd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -883,8 +883,7 @@ restart_loop:
 		spin_unlock(&journal->j_list_lock);
 		bh = jh2bh(jh);
 		jbd_lock_bh_state(bh);
-		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
-			jh->b_transaction == journal->j_running_transaction);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
 
 		/*
 		 * If there is undo-protected committed data against
-- 
cgit v1.2.3


From 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 24 Feb 2010 11:08:18 -0600
Subject: dlm: fix ordering of bast and cast

When both blocking and completion callbacks are queued for lock,
the dlm would always deliver the completion callback (cast) first.
In some cases the blocking callback (bast) is queued before the
cast, though, and should be delivered first.  This patch keeps
track of the order in which they were queued and delivers them
in that order.

This patch also keeps track of the granted mode in the last cast
and eliminates the following bast if the bast mode is compatible
with the preceding cast mode.  This happens when a remotely mastered
lock is demoted, e.g. EX->NL, in which case the local node queues
a cast immediately after sending the demote message.  In this way
a cast can be queued for a mode, e.g. NL, that makes an in-transit
bast extraneous.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 74 ++++++++++++++++++++++++++++++++++++++++-----------
 fs/dlm/ast.h          |  4 +--
 fs/dlm/dlm_internal.h | 10 +++++--
 fs/dlm/lock.c         |  4 +--
 fs/dlm/user.c         | 10 ++++---
 fs/dlm/user.h         |  4 +--
 6 files changed, 78 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dc2ad6008b2d..4314f0d48d85 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type, bastmode);
+		dlm_user_add_ast(lkb, type, mode);
 		return;
 	}
 
@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+		lkb->lkb_ast_first = type;
 	}
+
+	/* sanity check, this should not happen */
+
+	if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
+		log_print("repeat cast %d castmode %d lock %x %s",
+			  mode, lkb->lkb_castmode,
+			  lkb->lkb_id, lkb->lkb_resource->res_name);
+
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,9 +70,9 @@ static void process_asts(void)
 	struct dlm_ls *ls = NULL;
 	struct dlm_rsb *r = NULL;
 	struct dlm_lkb *lkb;
-	void (*cast) (void *astparam);
-	void (*bast) (void *astparam, int mode);
-	int type = 0, bastmode;
+	void (*castfn) (void *astparam);
+	void (*bastfn) (void *astparam, int mode);
+	int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
 
 repeat:
 	spin_lock(&ast_queue_lock);
@@ -75,17 +86,48 @@ repeat:
 		list_del(&lkb->lkb_astqueue);
 		type = lkb->lkb_ast_type;
 		lkb->lkb_ast_type = 0;
+		first = lkb->lkb_ast_first;
+		lkb->lkb_ast_first = 0;
 		bastmode = lkb->lkb_bastmode;
-
+		castmode = lkb->lkb_castmode;
+		castfn = lkb->lkb_astfn;
+		bastfn = lkb->lkb_bastfn;
 		spin_unlock(&ast_queue_lock);
-		cast = lkb->lkb_astfn;
-		bast = lkb->lkb_bastfn;
-
-		if ((type & AST_COMP) && cast)
-			cast(lkb->lkb_astparam);
 
-		if ((type & AST_BAST) && bast)
-			bast(lkb->lkb_astparam, bastmode);
+		do_cast = (type & AST_COMP) && castfn;
+		do_bast = (type & AST_BAST) && bastfn;
+
+		/* Skip a bast if its blocking mode is compatible with the
+		   granted mode of the preceding cast. */
+
+		if (do_bast) {
+			if (first == AST_COMP)
+				last_castmode = castmode;
+			else
+				last_castmode = lkb->lkb_castmode_done;
+			if (dlm_modes_compat(bastmode, last_castmode))
+				do_bast = 0;
+		}
+
+		if (first == AST_COMP) {
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+		} else if (first == AST_BAST) {
+			if (do_bast)
+				bastfn(lkb->lkb_astparam, bastmode);
+			if (do_cast)
+				castfn(lkb->lkb_astparam);
+		} else {
+			log_error(ls, "bad ast_first %d ast_type %d",
+				  first, type);
+		}
+
+		if (do_cast)
+			lkb->lkb_castmode_done = castmode;
+		if (do_bast)
+			lkb->lkb_bastmode_done = bastmode;
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 1b5fc5f428fd..bcb1aaba519d 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 
 void dlm_astd_wake(void);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 826d3dc6e0ab..f632b58cd222 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -232,11 +232,17 @@ struct dlm_lkb {
 	int8_t			lkb_status;     /* granted, waiting, convert */
 	int8_t			lkb_rqmode;	/* requested lock mode */
 	int8_t			lkb_grmode;	/* granted lock mode */
-	int8_t			lkb_bastmode;	/* requested mode */
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_count;
 	int8_t			lkb_ast_type;	/* type of ast queued for */
+	int8_t			lkb_ast_first;	/* type of first ast queued */
+
+	int8_t			lkb_bastmode;	/* req mode of queued bast */
+	int8_t			lkb_castmode;	/* gr mode of queued cast */
+	int8_t			lkb_bastmode_done; /* last delivered bastmode */
+	int8_t			lkb_castmode_done; /* last delivered castmode */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9c0c1db1e105..e08ea93432bc 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	lkb->lkb_lksb->sb_status = rv;
 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
 
-	dlm_add_ast(lkb, AST_COMP, 0);
+	dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index e73a4bb572aa..a4bfd31ac45b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 
 	ast_type = lkb->lkb_ast_type;
 	lkb->lkb_ast_type |= type;
-	if (bastmode)
-		lkb->lkb_bastmode = bastmode;
+	if (type == AST_BAST)
+		lkb->lkb_bastmode = mode;
+	else
+		lkb->lkb_castmode = mode;
 
 	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 1c9686492286..f196091dd7ff 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
-- 
cgit v1.2.3


From 122ca0076e5f84fa12943f22f6586ff285670783 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 24 Feb 2010 21:56:48 +0000
Subject: [CIFS] Use unsigned ea length for clarity

Jeff correctly noted that using unsigned ea length is more intuitive.
CC: Jeff Lyaton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 99ae57d01d4f..b79ff68c47c7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -5392,7 +5392,7 @@ QAllEAsRetry:
 	temp_fea = ea_response_data->list;
 	temp_ptr = (char *)temp_fea;
 	while (list_len > 0) {
-		int name_len;
+		unsigned int name_len;
 		__u16 value_len;
 
 		list_len -= 4;
-- 
cgit v1.2.3


From d7b619cf56218704ffce9d510aa497f0a0bcda0b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 25 Feb 2010 05:36:46 +0000
Subject: [CIFS] pSesInfo->sesSem is used as mutex. Rename it to session_mutex
 and convert it to a real mutex.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  2 +-
 fs/cifs/cifssmb.c  | 12 ++++++------
 fs/cifs/connect.c  |  8 ++++----
 fs/cifs/misc.c     |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ed751bb657db..a1c817eb291a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -205,7 +205,7 @@ struct cifsUidInfo {
 struct cifsSesInfo {
 	struct list_head smb_ses_list;
 	struct list_head tcon_list;
-	struct semaphore sesSem;
+	struct mutex session_mutex;
 #if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
 #endif
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b79ff68c47c7..9d17df3e0768 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -170,19 +170,19 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 	 * need to prevent multiple threads trying to simultaneously
 	 * reconnect the same SMB session
 	 */
-	down(&ses->sesSem);
+	mutex_lock(&ses->session_mutex);
 	if (ses->need_reconnect)
 		rc = cifs_setup_session(0, ses, nls_codepage);
 
 	/* do we need to reconnect tcon? */
 	if (rc || !tcon->need_reconnect) {
-		up(&ses->sesSem);
+		mutex_unlock(&ses->session_mutex);
 		goto out;
 	}
 
 	mark_open_files_invalid(tcon);
 	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
-	up(&ses->sesSem);
+	mutex_unlock(&ses->session_mutex);
 	cFYI(1, ("reconnect tcon rc = %d", rc));
 
 	if (rc)
@@ -700,13 +700,13 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	if (!ses || !ses->server)
 		return -EIO;
 
-	down(&ses->sesSem);
+	mutex_lock(&ses->session_mutex);
 	if (ses->need_reconnect)
 		goto session_already_dead; /* no need to send SMBlogoff if uid
 					      already closed due to reconnect */
 	rc = small_smb_init(SMB_COM_LOGOFF_ANDX, 2, NULL, (void **)&pSMB);
 	if (rc) {
-		up(&ses->sesSem);
+		mutex_unlock(&ses->session_mutex);
 		return rc;
 	}
 
@@ -721,7 +721,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (struct smb_hdr *) pSMB, 0);
 session_already_dead:
-	up(&ses->sesSem);
+	mutex_unlock(&ses->session_mutex);
 
 	/* if session dead then we do not need to do ulogoff,
 		since server closed smb session, no sense reporting
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2e9e09ca0e30..45eb6cba793f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2388,13 +2388,13 @@ try_mount_again:
 		 */
 		cifs_put_tcp_session(srvTcp);
 
-		down(&pSesInfo->sesSem);
+		mutex_lock(&pSesInfo->session_mutex);
 		if (pSesInfo->need_reconnect) {
 			cFYI(1, ("Session needs reconnect"));
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
 		}
-		up(&pSesInfo->sesSem);
+		mutex_unlock(&pSesInfo->session_mutex);
 	} else if (!rc) {
 		cFYI(1, ("Existing smb sess not found"));
 		pSesInfo = sesInfoAlloc();
@@ -2437,12 +2437,12 @@ try_mount_again:
 		}
 		pSesInfo->linux_uid = volume_info->linux_uid;
 		pSesInfo->overrideSecFlg = volume_info->secFlg;
-		down(&pSesInfo->sesSem);
+		mutex_lock(&pSesInfo->session_mutex);
 
 		/* BB FIXME need to pass vol->secFlgs BB */
 		rc = cifs_setup_session(xid, pSesInfo,
 					cifs_sb->local_nls);
-		up(&pSesInfo->sesSem);
+		mutex_unlock(&pSesInfo->session_mutex);
 	}
 
 	/* search for existing tcon to this server share */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d27d4ec6579b..d1474996a812 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -79,7 +79,7 @@ sesInfoAlloc(void)
 		++ret_buf->ses_count;
 		INIT_LIST_HEAD(&ret_buf->smb_ses_list);
 		INIT_LIST_HEAD(&ret_buf->tcon_list);
-		init_MUTEX(&ret_buf->sesSem);
+		mutex_init(&ret_buf->session_mutex);
 	}
 	return ret_buf;
 }
-- 
cgit v1.2.3


From 7dc52157982ab771f40e3c0b7dc55b954c3c2d19 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 22 Feb 2010 17:04:52 -0800
Subject: vfs: Apply lockdep-based checking to rcu_dereference() uses

Add lockdep-ified RCU primitives to alloc_fd(), files_fdtable()
and fcheck_files().

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
LKML-Reference: <1266887105-1528-8-git-send-email-paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/file.c               | 2 +-
 fs/proc/array.c         | 2 ++
 fs/proc/base.c          | 6 +++++-
 include/linux/fdtable.h | 8 ++++++--
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 87e129030ab1..38039af67663 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -478,7 +478,7 @@ repeat:
 	error = fd;
 #if 1
 	/* Sanity check */
-	if (rcu_dereference(fdt->fd[fd]) != NULL) {
+	if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
 		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
 		rcu_assign_pointer(fdt->fd[fd], NULL);
 	}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 13b5d0708175..18e20feee251 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -270,7 +270,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		blocked = p->blocked;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = atomic_read(&p->signal->count);
+		rcu_read_lock();  /* FIXME: is this correct? */
 		qsize = atomic_read(&__task_cred(p)->user->sigpending);
+		rcu_read_unlock();
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58324c299165..623e2ffb5d2b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1095,8 +1095,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
+	rcu_read_lock();
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
+		rcu_read_unlock();
 		return -EPERM;
+	}
+	rcu_read_unlock();
 
 	if (count >= PAGE_SIZE)
 		count = PAGE_SIZE - 1;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index a2ec74bc4812..144412ffaced 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -57,7 +57,11 @@ struct files_struct {
 	struct file * fd_array[NR_OPEN_DEFAULT];
 };
 
-#define files_fdtable(files) (rcu_dereference((files)->fdt))
+#define files_fdtable(files) \
+	(rcu_dereference_check((files)->fdt, \
+			       rcu_read_lock_held() || \
+			       lockdep_is_held(&(files)->file_lock) || \
+			       atomic_read(&files->count) == 1))
 
 struct file_operations;
 struct vfsmount;
@@ -78,7 +82,7 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in
 	struct fdtable *fdt = files_fdtable(files);
 
 	if (fd < fdt->max_fds)
-		file = rcu_dereference(fdt->fd[fd]);
+		file = rcu_dereference_check(fdt->fd[fd], rcu_read_lock_held() || lockdep_is_held(&files->file_lock) || atomic_read(&files->count) == 1);
 	return file;
 }
 
-- 
cgit v1.2.3


From 8a78362c4eefc1deddbefe2c7f38aabbc2429d6b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 26 Feb 2010 00:20:39 -0500
Subject: block: Consolidate phys_segment and hw_segment limits

Except for SCSI no device drivers distinguish between physical and
hardware segment limits.  Consolidate the two into a single segment
limit.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/um/drivers/ubd_kern.c          |  2 +-
 block/blk-core.c                    |  3 +-
 block/blk-merge.c                   |  8 ++---
 block/blk-settings.c                | 60 ++++++++-----------------------------
 drivers/ata/sata_nv.c               |  2 +-
 drivers/block/DAC960.c              |  2 +-
 drivers/block/cciss.c               |  5 +---
 drivers/block/cpqarray.c            |  5 +---
 drivers/block/drbd/drbd_nl.c        |  3 +-
 drivers/block/paride/pf.c           |  3 +-
 drivers/block/pktcdvd.c             |  4 +--
 drivers/block/ps3disk.c             |  3 +-
 drivers/block/ps3vram.c             |  3 +-
 drivers/block/sunvdc.c              |  3 +-
 drivers/block/sx8.c                 |  3 +-
 drivers/block/ub.c                  |  3 +-
 drivers/block/viodasd.c             |  3 +-
 drivers/block/xen-blkfront.c        |  3 +-
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               |  3 +-
 drivers/ide/ide-probe.c             |  3 +-
 drivers/md/raid5.c                  |  2 +-
 drivers/memstick/core/mspro_block.c |  3 +-
 drivers/message/i2o/i2o_block.c     |  3 +-
 drivers/mmc/card/queue.c            |  6 ++--
 drivers/s390/block/dasd.c           |  3 +-
 drivers/s390/char/tape_block.c      |  3 +-
 drivers/scsi/ibmvscsi/ibmvfc.c      |  4 +--
 drivers/scsi/scsi_lib.c             |  4 +--
 drivers/scsi/sg.c                   |  6 ++--
 drivers/scsi/st.c                   |  3 +-
 drivers/staging/hv/blkvsc_drv.c     |  5 +---
 fs/bio.c                            |  9 ++----
 include/linux/blkdev.h              | 27 ++++++++++-------
 include/linux/i2o.h                 |  2 +-
 35 files changed, 70 insertions(+), 136 deletions(-)

(limited to 'fs')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index c2051b0737cb..c1ff6903b622 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -849,7 +849,7 @@ static int ubd_add(int n, char **error_out)
 	}
 	ubd_dev->queue->queuedata = ubd_dev;
 
-	blk_queue_max_hw_segments(ubd_dev->queue, MAX_SG);
+	blk_queue_max_segments(ubd_dev->queue, MAX_SG);
 	err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
 	if(err){
 		*error_out = "Failed to register device";
diff --git a/block/blk-core.c b/block/blk-core.c
index 36c0deebc2dc..9fe174dc74d1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1614,8 +1614,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
-	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
+	if (rq->nr_phys_segments > queue_max_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99cb5cf1f447..5e7dc9973458 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
-	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > queue_max_phys_segments(q))
-		return 0;
-
-	if (total_phys_segments > queue_max_hw_segments(q))
+	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 61afae9dbc6d..31e7a9375c13 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,8 +91,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
-	lim->max_phys_segments = MAX_PHYS_SEGMENTS;
-	lim->max_hw_segments = MAX_HW_SEGMENTS;
+	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -252,17 +251,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
 /**
- * blk_queue_max_phys_segments - set max phys segments for a request for this queue
+ * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
- *    physical data segments in a request.  This would be the largest sized
- *    scatter list the driver could handle.
+ *    hw data segments in a request.
  **/
-void blk_queue_max_phys_segments(struct request_queue *q,
-				 unsigned short max_segments)
+void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
@@ -270,33 +267,9 @@ void blk_queue_max_phys_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->limits.max_phys_segments = max_segments;
+	q->limits.max_segments = max_segments;
 }
-EXPORT_SYMBOL(blk_queue_max_phys_segments);
-
-/**
- * blk_queue_max_hw_segments - set max hw segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give at once
- *    to the device.
- **/
-void blk_queue_max_hw_segments(struct request_queue *q,
-			       unsigned short max_segments)
-{
-	if (!max_segments) {
-		max_segments = 1;
-		printk(KERN_INFO "%s: set to minimum %d\n",
-		       __func__, max_segments);
-	}
-
-	q->limits.max_hw_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_hw_segments);
+EXPORT_SYMBOL(blk_queue_max_segments);
 
 /**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
@@ -531,11 +504,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
 
-	t->max_phys_segments = min_not_zero(t->max_phys_segments,
-					    b->max_phys_segments);
-
-	t->max_hw_segments = min_not_zero(t->max_hw_segments,
-					  b->max_hw_segments);
+	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
@@ -739,22 +708,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad);
  * does is adjust the queue so that the buf is always appended
  * silently to the scatterlist.
  *
- * Note: This routine adjusts max_hw_segments to make room for
- * appending the drain buffer.  If you call
- * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after
- * calling this routine, you must set the limit to one fewer than your
- * device can support otherwise there won't be room for the drain
- * buffer.
+ * Note: This routine adjusts max_hw_segments to make room for appending
+ * the drain buffer.  If you call blk_queue_max_segments() after calling
+ * this routine, you must set the limit to one fewer than your device
+ * can support otherwise there won't be room for the drain buffer.
  */
 int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
+	if (queue_max_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
-	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
+	blk_queue_max_segments(q, queue_max_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 0c82d335c55d..684fe04dbbb7 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -772,7 +772,7 @@ static int nv_adma_slave_config(struct scsi_device *sdev)
 	}
 
 	blk_queue_segment_boundary(sdev->request_queue, segment_boundary);
-	blk_queue_max_hw_segments(sdev->request_queue, sg_tablesize);
+	blk_queue_max_segments(sdev->request_queue, sg_tablesize);
 	ata_port_printk(ap, KERN_INFO,
 		"DMA mask 0x%llX, segment boundary 0x%lX, hw segs %hu\n",
 		(unsigned long long)*ap->host->dev->dma_mask,
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 1c0cd35e1913..459f1bc25a7b 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -2534,7 +2534,7 @@ static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
   	blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
   	RequestQueue->queuedata = Controller;
   	blk_queue_max_hw_segments(RequestQueue, Controller->DriverScatterGatherLimit);
-	blk_queue_max_phys_segments(RequestQueue, Controller->DriverScatterGatherLimit);
+	blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
 	blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
 	disk->queue = RequestQueue;
 	sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 030e52d72254..a29e69418a03 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1797,10 +1797,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 	blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
 
 	/* This is a hardware imposed limit. */
-	blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
-
-	/* This is a limit in the driver and could be eliminated. */
-	blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
+	blk_queue_max_segments(disk->queue, h->maxsgentries);
 
 	blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors);
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 6422651ec364..91d11631cec9 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -448,11 +448,8 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask);
 
 	/* This is a hardware imposed limit. */
-	blk_queue_max_hw_segments(q, SG_MAX);
+	blk_queue_max_segments(q, SG_MAX);
 
-	/* This is a driver limit and could be eliminated. */
-	blk_queue_max_phys_segments(q, SG_MAX);
-	
 	init_timer(&hba[i]->timer);
 	hba[i]->timer.expires = jiffies + IDA_TIMER;
 	hba[i]->timer.data = (unsigned long)hba[i];
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9b55e64196fc..4df3b40b1057 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -710,8 +710,7 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
 	max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
 
 	blk_queue_max_hw_sectors(q, max_seg_s >> 9);
-	blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
-	blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
 	blk_queue_max_segment_size(q, max_seg_s);
 	blk_queue_logical_block_size(q, 512);
 	blk_queue_segment_boundary(q, PAGE_SIZE-1);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index ea54ea393553..ddb4f9abd480 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -956,8 +956,7 @@ static int __init pf_init(void)
 		return -ENOMEM;
 	}
 
-	blk_queue_max_phys_segments(pf_queue, cluster);
-	blk_queue_max_hw_segments(pf_queue, cluster);
+	blk_queue_max_segments(pf_queue, cluster);
 
 	for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
 		struct gendisk *disk = pf->disk;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 6e1daa24da2f..b72935b8f203 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -950,14 +950,14 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
 	if ((pd->settings.size << 9) / CD_FRAMESIZE
-	    <= queue_max_phys_segments(q)) {
+	    <= queue_max_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
 	} else if ((pd->settings.size << 9) / PAGE_SIZE
-		   <= queue_max_phys_segments(q)) {
+		   <= queue_max_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 9cd1a4a542b8..bc95469d33c1 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -482,8 +482,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
 			  ps3disk_prepare_flush);
 
-	blk_queue_max_phys_segments(queue, -1);
-	blk_queue_max_hw_segments(queue, -1);
+	blk_queue_max_segments(queue, -1);
 	blk_queue_max_segment_size(queue, dev->bounce_size);
 
 	gendisk = alloc_disk(PS3DISK_MINORS);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 6416b262934b..83ebb390b164 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -751,8 +751,7 @@ static int __devinit ps3vram_probe(struct ps3_system_bus_device *dev)
 	priv->queue = queue;
 	queue->queuedata = dev;
 	blk_queue_make_request(queue, ps3vram_make_request);
-	blk_queue_max_phys_segments(queue, MAX_PHYS_SEGMENTS);
-	blk_queue_max_hw_segments(queue, MAX_HW_SEGMENTS);
+	blk_queue_max_segments(queue, BLK_MAX_HW_SEGMENTS);
 	blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE);
 	blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS);
 
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index dd30cddd0f7f..48e8fee9f2d4 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -691,8 +691,7 @@ static int probe_disk(struct vdc_port *port)
 
 	port->disk = g;
 
-	blk_queue_max_hw_segments(q, port->ring_cookies);
-	blk_queue_max_phys_segments(q, port->ring_cookies);
+	blk_queue_max_segments(q, port->ring_cookies);
 	blk_queue_max_hw_sectors(q, port->max_xfer_size);
 	g->major = vdc_major;
 	g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 7bd7b2f83116..b70f0fca9a42 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1518,8 +1518,7 @@ static int carm_init_disks(struct carm_host *host)
 			break;
 		}
 		disk->queue = q;
-		blk_queue_max_hw_segments(q, CARM_MAX_REQ_SG);
-		blk_queue_max_phys_segments(q, CARM_MAX_REQ_SG);
+		blk_queue_max_segments(q, CARM_MAX_REQ_SG);
 		blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 
 		q->queuedata = port;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 352ea24d66e8..2e889838e819 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -2320,8 +2320,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	disk->queue = q;
 
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
-	blk_queue_max_hw_segments(q, UB_MAX_REQ_SG);
-	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
+	blk_queue_max_segments(q, UB_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
 	blk_queue_max_hw_sectors(q, UB_MAX_SECTORS);
 	blk_queue_logical_block_size(q, lun->capacity.bsize);
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index d44ece7d7b7c..c12b31362ac6 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -471,8 +471,7 @@ retry:
 	}
 
 	d->disk = g;
-	blk_queue_max_hw_segments(q, VIOMAXBLOCKDMA);
-	blk_queue_max_phys_segments(q, VIOMAXBLOCKDMA);
+	blk_queue_max_segments(q, VIOMAXBLOCKDMA);
 	blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
 	g->major = VIODASD_MAJOR;
 	g->first_minor = dev_no << PARTITION_SHIFT;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index f9861aaa1fef..9c09694b2520 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -353,8 +353,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 
 	/* Ensure a merged request will fit in a single I/O ring slot. */
-	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index e789e6c9a422..03c71f7698cb 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -741,7 +741,7 @@ static int __devinit probe_gdrom_setupqueue(void)
 {
 	blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
 	/* using DMA so memory will need to be contiguous */
-	blk_queue_max_hw_segments(gd.gdrom_rq, 1);
+	blk_queue_max_segments(gd.gdrom_rq, 1);
 	/* set a large max size to get most from DMA */
 	blk_queue_max_segment_size(gd.gdrom_rq, 0x40000);
 	gd.disk->queue = gd.gdrom_rq;
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index b1dfd23eb832..cc435be0bc13 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -616,8 +616,7 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	gendisk->first_minor = deviceno;
 	strncpy(gendisk->disk_name, c->name,
 			sizeof(gendisk->disk_name));
-	blk_queue_max_hw_segments(q, 1);
-	blk_queue_max_phys_segments(q, 1);
+	blk_queue_max_segments(q, 1);
 	blk_queue_max_hw_sectors(q, 4096 / 512);
 	gendisk->queue = q;
 	gendisk->fops = &viocd_fops;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 1ec8b31277bd..f8c1ae6ad74c 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -790,8 +790,7 @@ static int ide_init_queue(ide_drive_t *drive)
 		max_sg_entries >>= 1;
 #endif /* CONFIG_PCI */
 
-	blk_queue_max_hw_segments(q, max_sg_entries);
-	blk_queue_max_phys_segments(q, max_sg_entries);
+	blk_queue_max_segments(q, max_sg_entries);
 
 	/* assign drive queue */
 	drive->queue = q;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ceb24afdc147..509c8f3dd9a5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3739,7 +3739,7 @@ static int bio_fits_rdev(struct bio *bi)
 	if ((bi->bi_size>>9) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > queue_max_phys_segments(q))
+	if (bi->bi_phys_segments > queue_max_segments(q))
 		return 0;
 
 	if (q->merge_bvec_fn)
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 44d4178c4c15..972b87069d55 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1227,8 +1227,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	blk_queue_bounce_limit(msb->queue, limit);
 	blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
-	blk_queue_max_phys_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
-	blk_queue_max_hw_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
+	blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
 	blk_queue_max_segment_size(msb->queue,
 				   MSPRO_BLOCK_MAX_PAGES * msb->page_size);
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index d033cfdb516f..2658b1484a2c 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1065,9 +1065,8 @@ static int i2o_block_probe(struct device *dev)
 	queue = gd->queue;
 	queue->queuedata = i2o_blk_dev;
 
-	blk_queue_max_phys_segments(queue, I2O_MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_sectors(queue, max_sectors);
-	blk_queue_max_hw_segments(queue, i2o_sg_tablesize(c, body_size));
+	blk_queue_max_segments(queue, i2o_sg_tablesize(c, body_size));
 
 	osm_debug("max sectors = %d\n", queue->max_sectors);
 	osm_debug("phys segments = %d\n", queue->max_phys_segments);
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 09b633d5657b..381fe032caa1 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -155,8 +155,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 		if (mq->bounce_buf) {
 			blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY);
 			blk_queue_max_hw_sectors(mq->queue, bouncesz / 512);
-			blk_queue_max_phys_segments(mq->queue, bouncesz / 512);
-			blk_queue_max_hw_segments(mq->queue, bouncesz / 512);
+			blk_queue_max_segments(mq->queue, bouncesz / 512);
 			blk_queue_max_segment_size(mq->queue, bouncesz);
 
 			mq->sg = kmalloc(sizeof(struct scatterlist),
@@ -182,8 +181,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock
 		blk_queue_bounce_limit(mq->queue, limit);
 		blk_queue_max_hw_sectors(mq->queue,
 			min(host->max_blk_count, host->max_req_size / 512));
-		blk_queue_max_phys_segments(mq->queue, host->max_phys_segs);
-		blk_queue_max_hw_segments(mq->queue, host->max_hw_segs);
+		blk_queue_max_segments(mq->queue, host->max_hw_segs);
 		blk_queue_max_segment_size(mq->queue, host->max_seg_size);
 
 		mq->sg = kmalloc(sizeof(struct scatterlist) *
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 14b1e25b9dcf..8831e9308d05 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2130,8 +2130,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_hw_sectors(block->request_queue, max);
-	blk_queue_max_phys_segments(block->request_queue, -1L);
-	blk_queue_max_hw_segments(block->request_queue, -1L);
+	blk_queue_max_segments(block->request_queue, -1L);
 	/* with page sized segments we can translate each segement into
 	 * one idaw/tidaw
 	 */
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 509ed056fddd..097da8ce6be6 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -223,8 +223,7 @@ tapeblock_setup_device(struct tape_device * device)
 
 	blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
 	blk_queue_max_hw_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC);
-	blk_queue_max_phys_segments(blkdat->request_queue, -1L);
-	blk_queue_max_hw_segments(blkdat->request_queue, -1L);
+	blk_queue_max_segments(blkdat->request_queue, -1L);
 	blk_queue_max_segment_size(blkdat->request_queue, -1L);
 	blk_queue_segment_boundary(blkdat->request_queue, -1L);
 
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 87b536a97cb4..732f6d35b4a8 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -4195,7 +4195,7 @@ static void ibmvfc_tgt_add_rport(struct ibmvfc_target *tgt)
 		if (tgt->service_parms.class3_parms[0] & 0x80000000)
 			rport->supported_classes |= FC_COS_CLASS3;
 		if (rport->rqst_q)
-			blk_queue_max_hw_segments(rport->rqst_q, 1);
+			blk_queue_max_segments(rport->rqst_q, 1);
 	} else
 		tgt_dbg(tgt, "rport add failed\n");
 	spin_unlock_irqrestore(vhost->host->host_lock, flags);
@@ -4669,7 +4669,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	}
 
 	if (shost_to_fc_host(shost)->rqst_q)
-		blk_queue_max_hw_segments(shost_to_fc_host(shost)->rqst_q, 1);
+		blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1);
 	dev_set_drvdata(dev, vhost);
 	spin_lock(&ibmvfc_driver_lock);
 	list_add_tail(&vhost->queue, &ibmvfc_head);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ac3cca74bdfb..f8fbf47377ae 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1624,8 +1624,8 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
 	/*
 	 * this limit is imposed by hardware restrictions
 	 */
-	blk_queue_max_hw_segments(q, shost->sg_tablesize);
-	blk_queue_max_phys_segments(q, SCSI_MAX_SG_CHAIN_SEGMENTS);
+	blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
+					SCSI_MAX_SG_CHAIN_SEGMENTS));
 
 	blk_queue_max_hw_sectors(q, shost->max_sectors);
 	blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 040f751809ea..c996d98636f3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -287,8 +287,7 @@ sg_open(struct inode *inode, struct file *filp)
 	if (list_empty(&sdp->sfds)) {	/* no existing opens on this device */
 		sdp->sgdebug = 0;
 		q = sdp->device->request_queue;
-		sdp->sg_tablesize = min(queue_max_hw_segments(q),
-					queue_max_phys_segments(q));
+		sdp->sg_tablesize = queue_max_segments(q);
 	}
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
@@ -1376,8 +1375,7 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	sdp->device = scsidp;
 	INIT_LIST_HEAD(&sdp->sfds);
 	init_waitqueue_head(&sdp->o_excl_wait);
-	sdp->sg_tablesize = min(queue_max_hw_segments(q),
-				queue_max_phys_segments(q));
+	sdp->sg_tablesize = queue_max_segments(q);
 	sdp->index = k;
 	kref_init(&sdp->d_ref);
 
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index d04ea9a6f673..f67d1a159aad 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3983,8 +3983,7 @@ static int st_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	i = min(queue_max_hw_segments(SDp->request_queue),
-		queue_max_phys_segments(SDp->request_queue));
+	i = queue_max_segments(SDp->request_queue);
 	if (st_max_sg_segs < i)
 		i = st_max_sg_segs;
 	buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i);
diff --git a/drivers/staging/hv/blkvsc_drv.c b/drivers/staging/hv/blkvsc_drv.c
index 62b282844a53..45d908114d11 100644
--- a/drivers/staging/hv/blkvsc_drv.c
+++ b/drivers/staging/hv/blkvsc_drv.c
@@ -363,10 +363,7 @@ static int blkvsc_probe(struct device *device)
 	blkdev->gd->queue = blk_init_queue(blkvsc_request, &blkdev->lock);
 
 	blk_queue_max_segment_size(blkdev->gd->queue, PAGE_SIZE);
-	blk_queue_max_phys_segments(blkdev->gd->queue,
-				    MAX_MULTIPAGE_BUFFER_COUNT);
-	blk_queue_max_hw_segments(blkdev->gd->queue,
-				  MAX_MULTIPAGE_BUFFER_COUNT);
+	blk_queue_max_segments(blkdev->gd->queue, MAX_MULTIPAGE_BUFFER_COUNT);
 	blk_queue_segment_boundary(blkdev->gd->queue, PAGE_SIZE-1);
 	blk_queue_bounce_limit(blkdev->gd->queue, BLK_BOUNCE_ANY);
 	blk_queue_dma_alignment(blkdev->gd->queue, 511);
diff --git a/fs/bio.c b/fs/bio.c
index 88094afc29ea..dc17afd672e3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -507,10 +507,8 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	int nr_pages;
 
 	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > queue_max_phys_segments(q))
-		nr_pages = queue_max_phys_segments(q);
-	if (nr_pages > queue_max_hw_segments(q))
-		nr_pages = queue_max_hw_segments(q);
+	if (nr_pages > queue_max_segments(q))
+		nr_pages = queue_max_segments(q);
 
 	return nr_pages;
 }
@@ -575,8 +573,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
-	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
+	while (bio->bi_phys_segments >= queue_max_segments(q)) {
 
 		if (retried_segments)
 			return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 57bc48446e92..ebd22dbed861 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -316,8 +316,7 @@ struct queue_limits {
 	unsigned int		discard_alignment;
 
 	unsigned short		logical_block_size;
-	unsigned short		max_hw_segments;
-	unsigned short		max_phys_segments;
+	unsigned short		max_segments;
 
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
@@ -929,8 +928,19 @@ static inline void blk_queue_max_sectors(struct request_queue *q, unsigned int m
 	blk_queue_max_hw_sectors(q, max);
 }
 
-extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
+extern void blk_queue_max_segments(struct request_queue *, unsigned short);
+
+static inline void blk_queue_max_phys_segments(struct request_queue *q, unsigned short max)
+{
+	blk_queue_max_segments(q, max);
+}
+
+static inline void blk_queue_max_hw_segments(struct request_queue *q, unsigned short max)
+{
+	blk_queue_max_segments(q, max);
+}
+
+
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
@@ -1055,14 +1065,9 @@ static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 	return q->limits.max_hw_sectors;
 }
 
-static inline unsigned short queue_max_hw_segments(struct request_queue *q)
-{
-	return q->limits.max_hw_segments;
-}
-
-static inline unsigned short queue_max_phys_segments(struct request_queue *q)
+static inline unsigned short queue_max_segments(struct request_queue *q)
 {
-	return q->limits.max_phys_segments;
+	return q->limits.max_segments;
 }
 
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 4c4e57d1f19d..87018dc5527d 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -385,7 +385,7 @@
 /* defines for max_sectors and max_phys_segments */
 #define I2O_MAX_SECTORS			1024
 #define I2O_MAX_SECTORS_LIMITED		128
-#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
+#define I2O_MAX_PHYS_SEGMENTS		BLK_MAX_SEGMENTS
 
 /*
  *	Message structures
-- 
cgit v1.2.3


From cf6620acc0f6fac57968aafef79ab372bdcf6157 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 24 Feb 2010 11:59:23 -0600
Subject: dlm: send reply before bast

When the lock master processes a successful operation (request,
convert, cancel, or unlock), it will process the effects of the
change before sending the reply for the operation.  The "effects"
of the operation are:

- blocking callbacks (basts) for any newly granted locks
- waiting or converting locks that can now be granted

The cast is queued on the local node when the reply from the lock
master is received.  This means that a lock holder can receive a
bast for a lock mode that is doesn't yet know has been granted.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 110 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 84 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e08ea93432bc..d0e43a3da887 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_queued(lkb)) {
 		error = -EINPROGRESS;
 		add_lkb(r, lkb, DLM_LKSTS_WAITING);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error = 0;
@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	if (can_be_granted(r, lkb, 1, &deadlk)) {
 		grant_lock(r, lkb);
 		queue_cast(r, lkb, 0);
-		grant_pending_locks(r);
 		goto out;
 	}
 
@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		if (_can_be_granted(r, lkb, 1)) {
 			grant_lock(r, lkb);
 			queue_cast(r, lkb, 0);
-			grant_pending_locks(r);
 			goto out;
 		}
 		/* else fall through and move to convert queue */
@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		error = -EINPROGRESS;
 		del_lkb(r, lkb);
 		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
-		send_blocking_asts(r, lkb);
 		add_timeout(lkb);
 		goto out;
 	}
 
 	error = -EAGAIN;
-	if (force_blocking_asts(lkb))
-		send_blocking_asts_all(r, lkb);
 	queue_cast(r, lkb, -EAGAIN);
-
  out:
 	return error;
 }
 
+static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			       int error)
+{
+	switch (error) {
+	case 0:
+		grant_pending_locks(r);
+		/* grant_pending_locks also sends basts */
+		break;
+	case -EAGAIN:
+		if (force_blocking_asts(lkb))
+			send_blocking_asts_all(r, lkb);
+		break;
+	case -EINPROGRESS:
+		send_blocking_asts(r, lkb);
+		break;
+	}
+}
+
 static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	remove_lock(r, lkb);
 	queue_cast(r, lkb, -DLM_EUNLOCK);
-	grant_pending_locks(r);
 	return -DLM_EUNLOCK;
 }
 
+static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	grant_pending_locks(r);
+}
+
 /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
  
 static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
 	error = revert_lock(r, lkb);
 	if (error) {
 		queue_cast(r, lkb, -DLM_ECANCEL);
-		grant_pending_locks(r);
 		return -DLM_ECANCEL;
 	}
 	return 0;
 }
 
+static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			      int error)
+{
+	if (error)
+		grant_pending_locks(r);
+}
+
 /*
  * Four stage 3 varieties:
  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 		goto out;
 	}
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_request() calls do_request() on remote node */
 		error = send_request(r, lkb);
-	else
+	} else {
 		error = do_request(r, lkb);
+		/* for remote locks the request_reply is sent
+		   between do_request and do_request_effects */
+		do_request_effects(r, lkb, error);
+	}
  out:
 	return error;
 }
@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_convert() calls do_convert() on remote node */
 		error = send_convert(r, lkb);
-	else
+	} else {
 		error = do_convert(r, lkb);
+		/* for remote locks the convert_reply is sent
+		   between do_convert and do_convert_effects */
+		do_convert_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_unlock() calls do_unlock() on remote node */
 		error = send_unlock(r, lkb);
-	else
+	} else {
 		error = do_unlock(r, lkb);
+		/* for remote locks the unlock_reply is sent
+		   between do_unlock and do_unlock_effects */
+		do_unlock_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
 {
 	int error;
 
-	if (is_remote(r))
+	if (is_remote(r)) {
 		/* receive_cancel() calls do_cancel() on remote node */
 		error = send_cancel(r, lkb);
-	else
+	} else {
 		error = do_cancel(r, lkb);
+		/* for remote locks the cancel_reply is sent
+		   between do_cancel and do_cancel_effects */
+		do_cancel_effects(r, lkb, error);
+	}
 
 	return error;
 }
@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
 	attach_lkb(r, lkb);
 	error = do_request(r, lkb);
 	send_request_reply(r, lkb, error);
+	do_request_effects(r, lkb, error);
 
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_convert_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_convert_reply(r, lkb, error);
+		goto out;
+	}
+
 	reply = !down_conversion(lkb);
 
 	error = do_convert(r, lkb);
- out_reply:
 	if (reply)
 		send_convert_reply(r, lkb, error);
+	do_convert_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 		goto out;
 
 	receive_flags(lkb, ms);
+
 	error = receive_unlock_args(ls, lkb, ms);
-	if (error)
-		goto out_reply;
+	if (error) {
+		send_unlock_reply(r, lkb, error);
+		goto out;
+	}
 
 	error = do_unlock(r, lkb);
- out_reply:
 	send_unlock_reply(r, lkb, error);
+	do_unlock_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
 
 	error = do_cancel(r, lkb);
 	send_cancel_reply(r, lkb, error);
+	do_cancel_effects(r, lkb, error);
  out:
 	unlock_rsb(r);
 	put_rsb(r);
-- 
cgit v1.2.3


From b4a5d4bc377e49239374f266f0a0e2772c29749c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 17 Feb 2010 09:41:34 +0000
Subject: dlm: Send lockspace name with uevents

Although it is possible to get this information from the path,
its much easier to provide the lockspace as a seperate env
variable.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index c010ecfc0d29..26a8bd40400a 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -191,6 +191,18 @@ static int do_uevent(struct dlm_ls *ls, int in)
 	return error;
 }
 
+static int dlm_uevent(struct kset *kset, struct kobject *kobj,
+		      struct kobj_uevent_env *env)
+{
+	struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+
+	add_uevent_var(env, "LOCKSPACE=%s", ls->ls_name);
+	return 0;
+}
+
+static struct kset_uevent_ops dlm_uevent_ops = {
+	.uevent = dlm_uevent,
+};
 
 int __init dlm_lockspace_init(void)
 {
@@ -199,7 +211,7 @@ int __init dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
-	dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj);
+	dlm_kset = kset_create_and_add("dlm", &dlm_uevent_ops, kernel_kobj);
 	if (!dlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
-- 
cgit v1.2.3


From b6fa8796b2da0390e9f4115e8789a01004fc1c9b Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 25 Feb 2010 12:20:57 -0600
Subject: dlm: use bastmode in debugfs output

The bast mode that appears in the debugfs output should be
useful on both master and process nodes.  lkb_highbast is
currently printed, and is only useful on the master node.
lkb_bastmode is only useful on the process node.  This
patch sets lkb_bastmode on the master node as well, and
uses that value in the debugfs print.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c | 2 +-
 fs/dlm/lock.c     | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 375a2359b3bf..29d6139c35fc 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -256,7 +256,7 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 			lkb->lkb_status,
 			lkb->lkb_grmode,
 			lkb->lkb_rqmode,
-			lkb->lkb_highbast,
+			lkb->lkb_bastmode,
 			rsb_lookup,
 			lkb->lkb_wait_type,
 			lkb->lkb_lvbseq,
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index d0e43a3da887..46ffd3eeaaf7 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -320,10 +320,12 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
 	lkb->lkb_time_bast = ktime_get();
 
-	if (is_master_copy(lkb))
+	if (is_master_copy(lkb)) {
+		lkb->lkb_bastmode = rqmode; /* printed by debugfs */
 		send_bast(r, lkb, rqmode);
-	else
+	} else {
 		dlm_add_ast(lkb, AST_BAST, rqmode);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From b89c54282db0c8634a2d2dc200f196d571750ce5 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Mon, 25 Jan 2010 14:11:06 +0800
Subject: ocfs2: add extent block stealing for ocfs2 v5

This patch add extent block (metadata) stealing mechanism for
extent allocation. This mechanism is same as the inode stealing.
if no room in slot specific extent_alloc, we will try to
allocate extent block from the next slot.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Acked-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c        |   5 +-
 fs/ocfs2/dir.c          |   2 +-
 fs/ocfs2/localalloc.c   |   2 +-
 fs/ocfs2/ocfs2.h        |  29 +-------
 fs/ocfs2/refcounttree.c |   6 +-
 fs/ocfs2/suballoc.c     | 171 +++++++++++++++++++++++++++++++++++++-----------
 fs/ocfs2/suballoc.h     |   1 +
 fs/ocfs2/super.c        |  10 ++-
 fs/ocfs2/xattr.c        |   2 +-
 9 files changed, 150 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d17bdc718f74..2bbe1ecc08c0 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1050,7 +1050,8 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
 			eb->h_blkno = cpu_to_le64(first_blkno);
 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+			eb->h_suballoc_slot =
+				cpu_to_le16(meta_ac->ac_alloc_slot);
 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 			eb->h_list.l_count =
 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -6037,7 +6038,7 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
 	if (status < 0)
 		mlog_errno(status);
 	else
-		ocfs2_init_inode_steal_slot(osb);
+		ocfs2_init_steal_slots(osb);
 
 	mlog_exit(status);
 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 28c3ec238796..765d66c70989 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2439,7 +2439,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 	dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
 	memset(dx_root, 0, osb->sb->s_blocksize);
 	strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
-	dx_root->dr_suballoc_slot = cpu_to_le16(osb->slot_num);
+	dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 	dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
 	dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
 	dx_root->dr_blkno = cpu_to_le64(dr_blkno);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ac10f83edb95..ca992d91f511 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,7 +476,7 @@ out_mutex:
 
 out:
 	if (!status)
-		ocfs2_init_inode_steal_slot(osb);
+		ocfs2_init_steal_slots(osb);
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740f448041e2..8857dd724f90 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -305,7 +305,9 @@ struct ocfs2_super
 	u32 s_next_generation;
 	unsigned long osb_flags;
 	s16 s_inode_steal_slot;
+	s16 s_meta_steal_slot;
 	atomic_t s_num_inodes_stolen;
+	atomic_t s_num_meta_stolen;
 
 	unsigned long s_mount_opt;
 	unsigned int s_atime_quantum;
@@ -760,33 +762,6 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
 	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
-static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
-{
-	spin_lock(&osb->osb_lock);
-	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
-	spin_unlock(&osb->osb_lock);
-	atomic_set(&osb->s_num_inodes_stolen, 0);
-}
-
-static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
-					      s16 slot)
-{
-	spin_lock(&osb->osb_lock);
-	osb->s_inode_steal_slot = slot;
-	spin_unlock(&osb->osb_lock);
-}
-
-static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
-{
-	s16 slot;
-
-	spin_lock(&osb->osb_lock);
-	slot = osb->s_inode_steal_slot;
-	spin_unlock(&osb->osb_lock);
-
-	return slot;
-}
-
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 8ae65c9c020c..fb6aa7acf54b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -626,7 +626,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	rb = (struct ocfs2_refcount_block *)new_bh->b_data;
 	memset(rb, 0, inode->i_sb->s_blocksize);
 	strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-	rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
+	rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 	rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
 	rb->rf_blkno = cpu_to_le64(first_blkno);
@@ -1330,7 +1330,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 	memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
 
 	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
-	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	new_rb->rf_blkno = cpu_to_le64(blkno);
 	new_rb->rf_cpos = cpu_to_le32(0);
@@ -1576,7 +1576,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 	new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
 	memset(new_rb, 0, sb->s_blocksize);
 	strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
-	new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
+	new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 	new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
 	new_rb->rf_blkno = cpu_to_le64(blkno);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c30b644d9572..c3c60bc3e072 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -51,7 +51,7 @@
 #define ALLOC_NEW_GROUP			0x1
 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
 
-#define OCFS2_MAX_INODES_TO_STEAL	1024
+#define OCFS2_MAX_TO_STEAL		1024
 
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
@@ -637,12 +637,113 @@ bail:
 	return status;
 }
 
+static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_inodes_stolen, 0);
+}
+
+static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
+{
+	spin_lock(&osb->osb_lock);
+	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
+	spin_unlock(&osb->osb_lock);
+	atomic_set(&osb->s_num_meta_stolen, 0);
+}
+
+void ocfs2_init_steal_slots(struct ocfs2_super *osb)
+{
+	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_meta_steal_slot(osb);
+}
+
+static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
+{
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		osb->s_inode_steal_slot = slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		osb->s_meta_steal_slot = slot;
+	spin_unlock(&osb->osb_lock);
+}
+
+static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
+{
+	int slot = OCFS2_INVALID_SLOT;
+
+	spin_lock(&osb->osb_lock);
+	if (type == INODE_ALLOC_SYSTEM_INODE)
+		slot = osb->s_inode_steal_slot;
+	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
+		slot = osb->s_meta_steal_slot;
+	spin_unlock(&osb->osb_lock);
+
+	return slot;
+}
+
+static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
+{
+	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_resource(struct ocfs2_super *osb,
+				struct ocfs2_alloc_context *ac,
+				int type)
+{
+	int i, status = -ENOSPC;
+	int slot = __ocfs2_get_steal_slot(osb, type);
+
+	/* Start to steal resource from the first slot after ours. */
+	if (slot == OCFS2_INVALID_SLOT)
+		slot = osb->slot_num + 1;
+
+	for (i = 0; i < osb->max_slots; i++, slot++) {
+		if (slot == osb->max_slots)
+			slot = 0;
+
+		if (slot == osb->slot_num)
+			continue;
+
+		status = ocfs2_reserve_suballoc_bits(osb, ac,
+						     type,
+						     (u32)slot, NULL,
+						     NOT_ALLOC_NEW_GROUP);
+		if (status >= 0) {
+			__ocfs2_set_steal_slot(osb, slot, type);
+			break;
+		}
+
+		ocfs2_free_ac_resource(ac);
+	}
+
+	return status;
+}
+
+static int ocfs2_steal_inode(struct ocfs2_super *osb,
+			     struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
+}
+
+static int ocfs2_steal_meta(struct ocfs2_super *osb,
+			    struct ocfs2_alloc_context *ac)
+{
+	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
+}
+
 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 				      int blocks,
 				      struct ocfs2_alloc_context **ac)
 {
 	int status;
-	u32 slot;
+	int slot = ocfs2_get_meta_steal_slot(osb);
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
@@ -653,12 +754,34 @@ int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 
 	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
-	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
+	if (slot != OCFS2_INVALID_SLOT &&
+		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
+		goto extent_steal;
+
+	atomic_set(&osb->s_num_meta_stolen, 0);
 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 					     EXTENT_ALLOC_SYSTEM_INODE,
-					     slot, NULL, ALLOC_NEW_GROUP);
+					     (u32)osb->slot_num, NULL,
+					     ALLOC_NEW_GROUP);
+
+
+	if (status >= 0) {
+		status = 0;
+		if (slot != OCFS2_INVALID_SLOT)
+			ocfs2_init_meta_steal_slot(osb);
+		goto bail;
+	} else if (status < 0 && status != -ENOSPC) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_free_ac_resource(*ac);
+
+extent_steal:
+	status = ocfs2_steal_meta(osb, *ac);
+	atomic_inc(&osb->s_num_meta_stolen);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -685,43 +808,11 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 					ac);
 }
 
-static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
-					      struct ocfs2_alloc_context *ac)
-{
-	int i, status = -ENOSPC;
-	s16 slot = ocfs2_get_inode_steal_slot(osb);
-
-	/* Start to steal inodes from the first slot after ours. */
-	if (slot == OCFS2_INVALID_SLOT)
-		slot = osb->slot_num + 1;
-
-	for (i = 0; i < osb->max_slots; i++, slot++) {
-		if (slot == osb->max_slots)
-			slot = 0;
-
-		if (slot == osb->slot_num)
-			continue;
-
-		status = ocfs2_reserve_suballoc_bits(osb, ac,
-						     INODE_ALLOC_SYSTEM_INODE,
-						     slot, NULL,
-						     NOT_ALLOC_NEW_GROUP);
-		if (status >= 0) {
-			ocfs2_set_inode_steal_slot(osb, slot);
-			break;
-		}
-
-		ocfs2_free_ac_resource(ac);
-	}
-
-	return status;
-}
-
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac)
 {
 	int status;
-	s16 slot = ocfs2_get_inode_steal_slot(osb);
+	int slot = ocfs2_get_inode_steal_slot(osb);
 	u64 alloc_group;
 
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
@@ -754,14 +845,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	 * need to check our slots to see whether there is some space for us.
 	 */
 	if (slot != OCFS2_INVALID_SLOT &&
-	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
+	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
 		goto inode_steal;
 
 	atomic_set(&osb->s_num_inodes_stolen, 0);
 	alloc_group = osb->osb_inode_alloc_group;
 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
 					     INODE_ALLOC_SYSTEM_INODE,
-					     osb->slot_num,
+					     (u32)osb->slot_num,
 					     &alloc_group,
 					     ALLOC_NEW_GROUP |
 					     ALLOC_GROUPS_FROM_GLOBAL);
@@ -789,7 +880,7 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	ocfs2_free_ac_resource(*ac);
 
 inode_steal:
-	status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
+	status = ocfs2_steal_inode(osb, *ac);
 	atomic_inc(&osb->s_num_inodes_stolen);
 	if (status < 0) {
 		if (status != -ENOSPC)
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8c9a78a43164..fa60723c43e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -56,6 +56,7 @@ struct ocfs2_alloc_context {
 				 is the same as ~0 - unlimited */
 };
 
+void ocfs2_init_steal_slots(struct ocfs2_super *osb);
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
 static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 755cd49a5ef3..dee03197a494 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -69,6 +69,7 @@
 #include "xattr.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "suballoc.h"
 
 #include "buffer_head_io.h"
 
@@ -301,9 +302,12 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 
 	spin_lock(&osb->osb_lock);
 	out += snprintf(buf + out, len - out,
-			"%10s => Slot: %d  NumStolen: %d\n", "Steal",
+			"%10s => InodeSlot: %d  StolenInodes: %d, "
+			"MetaSlot: %d  StolenMeta: %d\n", "Steal",
 			osb->s_inode_steal_slot,
-			atomic_read(&osb->s_num_inodes_stolen));
+			atomic_read(&osb->s_num_inodes_stolen),
+			osb->s_meta_steal_slot,
+			atomic_read(&osb->s_num_meta_stolen));
 	spin_unlock(&osb->osb_lock);
 
 	out += snprintf(buf + out, len - out, "OrphanScan => ");
@@ -1997,7 +2001,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
 	spin_lock_init(&osb->osb_xattr_lock);
-	ocfs2_init_inode_steal_slot(osb);
+	ocfs2_init_steal_slots(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
 	atomic_set(&osb->alloc_stats.local_data, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8fc6fb071c6d..8ae4e5d1f730 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2282,7 +2282,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
 	memset(xblk, 0, inode->i_sb->s_blocksize);
 	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-	xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+	xblk->xb_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
 	xblk->xb_blkno = cpu_to_le64(first_blkno);
-- 
cgit v1.2.3


From 96a1cc731adb28dc4feb71701091b80e67d486a7 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Tue, 9 Feb 2010 14:57:45 +0800
Subject: ocfs2: Clean up the checks for CoW and direct I/O.

When ocfs2 has to do CoW for refcounted extents, we disable direct I/O
and go through the buffered I/O path.  This makes the combined check
easier to read.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 558ce0312421..da097bd07b72 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1836,6 +1836,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 							       &meta_level);
 			if (has_refcount)
 				*has_refcount = 1;
+			if (direct_io)
+				*direct_io = 0;
 		}
 
 		if (ret < 0) {
@@ -1859,10 +1861,6 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			break;
 		}
 
-		if (has_refcount && *has_refcount == 1) {
-			*direct_io = 0;
-			break;
-		}
 		/*
 		 * Allowing concurrent direct writes means
 		 * i_size changes wouldn't be synchronized, so
-- 
cgit v1.2.3


From 8545e03d82b6739461bbd60db7aba144f7dbe80f Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 12 Feb 2010 14:09:06 -0800
Subject: ocfs2: Add current->comm in trace output

Add current->comm to the standard mlog() output to help with debugging.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/cluster/masklog.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 9b4d11726cf2..0442366b3060 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -194,9 +194,9 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
  * previous token if args expands to nothing.
  */
 #define __mlog_printk(level, fmt, args...)				\
-	printk(level "(%u,%lu):%s:%d " fmt, task_pid_nr(current),	\
-	       __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ ,	\
-	       ##args)
+	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
+	       task_pid_nr(current), __mlog_cpu_guess,			\
+	       __PRETTY_FUNCTION__, __LINE__ , ##args)
 
 #define mlog(mask, fmt, args...) do {					\
 	u64 __m = MLOG_MASK_PREFIX | (mask);				\
-- 
cgit v1.2.3


From 11179f2c92cb025b1ff0b794f9714b3fb395855f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 14 Aug 2009 16:07:44 -0700
Subject: ocfs2: Introduce ocfs2_xa_loc

The ocfs2 extended attribute (xattr) code is very flexible.  It can
store xattrs in the inode itself, in an external block, or in a tree of
data structures.  This allows the number of xattrs to be bounded by the
filesystem size.

However, the code that manages each possible storage location is
different.  Maintaining the ocfs2 xattr code requires changing each hunk
separately.

This patch is the start of a series introducing the ocfs2_xa_loc
structure.  This structure wraps the on-disk details of an xattr
entry.  The goal is that the generic xattr routines can use
ocfs2_xa_loc without knowing the underlying storage location.

This first pass merely implements the basic structure, initializing it,
and wiping the name+value pair of the entry.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 226 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8ae4e5d1f730..945ca697eb25 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -137,6 +137,51 @@ struct ocfs2_xattr_search {
 	int not_found;
 };
 
+/* Operations on struct ocfs2_xa_entry */
+struct ocfs2_xa_loc;
+struct ocfs2_xa_loc_operations {
+	/*
+	 * Return a pointer to the appropriate buffer in loc->xl_storage
+	 * at the given offset from loc->xl_header.
+	 */
+	void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
+
+	/*
+	 * Remove the name+value at this location.  Do whatever is
+	 * appropriate with the remaining name+value pairs.
+	 */
+	void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+};
+
+/*
+ * Describes an xattr entry location.  This is a memory structure
+ * tracking the on-disk structure.
+ */
+struct ocfs2_xa_loc {
+	/* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
+	struct ocfs2_xattr_header *xl_header;
+
+	/* Bytes from xl_header to the end of the storage */
+	int xl_size;
+
+	/*
+	 * The ocfs2_xattr_entry this location describes.  If this is
+	 * NULL, this location describes the on-disk structure where it
+	 * would have been.
+	 */
+	struct ocfs2_xattr_entry *xl_entry;
+
+	/*
+	 * Internal housekeeping
+	 */
+
+	/* Buffer(s) containing this entry */
+	void *xl_storage;
+
+	/* Operations on the storage backing this location */
+	const struct ocfs2_xa_loc_operations *xl_ops;
+};
+
 static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
@@ -1417,6 +1462,170 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	return ret;
 }
 
+/*
+ * Wipe the name+value pair and allow the storage to reclaim it.  This
+ * must be followed by either removal of the entry or a call to
+ * ocfs2_xa_add_namevalue().
+ */
+static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_wipe_namevalue(loc);
+}
+
+static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
+					   int offset)
+{
+	BUG_ON(offset >= loc->xl_size);
+	return (char *)loc->xl_header + offset;
+}
+
+/*
+ * Block storage for xattrs keeps the name+value pairs compacted.  When
+ * we remove one, we have to shift any that preceded it towards the end.
+ */
+static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	int i, offset;
+	int namevalue_offset, first_namevalue_offset, namevalue_size;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	u64 value_size = le64_to_cpu(entry->xe_value_size);
+	int count = le16_to_cpu(xh->xh_count);
+
+	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
+	namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len);
+	if (value_size > OCFS2_XATTR_INLINE_SIZE)
+		namevalue_size += OCFS2_XATTR_ROOT_SIZE;
+	else
+		namevalue_size += OCFS2_XATTR_SIZE(value_size);
+
+	for (i = 0, first_namevalue_offset = loc->xl_size;
+	     i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < first_namevalue_offset)
+			first_namevalue_offset = offset;
+	}
+
+	/* Shift the name+value pairs */
+	memmove((char *)xh + first_namevalue_offset + namevalue_size,
+		(char *)xh + first_namevalue_offset,
+		namevalue_offset - first_namevalue_offset);
+	memset((char *)xh + first_namevalue_offset, 0, namevalue_size);
+
+	/* Now tell xh->xh_entries about it */
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < namevalue_offset)
+			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
+				     namevalue_size);
+	}
+
+	/*
+	 * Note that we don't update xh_free_start or xh_name_value_len
+	 * because they're not used in block-stored xattrs.
+	 */
+}
+
+/*
+ * Operations for xattrs stored in blocks.  This includes inline inode
+ * storage and unindexed ocfs2_xattr_blocks.
+ */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
+	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
+};
+
+static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
+					    int offset)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	int block, block_offset;
+
+	BUG_ON(offset >= OCFS2_XATTR_BUCKET_SIZE);
+
+	/* The header is at the front of the bucket */
+	block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits;
+	block_offset = offset % bucket->bu_inode->i_sb->s_blocksize;
+
+	return bucket_block(bucket, block) + block_offset;
+}
+
+static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
+{
+	int namevalue_size;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+	u64 value_size = le64_to_cpu(entry->xe_value_size);
+
+	namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len);
+	if (value_size > OCFS2_XATTR_INLINE_SIZE)
+		namevalue_size += OCFS2_XATTR_ROOT_SIZE;
+	else
+		namevalue_size += OCFS2_XATTR_SIZE(value_size);
+
+	le16_add_cpu(&loc->xl_header->xh_name_value_len, -namevalue_size);
+}
+
+/* Operations for xattrs stored in buckets. */
+static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
+	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
+};
+
+static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
+{
+	ocfs2_xa_wipe_namevalue(loc);
+}
+
+static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_entry = entry;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
+	else {
+		BUG_ON(entry);
+		loc->xl_size = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	}
+	loc->xl_header =
+		(struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
+					      loc->xl_size);
+}
+
+static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+					  struct buffer_head *bh,
+					  struct ocfs2_xattr_entry *entry)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
+
+	loc->xl_ops = &ocfs2_xa_block_loc_ops;
+	loc->xl_storage = bh;
+	loc->xl_header = &(xb->xb_attrs.xb_header);
+	loc->xl_entry = entry;
+	loc->xl_size = bh->b_size - offsetof(struct ocfs2_xattr_block,
+					     xb_attrs.xb_header);
+}
+
+static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_bucket *bucket,
+					   struct ocfs2_xattr_entry *entry)
+{
+	loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
+	loc->xl_storage = bucket;
+	loc->xl_header = bucket_xh(bucket);
+	loc->xl_entry = entry;
+	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
+}
+
 /*
  * ocfs2_xattr_set_entry_local()
  *
@@ -1430,7 +1639,14 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 {
 	size_t name_len = strlen(xi->name);
 	int i;
+	struct ocfs2_xa_loc loc;
 
+	if (xs->xattr_bh == xs->inode_bh)
+		ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+					 xs->not_found ? NULL : xs->here);
+	else
+		ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
 	if (xi->value && xs->not_found) {
 		/* Insert the new xattr entry. */
 		le16_add_cpu(&xs->header->xh_count, 1);
@@ -1469,9 +1685,9 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 			       xi->value_len);
 			return;
 		}
+
 		/* Remove the old name+value. */
-		memmove(first_val + size, first_val, val - first_val);
-		memset(first_val, 0, size);
+		ocfs2_xa_wipe_namevalue(&loc);
 		xs->here->xe_name_hash = 0;
 		xs->here->xe_name_offset = 0;
 		ocfs2_xattr_set_local(xs->here, 1);
@@ -1479,23 +1695,15 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 
 		min_offs += size;
 
-		/* Adjust all value offsets. */
-		last = xs->header->xh_entries;
-		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
-			size_t o = le16_to_cpu(last->xe_name_offset);
-
-			if (o < offs)
-				last->xe_name_offset = cpu_to_le16(o + size);
-			last += 1;
-		}
-
 		if (!xi->value) {
 			/* Remove the old entry. */
-			last -= 1;
+			i = le16_to_cpu(xs->header->xh_count) - 1;
+			last = &xs->header->xh_entries[i];
+			xs->header->xh_count = cpu_to_le16(i);
+
 			memmove(xs->here, xs->here + 1,
 				(void *)last - (void *)xs->here);
 			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-			le16_add_cpu(&xs->header->xh_count, -1);
 		}
 	}
 	if (xi->value) {
@@ -4769,7 +4977,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 	size_t blocksize = inode->i_sb->s_blocksize;
 	char *val;
 	size_t offs, size, new_size;
+	struct ocfs2_xa_loc loc;
 
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
 	last = &xh->xh_entries[count];
 	if (!xs->not_found) {
 		xe = xs->here;
@@ -4790,7 +5001,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		new_size = OCFS2_XATTR_SIZE(name_len) +
 			   OCFS2_XATTR_SIZE(xi->value_len);
 
-		le16_add_cpu(&xh->xh_name_value_len, -size);
+		ocfs2_xa_wipe_namevalue(&loc);
 		if (xi->value) {
 			if (new_size > size)
 				goto set_new_name_value;
-- 
cgit v1.2.3


From bde1e5400a1b21ef47932ab00446c7361ff3c139 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 14 Aug 2009 16:58:38 -0700
Subject: ocfs2: Remove xattrs via ocfs2_xa_loc

Add ocfs2_xa_remove_entry(), which will remove an xattr entry from its
storage via the ocfs2_xa_loc descriptor.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 62 ++++++++++++++++++++++++++------------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 945ca697eb25..22a60a7c35ca 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1573,7 +1573,29 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 
 static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
 {
+	int index, count;
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct ocfs2_xattr_entry *entry = loc->xl_entry;
+
 	ocfs2_xa_wipe_namevalue(loc);
+	loc->xl_entry = NULL;
+
+	le16_add_cpu(&xh->xh_count, -1);
+	count = le16_to_cpu(xh->xh_count);
+
+	/*
+	 * Only zero out the entry if there are more remaining.  This is
+	 * important for an empty bucket, as it keeps track of the
+	 * bucket's hash value.  It doesn't hurt empty block storage.
+	 */
+	if (count) {
+		index = ((char *)entry - (char *)&xh->xh_entries) /
+			sizeof(struct ocfs2_xattr_entry);
+		memmove(&xh->xh_entries[index], &xh->xh_entries[index + 1],
+			(count - index) * sizeof(struct ocfs2_xattr_entry));
+		memset(&xh->xh_entries[count], 0,
+		       sizeof(struct ocfs2_xattr_entry));
+	}
 }
 
 static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
@@ -1638,7 +1660,6 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 					size_t min_offs)
 {
 	size_t name_len = strlen(xi->name);
-	int i;
 	struct ocfs2_xa_loc loc;
 
 	if (xs->xattr_bh == xs->inode_bh)
@@ -1686,25 +1707,12 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 			return;
 		}
 
-		/* Remove the old name+value. */
-		ocfs2_xa_wipe_namevalue(&loc);
-		xs->here->xe_name_hash = 0;
-		xs->here->xe_name_offset = 0;
-		ocfs2_xattr_set_local(xs->here, 1);
-		xs->here->xe_value_size = 0;
+		if (!xi->value)
+			ocfs2_xa_remove_entry(&loc);
+		else
+			ocfs2_xa_wipe_namevalue(&loc);
 
 		min_offs += size;
-
-		if (!xi->value) {
-			/* Remove the old entry. */
-			i = le16_to_cpu(xs->header->xh_count) - 1;
-			last = &xs->header->xh_entries[i];
-			xs->header->xh_count = cpu_to_le16(i);
-
-			memmove(xs->here, xs->here + 1,
-				(void *)last - (void *)xs->here);
-			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-		}
 	}
 	if (xi->value) {
 		/* Insert the new name+value. */
@@ -5001,8 +5009,8 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		new_size = OCFS2_XATTR_SIZE(name_len) +
 			   OCFS2_XATTR_SIZE(xi->value_len);
 
-		ocfs2_xa_wipe_namevalue(&loc);
 		if (xi->value) {
+			ocfs2_xa_wipe_namevalue(&loc);
 			if (new_size > size)
 				goto set_new_name_value;
 
@@ -5024,20 +5032,8 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 			ocfs2_xattr_set_local(xe, local);
 			return;
 		} else {
-			/*
-			 * Remove the old entry if there is more than one.
-			 * We don't remove the last entry so that we can
-			 * use it to indicate the hash value of the empty
-			 * bucket.
-			 */
-			last -= 1;
-			le16_add_cpu(&xh->xh_count, -1);
-			if (xh->xh_count) {
-				memmove(xe, xe + 1,
-					(void *)last - (void *)xe);
-				memset(last, 0,
-				       sizeof(struct ocfs2_xattr_entry));
-			} else
+			ocfs2_xa_remove_entry(&loc);
+			if (!xh->xh_count)
 				xh->xh_free_start =
 					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
 
-- 
cgit v1.2.3


From 6b240ff63c9dda93366c61c969b81ca22fe676ac Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 14 Aug 2009 18:02:52 -0700
Subject: ocfs2: Prefix the member fields of struct ocfs2_xattr_info.

struct ocfs2_xattr_info is a useful structure describing an xattr
you'd like to set.  Let's put prefixes on the member fields so it's
easier to read and use.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 212 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 108 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 22a60a7c35ca..c675a6cda0bb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -116,10 +116,10 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 };
 
 struct ocfs2_xattr_info {
-	int name_index;
-	const char *name;
-	const void *value;
-	size_t value_len;
+	int		xi_name_index;
+	const char	*xi_name;
+	const void	*xi_value;
+	size_t		xi_value_len;
 };
 
 struct ocfs2_xattr_search {
@@ -1361,7 +1361,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 			       size_t offs)
 {
 	int ret = 0;
-	size_t name_len = strlen(xi->name);
+	size_t name_len = strlen(xi->xi_name);
 	void *val = xs->base + offs;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 
@@ -1401,8 +1401,8 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
 	}
 
 	xs->here->xe_name_offset = cpu_to_le16(offs);
-	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
-	if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+	xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
 		ocfs2_xattr_set_local(xs->here, 1);
 	else
 		ocfs2_xattr_set_local(xs->here, 0);
@@ -1427,14 +1427,14 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 					 struct ocfs2_xattr_value_buf *vb,
 					 size_t offs)
 {
-	size_t name_len = strlen(xi->name);
+	size_t name_len = strlen(xi->xi_name);
 	void *val = xs->base + offs;
 	struct ocfs2_xattr_value_root *xv = NULL;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 	int ret = 0;
 
 	memset(val, 0, size);
-	memcpy(val, xi->name, name_len);
+	memcpy(val, xi->xi_name, name_len);
 	xv = (struct ocfs2_xattr_value_root *)
 		(val + OCFS2_XATTR_SIZE(name_len));
 	xv->xr_clusters = 0;
@@ -1444,7 +1444,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	xv->xr_list.l_next_free_rec = 0;
 	vb->vb_xv = xv;
 
-	ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
+	ret = ocfs2_xattr_value_truncate(inode, vb, xi->xi_value_len, ctxt);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1455,7 +1455,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 		return ret;
 	}
 	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
-					      xi->value, xi->value_len);
+					      xi->xi_value, xi->xi_value_len);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -1659,7 +1659,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 					struct ocfs2_xattr_entry *last,
 					size_t min_offs)
 {
-	size_t name_len = strlen(xi->name);
+	size_t name_len = strlen(xi->xi_name);
 	struct ocfs2_xa_loc loc;
 
 	if (xs->xattr_bh == xs->inode_bh)
@@ -1668,10 +1668,10 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 	else
 		ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh,
 					      xs->not_found ? NULL : xs->here);
-	if (xi->value && xs->not_found) {
+	if (xi->xi_value && xs->not_found) {
 		/* Insert the new xattr entry. */
 		le16_add_cpu(&xs->header->xh_count, 1);
-		ocfs2_xattr_set_type(last, xi->name_index);
+		ocfs2_xattr_set_type(last, xi->xi_name_index);
 		ocfs2_xattr_set_local(last, 1);
 		last->xe_name_len = name_len;
 	} else {
@@ -1691,42 +1691,42 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 			size = OCFS2_XATTR_SIZE(name_len) +
 			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
 
-		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_SIZE(xi->value_len)) {
+		if (xi->xi_value && size == OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->xi_value_len)) {
 			/* The old and the new value have the
 			   same size. Just replace the value. */
 			ocfs2_xattr_set_local(xs->here, 1);
-			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+			xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
 			/* Clear value bytes. */
 			memset(val + OCFS2_XATTR_SIZE(name_len),
 			       0,
-			       OCFS2_XATTR_SIZE(xi->value_len));
+			       OCFS2_XATTR_SIZE(xi->xi_value_len));
 			memcpy(val + OCFS2_XATTR_SIZE(name_len),
-			       xi->value,
-			       xi->value_len);
+			       xi->xi_value,
+			       xi->xi_value_len);
 			return;
 		}
 
-		if (!xi->value)
+		if (!xi->xi_value)
 			ocfs2_xa_remove_entry(&loc);
 		else
 			ocfs2_xa_wipe_namevalue(&loc);
 
 		min_offs += size;
 	}
-	if (xi->value) {
+	if (xi->xi_value) {
 		/* Insert the new name+value. */
 		size_t size = OCFS2_XATTR_SIZE(name_len) +
-				OCFS2_XATTR_SIZE(xi->value_len);
+				OCFS2_XATTR_SIZE(xi->xi_value_len);
 		void *val = xs->base + min_offs - size;
 
 		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
 		memset(val, 0, size);
-		memcpy(val, xi->name, name_len);
+		memcpy(val, xi->xi_name, name_len);
 		memcpy(val + OCFS2_XATTR_SIZE(name_len),
-		       xi->value,
-		       xi->value_len);
-		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+		       xi->xi_value,
+		       xi->xi_value_len);
+		xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
 		ocfs2_xattr_set_local(xs->here, 1);
 		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
 	}
@@ -1752,15 +1752,15 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	struct ocfs2_xattr_entry *last;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->xi_name);
 	size_t size_l = 0;
 	handle_t *handle = ctxt->handle;
 	int free, i, ret;
 	struct ocfs2_xattr_info xi_l = {
-		.name_index = xi->name_index,
-		.name = xi->name,
-		.value = xi->value,
-		.value_len = xi->value_len,
+		.xi_name_index = xi->xi_name_index,
+		.xi_name = xi->xi_name,
+		.xi_value = xi->xi_value,
+		.xi_value_len = xi->xi_value_len,
 	};
 	struct ocfs2_xattr_value_buf vb = {
 		.vb_bh = xs->xattr_bh,
@@ -1798,7 +1798,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		free += (size + sizeof(struct ocfs2_xattr_entry));
 	}
 	/* Check free space in inode or block */
-	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+	if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		if (free < sizeof(struct ocfs2_xattr_entry) +
 			   OCFS2_XATTR_SIZE(name_len) +
 			   OCFS2_XATTR_ROOT_SIZE) {
@@ -1806,12 +1806,12 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 			goto out;
 		}
 		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-		xi_l.value = (void *)&def_xv;
-		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
-	} else if (xi->value) {
+		xi_l.xi_value = (void *)&def_xv;
+		xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->xi_value) {
 		if (free < sizeof(struct ocfs2_xattr_entry) +
 			   OCFS2_XATTR_SIZE(name_len) +
-			   OCFS2_XATTR_SIZE(xi->value_len)) {
+			   OCFS2_XATTR_SIZE(xi->xi_value_len)) {
 			ret = -ENOSPC;
 			goto out;
 		}
@@ -1836,16 +1836,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 			vb.vb_xv = (struct ocfs2_xattr_value_root *)
 				(val + OCFS2_XATTR_SIZE(name_len));
 
-			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+			if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 				/*
 				 * If new value need set outside also,
 				 * first truncate old value to new value,
 				 * then set new value with set_value_outside().
 				 */
 				ret = ocfs2_xattr_value_truncate(inode,
-								 &vb,
-								 xi->value_len,
-								 ctxt);
+							&vb,
+							xi->xi_value_len,
+							ctxt);
 				if (ret < 0) {
 					mlog_errno(ret);
 					goto out;
@@ -1863,10 +1863,10 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				}
 
 				ret = __ocfs2_xattr_set_value_outside(inode,
-								handle,
-								&vb,
-								xi->value,
-								xi->value_len);
+							handle,
+							&vb,
+							xi->xi_value,
+							xi->xi_value_len);
 				if (ret < 0)
 					mlog_errno(ret);
 				goto out;
@@ -1944,7 +1944,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	if (ret < 0)
 		mlog_errno(ret);
 
-	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+	if (!ret && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/*
 		 * Set value outside in B tree.
 		 * This is the second step for value size > INLINE_SIZE.
@@ -2610,13 +2610,13 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 
 	BUG_ON(!xs->not_found);
 
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
 		value_size = OCFS2_XATTR_ROOT_SIZE;
 	else
-		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+		value_size = OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 	if (free >= sizeof(struct ocfs2_xattr_entry) +
-		   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+		   OCFS2_XATTR_SIZE(strlen(xi->xi_name)) + value_size)
 		return 1;
 
 	return 0;
@@ -2640,7 +2640,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	char *base = NULL;
 	int name_offset, name_len = 0;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
-						    xi->value_len);
+						    xi->xi_value_len);
 	u64 value_size;
 
 	/*
@@ -2648,14 +2648,14 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	 * No matter whether we replace an old one or add a new one,
 	 * we need this for writing.
 	 */
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
 		credits += new_clusters *
 			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
 
 	if (xis->not_found && xbs->not_found) {
 		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 			clusters_add += new_clusters;
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
 							&def_xv.xv.xr_list,
@@ -2700,7 +2700,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	 * The credits for removing the value tree will be extended
 	 * by ocfs2_remove_extent itself.
 	 */
-	if (!xi->value) {
+	if (!xi->xi_value) {
 		if (!ocfs2_xattr_is_local(xe))
 			credits += ocfs2_remove_extent_credits(inode->i_sb);
 
@@ -2730,7 +2730,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		}
 	}
 
-	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/* the new values will be stored outside. */
 		u32 old_clusters = 0;
 
@@ -2763,9 +2763,10 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		 * value, we don't need any allocation, otherwise we have
 		 * to guess metadata allocation.
 		 */
-		if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+		if ((ocfs2_xattr_is_local(xe) &&
+		     (value_size >= xi->xi_value_len)) ||
 		    (!ocfs2_xattr_is_local(xe) &&
-		     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+		     OCFS2_XATTR_ROOT_SIZE >= xi->xi_value_len))
 			goto out;
 	}
 
@@ -2855,7 +2856,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 
 	meta_add += extra_meta;
 	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
-	     "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+	     "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
 
 	if (meta_add) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2895,7 +2896,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 {
 	int ret = 0, credits, old_found;
 
-	if (!xi->value) {
+	if (!xi->xi_value) {
 		/* Remove existing extended attribute */
 		if (!xis->not_found)
 			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
@@ -2909,8 +2910,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 			 * If succeed and that extended attribute existing in
 			 * external block, then we will remove it.
 			 */
-			xi->value = NULL;
-			xi->value_len = 0;
+			xi->xi_value = NULL;
+			xi->xi_value_len = 0;
 
 			old_found = xis->not_found;
 			xis->not_found = -ENODATA;
@@ -2938,8 +2939,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 		} else if (ret == -ENOSPC) {
 			if (di->i_xattr_loc && !xbs->xattr_bh) {
 				ret = ocfs2_xattr_block_find(inode,
-							     xi->name_index,
-							     xi->name, xbs);
+							     xi->xi_name_index,
+							     xi->xi_name, xbs);
 				if (ret)
 					goto out;
 
@@ -2978,8 +2979,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 				 * If succeed and that extended attribute
 				 * existing in inode, we will remove it.
 				 */
-				xi->value = NULL;
-				xi->value_len = 0;
+				xi->xi_value = NULL;
+				xi->xi_value_len = 0;
 				xbs->not_found = -ENODATA;
 				ret = ocfs2_calc_xattr_set_need(inode,
 								di,
@@ -3045,10 +3046,10 @@ int ocfs2_xattr_set_handle(handle_t *handle,
 	int ret;
 
 	struct ocfs2_xattr_info xi = {
-		.name_index = name_index,
-		.name = name,
-		.value = value,
-		.value_len = value_len,
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_value = value,
+		.xi_value_len = value_len,
 	};
 
 	struct ocfs2_xattr_search xis = {
@@ -3128,10 +3129,10 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	struct ocfs2_xattr_info xi = {
-		.name_index = name_index,
-		.name = name,
-		.value = value,
-		.value_len = value_len,
+		.xi_name_index = name_index,
+		.xi_name = name,
+		.xi_value = value,
+		.xi_value_len = value_len,
 	};
 
 	struct ocfs2_xattr_search xis = {
@@ -4979,7 +4980,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 					 int local)
 {
 	struct ocfs2_xattr_entry *last, *xe;
-	int name_len = strlen(xi->name);
+	int name_len = strlen(xi->xi_name);
 	struct ocfs2_xattr_header *xh = xs->header;
 	u16 count = le16_to_cpu(xh->xh_count), start;
 	size_t blocksize = inode->i_sb->s_blocksize;
@@ -5001,22 +5002,24 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
 
 		/*
-		 * If the new value will be stored outside, xi->value has been
-		 * initalized as an empty ocfs2_xattr_value_root, and the same
-		 * goes with xi->value_len, so we can set new_size safely here.
+		 * If the new value will be stored outside, xi->xi_value has
+		 * been initalized as an empty ocfs2_xattr_value_root, and
+		 * the same goes with xi->xi_value_len, so we can set
+		 * new_size safely here.
 		 * See ocfs2_xattr_set_in_bucket.
 		 */
 		new_size = OCFS2_XATTR_SIZE(name_len) +
-			   OCFS2_XATTR_SIZE(xi->value_len);
+			   OCFS2_XATTR_SIZE(xi->xi_value_len);
 
-		if (xi->value) {
+		if (xi->xi_value) {
 			ocfs2_xa_wipe_namevalue(&loc);
 			if (new_size > size)
 				goto set_new_name_value;
 
 			/* Now replace the old value with new one. */
 			if (local)
-				xe->xe_value_size = cpu_to_le64(xi->value_len);
+				xe->xe_value_size =
+					cpu_to_le64(xi->xi_value_len);
 			else
 				xe->xe_value_size = 0;
 
@@ -5024,9 +5027,9 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 							 xs->bucket, offs);
 			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
 			       size - OCFS2_XATTR_SIZE(name_len));
-			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+			if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0)
 				memcpy(val + OCFS2_XATTR_SIZE(name_len),
-				       xi->value, xi->value_len);
+				       xi->xi_value, xi->xi_value_len);
 
 			le16_add_cpu(&xh->xh_name_value_len, new_size);
 			ocfs2_xattr_set_local(xe, local);
@@ -5067,12 +5070,12 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
 		xe->xe_name_hash = cpu_to_le32(name_hash);
 		xe->xe_name_len = name_len;
-		ocfs2_xattr_set_type(xe, xi->name_index);
+		ocfs2_xattr_set_type(xe, xi->xi_name_index);
 	}
 
 set_new_name_value:
 	/* Insert the new name+value. */
-	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 	/*
 	 * We must make sure that the name/value pair
@@ -5091,10 +5094,11 @@ set_new_name_value:
 	xe->xe_name_offset = cpu_to_le16(offs - size);
 
 	memset(val, 0, size);
-	memcpy(val, xi->name, name_len);
-	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+	memcpy(val, xi->xi_name, name_len);
+	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->xi_value,
+	       xi->xi_value_len);
 
-	xe->xe_value_size = cpu_to_le64(xi->value_len);
+	xe->xe_value_size = cpu_to_le64(xi->xi_value_len);
 	ocfs2_xattr_set_local(xe, local);
 	xs->here = xe;
 	le16_add_cpu(&xh->xh_free_start, -size);
@@ -5119,7 +5123,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	u64 blkno;
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
-	     (unsigned long)xi->value_len, xi->name_index,
+	     (unsigned long)xi->xi_value_len, xi->xi_name_index,
 	     (unsigned long long)bucket_blkno(xs->bucket));
 
 	if (!xs->bucket->bu_bhs[1]) {
@@ -5417,10 +5421,10 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 {
 	int ret, local = 1;
 	size_t value_len;
-	char *val = (char *)xi->value;
+	char *val = (char *)xi->xi_value;
 	struct ocfs2_xattr_entry *xe = xs->here;
-	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
-					      strlen(xi->name));
+	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
+					      strlen(xi->xi_name));
 
 	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
 		/*
@@ -5435,8 +5439,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		 * the modification to the xattr block will be done
 		 * by following steps.
 		 */
-		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
-			value_len = xi->value_len;
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+			value_len = xi->xi_value_len;
 		else
 			value_len = 0;
 
@@ -5450,7 +5454,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			goto set_value_outside;
 	}
 
-	value_len = xi->value_len;
+	value_len = xi->xi_value_len;
 	/* So we have to handle the inside block change now. */
 	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/*
@@ -5458,8 +5462,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		 * initalize a new empty value root and insert it first.
 		 */
 		local = 0;
-		xi->value = &def_xv;
-		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+		xi->xi_value = &def_xv;
+		xi->xi_value_len = OCFS2_XATTR_ROOT_SIZE;
 	}
 
 	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
@@ -5533,11 +5537,11 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 	struct ocfs2_xattr_entry *xe;
 	u16 count, header_size, xh_free_start;
 	int free, max_free, need, old;
-	size_t value_size = 0, name_len = strlen(xi->name);
+	size_t value_size = 0, name_len = strlen(xi->xi_name);
 	size_t blocksize = inode->i_sb->s_blocksize;
 	int ret, allocation = 0;
 
-	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
 
 try_again:
 	xh = xs->header;
@@ -5553,10 +5557,10 @@ try_again:
 			(unsigned long long)bucket_blkno(xs->bucket),
 			header_size);
 
-	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+	if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
 		value_size = OCFS2_XATTR_ROOT_SIZE;
-	else if (xi->value)
-		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+	else if (xi->xi_value)
+		value_size = OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 	if (xs->not_found)
 		need = sizeof(struct ocfs2_xattr_entry) +
@@ -5639,7 +5643,7 @@ try_again:
 		 */
 		ret = ocfs2_check_xattr_bucket_collision(inode,
 							 xs->bucket,
-							 xi->name);
+							 xi->xi_name);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -5663,8 +5667,8 @@ try_again:
 		 */
 		ocfs2_xattr_bucket_relse(xs->bucket);
 		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
-						   xi->name_index,
-						   xi->name, xs);
+						   xi->xi_name_index,
+						   xi->xi_name, xs);
 		if (ret && ret != -ENODATA)
 			goto out;
 		xs->not_found = ret;
@@ -5885,7 +5889,7 @@ static int ocfs2_prepare_refcount_xattr(struct inode *inode,
 	 * refcount tree, and make the original extent become 3. So we will need
 	 * 2 * cluster more extent recs at most.
 	 */
-	if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
+	if (!xi->xi_value || xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE) {
 
 		ret = ocfs2_refcounted_xattr_delete_need(inode,
 							 &(*ref_tree)->rf_ci,
-- 
cgit v1.2.3


From 18853b95d1fb964b76c3393a12c4d861e7779460 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 14 Aug 2009 18:17:07 -0700
Subject: ocfs2: Add a name_len field to ocfs2_xattr_info.

Rather than calculating strlen all over the place, let's store the
name length directly on ocfs2_xattr_info.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 84 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c675a6cda0bb..68126adbf311 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -118,6 +118,7 @@ static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 struct ocfs2_xattr_info {
 	int		xi_name_index;
 	const char	*xi_name;
+	int		xi_name_len;
 	const void	*xi_value;
 	size_t		xi_value_len;
 };
@@ -1361,9 +1362,9 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 			       size_t offs)
 {
 	int ret = 0;
-	size_t name_len = strlen(xi->xi_name);
 	void *val = xs->base + offs;
-	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
+		OCFS2_XATTR_ROOT_SIZE;
 
 	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
 			    OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1427,16 +1428,16 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 					 struct ocfs2_xattr_value_buf *vb,
 					 size_t offs)
 {
-	size_t name_len = strlen(xi->xi_name);
 	void *val = xs->base + offs;
 	struct ocfs2_xattr_value_root *xv = NULL;
-	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
+		OCFS2_XATTR_ROOT_SIZE;
 	int ret = 0;
 
 	memset(val, 0, size);
-	memcpy(val, xi->xi_name, name_len);
+	memcpy(val, xi->xi_name, xi->xi_name_len);
 	xv = (struct ocfs2_xattr_value_root *)
-		(val + OCFS2_XATTR_SIZE(name_len));
+		(val + OCFS2_XATTR_SIZE(xi->xi_name_len));
 	xv->xr_clusters = 0;
 	xv->xr_last_eb_blk = 0;
 	xv->xr_list.l_tree_depth = 0;
@@ -1659,7 +1660,6 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 					struct ocfs2_xattr_entry *last,
 					size_t min_offs)
 {
-	size_t name_len = strlen(xi->xi_name);
 	struct ocfs2_xa_loc loc;
 
 	if (xs->xattr_bh == xs->inode_bh)
@@ -1673,7 +1673,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 		le16_add_cpu(&xs->header->xh_count, 1);
 		ocfs2_xattr_set_type(last, xi->xi_name_index);
 		ocfs2_xattr_set_local(last, 1);
-		last->xe_name_len = name_len;
+		last->xe_name_len = xi->xi_name_len;
 	} else {
 		void *first_val;
 		void *val;
@@ -1685,23 +1685,23 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 
 		if (le64_to_cpu(xs->here->xe_value_size) >
 		    OCFS2_XATTR_INLINE_SIZE)
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 				OCFS2_XATTR_ROOT_SIZE;
 		else
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
 
-		if (xi->xi_value && size == OCFS2_XATTR_SIZE(name_len) +
+		if (xi->xi_value && size == OCFS2_XATTR_SIZE(xi->xi_name_len) +
 				OCFS2_XATTR_SIZE(xi->xi_value_len)) {
 			/* The old and the new value have the
 			   same size. Just replace the value. */
 			ocfs2_xattr_set_local(xs->here, 1);
 			xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
 			/* Clear value bytes. */
-			memset(val + OCFS2_XATTR_SIZE(name_len),
+			memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
 			       0,
 			       OCFS2_XATTR_SIZE(xi->xi_value_len));
-			memcpy(val + OCFS2_XATTR_SIZE(name_len),
+			memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
 			       xi->xi_value,
 			       xi->xi_value_len);
 			return;
@@ -1716,14 +1716,14 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 	}
 	if (xi->xi_value) {
 		/* Insert the new name+value. */
-		size_t size = OCFS2_XATTR_SIZE(name_len) +
+		size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 				OCFS2_XATTR_SIZE(xi->xi_value_len);
 		void *val = xs->base + min_offs - size;
 
 		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
 		memset(val, 0, size);
-		memcpy(val, xi->xi_name, name_len);
-		memcpy(val + OCFS2_XATTR_SIZE(name_len),
+		memcpy(val, xi->xi_name, xi->xi_name_len);
+		memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
 		       xi->xi_value,
 		       xi->xi_value_len);
 		xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
@@ -1752,13 +1752,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	struct ocfs2_xattr_entry *last;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->xi_name);
+	size_t min_offs = xs->end - xs->base;
 	size_t size_l = 0;
 	handle_t *handle = ctxt->handle;
 	int free, i, ret;
 	struct ocfs2_xattr_info xi_l = {
 		.xi_name_index = xi->xi_name_index,
 		.xi_name = xi->xi_name,
+		.xi_name_len = xi->xi_name_len,
 		.xi_value = xi->xi_value,
 		.xi_value_len = xi->xi_value_len,
 	};
@@ -1790,27 +1791,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	if (!xs->not_found) {
 		size_t size = 0;
 		if (ocfs2_xattr_is_local(xs->here))
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
 		else
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 				OCFS2_XATTR_ROOT_SIZE;
 		free += (size + sizeof(struct ocfs2_xattr_entry));
 	}
 	/* Check free space in inode or block */
 	if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			   OCFS2_XATTR_ROOT_SIZE) {
 			ret = -ENOSPC;
 			goto out;
 		}
-		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+		size_l = OCFS2_XATTR_SIZE(xi->xi_name_len) +
+			OCFS2_XATTR_ROOT_SIZE;
 		xi_l.xi_value = (void *)&def_xv;
 		xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE;
 	} else if (xi->xi_value) {
 		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			   OCFS2_XATTR_SIZE(xi->xi_value_len)) {
 			ret = -ENOSPC;
 			goto out;
@@ -1819,7 +1821,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 	if (!xs->not_found) {
 		/* For existing extended attribute */
-		size_t size = OCFS2_XATTR_SIZE(name_len) +
+		size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
 		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
 		void *val = xs->base + offs;
@@ -1834,7 +1836,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		} else if (!ocfs2_xattr_is_local(xs->here)) {
 			/* For existing xattr which has value outside */
 			vb.vb_xv = (struct ocfs2_xattr_value_root *)
-				(val + OCFS2_XATTR_SIZE(name_len));
+				(val + OCFS2_XATTR_SIZE(xi->xi_name_len));
 
 			if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 				/*
@@ -2616,7 +2618,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 		value_size = OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 	if (free >= sizeof(struct ocfs2_xattr_entry) +
-		   OCFS2_XATTR_SIZE(strlen(xi->xi_name)) + value_size)
+		   OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size)
 		return 1;
 
 	return 0;
@@ -3048,6 +3050,7 @@ int ocfs2_xattr_set_handle(handle_t *handle,
 	struct ocfs2_xattr_info xi = {
 		.xi_name_index = name_index,
 		.xi_name = name,
+		.xi_name_len = strlen(name),
 		.xi_value = value,
 		.xi_value_len = value_len,
 	};
@@ -3131,6 +3134,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct ocfs2_xattr_info xi = {
 		.xi_name_index = name_index,
 		.xi_name = name,
+		.xi_name_len = strlen(name),
 		.xi_value = value,
 		.xi_value_len = value_len,
 	};
@@ -4980,7 +4984,6 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 					 int local)
 {
 	struct ocfs2_xattr_entry *last, *xe;
-	int name_len = strlen(xi->xi_name);
 	struct ocfs2_xattr_header *xh = xs->header;
 	u16 count = le16_to_cpu(xh->xh_count), start;
 	size_t blocksize = inode->i_sb->s_blocksize;
@@ -4995,10 +4998,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		xe = xs->here;
 		offs = le16_to_cpu(xe->xe_name_offset);
 		if (ocfs2_xattr_is_local(xe))
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
 		else
-			size = OCFS2_XATTR_SIZE(name_len) +
+			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
 
 		/*
@@ -5008,7 +5011,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		 * new_size safely here.
 		 * See ocfs2_xattr_set_in_bucket.
 		 */
-		new_size = OCFS2_XATTR_SIZE(name_len) +
+		new_size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
 			   OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 		if (xi->xi_value) {
@@ -5025,10 +5028,10 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 
 			val = ocfs2_xattr_bucket_get_val(inode,
 							 xs->bucket, offs);
-			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
-			       size - OCFS2_XATTR_SIZE(name_len));
+			memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), 0,
+			       size - OCFS2_XATTR_SIZE(xi->xi_name_len));
 			if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0)
-				memcpy(val + OCFS2_XATTR_SIZE(name_len),
+				memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
 				       xi->xi_value, xi->xi_value_len);
 
 			le16_add_cpu(&xh->xh_name_value_len, new_size);
@@ -5069,13 +5072,14 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		le16_add_cpu(&xh->xh_count, 1);
 		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
 		xe->xe_name_hash = cpu_to_le32(name_hash);
-		xe->xe_name_len = name_len;
+		xe->xe_name_len = xi->xi_name_len;
 		ocfs2_xattr_set_type(xe, xi->xi_name_index);
 	}
 
 set_new_name_value:
 	/* Insert the new name+value. */
-	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->xi_value_len);
+	size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
+		OCFS2_XATTR_SIZE(xi->xi_value_len);
 
 	/*
 	 * We must make sure that the name/value pair
@@ -5094,8 +5098,8 @@ set_new_name_value:
 	xe->xe_name_offset = cpu_to_le16(offs - size);
 
 	memset(val, 0, size);
-	memcpy(val, xi->xi_name, name_len);
-	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->xi_value,
+	memcpy(val, xi->xi_name, xi->xi_name_len);
+	memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value,
 	       xi->xi_value_len);
 
 	xe->xe_value_size = cpu_to_le64(xi->xi_value_len);
@@ -5424,7 +5428,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 	char *val = (char *)xi->xi_value;
 	struct ocfs2_xattr_entry *xe = xs->here;
 	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
-					      strlen(xi->xi_name));
+					      xi->xi_name_len);
 
 	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
 		/*
@@ -5537,7 +5541,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 	struct ocfs2_xattr_entry *xe;
 	u16 count, header_size, xh_free_start;
 	int free, max_free, need, old;
-	size_t value_size = 0, name_len = strlen(xi->xi_name);
+	size_t value_size = 0;
 	size_t blocksize = inode->i_sb->s_blocksize;
 	int ret, allocation = 0;
 
@@ -5564,9 +5568,9 @@ try_again:
 
 	if (xs->not_found)
 		need = sizeof(struct ocfs2_xattr_entry) +
-			OCFS2_XATTR_SIZE(name_len) + value_size;
+			OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size;
 	else {
-		need = value_size + OCFS2_XATTR_SIZE(name_len);
+		need = value_size + OCFS2_XATTR_SIZE(xi->xi_name_len);
 
 		/*
 		 * We only replace the old value if the new length is smaller
-- 
cgit v1.2.3


From 199799a3609f6d5bb231a75c2e702afaac805431 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 14 Aug 2009 19:04:15 -0700
Subject: ocfs2: Wrap calculation of name+value pair size.

An ocfs2 xattr entry stores the text name and value as a pair in the
storage area.  Obviously names and values can be variable-sized.  If a
value is too large for the entry storage, a tree root is stored instead.
The name+value pair is also padded.

Because of this, there are a million places in the code that do:

	if (needs_external_tree(value_size)
		namevalue_size = pad(name_size) + tree_root_size;
	else
		namevalue_size = pad(name_size) + pad(value_size);

Let's create some convenience functions to make the code more readable.
There are three forms.  The first takes the raw sizes.  The second takes
an ocfs2_xattr_info structure.  The third takes an existing
ocfs2_xattr_entry.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 170 +++++++++++++++++++++----------------------------------
 1 file changed, 65 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 68126adbf311..064ec6d6c23c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -183,6 +183,33 @@ struct ocfs2_xa_loc {
 	const struct ocfs2_xa_loc_operations *xl_ops;
 };
 
+/*
+ * Convenience functions to calculate how much space is needed for a
+ * given name+value pair
+ */
+static int namevalue_size(int name_len, uint64_t value_len)
+{
+	if (value_len > OCFS2_XATTR_INLINE_SIZE)
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	else
+		return OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+}
+
+static int namevalue_size_xi(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size(xi->xi_name_len, xi->xi_value_len);
+}
+
+static int namevalue_size_xe(struct ocfs2_xattr_entry *xe)
+{
+	u64 value_len = le64_to_cpu(xe->xe_value_size);
+
+	BUG_ON((value_len > OCFS2_XATTR_INLINE_SIZE) &&
+	       ocfs2_xattr_is_local(xe));
+	return namevalue_size(xe->xe_name_len, value_len);
+}
+
+
 static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
 					     struct ocfs2_xattr_header *xh,
 					     int index,
@@ -529,15 +556,20 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
 
 static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
 {
-	int size = 0;
+	return namevalue_size(name_len, value_len) +
+		sizeof(struct ocfs2_xattr_entry);
+}
 
-	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
-		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
-	else
-		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-	size += sizeof(struct ocfs2_xattr_entry);
+static int ocfs2_xi_entry_usage(struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xi(xi) +
+		sizeof(struct ocfs2_xattr_entry);
+}
 
-	return size;
+static int ocfs2_xe_entry_usage(struct ocfs2_xattr_entry *xe)
+{
+	return namevalue_size_xe(xe) +
+		sizeof(struct ocfs2_xattr_entry);
 }
 
 int ocfs2_calc_security_init(struct inode *dir,
@@ -1363,8 +1395,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 {
 	int ret = 0;
 	void *val = xs->base + offs;
-	size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-		OCFS2_XATTR_ROOT_SIZE;
+	size_t size = namevalue_size_xi(xi);
 
 	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
 			    OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1430,8 +1461,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 {
 	void *val = xs->base + offs;
 	struct ocfs2_xattr_value_root *xv = NULL;
-	size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-		OCFS2_XATTR_ROOT_SIZE;
+	size_t size = namevalue_size_xi(xi);
 	int ret = 0;
 
 	memset(val, 0, size);
@@ -1490,15 +1520,10 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
 	int namevalue_offset, first_namevalue_offset, namevalue_size;
 	struct ocfs2_xattr_entry *entry = loc->xl_entry;
 	struct ocfs2_xattr_header *xh = loc->xl_header;
-	u64 value_size = le64_to_cpu(entry->xe_value_size);
 	int count = le16_to_cpu(xh->xh_count);
 
 	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
-	namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len);
-	if (value_size > OCFS2_XATTR_INLINE_SIZE)
-		namevalue_size += OCFS2_XATTR_ROOT_SIZE;
-	else
-		namevalue_size += OCFS2_XATTR_SIZE(value_size);
+	namevalue_size = namevalue_size_xe(entry);
 
 	for (i = 0, first_namevalue_offset = loc->xl_size;
 	     i < count; i++) {
@@ -1553,17 +1578,8 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
 
 static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
 {
-	int namevalue_size;
-	struct ocfs2_xattr_entry *entry = loc->xl_entry;
-	u64 value_size = le64_to_cpu(entry->xe_value_size);
-
-	namevalue_size = OCFS2_XATTR_SIZE(entry->xe_name_len);
-	if (value_size > OCFS2_XATTR_INLINE_SIZE)
-		namevalue_size += OCFS2_XATTR_ROOT_SIZE;
-	else
-		namevalue_size += OCFS2_XATTR_SIZE(value_size);
-
-	le16_add_cpu(&loc->xl_header->xh_name_value_len, -namevalue_size);
+	le16_add_cpu(&loc->xl_header->xh_name_value_len,
+		     -namevalue_size_xe(loc->xl_entry));
 }
 
 /* Operations for xattrs stored in buckets. */
@@ -1683,16 +1699,8 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 		offs = le16_to_cpu(xs->here->xe_name_offset);
 		val = xs->base + offs;
 
-		if (le64_to_cpu(xs->here->xe_value_size) >
-		    OCFS2_XATTR_INLINE_SIZE)
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-				OCFS2_XATTR_ROOT_SIZE;
-		else
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-
-		if (xi->xi_value && size == OCFS2_XATTR_SIZE(xi->xi_name_len) +
-				OCFS2_XATTR_SIZE(xi->xi_value_len)) {
+		size = namevalue_size_xe(xs->here);
+		if (xi->xi_value && (size == namevalue_size_xi(xi))) {
 			/* The old and the new value have the
 			   same size. Just replace the value. */
 			ocfs2_xattr_set_local(xs->here, 1);
@@ -1716,8 +1724,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 	}
 	if (xi->xi_value) {
 		/* Insert the new name+value. */
-		size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-				OCFS2_XATTR_SIZE(xi->xi_value_len);
+		size_t size = namevalue_size_xi(xi);
 		void *val = xs->base + min_offs - size;
 
 		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
@@ -1788,41 +1795,25 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	if (free < 0)
 		return -EIO;
 
-	if (!xs->not_found) {
-		size_t size = 0;
-		if (ocfs2_xattr_is_local(xs->here))
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
-		else
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-				OCFS2_XATTR_ROOT_SIZE;
-		free += (size + sizeof(struct ocfs2_xattr_entry));
-	}
+	if (!xs->not_found)
+		free += ocfs2_xe_entry_usage(xs->here);
+
 	/* Check free space in inode or block */
-	if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
-		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			   OCFS2_XATTR_ROOT_SIZE) {
+	if (xi->xi_value) {
+		if (free < ocfs2_xi_entry_usage(xi)) {
 			ret = -ENOSPC;
 			goto out;
 		}
-		size_l = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_ROOT_SIZE;
-		xi_l.xi_value = (void *)&def_xv;
-		xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE;
-	} else if (xi->xi_value) {
-		if (free < sizeof(struct ocfs2_xattr_entry) +
-			   OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			   OCFS2_XATTR_SIZE(xi->xi_value_len)) {
-			ret = -ENOSPC;
-			goto out;
+		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+			size_l = namevalue_size_xi(xi);
+			xi_l.xi_value = (void *)&def_xv;
+			xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE;
 		}
 	}
 
 	if (!xs->not_found) {
 		/* For existing extended attribute */
-		size_t size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		size_t size = namevalue_size_xe(xs->here);
 		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
 		void *val = xs->base + offs;
 
@@ -2589,7 +2580,6 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 				       struct ocfs2_xattr_info *xi,
 				       struct ocfs2_xattr_search *xs)
 {
-	u64 value_size;
 	struct ocfs2_xattr_entry *last;
 	int free, i;
 	size_t min_offs = xs->end - xs->base;
@@ -2612,13 +2602,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 
 	BUG_ON(!xs->not_found);
 
-	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
-		value_size = OCFS2_XATTR_ROOT_SIZE;
-	else
-		value_size = OCFS2_XATTR_SIZE(xi->xi_value_len);
-
-	if (free >= sizeof(struct ocfs2_xattr_entry) +
-		   OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size)
+	if (free >= (sizeof(struct ocfs2_xattr_entry) + namevalue_size_xi(xi)))
 		return 1;
 
 	return 0;
@@ -3980,7 +3964,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 				     struct ocfs2_xattr_bucket *bucket)
 {
 	int ret, i;
-	size_t end, offset, len, value_len;
+	size_t end, offset, len;
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
 	u64 blkno = bucket_blkno(bucket);
@@ -4034,12 +4018,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	end = OCFS2_XATTR_BUCKET_SIZE;
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
 		offset = le16_to_cpu(xe->xe_name_offset);
-		if (ocfs2_xattr_is_local(xe))
-			value_len = OCFS2_XATTR_SIZE(
-					le64_to_cpu(xe->xe_value_size));
-		else
-			value_len = OCFS2_XATTR_ROOT_SIZE;
-		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+		len = namevalue_size_xe(xe);
 
 		/*
 		 * We must make sure that the name/value pair
@@ -4228,7 +4207,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 				    int new_bucket_head)
 {
 	int ret, i;
-	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
+	int count, start, len, name_value_len = 0, name_offset = 0;
 	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
@@ -4319,13 +4298,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	name_value_len = 0;
 	for (i = 0; i < start; i++) {
 		xe = &xh->xh_entries[i];
-		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
-		if (ocfs2_xattr_is_local(xe))
-			xe_len +=
-			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			xe_len += OCFS2_XATTR_ROOT_SIZE;
-		name_value_len += xe_len;
+		name_value_len += namevalue_size_xe(xe);
 		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
 			name_offset = le16_to_cpu(xe->xe_name_offset);
 	}
@@ -4355,12 +4328,6 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
-		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
-		if (ocfs2_xattr_is_local(xe))
-			xe_len +=
-			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			xe_len += OCFS2_XATTR_ROOT_SIZE;
 		if (le16_to_cpu(xe->xe_name_offset) <
 		    le16_to_cpu(xh->xh_free_start))
 			xh->xh_free_start = xe->xe_name_offset;
@@ -4997,12 +4964,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 	if (!xs->not_found) {
 		xe = xs->here;
 		offs = le16_to_cpu(xe->xe_name_offset);
-		if (ocfs2_xattr_is_local(xe))
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+		size = namevalue_size_xe(xe);
 
 		/*
 		 * If the new value will be stored outside, xi->xi_value has
@@ -5011,8 +4973,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 		 * new_size safely here.
 		 * See ocfs2_xattr_set_in_bucket.
 		 */
-		new_size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-			   OCFS2_XATTR_SIZE(xi->xi_value_len);
+		new_size = namevalue_size_xi(xi);
 
 		if (xi->xi_value) {
 			ocfs2_xa_wipe_namevalue(&loc);
@@ -5078,8 +5039,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 
 set_new_name_value:
 	/* Insert the new name+value. */
-	size = OCFS2_XATTR_SIZE(xi->xi_name_len) +
-		OCFS2_XATTR_SIZE(xi->xi_value_len);
+	size = namevalue_size_xi(xi);
 
 	/*
 	 * We must make sure that the name/value pair
-- 
cgit v1.2.3


From 69a3e539d083ac09aec92b8705b8ff2c2e5c810c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 17 Aug 2009 12:24:39 -0700
Subject: ocfs2: Set the xattr name+value pair in one place

We create two new functions on ocfs2_xa_loc, ocfs2_xa_prepare_entry()
and ocfs2_xa_store_inline_value().

ocfs2_xa_prepare_entry() makes sure that the xl_entry field of
ocfs2_xa_loc is ready to receive an xattr.  The entry will point to an
appropriately sized name+value region in storage.  If an existing entry
can be reused, it will be.  If no entry already exists, it will be
allocated.  If there isn't space to allocate it, -ENOSPC will be
returned.

ocfs2_xa_store_inline_value() stores the data that goes into the 'value'
part of the name+value pair.  For values that don't fit directly, this
stores the value tree root.

A number of operations are added to ocfs2_xa_loc_operations to support
these functions.  This reflects the disparate behaviors of xattr blocks
and buckets.

With these functions, the overlapping ocfs2_xattr_set_entry_local() and
ocfs2_xattr_set_entry_normal() can be replaced with a single call
scheme.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 634 ++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 411 insertions(+), 223 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 064ec6d6c23c..4bf6ec849e19 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -147,11 +147,31 @@ struct ocfs2_xa_loc_operations {
 	 */
 	void *(*xlo_offset_pointer)(struct ocfs2_xa_loc *loc, int offset);
 
+	/* Can we reuse the existing entry for the new value? */
+	int (*xlo_can_reuse)(struct ocfs2_xa_loc *loc,
+			     struct ocfs2_xattr_info *xi);
+
+	/* How much space is needed for the new value? */
+	int (*xlo_check_space)(struct ocfs2_xa_loc *loc,
+			       struct ocfs2_xattr_info *xi);
+
+	/*
+	 * Return the offset of the first name+value pair.  This is
+	 * the start of our downward-filling free space.
+	 */
+	int (*xlo_get_free_start)(struct ocfs2_xa_loc *loc);
+
 	/*
 	 * Remove the name+value at this location.  Do whatever is
 	 * appropriate with the remaining name+value pairs.
 	 */
 	void (*xlo_wipe_namevalue)(struct ocfs2_xa_loc *loc);
+
+	/* Fill xl_entry with a new entry */
+	void (*xlo_add_entry)(struct ocfs2_xa_loc *loc, u32 name_hash);
+
+	/* Add name+value storage to an entry */
+	void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
 };
 
 /*
@@ -1493,6 +1513,33 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	return ret;
 }
 
+static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
+				       int num_entries)
+{
+	int free_space;
+
+	if (!needed_space)
+		return 0;
+
+	free_space = free_start -
+		sizeof(struct ocfs2_xattr_header) -
+		(num_entries * sizeof(struct ocfs2_xattr_entry)) -
+		OCFS2_XATTR_HEADER_GAP;
+	if (free_space < 0)
+		return -EIO;
+	if (free_space < needed_space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+/* Give a pointer into the storage for the given offset */
+static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
+{
+	BUG_ON(offset >= loc->xl_size);
+	return loc->xl_ops->xlo_offset_pointer(loc, offset);
+}
+
 /*
  * Wipe the name+value pair and allow the storage to reclaim it.  This
  * must be followed by either removal of the entry or a call to
@@ -1503,13 +1550,117 @@ static void ocfs2_xa_wipe_namevalue(struct ocfs2_xa_loc *loc)
 	loc->xl_ops->xlo_wipe_namevalue(loc);
 }
 
+/*
+ * Find lowest offset to a name+value pair.  This is the start of our
+ * downward-growing free space.
+ */
+static int ocfs2_xa_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	return loc->xl_ops->xlo_get_free_start(loc);
+}
+
+/* Can we reuse loc->xl_entry for xi? */
+static int ocfs2_xa_can_reuse_entry(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_can_reuse(loc, xi);
+}
+
+/* How much free space is needed to set the new value */
+static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi)
+{
+	return loc->xl_ops->xlo_check_space(loc, xi);
+}
+
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	loc->xl_ops->xlo_add_entry(loc, name_hash);
+	loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+	/*
+	 * We can't leave the new entry's xe_name_offset at zero or
+	 * add_namevalue() will go nuts.  We set it to the size of our
+	 * storage so that it can never be less than any other entry.
+	 */
+	loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
+static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_info *xi)
+{
+	int size = namevalue_size_xi(xi);
+	int nameval_offset;
+	char *nameval_buf;
+
+	loc->xl_ops->xlo_add_namevalue(loc, size);
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	loc->xl_entry->xe_name_len = xi->xi_name_len;
+	ocfs2_xattr_set_type(loc->xl_entry, xi->xi_name_index);
+	ocfs2_xattr_set_local(loc->xl_entry,
+			      xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE);
+
+	nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	memset(nameval_buf, 0, size);
+	memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
+}
+
 static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
 					   int offset)
 {
-	BUG_ON(offset >= loc->xl_size);
 	return (char *)loc->xl_header + offset;
 }
 
+static int ocfs2_xa_block_can_reuse(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_info *xi)
+{
+	/*
+	 * Block storage is strict.  If the sizes aren't exact, we will
+	 * remove the old one and reinsert the new.
+	 */
+	return namevalue_size_xe(loc->xl_entry) ==
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_block_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int i, count = le16_to_cpu(xh->xh_count);
+	int offset, free_start = loc->xl_size;
+
+	for (i = 0; i < count; i++) {
+		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
+		if (offset < free_start)
+			free_start = offset;
+	}
+
+	return free_start;
+}
+
+static int ocfs2_xa_block_check_space(struct ocfs2_xa_loc *loc,
+				      struct ocfs2_xattr_info *xi)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+
+	/*
+	 * Block storage will reclaim the original entry before inserting
+	 * the new value, so we only need the difference.  If the new
+	 * entry is smaller than the old one, we don't need anything.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= ocfs2_xe_entry_usage(loc->xl_entry);
+	}
+	if (needed_space < 0)
+		needed_space = 0;
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
 /*
  * Block storage for xattrs keeps the name+value pairs compacted.  When
  * we remove one, we have to shift any that preceded it towards the end.
@@ -1524,13 +1675,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
 
 	namevalue_offset = le16_to_cpu(entry->xe_name_offset);
 	namevalue_size = namevalue_size_xe(entry);
-
-	for (i = 0, first_namevalue_offset = loc->xl_size;
-	     i < count; i++) {
-		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
-		if (offset < first_namevalue_offset)
-			first_namevalue_offset = offset;
-	}
+	first_namevalue_offset = ocfs2_xa_get_free_start(loc);
 
 	/* Shift the name+value pairs */
 	memmove((char *)xh + first_namevalue_offset + namevalue_size,
@@ -1552,13 +1697,33 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
 	 */
 }
 
+static void ocfs2_xa_block_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	loc->xl_entry = &(loc->xl_header->xh_entries[count]);
+	le16_add_cpu(&loc->xl_header->xh_count, 1);
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+
+	loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
+}
+
 /*
  * Operations for xattrs stored in blocks.  This includes inline inode
  * storage and unindexed ocfs2_xattr_blocks.
  */
 static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
 	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_block_check_space,
+	.xlo_can_reuse		= ocfs2_xa_block_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_block_get_free_start,
 	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_block_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_block_add_namevalue,
 };
 
 static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
@@ -1567,8 +1732,6 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
 	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
 	int block, block_offset;
 
-	BUG_ON(offset >= OCFS2_XATTR_BUCKET_SIZE);
-
 	/* The header is at the front of the bucket */
 	block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits;
 	block_offset = offset % bucket->bu_inode->i_sb->s_blocksize;
@@ -1576,16 +1739,145 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
 	return bucket_block(bucket, block) + block_offset;
 }
 
+static int ocfs2_xa_bucket_can_reuse(struct ocfs2_xa_loc *loc,
+				     struct ocfs2_xattr_info *xi)
+{
+	return namevalue_size_xe(loc->xl_entry) >=
+		namevalue_size_xi(xi);
+}
+
+static int ocfs2_xa_bucket_get_free_start(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	return le16_to_cpu(bucket_xh(bucket)->xh_free_start);
+}
+
+static int ocfs2_bucket_align_free_start(struct super_block *sb,
+					 int free_start, int size)
+{
+	/*
+	 * We need to make sure that the name+value pair fits within
+	 * one block.
+	 */
+	if (((free_start - size) >> sb->s_blocksize_bits) !=
+	    ((free_start - 1) >> sb->s_blocksize_bits))
+		free_start -= free_start % sb->s_blocksize;
+
+	return free_start;
+}
+
+static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
+				       struct ocfs2_xattr_info *xi)
+{
+	int rc;
+	int count = le16_to_cpu(loc->xl_header->xh_count);
+	int free_start = ocfs2_xa_get_free_start(loc);
+	int needed_space = ocfs2_xi_entry_usage(xi);
+	int size = namevalue_size_xi(xi);
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = bucket->bu_inode->i_sb;
+
+	/*
+	 * Bucket storage does not reclaim name+value pairs it cannot
+	 * reuse.  They live as holes until the bucket fills, and then
+	 * the bucket is defragmented.  However, the bucket can reclaim
+	 * the ocfs2_xattr_entry.
+	 */
+	if (loc->xl_entry) {
+		/* Don't need space if we're reusing! */
+		if (ocfs2_xa_can_reuse_entry(loc, xi))
+			needed_space = 0;
+		else
+			needed_space -= sizeof(struct ocfs2_xattr_entry);
+	}
+	BUG_ON(needed_space < 0);
+
+	if (free_start < size) {
+		if (needed_space)
+			return -ENOSPC;
+	} else {
+		/*
+		 * First we check if it would fit in the first place.
+		 * Below, we align the free start to a block.  This may
+		 * slide us below the minimum gap.  By checking unaligned
+		 * first, we avoid that error.
+		 */
+		rc = ocfs2_xa_check_space_helper(needed_space, free_start,
+						 count);
+		if (rc)
+			return rc;
+		free_start = ocfs2_bucket_align_free_start(sb, free_start,
+							   size);
+	}
+	return ocfs2_xa_check_space_helper(needed_space, free_start, count);
+}
+
 static void ocfs2_xa_bucket_wipe_namevalue(struct ocfs2_xa_loc *loc)
 {
 	le16_add_cpu(&loc->xl_header->xh_name_value_len,
 		     -namevalue_size_xe(loc->xl_entry));
 }
 
+static void ocfs2_xa_bucket_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	int count = le16_to_cpu(xh->xh_count);
+	int low = 0, high = count - 1, tmp;
+	struct ocfs2_xattr_entry *tmp_xe;
+
+	/*
+	 * We keep buckets sorted by name_hash, so we need to find
+	 * our insert place.
+	 */
+	while (low <= high && count) {
+		tmp = (low + high) / 2;
+		tmp_xe = &xh->xh_entries[tmp];
+
+		if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+			low = tmp + 1;
+		else if (name_hash < le32_to_cpu(tmp_xe->xe_name_hash))
+			high = tmp - 1;
+		else {
+			low = tmp;
+			break;
+		}
+	}
+
+	if (low != count)
+		memmove(&xh->xh_entries[low + 1],
+			&xh->xh_entries[low],
+			((count - low) * sizeof(struct ocfs2_xattr_entry)));
+
+	le16_add_cpu(&xh->xh_count, 1);
+	loc->xl_entry = &xh->xh_entries[low];
+	memset(loc->xl_entry, 0, sizeof(struct ocfs2_xattr_entry));
+}
+
+static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
+{
+	int free_start = ocfs2_xa_get_free_start(loc);
+	struct ocfs2_xattr_header *xh = loc->xl_header;
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = bucket->bu_inode->i_sb;
+	int nameval_offset;
+
+	free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
+	nameval_offset = free_start - size;
+	loc->xl_entry->xe_name_offset = cpu_to_le16(nameval_offset);
+	xh->xh_free_start = cpu_to_le16(nameval_offset);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+}
+
 /* Operations for xattrs stored in buckets. */
 static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
+	.xlo_check_space	= ocfs2_xa_bucket_check_space,
+	.xlo_can_reuse		= ocfs2_xa_bucket_can_reuse,
+	.xlo_get_free_start	= ocfs2_xa_bucket_get_free_start,
 	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
+	.xlo_add_entry		= ocfs2_xa_bucket_add_entry,
+	.xlo_add_namevalue	= ocfs2_xa_bucket_add_namevalue,
 };
 
 static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
@@ -1615,6 +1907,77 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
 	}
 }
 
+/*
+ * Prepares loc->xl_entry to receive the new xattr.  This includes
+ * properly setting up the name+value pair region.  If loc->xl_entry
+ * already exists, it will take care of modifying it appropriately.
+ * This also includes deleting entries, but don't call this to remove
+ * a non-existant entry.  That's just a bug.
+ *
+ * Note that this modifies the data.  You did journal_access already,
+ * right?
+ */
+static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
+				  struct ocfs2_xattr_info *xi,
+				  u32 name_hash)
+{
+	int rc = 0;
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	char *nameval_buf;
+
+	if (!xi->xi_value) {
+		ocfs2_xa_remove_entry(loc);
+		goto out;
+	}
+
+	rc = ocfs2_xa_check_space(loc, xi);
+	if (rc)
+		goto out;
+
+	if (loc->xl_entry) {
+		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+			nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+			memset(nameval_buf + name_size, 0,
+			       namevalue_size_xe(loc->xl_entry) - name_size);
+			loc->xl_entry->xe_value_size =
+				cpu_to_le64(xi->xi_value_len);
+			goto out;
+		}
+
+		ocfs2_xa_wipe_namevalue(loc);
+	} else
+		ocfs2_xa_add_entry(loc, name_hash);
+
+	/*
+	 * If we get here, we have a blank entry.  Fill it.  We grow our
+	 * name+value pair back from the end.
+	 */
+	ocfs2_xa_add_namevalue(loc, xi);
+
+out:
+	return rc;
+}
+
+/*
+ * Store the value portion of the name+value pair.  This is either an
+ * inline value or the tree root of an external value.
+ */
+static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc,
+					struct ocfs2_xattr_info *xi)
+{
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	int size = namevalue_size_xi(xi);
+	char *nameval_buf;
+
+	if (!xi->xi_value)
+		return;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
+	memcpy(nameval_buf + name_size, xi->xi_value, size - name_size);
+}
+
 static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
 				     struct inode *inode,
 				     struct buffer_head *bh,
@@ -1665,81 +2028,6 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
 	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
 }
 
-/*
- * ocfs2_xattr_set_entry_local()
- *
- * Set, replace or remove extended attribute in local.
- */
-static void ocfs2_xattr_set_entry_local(struct inode *inode,
-					struct ocfs2_xattr_info *xi,
-					struct ocfs2_xattr_search *xs,
-					struct ocfs2_xattr_entry *last,
-					size_t min_offs)
-{
-	struct ocfs2_xa_loc loc;
-
-	if (xs->xattr_bh == xs->inode_bh)
-		ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
-					 xs->not_found ? NULL : xs->here);
-	else
-		ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh,
-					      xs->not_found ? NULL : xs->here);
-	if (xi->xi_value && xs->not_found) {
-		/* Insert the new xattr entry. */
-		le16_add_cpu(&xs->header->xh_count, 1);
-		ocfs2_xattr_set_type(last, xi->xi_name_index);
-		ocfs2_xattr_set_local(last, 1);
-		last->xe_name_len = xi->xi_name_len;
-	} else {
-		void *first_val;
-		void *val;
-		size_t offs, size;
-
-		first_val = xs->base + min_offs;
-		offs = le16_to_cpu(xs->here->xe_name_offset);
-		val = xs->base + offs;
-
-		size = namevalue_size_xe(xs->here);
-		if (xi->xi_value && (size == namevalue_size_xi(xi))) {
-			/* The old and the new value have the
-			   same size. Just replace the value. */
-			ocfs2_xattr_set_local(xs->here, 1);
-			xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
-			/* Clear value bytes. */
-			memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
-			       0,
-			       OCFS2_XATTR_SIZE(xi->xi_value_len));
-			memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
-			       xi->xi_value,
-			       xi->xi_value_len);
-			return;
-		}
-
-		if (!xi->xi_value)
-			ocfs2_xa_remove_entry(&loc);
-		else
-			ocfs2_xa_wipe_namevalue(&loc);
-
-		min_offs += size;
-	}
-	if (xi->xi_value) {
-		/* Insert the new name+value. */
-		size_t size = namevalue_size_xi(xi);
-		void *val = xs->base + min_offs - size;
-
-		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
-		memset(val, 0, size);
-		memcpy(val, xi->xi_name, xi->xi_name_len);
-		memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
-		       xi->xi_value,
-		       xi->xi_value_len);
-		xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
-		ocfs2_xattr_set_local(xs->here, 1);
-		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-	}
-
-	return;
-}
 
 /*
  * ocfs2_xattr_set_entry()
@@ -1747,7 +2035,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
  * Set extended attribute entry into inode or block.
  *
  * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
- * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * We first insert tree root(ocfs2_xattr_value_root) like a normal value,
  * then set value in B tree with set_value_outside().
  */
 static int ocfs2_xattr_set_entry(struct inode *inode,
@@ -1763,6 +2051,9 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	size_t size_l = 0;
 	handle_t *handle = ctxt->handle;
 	int free, i, ret;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
+					      xi->xi_name_len);
+	struct ocfs2_xa_loc loc;
 	struct ocfs2_xattr_info xi_l = {
 		.xi_name_index = xi->xi_name_index,
 		.xi_name = xi->xi_name,
@@ -1894,11 +2185,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		}
 	}
 
+	if (xs->xattr_bh == xs->inode_bh)
+		ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+					 xs->not_found ? NULL : xs->here);
+	else
+		ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
+
 	/*
-	 * Set value in local, include set tree root in local.
-	 * This is the first step for value size >INLINE_SIZE.
+	 * Prepare our entry and insert the inline value.  This will
+	 * be a value tree root for values that are larger than
+	 * OCFS2_XATTR_INLINE_SIZE.
 	 */
-	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	/* XXX For now, until we make ocfs2_xa_prepare_entry() primary */
+	BUG_ON(ret == -ENOSPC);
+	ocfs2_xa_store_inline_value(&loc, xi);
+	xs->here = loc.xl_entry;
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
 		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
@@ -4938,139 +5246,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 	return bucket_block(bucket, block_off) + offs;
 }
 
-/*
- * Handle the normal xattr set, including replace, delete and new.
- *
- * Note: "local" indicates the real data's locality. So we can't
- * just its bucket locality by its length.
- */
-static void ocfs2_xattr_set_entry_normal(struct inode *inode,
-					 struct ocfs2_xattr_info *xi,
-					 struct ocfs2_xattr_search *xs,
-					 u32 name_hash,
-					 int local)
-{
-	struct ocfs2_xattr_entry *last, *xe;
-	struct ocfs2_xattr_header *xh = xs->header;
-	u16 count = le16_to_cpu(xh->xh_count), start;
-	size_t blocksize = inode->i_sb->s_blocksize;
-	char *val;
-	size_t offs, size, new_size;
-	struct ocfs2_xa_loc loc;
-
-	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
-				       xs->not_found ? NULL : xs->here);
-	last = &xh->xh_entries[count];
-	if (!xs->not_found) {
-		xe = xs->here;
-		offs = le16_to_cpu(xe->xe_name_offset);
-		size = namevalue_size_xe(xe);
-
-		/*
-		 * If the new value will be stored outside, xi->xi_value has
-		 * been initalized as an empty ocfs2_xattr_value_root, and
-		 * the same goes with xi->xi_value_len, so we can set
-		 * new_size safely here.
-		 * See ocfs2_xattr_set_in_bucket.
-		 */
-		new_size = namevalue_size_xi(xi);
-
-		if (xi->xi_value) {
-			ocfs2_xa_wipe_namevalue(&loc);
-			if (new_size > size)
-				goto set_new_name_value;
-
-			/* Now replace the old value with new one. */
-			if (local)
-				xe->xe_value_size =
-					cpu_to_le64(xi->xi_value_len);
-			else
-				xe->xe_value_size = 0;
-
-			val = ocfs2_xattr_bucket_get_val(inode,
-							 xs->bucket, offs);
-			memset(val + OCFS2_XATTR_SIZE(xi->xi_name_len), 0,
-			       size - OCFS2_XATTR_SIZE(xi->xi_name_len));
-			if (OCFS2_XATTR_SIZE(xi->xi_value_len) > 0)
-				memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len),
-				       xi->xi_value, xi->xi_value_len);
-
-			le16_add_cpu(&xh->xh_name_value_len, new_size);
-			ocfs2_xattr_set_local(xe, local);
-			return;
-		} else {
-			ocfs2_xa_remove_entry(&loc);
-			if (!xh->xh_count)
-				xh->xh_free_start =
-					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
-
-			return;
-		}
-	} else {
-		/* find a new entry for insert. */
-		int low = 0, high = count - 1, tmp;
-		struct ocfs2_xattr_entry *tmp_xe;
-
-		while (low <= high && count) {
-			tmp = (low + high) / 2;
-			tmp_xe = &xh->xh_entries[tmp];
-
-			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
-				low = tmp + 1;
-			else if (name_hash <
-				 le32_to_cpu(tmp_xe->xe_name_hash))
-				high = tmp - 1;
-			else {
-				low = tmp;
-				break;
-			}
-		}
-
-		xe = &xh->xh_entries[low];
-		if (low != count)
-			memmove(xe + 1, xe, (void *)last - (void *)xe);
-
-		le16_add_cpu(&xh->xh_count, 1);
-		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
-		xe->xe_name_hash = cpu_to_le32(name_hash);
-		xe->xe_name_len = xi->xi_name_len;
-		ocfs2_xattr_set_type(xe, xi->xi_name_index);
-	}
-
-set_new_name_value:
-	/* Insert the new name+value. */
-	size = namevalue_size_xi(xi);
-
-	/*
-	 * We must make sure that the name/value pair
-	 * exists in the same block.
-	 */
-	offs = le16_to_cpu(xh->xh_free_start);
-	start = offs - size;
-
-	if (start >> inode->i_sb->s_blocksize_bits !=
-	    (offs - 1) >> inode->i_sb->s_blocksize_bits) {
-		offs = offs - offs % blocksize;
-		xh->xh_free_start = cpu_to_le16(offs);
-	}
-
-	val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-	xe->xe_name_offset = cpu_to_le16(offs - size);
-
-	memset(val, 0, size);
-	memcpy(val, xi->xi_name, xi->xi_name_len);
-	memcpy(val + OCFS2_XATTR_SIZE(xi->xi_name_len), xi->xi_value,
-	       xi->xi_value_len);
-
-	xe->xe_value_size = cpu_to_le64(xi->xi_value_len);
-	ocfs2_xattr_set_local(xe, local);
-	xs->here = xe;
-	le16_add_cpu(&xh->xh_free_start, -size);
-	le16_add_cpu(&xh->xh_name_value_len, size);
-
-	return;
-}
-
 /*
  * Set the xattr entry in the specified bucket.
  * The bucket is indicated by xs->bucket and it should have the enough
@@ -5085,6 +5260,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 {
 	int ret;
 	u64 blkno;
+	struct ocfs2_xa_loc loc;
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->xi_value_len, xi->xi_name_index,
@@ -5107,7 +5283,19 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	/* XXX For now, until we make ocfs2_xa_prepare_entry() primary */
+	BUG_ON(ret == -ENOSPC);
+	ocfs2_xa_store_inline_value(&loc, xi);
+	xs->here = loc.xl_entry;
+
 	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
 out:
-- 
cgit v1.2.3


From 9dc474005d0e34cf21d4b510f347e3942f24b021 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 17 Aug 2009 19:56:01 -0700
Subject: ocfs2: Handle value tree roots in ocfs2_xa_set_inline_value()

Previously the xattr code would send in a fake value, containing a tree
root, to the function that installed name+value pairs.  Instead, we pass
the real value to ocfs2_xa_set_inline_value(), and it notices that the
value cannot fit.  Thus, it installs a tree root.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 54 ++++++++++++++++--------------------------------------
 1 file changed, 16 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4bf6ec849e19..6d362e2d3960 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1968,14 +1968,19 @@ static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc,
 {
 	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
 	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
-	int size = namevalue_size_xi(xi);
+	int inline_value_size = namevalue_size_xi(xi) - name_size;
+	const void *value = xi->xi_value;
 	char *nameval_buf;
 
 	if (!xi->xi_value)
 		return;
 
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		value = &def_xv;
+		inline_value_size = OCFS2_XATTR_ROOT_SIZE;
+	}
 	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
-	memcpy(nameval_buf + name_size, xi->xi_value, size - name_size);
+	memcpy(nameval_buf + name_size, value, inline_value_size);
 }
 
 static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
@@ -2054,13 +2059,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
 					      xi->xi_name_len);
 	struct ocfs2_xa_loc loc;
-	struct ocfs2_xattr_info xi_l = {
-		.xi_name_index = xi->xi_name_index,
-		.xi_name = xi->xi_name,
-		.xi_name_len = xi->xi_name_len,
-		.xi_value = xi->xi_value,
-		.xi_value_len = xi->xi_value_len,
-	};
 	struct ocfs2_xattr_value_buf vb = {
 		.vb_bh = xs->xattr_bh,
 		.vb_access = ocfs2_journal_access_di,
@@ -2090,16 +2088,9 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		free += ocfs2_xe_entry_usage(xs->here);
 
 	/* Check free space in inode or block */
-	if (xi->xi_value) {
-		if (free < ocfs2_xi_entry_usage(xi)) {
-			ret = -ENOSPC;
-			goto out;
-		}
-		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
-			size_l = namevalue_size_xi(xi);
-			xi_l.xi_value = (void *)&def_xv;
-			xi_l.xi_value_len = OCFS2_XATTR_ROOT_SIZE;
-		}
+	if (xi->xi_value && (free < ocfs2_xi_entry_usage(xi))) {
+		ret = -ENOSPC;
+		goto out;
 	}
 
 	if (!xs->not_found) {
@@ -5255,8 +5246,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 					   handle_t *handle,
 					   struct ocfs2_xattr_info *xi,
 					   struct ocfs2_xattr_search *xs,
-					   u32 name_hash,
-					   int local)
+					   u32 name_hash)
 {
 	int ret;
 	u64 blkno;
@@ -5571,13 +5561,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_search *xs,
 				     struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret, local = 1;
+	int ret;
 	size_t value_len;
 	char *val = (char *)xi->xi_value;
 	struct ocfs2_xattr_entry *xe = xs->here;
 	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
 					      xi->xi_name_len);
 
+	value_len = xi->xi_value_len;
 	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
 		/*
 		 * We need to truncate the xattr storage first.
@@ -5591,9 +5582,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		 * the modification to the xattr block will be done
 		 * by following steps.
 		 */
-		if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
-			value_len = xi->xi_value_len;
-		else
+		if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
 			value_len = 0;
 
 		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
@@ -5606,26 +5595,15 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			goto set_value_outside;
 	}
 
-	value_len = xi->xi_value_len;
 	/* So we have to handle the inside block change now. */
-	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
-		/*
-		 * If the new value will be stored outside of block,
-		 * initalize a new empty value root and insert it first.
-		 */
-		local = 0;
-		xi->xi_value = &def_xv;
-		xi->xi_value_len = OCFS2_XATTR_ROOT_SIZE;
-	}
-
 	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
-					      name_hash, local);
+					      name_hash);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+	if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
 		goto out;
 
 	/* allocate the space now for the outside block storage. */
-- 
cgit v1.2.3


From 3fc12afa0cea5dc8845487b685165c89934ca1eb Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 18 Aug 2009 13:20:27 -0700
Subject: ocfs2: Provide ocfs2_xa_fill_value_buf() for external value
 processing

We use the ocfs2_xattr_value_buf structure to manage external values.
It lets the value tree code do its work regardless of the containing
storage.  ocfs2_xa_fill_value_buf() initializes a value buf from an
ocfs2_xa_loc entry.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6d362e2d3960..e0d0fa23a19b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -172,6 +172,13 @@ struct ocfs2_xa_loc_operations {
 
 	/* Add name+value storage to an entry */
 	void (*xlo_add_namevalue)(struct ocfs2_xa_loc *loc, int size);
+
+	/*
+	 * Initialize the value buf's access and bh fields for this entry.
+	 * ocfs2_xa_fill_value_buf() will handle the xv pointer.
+	 */
+	void (*xlo_fill_value_buf)(struct ocfs2_xa_loc *loc,
+				   struct ocfs2_xattr_value_buf *vb);
 };
 
 /*
@@ -1605,6 +1612,23 @@ static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
 	memcpy(nameval_buf, xi->xi_name, xi->xi_name_len);
 }
 
+static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
+				    struct ocfs2_xattr_value_buf *vb)
+{
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+
+	/* Value bufs are for value trees */
+	BUG_ON(namevalue_size_xe(loc->xl_entry) !=
+	       (name_size + OCFS2_XATTR_ROOT_SIZE));
+
+	loc->xl_ops->xlo_fill_value_buf(loc, vb);
+	vb->vb_xv =
+		(struct ocfs2_xattr_value_root *)ocfs2_xa_offset_pointer(loc,
+							nameval_offset +
+							name_size);
+}
+
 static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
 					   int offset)
 {
@@ -1712,6 +1736,20 @@ static void ocfs2_xa_block_add_namevalue(struct ocfs2_xa_loc *loc, int size)
 	loc->xl_entry->xe_name_offset = cpu_to_le16(free_start - size);
 }
 
+static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
+					  struct ocfs2_xattr_value_buf *vb)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		vb->vb_access = ocfs2_journal_access_xb;
+	else
+		vb->vb_access = ocfs2_journal_access_di;
+	vb->vb_bh = bh;
+}
+
 /*
  * Operations for xattrs stored in blocks.  This includes inline inode
  * storage and unindexed ocfs2_xattr_blocks.
@@ -1724,6 +1762,7 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
 	.xlo_wipe_namevalue	= ocfs2_xa_block_wipe_namevalue,
 	.xlo_add_entry		= ocfs2_xa_block_add_entry,
 	.xlo_add_namevalue	= ocfs2_xa_block_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_block_fill_value_buf,
 };
 
 static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
@@ -1869,6 +1908,25 @@ static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
 
 }
 
+static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
+					   struct ocfs2_xattr_value_buf *vb)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+	struct super_block *sb = bucket->bu_inode->i_sb;
+	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
+	int size = namevalue_size_xe(loc->xl_entry);
+	int block_offset = nameval_offset >> sb->s_blocksize_bits;
+
+	/* Values are not allowed to straddle block boundaries */
+	BUG_ON(block_offset !=
+	       ((nameval_offset + size - 1) >> sb->s_blocksize_bits));
+	/* We expect the bucket to be filled in */
+	BUG_ON(!bucket->bu_bhs[block_offset]);
+
+	vb->vb_access = ocfs2_journal_access;
+	vb->vb_bh = bucket->bu_bhs[block_offset];
+}
+
 /* Operations for xattrs stored in buckets. */
 static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
@@ -1878,6 +1936,7 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 	.xlo_wipe_namevalue	= ocfs2_xa_bucket_wipe_namevalue,
 	.xlo_add_entry		= ocfs2_xa_bucket_add_entry,
 	.xlo_add_namevalue	= ocfs2_xa_bucket_add_namevalue,
+	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
 };
 
 static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
-- 
cgit v1.2.3


From cf2bc809403ae48a4a2bb5cc551d2ec35f2e4a47 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 18 Aug 2009 13:52:38 -0700
Subject: ocfs2: Teach ocfs2_xa_loc how to do its own journal work

We're going to want to make sure our buffers get accessed and dirtied
correctly.  So have the xa_loc do the work.  This includes storing the
inode on ocfs2_xa_loc.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 115 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e0d0fa23a19b..fbec11610223 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -141,6 +141,13 @@ struct ocfs2_xattr_search {
 /* Operations on struct ocfs2_xa_entry */
 struct ocfs2_xa_loc;
 struct ocfs2_xa_loc_operations {
+	/*
+	 * Journal functions
+	 */
+	int (*xlo_journal_access)(handle_t *handle, struct ocfs2_xa_loc *loc,
+				  int type);
+	void (*xlo_journal_dirty)(handle_t *handle, struct ocfs2_xa_loc *loc);
+
 	/*
 	 * Return a pointer to the appropriate buffer in loc->xl_storage
 	 * at the given offset from loc->xl_header.
@@ -186,6 +193,9 @@ struct ocfs2_xa_loc_operations {
  * tracking the on-disk structure.
  */
 struct ocfs2_xa_loc {
+	/* This xattr belongs to this inode */
+	struct inode *xl_inode;
+
 	/* The ocfs2_xattr_header inside the on-disk storage. Not NULL. */
 	struct ocfs2_xattr_header *xl_header;
 
@@ -1540,6 +1550,17 @@ static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
 	return 0;
 }
 
+static int ocfs2_xa_journal_access(handle_t *handle, struct ocfs2_xa_loc *loc,
+				   int type)
+{
+	return loc->xl_ops->xlo_journal_access(handle, loc, type);
+}
+
+static void ocfs2_xa_journal_dirty(handle_t *handle, struct ocfs2_xa_loc *loc)
+{
+	loc->xl_ops->xlo_journal_dirty(handle, loc);
+}
+
 /* Give a pointer into the storage for the given offset */
 static void *ocfs2_xa_offset_pointer(struct ocfs2_xa_loc *loc, int offset)
 {
@@ -1629,6 +1650,29 @@ static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
 							name_size);
 }
 
+static int ocfs2_xa_block_journal_access(handle_t *handle,
+					 struct ocfs2_xa_loc *loc, int type)
+{
+	struct buffer_head *bh = loc->xl_storage;
+	ocfs2_journal_access_func access;
+
+	if (loc->xl_size == (bh->b_size -
+			     offsetof(struct ocfs2_xattr_block,
+				      xb_attrs.xb_header)))
+		access = ocfs2_journal_access_xb;
+	else
+		access = ocfs2_journal_access_di;
+	return access(handle, INODE_CACHE(loc->xl_inode), bh, type);
+}
+
+static void ocfs2_xa_block_journal_dirty(handle_t *handle,
+					 struct ocfs2_xa_loc *loc)
+{
+	struct buffer_head *bh = loc->xl_storage;
+
+	ocfs2_journal_dirty(handle, bh);
+}
+
 static void *ocfs2_xa_block_offset_pointer(struct ocfs2_xa_loc *loc,
 					   int offset)
 {
@@ -1755,6 +1799,8 @@ static void ocfs2_xa_block_fill_value_buf(struct ocfs2_xa_loc *loc,
  * storage and unindexed ocfs2_xattr_blocks.
  */
 static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_block_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_block_journal_dirty,
 	.xlo_offset_pointer	= ocfs2_xa_block_offset_pointer,
 	.xlo_check_space	= ocfs2_xa_block_check_space,
 	.xlo_can_reuse		= ocfs2_xa_block_can_reuse,
@@ -1765,6 +1811,22 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_block_loc_ops = {
 	.xlo_fill_value_buf	= ocfs2_xa_block_fill_value_buf,
 };
 
+static int ocfs2_xa_bucket_journal_access(handle_t *handle,
+					  struct ocfs2_xa_loc *loc, int type)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	return ocfs2_xattr_bucket_journal_access(handle, bucket, type);
+}
+
+static void ocfs2_xa_bucket_journal_dirty(handle_t *handle,
+					  struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
+
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
+}
+
 static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
 					    int offset)
 {
@@ -1772,8 +1834,8 @@ static void *ocfs2_xa_bucket_offset_pointer(struct ocfs2_xa_loc *loc,
 	int block, block_offset;
 
 	/* The header is at the front of the bucket */
-	block = offset >> bucket->bu_inode->i_sb->s_blocksize_bits;
-	block_offset = offset % bucket->bu_inode->i_sb->s_blocksize;
+	block = offset >> loc->xl_inode->i_sb->s_blocksize_bits;
+	block_offset = offset % loc->xl_inode->i_sb->s_blocksize;
 
 	return bucket_block(bucket, block) + block_offset;
 }
@@ -1813,8 +1875,7 @@ static int ocfs2_xa_bucket_check_space(struct ocfs2_xa_loc *loc,
 	int free_start = ocfs2_xa_get_free_start(loc);
 	int needed_space = ocfs2_xi_entry_usage(xi);
 	int size = namevalue_size_xi(xi);
-	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-	struct super_block *sb = bucket->bu_inode->i_sb;
+	struct super_block *sb = loc->xl_inode->i_sb;
 
 	/*
 	 * Bucket storage does not reclaim name+value pairs it cannot
@@ -1896,8 +1957,7 @@ static void ocfs2_xa_bucket_add_namevalue(struct ocfs2_xa_loc *loc, int size)
 {
 	int free_start = ocfs2_xa_get_free_start(loc);
 	struct ocfs2_xattr_header *xh = loc->xl_header;
-	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-	struct super_block *sb = bucket->bu_inode->i_sb;
+	struct super_block *sb = loc->xl_inode->i_sb;
 	int nameval_offset;
 
 	free_start = ocfs2_bucket_align_free_start(sb, free_start, size);
@@ -1912,7 +1972,7 @@ static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
 					   struct ocfs2_xattr_value_buf *vb)
 {
 	struct ocfs2_xattr_bucket *bucket = loc->xl_storage;
-	struct super_block *sb = bucket->bu_inode->i_sb;
+	struct super_block *sb = loc->xl_inode->i_sb;
 	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
 	int size = namevalue_size_xe(loc->xl_entry);
 	int block_offset = nameval_offset >> sb->s_blocksize_bits;
@@ -1929,6 +1989,8 @@ static void ocfs2_xa_bucket_fill_value_buf(struct ocfs2_xa_loc *loc,
 
 /* Operations for xattrs stored in buckets. */
 static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
+	.xlo_journal_access	= ocfs2_xa_bucket_journal_access,
+	.xlo_journal_dirty	= ocfs2_xa_bucket_journal_dirty,
 	.xlo_offset_pointer	= ocfs2_xa_bucket_offset_pointer,
 	.xlo_check_space	= ocfs2_xa_bucket_check_space,
 	.xlo_can_reuse		= ocfs2_xa_bucket_can_reuse,
@@ -2049,6 +2111,7 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
+	loc->xl_inode = inode;
 	loc->xl_ops = &ocfs2_xa_block_loc_ops;
 	loc->xl_storage = bh;
 	loc->xl_entry = entry;
@@ -2065,6 +2128,7 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
 }
 
 static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
+					  struct inode *inode,
 					  struct buffer_head *bh,
 					  struct ocfs2_xattr_entry *entry)
 {
@@ -2073,6 +2137,7 @@ static void ocfs2_init_xattr_block_xa_loc(struct ocfs2_xa_loc *loc,
 
 	BUG_ON(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED);
 
+	loc->xl_inode = inode;
 	loc->xl_ops = &ocfs2_xa_block_loc_ops;
 	loc->xl_storage = bh;
 	loc->xl_header = &(xb->xb_attrs.xb_header);
@@ -2085,6 +2150,7 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
 					   struct ocfs2_xattr_bucket *bucket,
 					   struct ocfs2_xattr_entry *entry)
 {
+	loc->xl_inode = bucket->bu_inode;
 	loc->xl_ops = &ocfs2_xa_bucket_loc_ops;
 	loc->xl_storage = bucket;
 	loc->xl_header = bucket_xh(bucket);
@@ -2226,21 +2292,18 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		goto out;
 	}
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
 	if (xs->xattr_bh == xs->inode_bh)
 		ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
 					 xs->not_found ? NULL : xs->here);
 	else
-		ocfs2_init_xattr_block_xa_loc(&loc, xs->xattr_bh,
+		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
 					      xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_journal_access(handle, &loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
 	/*
 	 * Prepare our entry and insert the inline value.  This will
@@ -2258,13 +2321,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	ocfs2_xa_store_inline_value(&loc, xi);
 	xs->here = loc.xl_entry;
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
+	ocfs2_xa_journal_dirty(handle, &loc);
 
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
 	    (flag & OCFS2_INLINE_XATTR_FL)) {
@@ -5325,15 +5382,15 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_journal_access(handle, &loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
-				       xs->not_found ? NULL : xs->here);
 	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash);
 	if (ret) {
 		if (ret != -ENOSPC)
@@ -5345,7 +5402,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	ocfs2_xa_store_inline_value(&loc, xi);
 	xs->here = loc.xl_entry;
 
-	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
+	ocfs2_xa_journal_dirty(handle, &loc);
 
 out:
 	return ret;
-- 
cgit v1.2.3


From 73857ee0b548017f9632a0d0e6fe2dabbdc11d31 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 18 Aug 2009 20:26:41 -0700
Subject: ocfs2: Allocation in ocfs2_xa_prepare_entry(), values in
 ocfs2_xa_store_value()

ocfs2_xa_prepare_entry() gets all the logic to add, remove, or modify
external value trees.  Now, when it exits, the entry is ready to receive
a value of any size.

ocfs2_xa_remove() is added to handle the complete removal of an entry.
It truncates the external value tree before calling
ocfs2_xa_remove_entry().

ocfs2_xa_store_inline_value() becomes ocfs2_xa_store_value().  It can
store any value.

ocfs2_xattr_set_entry() loses all the allocation logic and just uses
these functions.  ocfs2_xattr_set_value_outside() disappears.

ocfs2_xattr_set_in_bucket() uses these functions and makes
ocfs2_xattr_set_entry_in_bucket() obsolete.  That goes away, as does
ocfs2_xattr_bucket_set_value_outside() and
ocfs2_xattr_bucket_value_truncate().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 661 ++++++++++++++++---------------------------------------
 1 file changed, 186 insertions(+), 475 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fbec11610223..550a3e86c971 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -573,24 +573,6 @@ static u32 ocfs2_xattr_name_hash(struct inode *inode,
 	return hash;
 }
 
-/*
- * ocfs2_xattr_hash_entry()
- *
- * Compute the hash of an extended attribute.
- */
-static void ocfs2_xattr_hash_entry(struct inode *inode,
-				   struct ocfs2_xattr_header *header,
-				   struct ocfs2_xattr_entry *entry)
-{
-	u32 hash = 0;
-	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
-
-	hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
-	entry->xe_name_hash = cpu_to_le32(hash);
-
-	return;
-}
-
 static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
 {
 	return namevalue_size(name_len, value_len) +
@@ -1423,113 +1405,6 @@ out:
 	return ret;
 }
 
-static int ocfs2_xattr_cleanup(struct inode *inode,
-			       handle_t *handle,
-			       struct ocfs2_xattr_info *xi,
-			       struct ocfs2_xattr_search *xs,
-			       struct ocfs2_xattr_value_buf *vb,
-			       size_t offs)
-{
-	int ret = 0;
-	void *val = xs->base + offs;
-	size_t size = namevalue_size_xi(xi);
-
-	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			    OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-	/* Decrease xattr count */
-	le16_add_cpu(&xs->header->xh_count, -1);
-	/* Remove the xattr entry and tree root which has already be set*/
-	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
-	memset(val, 0, size);
-
-	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-out:
-	return ret;
-}
-
-static int ocfs2_xattr_update_entry(struct inode *inode,
-				    handle_t *handle,
-				    struct ocfs2_xattr_info *xi,
-				    struct ocfs2_xattr_search *xs,
-				    struct ocfs2_xattr_value_buf *vb,
-				    size_t offs)
-{
-	int ret;
-
-	ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			    OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	xs->here->xe_name_offset = cpu_to_le16(offs);
-	xs->here->xe_value_size = cpu_to_le64(xi->xi_value_len);
-	if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
-		ocfs2_xattr_set_local(xs->here, 1);
-	else
-		ocfs2_xattr_set_local(xs->here, 0);
-	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-
-	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-out:
-	return ret;
-}
-
-/*
- * ocfs2_xattr_set_value_outside()
- *
- * Set large size value in B tree.
- */
-static int ocfs2_xattr_set_value_outside(struct inode *inode,
-					 struct ocfs2_xattr_info *xi,
-					 struct ocfs2_xattr_search *xs,
-					 struct ocfs2_xattr_set_ctxt *ctxt,
-					 struct ocfs2_xattr_value_buf *vb,
-					 size_t offs)
-{
-	void *val = xs->base + offs;
-	struct ocfs2_xattr_value_root *xv = NULL;
-	size_t size = namevalue_size_xi(xi);
-	int ret = 0;
-
-	memset(val, 0, size);
-	memcpy(val, xi->xi_name, xi->xi_name_len);
-	xv = (struct ocfs2_xattr_value_root *)
-		(val + OCFS2_XATTR_SIZE(xi->xi_name_len));
-	xv->xr_clusters = 0;
-	xv->xr_last_eb_blk = 0;
-	xv->xr_list.l_tree_depth = 0;
-	xv->xr_list.l_count = cpu_to_le16(1);
-	xv->xr_list.l_next_free_rec = 0;
-	vb->vb_xv = xv;
-
-	ret = ocfs2_xattr_value_truncate(inode, vb, xi->xi_value_len, ctxt);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
-	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
-					      xi->xi_value, xi->xi_value_len);
-	if (ret < 0)
-		mlog_errno(ret);
-
-	return ret;
-}
-
 static int ocfs2_xa_check_space_helper(int needed_space, int free_start,
 				       int num_entries)
 {
@@ -1640,6 +1515,7 @@ static void ocfs2_xa_fill_value_buf(struct ocfs2_xa_loc *loc,
 	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
 
 	/* Value bufs are for value trees */
+	BUG_ON(ocfs2_xattr_is_local(loc->xl_entry));
 	BUG_ON(namevalue_size_xe(loc->xl_entry) !=
 	       (name_size + OCFS2_XATTR_ROOT_SIZE));
 
@@ -2001,6 +1877,33 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
 };
 
+static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int trunc_rc, access_rc;
+	struct ocfs2_xattr_value_buf vb;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	trunc_rc = ocfs2_xattr_value_truncate(loc->xl_inode, &vb, bytes,
+					      ctxt);
+
+	/*
+	 * The caller of ocfs2_xa_value_truncate() has already called
+	 * ocfs2_xa_journal_access on the loc.  However, The truncate code
+	 * calls ocfs2_extend_trans().  This may commit the previous
+	 * transaction and open a new one.  If this is a bucket, truncate
+	 * could leave only vb->vb_bh set up for journaling.  Meanwhile,
+	 * the caller is expecting to dirty the entire bucket.  So we must
+	 * reset the journal work.  We do this even if truncate has failed,
+	 * as it could have failed after committing the extend.
+	 */
+	access_rc = ocfs2_xa_journal_access(ctxt->handle, loc,
+					    OCFS2_JOURNAL_ACCESS_WRITE);
+
+	/* Errors in truncate take precedence */
+	return trunc_rc ? trunc_rc : access_rc;
+}
+
 static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
 {
 	int index, count;
@@ -2028,6 +1931,88 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
 	}
 }
 
+static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
+			   struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+
+	if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+		rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+	}
+
+	ocfs2_xa_remove_entry(loc);
+
+out:
+	return rc;
+}
+
+static void ocfs2_xa_install_value_root(struct ocfs2_xa_loc *loc)
+{
+	int name_size = OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len);
+	char *nameval_buf;
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	memcpy(nameval_buf + name_size, &def_xv, OCFS2_XATTR_ROOT_SIZE);
+}
+
+/*
+ * Take an existing entry and make it ready for the new value.  This
+ * won't allocate space, but it may free space.  It should be ready for
+ * ocfs2_xa_prepare_entry() to finish the work.
+ */
+static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int rc = 0;
+	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	char *nameval_buf;
+	int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
+	int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
+
+	BUG_ON(OCFS2_XATTR_SIZE(loc->xl_entry->xe_name_len) !=
+	       name_size);
+
+	nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+	if (xe_local) {
+		memset(nameval_buf + name_size, 0,
+		       namevalue_size_xe(loc->xl_entry) - name_size);
+		if (!xi_local)
+			ocfs2_xa_install_value_root(loc);
+	} else {
+		if (xi_local) {
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc < 0) {
+				mlog_errno(rc);
+				goto out;
+			}
+			memset(nameval_buf + name_size, 0,
+			       namevalue_size_xe(loc->xl_entry) -
+			       name_size);
+		} else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
+			   xi->xi_value_len) {
+			rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
+						     ctxt);
+			if (rc < 0) {
+				mlog_errno(rc);
+				goto out;
+			}
+		}
+	}
+
+	loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
+	ocfs2_xattr_set_local(loc->xl_entry, xi_local);
+
+out:
+	return rc;
+}
+
 /*
  * Prepares loc->xl_entry to receive the new xattr.  This includes
  * properly setting up the name+value pair region.  If loc->xl_entry
@@ -2040,14 +2025,13 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
  */
 static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 				  struct ocfs2_xattr_info *xi,
-				  u32 name_hash)
+				  u32 name_hash,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int rc = 0;
-	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
-	char *nameval_buf;
 
 	if (!xi->xi_value) {
-		ocfs2_xa_remove_entry(loc);
+		rc = ocfs2_xa_remove(loc, ctxt);
 		goto out;
 	}
 
@@ -2057,15 +2041,19 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 
 	if (loc->xl_entry) {
 		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
-			nameval_buf = ocfs2_xa_offset_pointer(loc,
-				le16_to_cpu(loc->xl_entry->xe_name_offset));
-			memset(nameval_buf + name_size, 0,
-			       namevalue_size_xe(loc->xl_entry) - name_size);
-			loc->xl_entry->xe_value_size =
-				cpu_to_le64(xi->xi_value_len);
-			goto out;
+			rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+			if (rc)
+				goto out;
+			goto alloc_value;
 		}
 
+		if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+			if (rc) {
+				mlog_errno(rc);
+				goto out;
+			}
+		}
 		ocfs2_xa_wipe_namevalue(loc);
 	} else
 		ocfs2_xa_add_entry(loc, name_hash);
@@ -2075,33 +2063,50 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 	 * name+value pair back from the end.
 	 */
 	ocfs2_xa_add_namevalue(loc, xi);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xa_install_value_root(loc);
+
+alloc_value:
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
+		if (rc < 0)
+			mlog_errno(rc);
+	}
 
 out:
 	return rc;
 }
 
 /*
- * Store the value portion of the name+value pair.  This is either an
- * inline value or the tree root of an external value.
+ * Store the value portion of the name+value pair.  This will skip
+ * values that are stored externally.  Their tree roots were set up
+ * by ocfs2_xa_prepare_entry().
  */
-static void ocfs2_xa_store_inline_value(struct ocfs2_xa_loc *loc,
-					struct ocfs2_xattr_info *xi)
+static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
+				struct ocfs2_xattr_info *xi,
+				struct ocfs2_xattr_set_ctxt *ctxt)
 {
+	int rc = 0;
 	int nameval_offset = le16_to_cpu(loc->xl_entry->xe_name_offset);
 	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
-	int inline_value_size = namevalue_size_xi(xi) - name_size;
-	const void *value = xi->xi_value;
 	char *nameval_buf;
+	struct ocfs2_xattr_value_buf vb;
 
 	if (!xi->xi_value)
-		return;
+		goto out;
 
-	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
-		value = &def_xv;
-		inline_value_size = OCFS2_XATTR_ROOT_SIZE;
-	}
 	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
-	memcpy(nameval_buf + name_size, value, inline_value_size);
+	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		ocfs2_xa_fill_value_buf(loc, &vb);
+		rc = __ocfs2_xattr_set_value_outside(loc->xl_inode,
+						     ctxt->handle, &vb,
+						     xi->xi_value,
+						     xi->xi_value_len);
+	} else
+		memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
+
+out:
+	return rc;
 }
 
 static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
@@ -2174,117 +2179,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 struct ocfs2_xattr_set_ctxt *ctxt,
 				 int flag)
 {
-	struct ocfs2_xattr_entry *last;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	size_t min_offs = xs->end - xs->base;
-	size_t size_l = 0;
 	handle_t *handle = ctxt->handle;
-	int free, i, ret;
+	int ret;
 	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
 					      xi->xi_name_len);
 	struct ocfs2_xa_loc loc;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = xs->xattr_bh,
-		.vb_access = ocfs2_journal_access_di,
-	};
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+	if (!(flag & OCFS2_INLINE_XATTR_FL))
 		BUG_ON(xs->xattr_bh == xs->inode_bh);
-		vb.vb_access = ocfs2_journal_access_xb;
-	} else
+	else
 		BUG_ON(xs->xattr_bh != xs->inode_bh);
 
-	/* Compute min_offs, last and free space. */
-	last = xs->header->xh_entries;
-
-	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
-		size_t offs = le16_to_cpu(last->xe_name_offset);
-		if (offs < min_offs)
-			min_offs = offs;
-		last += 1;
-	}
-
-	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
-	if (free < 0)
-		return -EIO;
-
-	if (!xs->not_found)
-		free += ocfs2_xe_entry_usage(xs->here);
-
-	/* Check free space in inode or block */
-	if (xi->xi_value && (free < ocfs2_xi_entry_usage(xi))) {
-		ret = -ENOSPC;
-		goto out;
-	}
-
-	if (!xs->not_found) {
-		/* For existing extended attribute */
-		size_t size = namevalue_size_xe(xs->here);
-		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		void *val = xs->base + offs;
-
-		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
-			/* Replace existing local xattr with tree root */
-			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-							    ctxt, &vb, offs);
-			if (ret < 0)
-				mlog_errno(ret);
-			goto out;
-		} else if (!ocfs2_xattr_is_local(xs->here)) {
-			/* For existing xattr which has value outside */
-			vb.vb_xv = (struct ocfs2_xattr_value_root *)
-				(val + OCFS2_XATTR_SIZE(xi->xi_name_len));
-
-			if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
-				/*
-				 * If new value need set outside also,
-				 * first truncate old value to new value,
-				 * then set new value with set_value_outside().
-				 */
-				ret = ocfs2_xattr_value_truncate(inode,
-							&vb,
-							xi->xi_value_len,
-							ctxt);
-				if (ret < 0) {
-					mlog_errno(ret);
-					goto out;
-				}
-
-				ret = ocfs2_xattr_update_entry(inode,
-							       handle,
-							       xi,
-							       xs,
-							       &vb,
-							       offs);
-				if (ret < 0) {
-					mlog_errno(ret);
-					goto out;
-				}
-
-				ret = __ocfs2_xattr_set_value_outside(inode,
-							handle,
-							&vb,
-							xi->xi_value,
-							xi->xi_value_len);
-				if (ret < 0)
-					mlog_errno(ret);
-				goto out;
-			} else {
-				/*
-				 * If new value need set in local,
-				 * just trucate old value to zero.
-				 */
-				 ret = ocfs2_xattr_value_truncate(inode,
-								  &vb,
-								  0,
-								  ctxt);
-				if (ret < 0)
-					mlog_errno(ret);
-			}
-		}
-	}
-
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
@@ -2305,22 +2212,20 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		goto out;
 	}
 
-	/*
-	 * Prepare our entry and insert the inline value.  This will
-	 * be a value tree root for values that are larger than
-	 * OCFS2_XATTR_INLINE_SIZE.
-	 */
-	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash);
+	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt);
 	if (ret) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
 		goto out;
 	}
-	/* XXX For now, until we make ocfs2_xa_prepare_entry() primary */
-	BUG_ON(ret == -ENOSPC);
-	ocfs2_xa_store_inline_value(&loc, xi);
 	xs->here = loc.xl_entry;
 
+	ret = ocfs2_xa_store_value(&loc, xi, ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ocfs2_xa_journal_dirty(handle, &loc);
 
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
@@ -2352,28 +2257,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	if (ret < 0)
 		mlog_errno(ret);
 
-	if (!ret && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
-		/*
-		 * Set value outside in B tree.
-		 * This is the second step for value size > INLINE_SIZE.
-		 */
-		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
-						    &vb, offs);
-		if (ret < 0) {
-			int ret2;
-
-			mlog_errno(ret);
-			/*
-			 * If set value outside failed, we have to clean
-			 * the junk tree root we have already set in local.
-			 */
-			ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
-						   xi, xs, &vb, offs);
-			if (ret2 < 0)
-				mlog_errno(ret2);
-		}
-	}
 out:
 	return ret;
 }
@@ -5353,61 +5236,6 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 	return bucket_block(bucket, block_off) + offs;
 }
 
-/*
- * Set the xattr entry in the specified bucket.
- * The bucket is indicated by xs->bucket and it should have the enough
- * space for the xattr insertion.
- */
-static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
-					   handle_t *handle,
-					   struct ocfs2_xattr_info *xi,
-					   struct ocfs2_xattr_search *xs,
-					   u32 name_hash)
-{
-	int ret;
-	u64 blkno;
-	struct ocfs2_xa_loc loc;
-
-	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
-	     (unsigned long)xi->xi_value_len, xi->xi_name_index,
-	     (unsigned long long)bucket_blkno(xs->bucket));
-
-	if (!xs->bucket->bu_bhs[1]) {
-		blkno = bucket_blkno(xs->bucket);
-		ocfs2_xattr_bucket_relse(xs->bucket);
-		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
-				       xs->not_found ? NULL : xs->here);
-	ret = ocfs2_xa_journal_access(handle, &loc,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash);
-	if (ret) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-	/* XXX For now, until we make ocfs2_xa_prepare_entry() primary */
-	BUG_ON(ret == -ENOSPC);
-	ocfs2_xa_store_inline_value(&loc, xi);
-	xs->here = loc.xl_entry;
-
-	ocfs2_xa_journal_dirty(handle, &loc);
-
-out:
-	return ret;
-}
-
 /*
  * Truncate the specified xe_off entry in xattr bucket.
  * bucket is indicated by header_bh and len is the new length.
@@ -5478,66 +5306,6 @@ out:
 	return ret;
 }
 
-static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-					struct ocfs2_xattr_search *xs,
-					int len,
-					struct ocfs2_xattr_set_ctxt *ctxt)
-{
-	int ret, offset;
-	struct ocfs2_xattr_entry *xe = xs->here;
-	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-
-	BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
-
-	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-						offset, len, ctxt);
-	if (ret)
-		mlog_errno(ret);
-
-	return ret;
-}
-
-static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
-						handle_t *handle,
-						struct ocfs2_xattr_search *xs,
-						char *val,
-						int value_len)
-{
-	int ret, offset, block_off;
-	struct ocfs2_xattr_value_root *xv;
-	struct ocfs2_xattr_entry *xe = xs->here;
-	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-	void *base;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_access = ocfs2_journal_access,
-	};
-
-	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
-
-	ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
-						xe - xh->xh_entries,
-						&block_off,
-						&offset);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	base = bucket_block(xs->bucket, block_off);
-	xv = (struct ocfs2_xattr_value_root *)(base + offset +
-		 OCFS2_XATTR_SIZE(xe->xe_name_len));
-
-	vb.vb_xv = xv;
-	vb.vb_bh = xs->bucket->bu_bhs[block_off];
-	ret = __ocfs2_xattr_set_value_outside(inode, handle,
-					      &vb, val, value_len);
-	if (ret)
-		mlog_errno(ret);
-out:
-	return ret;
-}
-
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
 				  struct buffer_head *root_bh,
 				  u64 blkno,
@@ -5636,41 +5404,8 @@ out:
 	return ret;
 }
 
-static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
-					 handle_t *handle,
-					 struct ocfs2_xattr_search *xs)
-{
-	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-	struct ocfs2_xattr_entry *last = &xh->xh_entries[
-						le16_to_cpu(xh->xh_count) - 1];
-	int ret = 0;
-
-	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		return;
-	}
-
-	/* Remove the old entry. */
-	memmove(xs->here, xs->here + 1,
-		(void *)last - (void *)xs->here);
-	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
-	le16_add_cpu(&xh->xh_count, -1);
-
-	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-}
-
 /*
  * Set the xattr name/value in the bucket specified in xs.
- *
- * As the new value in xi may be stored in the bucket or in an outside cluster,
- * we divide the whole process into 3 steps:
- * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
- * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
- * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
- * 4. If the clusters for the new outside value can't be allocated, we need
- *    to free the xattr we allocated in set.
  */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_info *xi,
@@ -5678,70 +5413,46 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
-	size_t value_len;
-	char *val = (char *)xi->xi_value;
-	struct ocfs2_xattr_entry *xe = xs->here;
+	u64 blkno;
+	struct ocfs2_xa_loc loc;
 	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
 					      xi->xi_name_len);
 
-	value_len = xi->xi_value_len;
-	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
-		/*
-		 * We need to truncate the xattr storage first.
-		 *
-		 * If both the old and new value are stored to
-		 * outside block, we only need to truncate
-		 * the storage and then set the value outside.
-		 *
-		 * If the new value should be stored within block,
-		 * we should free all the outside block first and
-		 * the modification to the xattr block will be done
-		 * by following steps.
-		 */
-		if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
-			value_len = 0;
-
-		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-							   value_len,
-							   ctxt);
-		if (ret)
+	if (!xs->bucket->bu_bhs[1]) {
+		blkno = bucket_blkno(xs->bucket);
+		ocfs2_xattr_bucket_relse(xs->bucket);
+		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
+		if (ret) {
+			mlog_errno(ret);
 			goto out;
-
-		if (value_len)
-			goto set_value_outside;
+		}
 	}
 
-	/* So we have to handle the inside block change now. */
-	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
-					      name_hash);
-	if (ret) {
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_journal_access(ctxt->handle, &loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	if (xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE)
+	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
 		goto out;
+	}
+	xs->here = loc.xl_entry;
 
-	/* allocate the space now for the outside block storage. */
-	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-						   value_len, ctxt);
+	ret = ocfs2_xa_store_value(&loc, xi, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-
-		if (xs->not_found) {
-			/*
-			 * We can't allocate enough clusters for outside
-			 * storage and we have allocated xattr already,
-			 * so need to remove it.
-			 */
-			ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
-		}
 		goto out;
 	}
 
-set_value_outside:
-	ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
-						   xs, val, value_len);
+	ocfs2_xa_journal_dirty(ctxt->handle, &loc);
+
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From bca5e9bd1eb2a9422a2ff29e822a956310653754 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 18 Aug 2009 20:40:14 -0700
Subject: ocfs2: Gell into ocfs2_xa_set()

ocfs2_xa_set() wraps the ocfs2_xa_prepare_entry()/ocfs2_xa_store_value()
logic.  Both callers can now use the same routine.  ocfs2_xa_remove()
moves directly into ocfs2_xa_set().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 88 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 42 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 550a3e86c971..98c18fbfc289 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2017,8 +2017,6 @@ out:
  * Prepares loc->xl_entry to receive the new xattr.  This includes
  * properly setting up the name+value pair region.  If loc->xl_entry
  * already exists, it will take care of modifying it appropriately.
- * This also includes deleting entries, but don't call this to remove
- * a non-existant entry.  That's just a bug.
  *
  * Note that this modifies the data.  You did journal_access already,
  * right?
@@ -2030,11 +2028,6 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 {
 	int rc = 0;
 
-	if (!xi->xi_value) {
-		rc = ocfs2_xa_remove(loc, ctxt);
-		goto out;
-	}
-
 	rc = ocfs2_xa_check_space(loc, xi);
 	if (rc)
 		goto out;
@@ -2092,9 +2085,6 @@ static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
 	char *nameval_buf;
 	struct ocfs2_xattr_value_buf vb;
 
-	if (!xi->xi_value)
-		goto out;
-
 	nameval_buf = ocfs2_xa_offset_pointer(loc, nameval_offset);
 	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
 		ocfs2_xa_fill_value_buf(loc, &vb);
@@ -2105,10 +2095,49 @@ static int ocfs2_xa_store_value(struct ocfs2_xa_loc *loc,
 	} else
 		memcpy(nameval_buf + name_size, xi->xi_value, xi->xi_value_len);
 
-out:
 	return rc;
 }
 
+static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
+			struct ocfs2_xattr_info *xi,
+			struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	u32 name_hash = ocfs2_xattr_name_hash(loc->xl_inode, xi->xi_name,
+					      xi->xi_name_len);
+
+	ret = ocfs2_xa_journal_access(ctxt->handle, loc,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Don't worry, we are never called with !xi_value and !xl_entry */
+	if (!xi->xi_value) {
+		ret = ocfs2_xa_remove(loc, ctxt);
+		goto out;
+	}
+
+	ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xa_store_value(loc, xi, ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_xa_journal_dirty(ctxt->handle, loc);
+
+out:
+	return ret;
+}
+
 static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
 				     struct inode *inode,
 				     struct buffer_head *bh,
@@ -2183,8 +2212,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	handle_t *handle = ctxt->handle;
 	int ret;
-	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
-					      xi->xi_name_len);
 	struct ocfs2_xa_loc loc;
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL))
@@ -2205,14 +2232,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	else
 		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
 					      xs->not_found ? NULL : xs->here);
-	ret = ocfs2_xa_journal_access(handle, &loc,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
 
-	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
 	if (ret) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@@ -2220,14 +2241,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	}
 	xs->here = loc.xl_entry;
 
-	ret = ocfs2_xa_store_value(&loc, xi, ctxt);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ocfs2_xa_journal_dirty(handle, &loc);
-
 	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
 	    (flag & OCFS2_INLINE_XATTR_FL)) {
 		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -5415,8 +5428,6 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 	int ret;
 	u64 blkno;
 	struct ocfs2_xa_loc loc;
-	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->xi_name,
-					      xi->xi_name_len);
 
 	if (!xs->bucket->bu_bhs[1]) {
 		blkno = bucket_blkno(xs->bucket);
@@ -5430,14 +5441,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 
 	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
 				       xs->not_found ? NULL : xs->here);
-	ret = ocfs2_xa_journal_access(ctxt->handle, &loc,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_xa_prepare_entry(&loc, xi, name_hash, ctxt);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
 	if (ret) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@@ -5445,14 +5449,6 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 	}
 	xs->here = loc.xl_entry;
 
-	ret = ocfs2_xa_store_value(&loc, xi, ctxt);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ocfs2_xa_journal_dirty(ctxt->handle, &loc);
-
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From c5d95df5f78312c879f3058059c98a08821897a5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 18 Aug 2009 21:03:24 -0700
Subject: ocfs2: Let ocfs2_xa_prepare_entry() do space checks.

ocfs2_xattr_set_in_bucket() doesn't need to do its own hacky space
checking.  Let's let ocfs2_xa_prepare_entry() (via ocfs2_xa_set()) do
the more accurate work.  Whenever it doesn't have space,
ocfs2_xattr_set_in_bucket() can try to get more space.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 270 +++++++++++++++++++------------------------------------
 1 file changed, 93 insertions(+), 177 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 98c18fbfc289..e7630a77a6c8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -322,14 +322,6 @@ static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
 	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
 }
 
-static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
-{
-	u16 len = sb->s_blocksize -
-		 offsetof(struct ocfs2_xattr_header, xh_entries);
-
-	return len / sizeof(struct ocfs2_xattr_entry);
-}
-
 #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
@@ -5417,42 +5409,6 @@ out:
 	return ret;
 }
 
-/*
- * Set the xattr name/value in the bucket specified in xs.
- */
-static int ocfs2_xattr_set_in_bucket(struct inode *inode,
-				     struct ocfs2_xattr_info *xi,
-				     struct ocfs2_xattr_search *xs,
-				     struct ocfs2_xattr_set_ctxt *ctxt)
-{
-	int ret;
-	u64 blkno;
-	struct ocfs2_xa_loc loc;
-
-	if (!xs->bucket->bu_bhs[1]) {
-		blkno = bucket_blkno(xs->bucket);
-		ocfs2_xattr_bucket_relse(xs->bucket);
-		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
-				       xs->not_found ? NULL : xs->here);
-	ret = ocfs2_xa_set(&loc, xi, ctxt);
-	if (ret) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-	xs->here = loc.xl_entry;
-
-out:
-	return ret;
-}
-
 /*
  * check whether the xattr bucket is filled up with the same hash value.
  * If we want to insert the xattr with the same hash, return -ENOSPC.
@@ -5481,156 +5437,116 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 	return 0;
 }
 
-static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
-					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs,
-					     struct ocfs2_xattr_set_ctxt *ctxt)
+/*
+ * Try to set the entry in the current bucket.  If we fail, the caller
+ * will handle getting us another bucket.
+ */
+static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	struct ocfs2_xattr_header *xh;
-	struct ocfs2_xattr_entry *xe;
-	u16 count, header_size, xh_free_start;
-	int free, max_free, need, old;
-	size_t value_size = 0;
-	size_t blocksize = inode->i_sb->s_blocksize;
-	int ret, allocation = 0;
-
-	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
-
-try_again:
-	xh = xs->header;
-	count = le16_to_cpu(xh->xh_count);
-	xh_free_start = le16_to_cpu(xh->xh_free_start);
-	header_size = sizeof(struct ocfs2_xattr_header) +
-			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
-		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
-
-	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
-			"of %u which exceed block size\n",
-			(unsigned long long)bucket_blkno(xs->bucket),
-			header_size);
+	int ret;
+	struct ocfs2_xa_loc loc;
 
-	if (xi->xi_value && xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE)
-		value_size = OCFS2_XATTR_ROOT_SIZE;
-	else if (xi->xi_value)
-		value_size = OCFS2_XATTR_SIZE(xi->xi_value_len);
+	mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
 
-	if (xs->not_found)
-		need = sizeof(struct ocfs2_xattr_entry) +
-			OCFS2_XATTR_SIZE(xi->xi_name_len) + value_size;
-	else {
-		need = value_size + OCFS2_XATTR_SIZE(xi->xi_name_len);
+	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
+				       xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
+	}
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * We only replace the old value if the new length is smaller
-		 * than the old one. Otherwise we will allocate new space in the
-		 * bucket to store it.
-		 */
-		xe = xs->here;
-		if (ocfs2_xattr_is_local(xe))
-			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
-		else
-			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+	/* Ok, we need space.  Let's try defragmenting the bucket. */
+	ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+					xs->bucket);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		if (old >= value_size)
-			need = 0;
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (!ret) {
+		xs->here = loc.xl_entry;
+		goto out;
 	}
+	if (ret != -ENOSPC)
+		mlog_errno(ret);
 
-	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
-	/*
-	 * We need to make sure the new name/value pair
-	 * can exist in the same block.
-	 */
-	if (xh_free_start % blocksize < need)
-		free -= xh_free_start % blocksize;
-
-	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
-	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
-	     " %u\n", xs->not_found,
-	     (unsigned long long)bucket_blkno(xs->bucket),
-	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
-	     le16_to_cpu(xh->xh_name_value_len));
-
-	if (free < need ||
-	    (xs->not_found &&
-	     count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
-		if (need <= max_free &&
-		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
-			/*
-			 * We can create the space by defragment. Since only the
-			 * name/value will be moved, the xe shouldn't be changed
-			 * in xs.
-			 */
-			ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
-							xs->bucket);
-			if (ret) {
-				mlog_errno(ret);
-				goto out;
-			}
 
-			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size
-				- OCFS2_XATTR_HEADER_GAP;
-			if (xh_free_start % blocksize < need)
-				free -= xh_free_start % blocksize;
+out:
+	mlog_exit(ret);
+	return ret;
+}
 
-			if (free >= need)
-				goto xattr_set;
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
 
-			mlog(0, "Can't get enough space for xattr insert by "
-			     "defragment. Need %u bytes, but we have %d, so "
-			     "allocate new bucket for it.\n", need, free);
-		}
+	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
 
-		/*
-		 * We have to add new buckets or clusters and one
-		 * allocation should leave us enough space for insert.
-		 */
-		BUG_ON(allocation);
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (!ret)
+		goto out;
+	if (ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * We do not allow for overlapping ranges between buckets. And
-		 * the maximum number of collisions we will allow for then is
-		 * one bucket's worth, so check it here whether we need to
-		 * add a new bucket for the insert.
-		 */
-		ret = ocfs2_check_xattr_bucket_collision(inode,
-							 xs->bucket,
-							 xi->xi_name);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+	/* Ack, need more space.  Let's try to get another bucket! */
 
-		ret = ocfs2_add_new_xattr_bucket(inode,
-						 xs->xattr_bh,
+	/*
+	 * We do not allow for overlapping ranges between buckets. And
+	 * the maximum number of collisions we will allow for then is
+	 * one bucket's worth, so check it here whether we need to
+	 * add a new bucket for the insert.
+	 */
+	ret = ocfs2_check_xattr_bucket_collision(inode,
 						 xs->bucket,
-						 ctxt);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+						 xi->xi_name);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		/*
-		 * ocfs2_add_new_xattr_bucket() will have updated
-		 * xs->bucket if it moved, but it will not have updated
-		 * any of the other search fields.  Thus, we drop it and
-		 * re-search.  Everything should be cached, so it'll be
-		 * quick.
-		 */
-		ocfs2_xattr_bucket_relse(xs->bucket);
-		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
-						   xi->xi_name_index,
-						   xi->xi_name, xs);
-		if (ret && ret != -ENODATA)
-			goto out;
-		xs->not_found = ret;
-		allocation = 1;
-		goto try_again;
+	ret = ocfs2_add_new_xattr_bucket(inode,
+					 xs->xattr_bh,
+					 xs->bucket,
+					 ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
-xattr_set:
-	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
+	/*
+	 * ocfs2_add_new_xattr_bucket() will have updated
+	 * xs->bucket if it moved, but it will not have updated
+	 * any of the other search fields.  Thus, we drop it and
+	 * re-search.  Everything should be cached, so it'll be
+	 * quick.
+	 */
+	ocfs2_xattr_bucket_relse(xs->bucket);
+	ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+					   xi->xi_name_index,
+					   xi->xi_name, xs);
+	if (ret && ret != -ENODATA)
+		goto out;
+	xs->not_found = ret;
+
+	/* Ok, we have a new bucket, let's try again */
+	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
+	if (ret && (ret != -ENOSPC))
+		mlog_errno(ret);
+
 out:
 	mlog_exit(ret);
 	return ret;
-- 
cgit v1.2.3


From d3981544d7a4ed276966cdd58fe2f5d6e8a585d9 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 19 Aug 2009 02:13:50 -0700
Subject: ocfs2: Set xattr block entries with ocfs2_xa_set()

ocfs2_xattr_block_set() calls into ocfs2_xattr_set_entry() with just the
HAS_XATTR flag.  Most of the machinery of ocfs2_xattr_set_entry() is
skipped.  All that really happens other than the call to ocfs2_xa_set()
is making sure the HAS_XATTR flag is set on the inode.

But HAS_XATTR should be set when we also set di->i_xattr_loc.  And
that's done in ocfs2_create_xattr_block().  So let's move it there, and
then ocfs2_xattr_block_set() can just call ocfs2_xa_set().

While we're there, ocfs2_create_xattr_block() can take the set_ctxt for
a smaller argument list.  It also learns to set HAS_XATTR_FL, because it
knows for sure.  ocfs2_create_empty_xatttr_block() in the reflink path
fakes a set_ctxt to call ocfs2_create_xattr_block().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 99 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e7630a77a6c8..fb8568d1e8a1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2206,10 +2206,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	int ret;
 	struct ocfs2_xa_loc loc;
 
-	if (!(flag & OCFS2_INLINE_XATTR_FL))
-		BUG_ON(xs->xattr_bh == xs->inode_bh);
-	else
-		BUG_ON(xs->xattr_bh != xs->inode_bh);
+	BUG_ON(!(flag & OCFS2_INLINE_XATTR_FL));
+	BUG_ON(xs->xattr_bh != xs->inode_bh);
 
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2218,13 +2216,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		goto out;
 	}
 
-	if (xs->xattr_bh == xs->inode_bh)
-		ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
-					 xs->not_found ? NULL : xs->here);
-	else
-		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
-					      xs->not_found ? NULL : xs->here);
-
+	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+				 xs->not_found ? NULL : xs->here);
 	ret = ocfs2_xa_set(&loc, xi, ctxt);
 	if (ret) {
 		if (ret != -ENOSPC)
@@ -2233,8 +2226,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	}
 	xs->here = loc.xl_entry;
 
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
-	    (flag & OCFS2_INLINE_XATTR_FL)) {
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
 		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 		unsigned int xattrsize = osb->s_xattr_inline_size;
 
@@ -2254,7 +2246,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	}
 	/* Update xattr flag */
 	spin_lock(&oi->ip_lock);
-	oi->ip_dyn_features |= flag;
+	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL;
 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 	spin_unlock(&oi->ip_lock);
 
@@ -2748,12 +2740,11 @@ cleanup:
 	return ret;
 }
 
-static int ocfs2_create_xattr_block(handle_t *handle,
-				    struct inode *inode,
+static int ocfs2_create_xattr_block(struct inode *inode,
 				    struct buffer_head *inode_bh,
-				    struct ocfs2_alloc_context *meta_ac,
-				    struct buffer_head **ret_bh,
-				    int indexed)
+				    struct ocfs2_xattr_set_ctxt *ctxt,
+				    int indexed,
+				    struct buffer_head **ret_bh)
 {
 	int ret;
 	u16 suballoc_bit_start;
@@ -2764,14 +2755,14 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_xattr_block *xblk;
 
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
+				      inode_bh, OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto end;
 	}
 
-	ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+	ret = ocfs2_claim_metadata(osb, ctxt->handle, ctxt->meta_ac, 1,
 				   &suballoc_bit_start, &num_got,
 				   &first_blkno);
 	if (ret < 0) {
@@ -2782,7 +2773,7 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	new_bh = sb_getblk(inode->i_sb, first_blkno);
 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
 
-	ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
+	ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
 				      new_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret < 0) {
@@ -2794,11 +2785,10 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 	xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
 	memset(xblk, 0, inode->i_sb->s_blocksize);
 	strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
-	xblk->xb_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
+	xblk->xb_suballoc_slot = cpu_to_le16(ctxt->meta_ac->ac_alloc_slot);
 	xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 	xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
 	xblk->xb_blkno = cpu_to_le64(first_blkno);
-
 	if (indexed) {
 		struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
 		xr->xt_clusters = cpu_to_le32(1);
@@ -2809,14 +2799,17 @@ static int ocfs2_create_xattr_block(handle_t *handle,
 		xr->xt_list.l_next_free_rec = cpu_to_le16(1);
 		xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
 	}
+	ocfs2_journal_dirty(ctxt->handle, new_bh);
 
-	ret = ocfs2_journal_dirty(handle, new_bh);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto end;
-	}
+	/* Add it to the inode */
 	di->i_xattr_loc = cpu_to_le64(first_blkno);
-	ocfs2_journal_dirty(handle, inode_bh);
+
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+
+	ocfs2_journal_dirty(ctxt->handle, inode_bh);
 
 	*ret_bh = new_bh;
 	new_bh = NULL;
@@ -2838,13 +2831,13 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
-	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
 	int ret;
+	struct ocfs2_xa_loc loc;
 
 	if (!xs->xattr_bh) {
-		ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
-					       ctxt->meta_ac, &new_bh, 0);
+		ret = ocfs2_create_xattr_block(inode, xs->inode_bh, ctxt,
+					       0, &new_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto end;
@@ -2860,21 +2853,25 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
 	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
-		/* Set extended attribute into external block */
-		ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
-					    OCFS2_HAS_XATTR_FL);
-		if (!ret || ret != -ENOSPC)
-			goto end;
+		ocfs2_init_xattr_block_xa_loc(&loc, inode, xs->xattr_bh,
+					      xs->not_found ? NULL : xs->here);
 
-		ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
-		if (ret)
+		ret = ocfs2_xa_set(&loc, xi, ctxt);
+		if (!ret)
+			xs->here = loc.xl_entry;
+		else if (ret != -ENOSPC)
 			goto end;
+		else {
+			ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
+			if (ret)
+				goto end;
+		}
 	}
 
-	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
+	if (le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)
+		ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 
 end:
-
 	return ret;
 }
 
@@ -6434,9 +6431,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 					  int indexed)
 {
 	int ret;
-	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = {
+		.meta_ac = meta_ac,
+	};
 
 	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
 	if (ret < 0) {
@@ -6444,21 +6443,21 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 		return ret;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
 		goto out;
 	}
 
 	mlog(0, "create new xattr block for inode %llu, index = %d\n",
 	     (unsigned long long)fe_bh->b_blocknr, indexed);
-	ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
-				       meta_ac, ret_bh, indexed);
+	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
+				       ret_bh);
 	if (ret)
 		mlog_errno(ret);
 
-	ocfs2_commit_trans(osb, handle);
+	ocfs2_commit_trans(osb, ctxt.handle);
 out:
 	ocfs2_free_alloc_context(meta_ac);
 	return ret;
-- 
cgit v1.2.3


From 139ffacebf5fe2cd9e2ae40d325a9661a679ad4f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 19 Aug 2009 11:09:17 -0700
Subject: ocfs2: Set inline xattr entries with ocfs2_xa_set()

ocfs2_xattr_ibody_set() is the only remaining user of
ocfs2_xattr_set_entry().  ocfs2_xattr_set_entry() actually does two
things: it calls ocfs2_xa_set(), and it initializes the inline xattrs.
Initializing the inline space really belongs in its own call.

We lift the initialization to ocfs2_xattr_ibody_init(), called from
ocfs2_xattr_ibody_set() only when necessary.  Now
ocfs2_xattr_ibody_set() can call ocfs2_xa_set() directly.
ocfs2_xattr_set_entry() goes away.

Another nice fact is that ocfs2_init_dinode_xa_loc() can trust
i_xattr_inline_size.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 157 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 73 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fb8568d1e8a1..a2d912a92dd7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2137,17 +2137,13 @@ static void ocfs2_init_dinode_xa_loc(struct ocfs2_xa_loc *loc,
 {
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
+	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL));
+
 	loc->xl_inode = inode;
 	loc->xl_ops = &ocfs2_xa_block_loc_ops;
 	loc->xl_storage = bh;
 	loc->xl_entry = entry;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
-		loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
-	else {
-		BUG_ON(entry);
-		loc->xl_size = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
-	}
+	loc->xl_size = le16_to_cpu(di->i_xattr_inline_size);
 	loc->xl_header =
 		(struct ocfs2_xattr_header *)(bh->b_data + bh->b_size -
 					      loc->xl_size);
@@ -2184,80 +2180,6 @@ static void ocfs2_init_xattr_bucket_xa_loc(struct ocfs2_xa_loc *loc,
 	loc->xl_size = OCFS2_XATTR_BUCKET_SIZE;
 }
 
-
-/*
- * ocfs2_xattr_set_entry()
- *
- * Set extended attribute entry into inode or block.
- *
- * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
- * We first insert tree root(ocfs2_xattr_value_root) like a normal value,
- * then set value in B tree with set_value_outside().
- */
-static int ocfs2_xattr_set_entry(struct inode *inode,
-				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs,
-				 struct ocfs2_xattr_set_ctxt *ctxt,
-				 int flag)
-{
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	handle_t *handle = ctxt->handle;
-	int ret;
-	struct ocfs2_xa_loc loc;
-
-	BUG_ON(!(flag & OCFS2_INLINE_XATTR_FL));
-	BUG_ON(xs->xattr_bh != xs->inode_bh);
-
-	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
-				 xs->not_found ? NULL : xs->here);
-	ret = ocfs2_xa_set(&loc, xi, ctxt);
-	if (ret) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-	xs->here = loc.xl_entry;
-
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
-		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-		unsigned int xattrsize = osb->s_xattr_inline_size;
-
-		/*
-		 * Adjust extent record count or inline data size
-		 * to reserve space for extended attribute.
-		 */
-		if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-			struct ocfs2_inline_data *idata = &di->id2.i_data;
-			le16_add_cpu(&idata->id_count, -xattrsize);
-		} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
-			struct ocfs2_extent_list *el = &di->id2.i_list;
-			le16_add_cpu(&el->l_count, -(xattrsize /
-					sizeof(struct ocfs2_extent_rec)));
-		}
-		di->i_xattr_inline_size = cpu_to_le16(xattrsize);
-	}
-	/* Update xattr flag */
-	spin_lock(&oi->ip_lock);
-	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL;
-	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
-	spin_unlock(&oi->ip_lock);
-
-	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-
-out:
-	return ret;
-}
-
 /*
  * In xattr remove, if it is stored outside and refcounted, we may have
  * the chance to split the refcount tree. So need the allocators.
@@ -2653,6 +2575,55 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 	return 0;
 }
 
+static int ocfs2_xattr_ibody_init(struct inode *inode,
+				  struct buffer_head *di_bh,
+				  struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int xattrsize = osb->s_xattr_inline_size;
+
+	if (!ocfs2_xattr_has_space_inline(inode, di)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Adjust extent record count or inline data size
+	 * to reserve space for extended attribute.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		le16_add_cpu(&idata->id_count, -xattrsize);
+	} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		le16_add_cpu(&el->l_count, -(xattrsize /
+					     sizeof(struct ocfs2_extent_rec)));
+	}
+	di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_XATTR_FL|OCFS2_HAS_XATTR_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ret = ocfs2_journal_dirty(ctxt->handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
 /*
  * ocfs2_xattr_ibody_set()
  *
@@ -2664,9 +2635,10 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 				 struct ocfs2_xattr_search *xs,
 				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
+	int ret;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	int ret;
+	struct ocfs2_xa_loc loc;
 
 	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
 		return -ENOSPC;
@@ -2679,8 +2651,25 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
-				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_init_dinode_xa_loc(&loc, inode, xs->inode_bh,
+				 xs->not_found ? NULL : xs->here);
+	ret = ocfs2_xa_set(&loc, xi, ctxt);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+	xs->here = loc.xl_entry;
+
 out:
 	up_write(&oi->ip_alloc_sem);
 
-- 
cgit v1.2.3


From 399ff3a748cf4c8c853e96dd477153202636527b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 1 Sep 2009 18:38:27 -0700
Subject: ocfs2: Handle errors while setting external xattr values.

ocfs2 can store extended attribute values as large as a single file.  It
does this using a standard ocfs2 btree for the large value.  However,
the previous code did not handle all error cases cleanly.

There are multiple problems to have.

1) We have trouble allocating space for a new xattr.  This leaves us
   with an empty xattr.
2) We overwrote an existing local xattr with a value root, and now we
   have an error allocating the storage.  This leaves us an empty xattr.
   where there used to be a value.  The value is lost.
3) We have trouble truncating a reused value.  This leaves us with the
   original entry pointing to the truncated original value.  The value
   is lost.
4) We have trouble extending the storage on a reused value.  This leaves
   us with the original value safely in place, but with more storage
   allocated when needed.

This doesn't consider storing local xattrs (values that don't require a
btree).  Those only fail when the journal fails.

Case (1) is easy.  We just remove the xattr we added.  We leak the
storage because we can't safely remove it, but otherwise everything is
happy.  We'll print a warning about the leak.

Case (4) is easy.  We still have the original value in place.  We can
just leave the extra storage attached to this xattr.  We return the
error, but the old value is untouched.  We print a warning about the
storage.

Case (2) and (3) are hard because we've lost the original values.  In
the old code, we ended up with values that could be partially read.
That's not good.  Instead, we just wipe the xattr entry and leak the
storage.  It stinks that the original value is lost, but now there isn't
a partial value to be read.  We'll print a big fat warning.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a2d912a92dd7..d1b0d386f6d1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1869,6 +1869,17 @@ static const struct ocfs2_xa_loc_operations ocfs2_xa_bucket_loc_ops = {
 	.xlo_fill_value_buf	= ocfs2_xa_bucket_fill_value_buf,
 };
 
+static unsigned int ocfs2_xa_value_clusters(struct ocfs2_xa_loc *loc)
+{
+	struct ocfs2_xattr_value_buf vb;
+
+	if (ocfs2_xattr_is_local(loc->xl_entry))
+		return 0;
+
+	ocfs2_xa_fill_value_buf(loc, &vb);
+	return le32_to_cpu(vb.vb_xv->xr_clusters);
+}
+
 static int ocfs2_xa_value_truncate(struct ocfs2_xa_loc *loc, u64 bytes,
 				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
@@ -1923,16 +1934,85 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc)
 	}
 }
 
+/*
+ * If we have a problem adjusting the size of an external value during
+ * ocfs2_xa_prepare_entry() or ocfs2_xa_remove(), we may have an xattr
+ * in an intermediate state.  For example, the value may be partially
+ * truncated.
+ *
+ * If the value tree hasn't changed, the extend/truncate went nowhere.
+ * We have nothing to do.  The caller can treat it as a straight error.
+ *
+ * If the value tree got partially truncated, we now have a corrupted
+ * extended attribute.  We're going to wipe its entry and leak the
+ * clusters.  Better to leak some storage than leave a corrupt entry.
+ *
+ * If the value tree grew, it obviously didn't grow enough for the
+ * new entry.  We're not going to try and reclaim those clusters either.
+ * If there was already an external value there (orig_clusters != 0),
+ * the new clusters are attached safely and we can just leave the old
+ * value in place.  If there was no external value there, we remove
+ * the entry.
+ *
+ * This way, the xattr block we store in the journal will be consistent.
+ * If the size change broke because of the journal, no changes will hit
+ * disk anyway.
+ */
+static void ocfs2_xa_cleanup_value_truncate(struct ocfs2_xa_loc *loc,
+					    const char *what,
+					    unsigned int orig_clusters)
+{
+	unsigned int new_clusters = ocfs2_xa_value_clusters(loc);
+	char *nameval_buf = ocfs2_xa_offset_pointer(loc,
+				le16_to_cpu(loc->xl_entry->xe_name_offset));
+
+	if (new_clusters < orig_clusters) {
+		mlog(ML_ERROR,
+		     "Partial truncate while %s xattr %.*s.  Leaking "
+		     "%u clusters and removing the entry\n",
+		     what, loc->xl_entry->xe_name_len, nameval_buf,
+		     orig_clusters - new_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (!orig_clusters) {
+		mlog(ML_ERROR,
+		     "Unable to allocate an external value for xattr "
+		     "%.*s safely.  Leaking %u clusters and removing the "
+		     "entry\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+		ocfs2_xa_remove_entry(loc);
+	} else if (new_clusters > orig_clusters)
+		mlog(ML_ERROR,
+		     "Unable to grow xattr %.*s safely.  %u new clusters "
+		     "have been added, but the value will not be "
+		     "modified\n",
+		     loc->xl_entry->xe_name_len, nameval_buf,
+		     new_clusters - orig_clusters);
+}
+
 static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
 			   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int rc = 0;
+	unsigned int orig_clusters;
 
 	if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
 		rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
 		if (rc) {
 			mlog_errno(rc);
-			goto out;
+			/*
+			 * Since this is remove, we can return 0 if
+			 * ocfs2_xa_cleanup_value_truncate() is going to
+			 * wipe the entry anyway.  So we check the
+			 * cluster count as well.
+			 */
+			if (orig_clusters != ocfs2_xa_value_clusters(loc))
+				rc = 0;
+			ocfs2_xa_cleanup_value_truncate(loc, "removing",
+							orig_clusters);
+			if (rc)
+				goto out;
 		}
 	}
 
@@ -1963,6 +2043,7 @@ static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
 {
 	int rc = 0;
 	int name_size = OCFS2_XATTR_SIZE(xi->xi_name_len);
+	unsigned int orig_clusters;
 	char *nameval_buf;
 	int xe_local = ocfs2_xattr_is_local(loc->xl_entry);
 	int xi_local = xi->xi_value_len <= OCFS2_XATTR_INLINE_SIZE;
@@ -1978,23 +2059,27 @@ static int ocfs2_xa_reuse_entry(struct ocfs2_xa_loc *loc,
 		if (!xi_local)
 			ocfs2_xa_install_value_root(loc);
 	} else {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
 		if (xi_local) {
 			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
-			if (rc < 0) {
+			if (rc < 0)
 				mlog_errno(rc);
-				goto out;
-			}
-			memset(nameval_buf + name_size, 0,
-			       namevalue_size_xe(loc->xl_entry) -
-			       name_size);
+			else
+				memset(nameval_buf + name_size, 0,
+				       namevalue_size_xe(loc->xl_entry) -
+				       name_size);
 		} else if (le64_to_cpu(loc->xl_entry->xe_value_size) >
 			   xi->xi_value_len) {
 			rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len,
 						     ctxt);
-			if (rc < 0) {
+			if (rc < 0)
 				mlog_errno(rc);
-				goto out;
-			}
+		}
+
+		if (rc) {
+			ocfs2_xa_cleanup_value_truncate(loc, "reusing",
+							orig_clusters);
+			goto out;
 		}
 	}
 
@@ -2019,6 +2104,8 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 				  struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int rc = 0;
+	unsigned int orig_clusters;
+	__le64 orig_value_size = 0;
 
 	rc = ocfs2_xa_check_space(loc, xi);
 	if (rc)
@@ -2026,6 +2113,7 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 
 	if (loc->xl_entry) {
 		if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+			orig_value_size = loc->xl_entry->xe_value_size;
 			rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
 			if (rc)
 				goto out;
@@ -2033,9 +2121,13 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 		}
 
 		if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+			orig_clusters = ocfs2_xa_value_clusters(loc);
 			rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
 			if (rc) {
 				mlog_errno(rc);
+				ocfs2_xa_cleanup_value_truncate(loc,
+								"overwriting",
+								orig_clusters);
 				goto out;
 			}
 		}
@@ -2053,9 +2145,20 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
 
 alloc_value:
 	if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+		orig_clusters = ocfs2_xa_value_clusters(loc);
 		rc = ocfs2_xa_value_truncate(loc, xi->xi_value_len, ctxt);
-		if (rc < 0)
+		if (rc < 0) {
+			/*
+			 * If we tried to grow an existing external value,
+			 * ocfs2_xa_cleanuP-value_truncate() is going to
+			 * let it stand.  We have to restore its original
+			 * value size.
+			 */
+			loc->xl_entry->xe_value_size = orig_value_size;
+			ocfs2_xa_cleanup_value_truncate(loc, "growing",
+							orig_clusters);
 			mlog_errno(rc);
+		}
 	}
 
 out:
@@ -2105,25 +2208,30 @@ static int ocfs2_xa_set(struct ocfs2_xa_loc *loc,
 		goto out;
 	}
 
+	/*
+	 * From here on out, everything is going to modify the buffer a
+	 * little.  Errors are going to leave the xattr header in a
+	 * sane state.  Thus, even with errors we dirty the sucker.
+	 */
+
 	/* Don't worry, we are never called with !xi_value and !xl_entry */
 	if (!xi->xi_value) {
 		ret = ocfs2_xa_remove(loc, ctxt);
-		goto out;
+		goto out_dirty;
 	}
 
 	ret = ocfs2_xa_prepare_entry(loc, xi, name_hash, ctxt);
 	if (ret) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
-		goto out;
+		goto out_dirty;
 	}
 
 	ret = ocfs2_xa_store_value(loc, xi, ctxt);
-	if (ret) {
+	if (ret)
 		mlog_errno(ret);
-		goto out;
-	}
 
+out_dirty:
 	ocfs2_xa_journal_dirty(ctxt->handle, loc);
 
 out:
-- 
cgit v1.2.3


From 14a437c2b67aeee2a989a5d9c7e19094355d2fed Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 4 Feb 2010 15:37:27 -0800
Subject: ocfs2_dlmfs: Add capabilities parameter.

Over time, dlmfs has added some features that were not part of the
initial ABI.  Unfortunately, some of these features are not detectable
via standard usage.  For example, Linux's default poll always returns
POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
added poll support.  Instead, we provide this list of new capabilities.

Capabilities is a read-only attribute.  We do it as a module parameter
so we can discover it whether dlmfs is built in, loaded, or even not
loaded (via modinfo).

The ABI features are local to this machine's dlmfs mount.  This is
distinct from the locking protocol, which is concerned with inter-node
interaction.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmfs.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 02bf17808bdc..77d0df42fe02 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -81,6 +81,42 @@ static const struct dlm_protocol_version user_locking_protocol = {
 	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
 };
 
+
+/*
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ */
+#define DLMFS_CAPABILITIES ""
+extern int param_set_dlmfs_capabilities(const char *val,
+					struct kernel_param *kp)
+{
+	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+	return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+					struct kernel_param *kp)
+{
+	return strlcpy(buffer, DLMFS_CAPABILITIES,
+		       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+		  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
+
 /*
  * decodes a set of open flags into a valid lock level and a set of flags.
  * returns < 0 if we have invalid flags
-- 
cgit v1.2.3


From 65b6f3403431cd43ef7b0dab679a50f770124a65 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 26 Jan 2010 22:14:20 -0800
Subject: ocfs2_dlmfs: Use poll() to signify BASTs.

o2dlm's userspace filesystem is an easy way to use the DLM from
userspace.  It is intentionally simple. For example, it does not allow
for asynchronous behavior or lock conversion.  This is intentional to
keep the interface simple.

Because there is no asynchronous notification, there is no way for a
process holding a lock to know another node needs the lock.  This is the
number one complaint of ocfs2_dlmfs users.  Turns out, we can solve this
very easily.  We add poll() support to ocfs2_dlmfs.  When a BAST is
received, the lock's file descriptor will receive POLLIN.

This is trivial to implement.  Userdlm already has an appropriate
waitqueue, and the lock knows when it is blocked.

We add the "bast" capability to tell userspace this is available.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmfs.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 77d0df42fe02..ddf55dafba2d 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -43,6 +43,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/backing-dev.h>
+#include <linux/poll.h>
 
 #include <asm/uaccess.h>
 
@@ -98,8 +99,12 @@ static const struct dlm_protocol_version user_locking_protocol = {
  * The ABI features are local to this machine's dlmfs mount.  This is
  * distinct from the locking protocol, which is concerned with inter-node
  * interaction.
+ *
+ * Capabilities:
+ * - bast	: POLLIN against the file descriptor of a held lock
+ *		  signifies a bast fired on the lock.
  */
-#define DLMFS_CAPABILITIES ""
+#define DLMFS_CAPABILITIES "bast"
 extern int param_set_dlmfs_capabilities(const char *val,
 					struct kernel_param *kp)
 {
@@ -215,6 +220,22 @@ static int dlmfs_file_release(struct inode *inode,
 	return 0;
 }
 
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+	int event = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+	poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+	spin_lock(&ip->ip_lockres.l_lock);
+	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+		event = POLLIN | POLLRDNORM;
+	spin_unlock(&ip->ip_lockres.l_lock);
+
+	return event;
+}
+
 static ssize_t dlmfs_file_read(struct file *filp,
 			       char __user *buf,
 			       size_t count,
@@ -585,6 +606,7 @@ static int dlmfs_fill_super(struct super_block * sb,
 static const struct file_operations dlmfs_file_operations = {
 	.open		= dlmfs_file_open,
 	.release	= dlmfs_file_release,
+	.poll		= dlmfs_file_poll,
 	.read		= dlmfs_file_read,
 	.write		= dlmfs_file_write,
 };
-- 
cgit v1.2.3


From 34a9dd7e29e9129fec40c645a03f1bbbe810e771 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 28 Jan 2010 15:00:49 -0800
Subject: ocfs2_dlmfs: Move to its own directory

We're going to remove the tie between ocfs2_dlmfs and o2dlm.
ocfs2_dlmfs doesn't belong in the fs/ocfs2/dlm directory anymore.  Here
we move it to fs/ocfs2/dlmfs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/Makefile         |   1 +
 fs/ocfs2/dlm/Makefile     |   3 +-
 fs/ocfs2/dlm/dlmfs.c      | 710 ----------------------------------------------
 fs/ocfs2/dlm/dlmfsver.c   |  42 ---
 fs/ocfs2/dlm/dlmfsver.h   |  31 --
 fs/ocfs2/dlm/userdlm.c    | 676 -------------------------------------------
 fs/ocfs2/dlm/userdlm.h    | 113 --------
 fs/ocfs2/dlmfs/Makefile   |   5 +
 fs/ocfs2/dlmfs/dlmfs.c    | 710 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmfs/dlmfsver.c |  42 +++
 fs/ocfs2/dlmfs/dlmfsver.h |  31 ++
 fs/ocfs2/dlmfs/userdlm.c  | 676 +++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/dlmfs/userdlm.h  | 113 ++++++++
 13 files changed, 1579 insertions(+), 1574 deletions(-)
 delete mode 100644 fs/ocfs2/dlm/dlmfs.c
 delete mode 100644 fs/ocfs2/dlm/dlmfsver.c
 delete mode 100644 fs/ocfs2/dlm/dlmfsver.h
 delete mode 100644 fs/ocfs2/dlm/userdlm.c
 delete mode 100644 fs/ocfs2/dlm/userdlm.h
 create mode 100644 fs/ocfs2/dlmfs/Makefile
 create mode 100644 fs/ocfs2/dlmfs/dlmfs.c
 create mode 100644 fs/ocfs2/dlmfs/dlmfsver.c
 create mode 100644 fs/ocfs2/dlmfs/dlmfsver.h
 create mode 100644 fs/ocfs2/dlmfs/userdlm.c
 create mode 100644 fs/ocfs2/dlmfs/userdlm.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 600d2d2ade11..791c0886c060 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -46,6 +46,7 @@ ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
 
+obj-$(CONFIG_OCFS2_FS) += dlmfs/
 # cluster/ is always needed when OCFS2_FS for masklog support
 obj-$(CONFIG_OCFS2_FS) += cluster/
 obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 190361375700..dcebf0d920fa 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,8 +1,7 @@
 EXTRA_CFLAGS += -Ifs/ocfs2
 
-obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
+obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
 ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
 	dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
 
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
deleted file mode 100644
index ddf55dafba2d..000000000000
--- a/fs/ocfs2/dlm/dlmfs.c
+++ /dev/null
@@ -1,710 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfs.c
- *
- * Code which implements the kernel side of a minimal userspace
- * interface to our DLM. This file handles the virtual file system
- * used for communication with userspace. Credit should go to ramfs,
- * which was a template for the fs side of this module.
- *
- * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-/* Simple VFS hooks based on: */
-/*
- * Resizable simple ram filesystem for Linux.
- *
- * Copyright (C) 2000 Linus Torvalds.
- *               2000 Transmeta Corp.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/backing-dev.h>
-#include <linux/poll.h>
-
-#include <asm/uaccess.h>
-
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
-#include "userdlm.h"
-
-#include "dlmfsver.h"
-
-#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
-
-#include "ocfs2_lockingver.h"
-
-static const struct super_operations dlmfs_ops;
-static const struct file_operations dlmfs_file_operations;
-static const struct inode_operations dlmfs_dir_inode_operations;
-static const struct inode_operations dlmfs_root_inode_operations;
-static const struct inode_operations dlmfs_file_inode_operations;
-static struct kmem_cache *dlmfs_inode_cache;
-
-struct workqueue_struct *user_dlm_worker;
-
-/*
- * This is the userdlmfs locking protocol version.
- *
- * See fs/ocfs2/dlmglue.c for more details on locking versions.
- */
-static const struct dlm_protocol_version user_locking_protocol = {
-	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
-
-
-/*
- * These are the ABI capabilities of dlmfs.
- *
- * Over time, dlmfs has added some features that were not part of the
- * initial ABI.  Unfortunately, some of these features are not detectable
- * via standard usage.  For example, Linux's default poll always returns
- * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
- * added poll support.  Instead, we provide this list of new capabilities.
- *
- * Capabilities is a read-only attribute.  We do it as a module parameter
- * so we can discover it whether dlmfs is built in, loaded, or even not
- * loaded.
- *
- * The ABI features are local to this machine's dlmfs mount.  This is
- * distinct from the locking protocol, which is concerned with inter-node
- * interaction.
- *
- * Capabilities:
- * - bast	: POLLIN against the file descriptor of a held lock
- *		  signifies a bast fired on the lock.
- */
-#define DLMFS_CAPABILITIES "bast"
-extern int param_set_dlmfs_capabilities(const char *val,
-					struct kernel_param *kp)
-{
-	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
-	return -EINVAL;
-}
-static int param_get_dlmfs_capabilities(char *buffer,
-					struct kernel_param *kp)
-{
-	return strlcpy(buffer, DLMFS_CAPABILITIES,
-		       strlen(DLMFS_CAPABILITIES) + 1);
-}
-module_param_call(capabilities, param_set_dlmfs_capabilities,
-		  param_get_dlmfs_capabilities, NULL, 0444);
-MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
-
-
-/*
- * decodes a set of open flags into a valid lock level and a set of flags.
- * returns < 0 if we have invalid flags
- * flags which mean something to us:
- * O_RDONLY -> PRMODE level
- * O_WRONLY -> EXMODE level
- *
- * O_NONBLOCK -> LKM_NOQUEUE
- */
-static int dlmfs_decode_open_flags(int open_flags,
-				   int *level,
-				   int *flags)
-{
-	if (open_flags & (O_WRONLY|O_RDWR))
-		*level = LKM_EXMODE;
-	else
-		*level = LKM_PRMODE;
-
-	*flags = 0;
-	if (open_flags & O_NONBLOCK)
-		*flags |= LKM_NOQUEUE;
-
-	return 0;
-}
-
-static int dlmfs_file_open(struct inode *inode,
-			   struct file *file)
-{
-	int status, level, flags;
-	struct dlmfs_filp_private *fp = NULL;
-	struct dlmfs_inode_private *ip;
-
-	if (S_ISDIR(inode->i_mode))
-		BUG();
-
-	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
-		file->f_flags);
-
-	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
-	if (status < 0)
-		goto bail;
-
-	/* We don't want to honor O_APPEND at read/write time as it
-	 * doesn't make sense for LVB writes. */
-	file->f_flags &= ~O_APPEND;
-
-	fp = kmalloc(sizeof(*fp), GFP_NOFS);
-	if (!fp) {
-		status = -ENOMEM;
-		goto bail;
-	}
-	fp->fp_lock_level = level;
-
-	ip = DLMFS_I(inode);
-
-	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
-	if (status < 0) {
-		/* this is a strange error to return here but I want
-		 * to be able userspace to be able to distinguish a
-		 * valid lock request from one that simply couldn't be
-		 * granted. */
-		if (flags & LKM_NOQUEUE && status == -EAGAIN)
-			status = -ETXTBSY;
-		kfree(fp);
-		goto bail;
-	}
-
-	file->private_data = fp;
-bail:
-	return status;
-}
-
-static int dlmfs_file_release(struct inode *inode,
-			      struct file *file)
-{
-	int level, status;
-	struct dlmfs_inode_private *ip = DLMFS_I(inode);
-	struct dlmfs_filp_private *fp =
-		(struct dlmfs_filp_private *) file->private_data;
-
-	if (S_ISDIR(inode->i_mode))
-		BUG();
-
-	mlog(0, "close called on inode %lu\n", inode->i_ino);
-
-	status = 0;
-	if (fp) {
-		level = fp->fp_lock_level;
-		if (level != LKM_IVMODE)
-			user_dlm_cluster_unlock(&ip->ip_lockres, level);
-
-		kfree(fp);
-		file->private_data = NULL;
-	}
-
-	return 0;
-}
-
-static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
-{
-	int event = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct dlmfs_inode_private *ip = DLMFS_I(inode);
-
-	poll_wait(file, &ip->ip_lockres.l_event, wait);
-
-	spin_lock(&ip->ip_lockres.l_lock);
-	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
-		event = POLLIN | POLLRDNORM;
-	spin_unlock(&ip->ip_lockres.l_lock);
-
-	return event;
-}
-
-static ssize_t dlmfs_file_read(struct file *filp,
-			       char __user *buf,
-			       size_t count,
-			       loff_t *ppos)
-{
-	int bytes_left;
-	ssize_t readlen;
-	char *lvb_buf;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-
-	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
-		inode->i_ino, count, *ppos);
-
-	if (*ppos >= i_size_read(inode))
-		return 0;
-
-	if (!count)
-		return 0;
-
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		return -EFAULT;
-
-	/* don't read past the lvb */
-	if ((count + *ppos) > i_size_read(inode))
-		readlen = i_size_read(inode) - *ppos;
-	else
-		readlen = count - *ppos;
-
-	lvb_buf = kmalloc(readlen, GFP_NOFS);
-	if (!lvb_buf)
-		return -ENOMEM;
-
-	user_dlm_read_lvb(inode, lvb_buf, readlen);
-	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
-	readlen -= bytes_left;
-
-	kfree(lvb_buf);
-
-	*ppos = *ppos + readlen;
-
-	mlog(0, "read %zd bytes\n", readlen);
-	return readlen;
-}
-
-static ssize_t dlmfs_file_write(struct file *filp,
-				const char __user *buf,
-				size_t count,
-				loff_t *ppos)
-{
-	int bytes_left;
-	ssize_t writelen;
-	char *lvb_buf;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-
-	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
-		inode->i_ino, count, *ppos);
-
-	if (*ppos >= i_size_read(inode))
-		return -ENOSPC;
-
-	if (!count)
-		return 0;
-
-	if (!access_ok(VERIFY_READ, buf, count))
-		return -EFAULT;
-
-	/* don't write past the lvb */
-	if ((count + *ppos) > i_size_read(inode))
-		writelen = i_size_read(inode) - *ppos;
-	else
-		writelen = count - *ppos;
-
-	lvb_buf = kmalloc(writelen, GFP_NOFS);
-	if (!lvb_buf)
-		return -ENOMEM;
-
-	bytes_left = copy_from_user(lvb_buf, buf, writelen);
-	writelen -= bytes_left;
-	if (writelen)
-		user_dlm_write_lvb(inode, lvb_buf, writelen);
-
-	kfree(lvb_buf);
-
-	*ppos = *ppos + writelen;
-	mlog(0, "wrote %zd bytes\n", writelen);
-	return writelen;
-}
-
-static void dlmfs_init_once(void *foo)
-{
-	struct dlmfs_inode_private *ip =
-		(struct dlmfs_inode_private *) foo;
-
-	ip->ip_dlm = NULL;
-	ip->ip_parent = NULL;
-
-	inode_init_once(&ip->ip_vfs_inode);
-}
-
-static struct inode *dlmfs_alloc_inode(struct super_block *sb)
-{
-	struct dlmfs_inode_private *ip;
-
-	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
-	if (!ip)
-		return NULL;
-
-	return &ip->ip_vfs_inode;
-}
-
-static void dlmfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
-}
-
-static void dlmfs_clear_inode(struct inode *inode)
-{
-	int status;
-	struct dlmfs_inode_private *ip;
-
-	if (!inode)
-		return;
-
-	mlog(0, "inode %lu\n", inode->i_ino);
-
-	ip = DLMFS_I(inode);
-
-	if (S_ISREG(inode->i_mode)) {
-		status = user_dlm_destroy_lock(&ip->ip_lockres);
-		if (status < 0)
-			mlog_errno(status);
-		iput(ip->ip_parent);
-		goto clear_fields;
-	}
-
-	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
-	/* we must be a directory. If required, lets unregister the
-	 * dlm context now. */
-	if (ip->ip_dlm)
-		user_dlm_unregister_context(ip->ip_dlm);
-clear_fields:
-	ip->ip_parent = NULL;
-	ip->ip_dlm = NULL;
-}
-
-static struct backing_dev_info dlmfs_backing_dev_info = {
-	.name		= "ocfs2-dlmfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *dlmfs_get_root_inode(struct super_block *sb)
-{
-	struct inode *inode = new_inode(sb);
-	int mode = S_IFDIR | 0755;
-	struct dlmfs_inode_private *ip;
-
-	if (inode) {
-		ip = DLMFS_I(inode);
-
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = current_fsgid();
-		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inc_nlink(inode);
-
-		inode->i_fop = &simple_dir_operations;
-		inode->i_op = &dlmfs_root_inode_operations;
-	}
-
-	return inode;
-}
-
-static struct inode *dlmfs_get_inode(struct inode *parent,
-				     struct dentry *dentry,
-				     int mode)
-{
-	struct super_block *sb = parent->i_sb;
-	struct inode * inode = new_inode(sb);
-	struct dlmfs_inode_private *ip;
-
-	if (!inode)
-		return NULL;
-
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
-	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-
-	ip = DLMFS_I(inode);
-	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
-
-	switch (mode & S_IFMT) {
-	default:
-		/* for now we don't support anything other than
-		 * directories and regular files. */
-		BUG();
-		break;
-	case S_IFREG:
-		inode->i_op = &dlmfs_file_inode_operations;
-		inode->i_fop = &dlmfs_file_operations;
-
-		i_size_write(inode,  DLM_LVB_LEN);
-
-		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
-
-		/* released at clear_inode time, this insures that we
-		 * get to drop the dlm reference on each lock *before*
-		 * we call the unregister code for releasing parent
-		 * directories. */
-		ip->ip_parent = igrab(parent);
-		BUG_ON(!ip->ip_parent);
-		break;
-	case S_IFDIR:
-		inode->i_op = &dlmfs_dir_inode_operations;
-		inode->i_fop = &simple_dir_operations;
-
-		/* directory inodes start off with i_nlink ==
-		 * 2 (for "." entry) */
-		inc_nlink(inode);
-		break;
-	}
-
-	if (parent->i_mode & S_ISGID) {
-		inode->i_gid = parent->i_gid;
-		if (S_ISDIR(mode))
-			inode->i_mode |= S_ISGID;
-	}
-
-	return inode;
-}
-
-/*
- * File creation. Allocate an inode, and we're done..
- */
-/* SMP-safe */
-static int dlmfs_mkdir(struct inode * dir,
-		       struct dentry * dentry,
-		       int mode)
-{
-	int status;
-	struct inode *inode = NULL;
-	struct qstr *domain = &dentry->d_name;
-	struct dlmfs_inode_private *ip;
-	struct dlm_ctxt *dlm;
-	struct dlm_protocol_version proto = user_locking_protocol;
-
-	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
-
-	/* verify that we have a proper domain */
-	if (domain->len >= O2NM_MAX_NAME_LEN) {
-		status = -EINVAL;
-		mlog(ML_ERROR, "invalid domain name for directory.\n");
-		goto bail;
-	}
-
-	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
-	if (!inode) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	ip = DLMFS_I(inode);
-
-	dlm = user_dlm_register_context(domain, &proto);
-	if (IS_ERR(dlm)) {
-		status = PTR_ERR(dlm);
-		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
-		     status, domain->len, domain->name);
-		goto bail;
-	}
-	ip->ip_dlm = dlm;
-
-	inc_nlink(dir);
-	d_instantiate(dentry, inode);
-	dget(dentry);	/* Extra count - pin the dentry in core */
-
-	status = 0;
-bail:
-	if (status < 0)
-		iput(inode);
-	return status;
-}
-
-static int dlmfs_create(struct inode *dir,
-			struct dentry *dentry,
-			int mode,
-			struct nameidata *nd)
-{
-	int status = 0;
-	struct inode *inode;
-	struct qstr *name = &dentry->d_name;
-
-	mlog(0, "create %.*s\n", name->len, name->name);
-
-	/* verify name is valid and doesn't contain any dlm reserved
-	 * characters */
-	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
-	    name->name[0] == '$') {
-		status = -EINVAL;
-		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
-		     name->name);
-		goto bail;
-	}
-
-	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
-	if (!inode) {
-		status = -ENOMEM;
-		mlog_errno(status);
-		goto bail;
-	}
-
-	d_instantiate(dentry, inode);
-	dget(dentry);	/* Extra count - pin the dentry in core */
-bail:
-	return status;
-}
-
-static int dlmfs_unlink(struct inode *dir,
-			struct dentry *dentry)
-{
-	int status;
-	struct inode *inode = dentry->d_inode;
-
-	mlog(0, "unlink inode %lu\n", inode->i_ino);
-
-	/* if there are no current holders, or none that are waiting
-	 * to acquire a lock, this basically destroys our lockres. */
-	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
-	if (status < 0) {
-		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
-		     dentry->d_name.len, dentry->d_name.name, status);
-		goto bail;
-	}
-	status = simple_unlink(dir, dentry);
-bail:
-	return status;
-}
-
-static int dlmfs_fill_super(struct super_block * sb,
-			    void * data,
-			    int silent)
-{
-	struct inode * inode;
-	struct dentry * root;
-
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = DLMFS_MAGIC;
-	sb->s_op = &dlmfs_ops;
-	inode = dlmfs_get_root_inode(sb);
-	if (!inode)
-		return -ENOMEM;
-
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
-		return -ENOMEM;
-	}
-	sb->s_root = root;
-	return 0;
-}
-
-static const struct file_operations dlmfs_file_operations = {
-	.open		= dlmfs_file_open,
-	.release	= dlmfs_file_release,
-	.poll		= dlmfs_file_poll,
-	.read		= dlmfs_file_read,
-	.write		= dlmfs_file_write,
-};
-
-static const struct inode_operations dlmfs_dir_inode_operations = {
-	.create		= dlmfs_create,
-	.lookup		= simple_lookup,
-	.unlink		= dlmfs_unlink,
-};
-
-/* this way we can restrict mkdir to only the toplevel of the fs. */
-static const struct inode_operations dlmfs_root_inode_operations = {
-	.lookup		= simple_lookup,
-	.mkdir		= dlmfs_mkdir,
-	.rmdir		= simple_rmdir,
-};
-
-static const struct super_operations dlmfs_ops = {
-	.statfs		= simple_statfs,
-	.alloc_inode	= dlmfs_alloc_inode,
-	.destroy_inode	= dlmfs_destroy_inode,
-	.clear_inode	= dlmfs_clear_inode,
-	.drop_inode	= generic_delete_inode,
-};
-
-static const struct inode_operations dlmfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
-static int dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
-}
-
-static struct file_system_type dlmfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ocfs2_dlmfs",
-	.get_sb		= dlmfs_get_sb,
-	.kill_sb	= kill_litter_super,
-};
-
-static int __init init_dlmfs_fs(void)
-{
-	int status;
-	int cleanup_inode = 0, cleanup_worker = 0;
-
-	dlmfs_print_version();
-
-	status = bdi_init(&dlmfs_backing_dev_info);
-	if (status)
-		return status;
-
-	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
-				sizeof(struct dlmfs_inode_private),
-				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
-				dlmfs_init_once);
-	if (!dlmfs_inode_cache) {
-		status = -ENOMEM;
-		goto bail;
-	}
-	cleanup_inode = 1;
-
-	user_dlm_worker = create_singlethread_workqueue("user_dlm");
-	if (!user_dlm_worker) {
-		status = -ENOMEM;
-		goto bail;
-	}
-	cleanup_worker = 1;
-
-	status = register_filesystem(&dlmfs_fs_type);
-bail:
-	if (status) {
-		if (cleanup_inode)
-			kmem_cache_destroy(dlmfs_inode_cache);
-		if (cleanup_worker)
-			destroy_workqueue(user_dlm_worker);
-		bdi_destroy(&dlmfs_backing_dev_info);
-	} else
-		printk("OCFS2 User DLM kernel interface loaded\n");
-	return status;
-}
-
-static void __exit exit_dlmfs_fs(void)
-{
-	unregister_filesystem(&dlmfs_fs_type);
-
-	flush_workqueue(user_dlm_worker);
-	destroy_workqueue(user_dlm_worker);
-
-	kmem_cache_destroy(dlmfs_inode_cache);
-
-	bdi_destroy(&dlmfs_backing_dev_info);
-}
-
-MODULE_AUTHOR("Oracle");
-MODULE_LICENSE("GPL");
-
-module_init(init_dlmfs_fs)
-module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlm/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
-	printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlm/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
deleted file mode 100644
index 4cb1d3dae250..000000000000
--- a/fs/ocfs2/dlm/userdlm.c
+++ /dev/null
@@ -1,676 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * userdlm.c
- *
- * Code which implements the kernel side of a minimal userspace
- * interface to our DLM.
- *
- * Many of the functions here are pared down versions of dlmglue.c
- * functions.
- *
- * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/signal.h>
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/crc32.h>
-
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlmapi.h"
-
-#include "userdlm.h"
-
-#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
-
-static inline int user_check_wait_flag(struct user_lock_res *lockres,
-				       int flag)
-{
-	int ret;
-
-	spin_lock(&lockres->l_lock);
-	ret = lockres->l_flags & flag;
-	spin_unlock(&lockres->l_lock);
-
-	return ret;
-}
-
-static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
-
-{
-	wait_event(lockres->l_event,
-		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
-}
-
-static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
-
-{
-	wait_event(lockres->l_event,
-		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
-}
-
-/* I heart container_of... */
-static inline struct dlm_ctxt *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
-{
-	struct dlmfs_inode_private *ip;
-
-	ip = container_of(lockres,
-			  struct dlmfs_inode_private,
-			  ip_lockres);
-	return ip->ip_dlm;
-}
-
-static struct inode *
-user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
-{
-	struct dlmfs_inode_private *ip;
-
-	ip = container_of(lockres,
-			  struct dlmfs_inode_private,
-			  ip_lockres);
-	return &ip->ip_vfs_inode;
-}
-
-static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
-{
-	spin_lock(&lockres->l_lock);
-	lockres->l_flags &= ~USER_LOCK_BUSY;
-	spin_unlock(&lockres->l_lock);
-}
-
-#define user_log_dlm_error(_func, _stat, _lockres) do {			\
-	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "		\
-		"resource %.*s: %s\n", dlm_errname(_stat), _func,	\
-		_lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
-} while (0)
-
-/* WARNING: This function lives in a world where the only three lock
- * levels are EX, PR, and NL. It *will* have to be adjusted when more
- * lock types are added. */
-static inline int user_highest_compat_lock_level(int level)
-{
-	int new_level = LKM_EXMODE;
-
-	if (level == LKM_EXMODE)
-		new_level = LKM_NLMODE;
-	else if (level == LKM_PRMODE)
-		new_level = LKM_PRMODE;
-	return new_level;
-}
-
-static void user_ast(void *opaque)
-{
-	struct user_lock_res *lockres = opaque;
-	struct dlm_lockstatus *lksb;
-
-	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
-
-	spin_lock(&lockres->l_lock);
-
-	lksb = &(lockres->l_lksb);
-	if (lksb->status != DLM_NORMAL) {
-		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
-		     lksb->status, lockres->l_namelen, lockres->l_name);
-		spin_unlock(&lockres->l_lock);
-		return;
-	}
-
-	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
-			"Lockres %.*s, requested ivmode. flags 0x%x\n",
-			lockres->l_namelen, lockres->l_name, lockres->l_flags);
-
-	/* we're downconverting. */
-	if (lockres->l_requested < lockres->l_level) {
-		if (lockres->l_requested <=
-		    user_highest_compat_lock_level(lockres->l_blocking)) {
-			lockres->l_blocking = LKM_NLMODE;
-			lockres->l_flags &= ~USER_LOCK_BLOCKED;
-		}
-	}
-
-	lockres->l_level = lockres->l_requested;
-	lockres->l_requested = LKM_IVMODE;
-	lockres->l_flags |= USER_LOCK_ATTACHED;
-	lockres->l_flags &= ~USER_LOCK_BUSY;
-
-	spin_unlock(&lockres->l_lock);
-
-	wake_up(&lockres->l_event);
-}
-
-static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
-{
-	struct inode *inode;
-	inode = user_dlm_inode_from_user_lockres(lockres);
-	if (!igrab(inode))
-		BUG();
-}
-
-static void user_dlm_unblock_lock(struct work_struct *work);
-
-static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
-{
-	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
-		user_dlm_grab_inode_ref(lockres);
-
-		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
-
-		queue_work(user_dlm_worker, &lockres->l_work);
-		lockres->l_flags |= USER_LOCK_QUEUED;
-	}
-}
-
-static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
-{
-	int queue = 0;
-
-	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
-		return;
-
-	switch (lockres->l_blocking) {
-	case LKM_EXMODE:
-		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
-			queue = 1;
-		break;
-	case LKM_PRMODE:
-		if (!lockres->l_ex_holders)
-			queue = 1;
-		break;
-	default:
-		BUG();
-	}
-
-	if (queue)
-		__user_dlm_queue_lockres(lockres);
-}
-
-static void user_bast(void *opaque, int level)
-{
-	struct user_lock_res *lockres = opaque;
-
-	mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
-	     lockres->l_namelen, lockres->l_name, level);
-
-	spin_lock(&lockres->l_lock);
-	lockres->l_flags |= USER_LOCK_BLOCKED;
-	if (level > lockres->l_blocking)
-		lockres->l_blocking = level;
-
-	__user_dlm_queue_lockres(lockres);
-	spin_unlock(&lockres->l_lock);
-
-	wake_up(&lockres->l_event);
-}
-
-static void user_unlock_ast(void *opaque, enum dlm_status status)
-{
-	struct user_lock_res *lockres = opaque;
-
-	mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
-
-	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
-		mlog(ML_ERROR, "Dlm returns status %d\n", status);
-
-	spin_lock(&lockres->l_lock);
-	/* The teardown flag gets set early during the unlock process,
-	 * so test the cancel flag to make sure that this ast isn't
-	 * for a concurrent cancel. */
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
-	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
-		lockres->l_level = LKM_IVMODE;
-	} else if (status == DLM_CANCELGRANT) {
-		/* We tried to cancel a convert request, but it was
-		 * already granted. Don't clear the busy flag - the
-		 * ast should've done this already. */
-		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
-		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
-		goto out_noclear;
-	} else {
-		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
-		/* Cancel succeeded, we want to re-queue */
-		lockres->l_requested = LKM_IVMODE; /* cancel an
-						    * upconvert
-						    * request. */
-		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
-		/* we want the unblock thread to look at it again
-		 * now. */
-		if (lockres->l_flags & USER_LOCK_BLOCKED)
-			__user_dlm_queue_lockres(lockres);
-	}
-
-	lockres->l_flags &= ~USER_LOCK_BUSY;
-out_noclear:
-	spin_unlock(&lockres->l_lock);
-
-	wake_up(&lockres->l_event);
-}
-
-static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
-{
-	struct inode *inode;
-	inode = user_dlm_inode_from_user_lockres(lockres);
-	iput(inode);
-}
-
-static void user_dlm_unblock_lock(struct work_struct *work)
-{
-	int new_level, status;
-	struct user_lock_res *lockres =
-		container_of(work, struct user_lock_res, l_work);
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
-
-	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
-
-	spin_lock(&lockres->l_lock);
-
-	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
-			"Lockres %.*s, flags 0x%x\n",
-			lockres->l_namelen, lockres->l_name, lockres->l_flags);
-
-	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
-	 * set, we want user_ast clear it. */
-	lockres->l_flags &= ~USER_LOCK_QUEUED;
-
-	/* It's valid to get here and no longer be blocked - if we get
-	 * several basts in a row, we might be queued by the first
-	 * one, the unblock thread might run and clear the queued
-	 * flag, and finally we might get another bast which re-queues
-	 * us before our ast for the downconvert is called. */
-	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
-		spin_unlock(&lockres->l_lock);
-		goto drop_ref;
-	}
-
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
-		spin_unlock(&lockres->l_lock);
-		goto drop_ref;
-	}
-
-	if (lockres->l_flags & USER_LOCK_BUSY) {
-		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
-			spin_unlock(&lockres->l_lock);
-			goto drop_ref;
-		}
-
-		lockres->l_flags |= USER_LOCK_IN_CANCEL;
-		spin_unlock(&lockres->l_lock);
-
-		status = dlmunlock(dlm,
-				   &lockres->l_lksb,
-				   LKM_CANCEL,
-				   user_unlock_ast,
-				   lockres);
-		if (status != DLM_NORMAL)
-			user_log_dlm_error("dlmunlock", status, lockres);
-		goto drop_ref;
-	}
-
-	/* If there are still incompat holders, we can exit safely
-	 * without worrying about re-queueing this lock as that will
-	 * happen on the last call to user_cluster_unlock. */
-	if ((lockres->l_blocking == LKM_EXMODE)
-	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
-		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
-			lockres->l_ro_holders, lockres->l_ex_holders);
-		goto drop_ref;
-	}
-
-	if ((lockres->l_blocking == LKM_PRMODE)
-	    && lockres->l_ex_holders) {
-		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for pr: ex = %u\n",
-			lockres->l_ex_holders);
-		goto drop_ref;
-	}
-
-	/* yay, we can downconvert now. */
-	new_level = user_highest_compat_lock_level(lockres->l_blocking);
-	lockres->l_requested = new_level;
-	lockres->l_flags |= USER_LOCK_BUSY;
-	mlog(0, "Downconvert lock from %d to %d\n",
-		lockres->l_level, new_level);
-	spin_unlock(&lockres->l_lock);
-
-	/* need lock downconvert request now... */
-	status = dlmlock(dlm,
-			 new_level,
-			 &lockres->l_lksb,
-			 LKM_CONVERT|LKM_VALBLK,
-			 lockres->l_name,
-			 lockres->l_namelen,
-			 user_ast,
-			 lockres,
-			 user_bast);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmlock", status, lockres);
-		user_recover_from_dlm_error(lockres);
-	}
-
-drop_ref:
-	user_dlm_drop_inode_ref(lockres);
-}
-
-static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
-					int level)
-{
-	switch(level) {
-	case LKM_EXMODE:
-		lockres->l_ex_holders++;
-		break;
-	case LKM_PRMODE:
-		lockres->l_ro_holders++;
-		break;
-	default:
-		BUG();
-	}
-}
-
-/* predict what lock level we'll be dropping down to on behalf
- * of another node, and return true if the currently wanted
- * level will be compatible with it. */
-static inline int
-user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
-				  int wanted)
-{
-	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
-
-	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
-}
-
-int user_dlm_cluster_lock(struct user_lock_res *lockres,
-			  int level,
-			  int lkm_flags)
-{
-	int status, local_flags;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
-
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
-		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
-		     lockres->l_namelen, lockres->l_name);
-		status = -EINVAL;
-		goto bail;
-	}
-
-	mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
-	     lockres->l_namelen, lockres->l_name,
-	     (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
-	     lkm_flags);
-
-again:
-	if (signal_pending(current)) {
-		status = -ERESTARTSYS;
-		goto bail;
-	}
-
-	spin_lock(&lockres->l_lock);
-
-	/* We only compare against the currently granted level
-	 * here. If the lock is blocked waiting on a downconvert,
-	 * we'll get caught below. */
-	if ((lockres->l_flags & USER_LOCK_BUSY) &&
-	    (level > lockres->l_level)) {
-		/* is someone sitting in dlm_lock? If so, wait on
-		 * them. */
-		spin_unlock(&lockres->l_lock);
-
-		user_wait_on_busy_lock(lockres);
-		goto again;
-	}
-
-	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
-	    (!user_may_continue_on_blocked_lock(lockres, level))) {
-		/* is the lock is currently blocked on behalf of
-		 * another node */
-		spin_unlock(&lockres->l_lock);
-
-		user_wait_on_blocked_lock(lockres);
-		goto again;
-	}
-
-	if (level > lockres->l_level) {
-		local_flags = lkm_flags | LKM_VALBLK;
-		if (lockres->l_level != LKM_IVMODE)
-			local_flags |= LKM_CONVERT;
-
-		lockres->l_requested = level;
-		lockres->l_flags |= USER_LOCK_BUSY;
-		spin_unlock(&lockres->l_lock);
-
-		BUG_ON(level == LKM_IVMODE);
-		BUG_ON(level == LKM_NLMODE);
-
-		/* call dlm_lock to upgrade lock now */
-		status = dlmlock(dlm,
-				 level,
-				 &lockres->l_lksb,
-				 local_flags,
-				 lockres->l_name,
-				 lockres->l_namelen,
-				 user_ast,
-				 lockres,
-				 user_bast);
-		if (status != DLM_NORMAL) {
-			if ((lkm_flags & LKM_NOQUEUE) &&
-			    (status == DLM_NOTQUEUED))
-				status = -EAGAIN;
-			else {
-				user_log_dlm_error("dlmlock", status, lockres);
-				status = -EINVAL;
-			}
-			user_recover_from_dlm_error(lockres);
-			goto bail;
-		}
-
-		user_wait_on_busy_lock(lockres);
-		goto again;
-	}
-
-	user_dlm_inc_holders(lockres, level);
-	spin_unlock(&lockres->l_lock);
-
-	status = 0;
-bail:
-	return status;
-}
-
-static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
-					int level)
-{
-	switch(level) {
-	case LKM_EXMODE:
-		BUG_ON(!lockres->l_ex_holders);
-		lockres->l_ex_holders--;
-		break;
-	case LKM_PRMODE:
-		BUG_ON(!lockres->l_ro_holders);
-		lockres->l_ro_holders--;
-		break;
-	default:
-		BUG();
-	}
-}
-
-void user_dlm_cluster_unlock(struct user_lock_res *lockres,
-			     int level)
-{
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
-		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
-		     lockres->l_namelen, lockres->l_name);
-		return;
-	}
-
-	spin_lock(&lockres->l_lock);
-	user_dlm_dec_holders(lockres, level);
-	__user_dlm_cond_queue_lockres(lockres);
-	spin_unlock(&lockres->l_lock);
-}
-
-void user_dlm_write_lvb(struct inode *inode,
-			const char *val,
-			unsigned int len)
-{
-	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
-
-	BUG_ON(len > DLM_LVB_LEN);
-
-	spin_lock(&lockres->l_lock);
-
-	BUG_ON(lockres->l_level < LKM_EXMODE);
-	memcpy(lvb, val, len);
-
-	spin_unlock(&lockres->l_lock);
-}
-
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len)
-{
-	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
-
-	BUG_ON(len > DLM_LVB_LEN);
-
-	spin_lock(&lockres->l_lock);
-
-	BUG_ON(lockres->l_level < LKM_PRMODE);
-	memcpy(val, lvb, len);
-
-	spin_unlock(&lockres->l_lock);
-}
-
-void user_dlm_lock_res_init(struct user_lock_res *lockres,
-			    struct dentry *dentry)
-{
-	memset(lockres, 0, sizeof(*lockres));
-
-	spin_lock_init(&lockres->l_lock);
-	init_waitqueue_head(&lockres->l_event);
-	lockres->l_level = LKM_IVMODE;
-	lockres->l_requested = LKM_IVMODE;
-	lockres->l_blocking = LKM_IVMODE;
-
-	/* should have been checked before getting here. */
-	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
-
-	memcpy(lockres->l_name,
-	       dentry->d_name.name,
-	       dentry->d_name.len);
-	lockres->l_namelen = dentry->d_name.len;
-}
-
-int user_dlm_destroy_lock(struct user_lock_res *lockres)
-{
-	int status = -EBUSY;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
-
-	mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
-
-	spin_lock(&lockres->l_lock);
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
-		spin_unlock(&lockres->l_lock);
-		return 0;
-	}
-
-	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
-
-	while (lockres->l_flags & USER_LOCK_BUSY) {
-		spin_unlock(&lockres->l_lock);
-
-		user_wait_on_busy_lock(lockres);
-
-		spin_lock(&lockres->l_lock);
-	}
-
-	if (lockres->l_ro_holders || lockres->l_ex_holders) {
-		spin_unlock(&lockres->l_lock);
-		goto bail;
-	}
-
-	status = 0;
-	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
-		spin_unlock(&lockres->l_lock);
-		goto bail;
-	}
-
-	lockres->l_flags &= ~USER_LOCK_ATTACHED;
-	lockres->l_flags |= USER_LOCK_BUSY;
-	spin_unlock(&lockres->l_lock);
-
-	status = dlmunlock(dlm,
-			   &lockres->l_lksb,
-			   LKM_VALBLK,
-			   user_unlock_ast,
-			   lockres);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmunlock", status, lockres);
-		status = -EINVAL;
-		goto bail;
-	}
-
-	user_wait_on_busy_lock(lockres);
-
-	status = 0;
-bail:
-	return status;
-}
-
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto)
-{
-	struct dlm_ctxt *dlm;
-	u32 dlm_key;
-	char *domain;
-
-	domain = kmalloc(name->len + 1, GFP_NOFS);
-	if (!domain) {
-		mlog_errno(-ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	dlm_key = crc32_le(0, name->name, name->len);
-
-	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
-
-	dlm = dlm_register_domain(domain, dlm_key, proto);
-	if (IS_ERR(dlm))
-		mlog_errno(PTR_ERR(dlm));
-
-	kfree(domain);
-	return dlm;
-}
-
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
-{
-	dlm_unregister_domain(dlm);
-}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
deleted file mode 100644
index 0c3cc03c61fa..000000000000
--- a/fs/ocfs2/dlm/userdlm.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * userdlm.h
- *
- * Userspace dlm defines
- *
- * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-
-#ifndef USERDLM_H
-#define USERDLM_H
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/types.h>
-#include <linux/workqueue.h>
-
-/* user_lock_res->l_flags flags. */
-#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
-					       * the lvb */
-#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
-					       * dlm_lock */
-#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
-					      * downconvert*/
-#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
-					      * destroying this
-					      * lock. */
-#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
-					      * workqueue */
-#define USER_LOCK_IN_CANCEL     (0x00000020)
-
-struct user_lock_res {
-	spinlock_t               l_lock;
-
-	int                      l_flags;
-
-#define USER_DLM_LOCK_ID_MAX_LEN  32
-	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
-	int                      l_namelen;
-	int                      l_level;
-	unsigned int             l_ro_holders;
-	unsigned int             l_ex_holders;
-	struct dlm_lockstatus    l_lksb;
-
-	int                      l_requested;
-	int                      l_blocking;
-
-	wait_queue_head_t        l_event;
-
-	struct work_struct       l_work;
-};
-
-extern struct workqueue_struct *user_dlm_worker;
-
-void user_dlm_lock_res_init(struct user_lock_res *lockres,
-			    struct dentry *dentry);
-int user_dlm_destroy_lock(struct user_lock_res *lockres);
-int user_dlm_cluster_lock(struct user_lock_res *lockres,
-			  int level,
-			  int lkm_flags);
-void user_dlm_cluster_unlock(struct user_lock_res *lockres,
-			     int level);
-void user_dlm_write_lvb(struct inode *inode,
-			const char *val,
-			unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
-
-struct dlmfs_inode_private {
-	struct dlm_ctxt             *ip_dlm;
-
-	struct user_lock_res ip_lockres; /* unused for directories. */
-	struct inode         *ip_parent;
-
-	struct inode         ip_vfs_inode;
-};
-
-static inline struct dlmfs_inode_private *
-DLMFS_I(struct inode *inode)
-{
-        return container_of(inode,
-			    struct dlmfs_inode_private,
-			    ip_vfs_inode);
-}
-
-struct dlmfs_filp_private {
-	int                  fp_lock_level;
-};
-
-#define DLMFS_MAGIC	0x76a9f425
-
-#endif /* USERDLM_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
new file mode 100644
index 000000000000..df69b4856d0d
--- /dev/null
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -0,0 +1,5 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
+
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
new file mode 100644
index 000000000000..e21ce0e5fc42
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -0,0 +1,710 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfs.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM. This file handles the virtual file system
+ * used for communication with userspace. Credit should go to ramfs,
+ * which was a template for the fs side of this module.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/* Simple VFS hooks based on: */
+/*
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/poll.h>
+
+#include <asm/uaccess.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "userdlm.h"
+
+#include "dlmfsver.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+#include "ocfs2_lockingver.h"
+
+static const struct super_operations dlmfs_ops;
+static const struct file_operations dlmfs_file_operations;
+static const struct inode_operations dlmfs_dir_inode_operations;
+static const struct inode_operations dlmfs_root_inode_operations;
+static const struct inode_operations dlmfs_file_inode_operations;
+static struct kmem_cache *dlmfs_inode_cache;
+
+struct workqueue_struct *user_dlm_worker;
+
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static const struct dlm_protocol_version user_locking_protocol = {
+	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+};
+
+
+/*
+ * These are the ABI capabilities of dlmfs.
+ *
+ * Over time, dlmfs has added some features that were not part of the
+ * initial ABI.  Unfortunately, some of these features are not detectable
+ * via standard usage.  For example, Linux's default poll always returns
+ * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * added poll support.  Instead, we provide this list of new capabilities.
+ *
+ * Capabilities is a read-only attribute.  We do it as a module parameter
+ * so we can discover it whether dlmfs is built in, loaded, or even not
+ * loaded.
+ *
+ * The ABI features are local to this machine's dlmfs mount.  This is
+ * distinct from the locking protocol, which is concerned with inter-node
+ * interaction.
+ *
+ * Capabilities:
+ * - bast	: POLLIN against the file descriptor of a held lock
+ *		  signifies a bast fired on the lock.
+ */
+#define DLMFS_CAPABILITIES "bast"
+extern int param_set_dlmfs_capabilities(const char *val,
+					struct kernel_param *kp)
+{
+	printk(KERN_ERR "%s: readonly parameter\n", kp->name);
+	return -EINVAL;
+}
+static int param_get_dlmfs_capabilities(char *buffer,
+					struct kernel_param *kp)
+{
+	return strlcpy(buffer, DLMFS_CAPABILITIES,
+		       strlen(DLMFS_CAPABILITIES) + 1);
+}
+module_param_call(capabilities, param_set_dlmfs_capabilities,
+		  param_get_dlmfs_capabilities, NULL, 0444);
+MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
+
+
+/*
+ * decodes a set of open flags into a valid lock level and a set of flags.
+ * returns < 0 if we have invalid flags
+ * flags which mean something to us:
+ * O_RDONLY -> PRMODE level
+ * O_WRONLY -> EXMODE level
+ *
+ * O_NONBLOCK -> LKM_NOQUEUE
+ */
+static int dlmfs_decode_open_flags(int open_flags,
+				   int *level,
+				   int *flags)
+{
+	if (open_flags & (O_WRONLY|O_RDWR))
+		*level = LKM_EXMODE;
+	else
+		*level = LKM_PRMODE;
+
+	*flags = 0;
+	if (open_flags & O_NONBLOCK)
+		*flags |= LKM_NOQUEUE;
+
+	return 0;
+}
+
+static int dlmfs_file_open(struct inode *inode,
+			   struct file *file)
+{
+	int status, level, flags;
+	struct dlmfs_filp_private *fp = NULL;
+	struct dlmfs_inode_private *ip;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
+		file->f_flags);
+
+	status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
+	if (status < 0)
+		goto bail;
+
+	/* We don't want to honor O_APPEND at read/write time as it
+	 * doesn't make sense for LVB writes. */
+	file->f_flags &= ~O_APPEND;
+
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
+	if (!fp) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	fp->fp_lock_level = level;
+
+	ip = DLMFS_I(inode);
+
+	status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
+	if (status < 0) {
+		/* this is a strange error to return here but I want
+		 * to be able userspace to be able to distinguish a
+		 * valid lock request from one that simply couldn't be
+		 * granted. */
+		if (flags & LKM_NOQUEUE && status == -EAGAIN)
+			status = -ETXTBSY;
+		kfree(fp);
+		goto bail;
+	}
+
+	file->private_data = fp;
+bail:
+	return status;
+}
+
+static int dlmfs_file_release(struct inode *inode,
+			      struct file *file)
+{
+	int level, status;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+	struct dlmfs_filp_private *fp =
+		(struct dlmfs_filp_private *) file->private_data;
+
+	if (S_ISDIR(inode->i_mode))
+		BUG();
+
+	mlog(0, "close called on inode %lu\n", inode->i_ino);
+
+	status = 0;
+	if (fp) {
+		level = fp->fp_lock_level;
+		if (level != LKM_IVMODE)
+			user_dlm_cluster_unlock(&ip->ip_lockres, level);
+
+		kfree(fp);
+		file->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
+{
+	int event = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct dlmfs_inode_private *ip = DLMFS_I(inode);
+
+	poll_wait(file, &ip->ip_lockres.l_event, wait);
+
+	spin_lock(&ip->ip_lockres.l_lock);
+	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
+		event = POLLIN | POLLRDNORM;
+	spin_unlock(&ip->ip_lockres.l_lock);
+
+	return event;
+}
+
+static ssize_t dlmfs_file_read(struct file *filp,
+			       char __user *buf,
+			       size_t count,
+			       loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t readlen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return 0;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	/* don't read past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		readlen = i_size_read(inode) - *ppos;
+	else
+		readlen = count - *ppos;
+
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	user_dlm_read_lvb(inode, lvb_buf, readlen);
+	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+	readlen -= bytes_left;
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + readlen;
+
+	mlog(0, "read %zd bytes\n", readlen);
+	return readlen;
+}
+
+static ssize_t dlmfs_file_write(struct file *filp,
+				const char __user *buf,
+				size_t count,
+				loff_t *ppos)
+{
+	int bytes_left;
+	ssize_t writelen;
+	char *lvb_buf;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
+		inode->i_ino, count, *ppos);
+
+	if (*ppos >= i_size_read(inode))
+		return -ENOSPC;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* don't write past the lvb */
+	if ((count + *ppos) > i_size_read(inode))
+		writelen = i_size_read(inode) - *ppos;
+	else
+		writelen = count - *ppos;
+
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
+	if (!lvb_buf)
+		return -ENOMEM;
+
+	bytes_left = copy_from_user(lvb_buf, buf, writelen);
+	writelen -= bytes_left;
+	if (writelen)
+		user_dlm_write_lvb(inode, lvb_buf, writelen);
+
+	kfree(lvb_buf);
+
+	*ppos = *ppos + writelen;
+	mlog(0, "wrote %zd bytes\n", writelen);
+	return writelen;
+}
+
+static void dlmfs_init_once(void *foo)
+{
+	struct dlmfs_inode_private *ip =
+		(struct dlmfs_inode_private *) foo;
+
+	ip->ip_dlm = NULL;
+	ip->ip_parent = NULL;
+
+	inode_init_once(&ip->ip_vfs_inode);
+}
+
+static struct inode *dlmfs_alloc_inode(struct super_block *sb)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+	if (!ip)
+		return NULL;
+
+	return &ip->ip_vfs_inode;
+}
+
+static void dlmfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
+}
+
+static void dlmfs_clear_inode(struct inode *inode)
+{
+	int status;
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return;
+
+	mlog(0, "inode %lu\n", inode->i_ino);
+
+	ip = DLMFS_I(inode);
+
+	if (S_ISREG(inode->i_mode)) {
+		status = user_dlm_destroy_lock(&ip->ip_lockres);
+		if (status < 0)
+			mlog_errno(status);
+		iput(ip->ip_parent);
+		goto clear_fields;
+	}
+
+	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	/* we must be a directory. If required, lets unregister the
+	 * dlm context now. */
+	if (ip->ip_dlm)
+		user_dlm_unregister_context(ip->ip_dlm);
+clear_fields:
+	ip->ip_parent = NULL;
+	ip->ip_dlm = NULL;
+}
+
+static struct backing_dev_info dlmfs_backing_dev_info = {
+	.name		= "ocfs2-dlmfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static struct inode *dlmfs_get_root_inode(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+	int mode = S_IFDIR | 0755;
+	struct dlmfs_inode_private *ip;
+
+	if (inode) {
+		ip = DLMFS_I(inode);
+
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = current_fsgid();
+		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inc_nlink(inode);
+
+		inode->i_fop = &simple_dir_operations;
+		inode->i_op = &dlmfs_root_inode_operations;
+	}
+
+	return inode;
+}
+
+static struct inode *dlmfs_get_inode(struct inode *parent,
+				     struct dentry *dentry,
+				     int mode)
+{
+	struct super_block *sb = parent->i_sb;
+	struct inode * inode = new_inode(sb);
+	struct dlmfs_inode_private *ip;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	ip = DLMFS_I(inode);
+	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+
+	switch (mode & S_IFMT) {
+	default:
+		/* for now we don't support anything other than
+		 * directories and regular files. */
+		BUG();
+		break;
+	case S_IFREG:
+		inode->i_op = &dlmfs_file_inode_operations;
+		inode->i_fop = &dlmfs_file_operations;
+
+		i_size_write(inode,  DLM_LVB_LEN);
+
+		user_dlm_lock_res_init(&ip->ip_lockres, dentry);
+
+		/* released at clear_inode time, this insures that we
+		 * get to drop the dlm reference on each lock *before*
+		 * we call the unregister code for releasing parent
+		 * directories. */
+		ip->ip_parent = igrab(parent);
+		BUG_ON(!ip->ip_parent);
+		break;
+	case S_IFDIR:
+		inode->i_op = &dlmfs_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+
+		/* directory inodes start off with i_nlink ==
+		 * 2 (for "." entry) */
+		inc_nlink(inode);
+		break;
+	}
+
+	if (parent->i_mode & S_ISGID) {
+		inode->i_gid = parent->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	}
+
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int dlmfs_mkdir(struct inode * dir,
+		       struct dentry * dentry,
+		       int mode)
+{
+	int status;
+	struct inode *inode = NULL;
+	struct qstr *domain = &dentry->d_name;
+	struct dlmfs_inode_private *ip;
+	struct dlm_ctxt *dlm;
+	struct dlm_protocol_version proto = user_locking_protocol;
+
+	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
+
+	/* verify that we have a proper domain */
+	if (domain->len >= O2NM_MAX_NAME_LEN) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid domain name for directory.\n");
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ip = DLMFS_I(inode);
+
+	dlm = user_dlm_register_context(domain, &proto);
+	if (IS_ERR(dlm)) {
+		status = PTR_ERR(dlm);
+		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
+		     status, domain->len, domain->name);
+		goto bail;
+	}
+	ip->ip_dlm = dlm;
+
+	inc_nlink(dir);
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+
+	status = 0;
+bail:
+	if (status < 0)
+		iput(inode);
+	return status;
+}
+
+static int dlmfs_create(struct inode *dir,
+			struct dentry *dentry,
+			int mode,
+			struct nameidata *nd)
+{
+	int status = 0;
+	struct inode *inode;
+	struct qstr *name = &dentry->d_name;
+
+	mlog(0, "create %.*s\n", name->len, name->name);
+
+	/* verify name is valid and doesn't contain any dlm reserved
+	 * characters */
+	if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
+	    name->name[0] == '$') {
+		status = -EINVAL;
+		mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
+		     name->name);
+		goto bail;
+	}
+
+	inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);	/* Extra count - pin the dentry in core */
+bail:
+	return status;
+}
+
+static int dlmfs_unlink(struct inode *dir,
+			struct dentry *dentry)
+{
+	int status;
+	struct inode *inode = dentry->d_inode;
+
+	mlog(0, "unlink inode %lu\n", inode->i_ino);
+
+	/* if there are no current holders, or none that are waiting
+	 * to acquire a lock, this basically destroys our lockres. */
+	status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
+	if (status < 0) {
+		mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
+		     dentry->d_name.len, dentry->d_name.name, status);
+		goto bail;
+	}
+	status = simple_unlink(dir, dentry);
+bail:
+	return status;
+}
+
+static int dlmfs_fill_super(struct super_block * sb,
+			    void * data,
+			    int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = DLMFS_MAGIC;
+	sb->s_op = &dlmfs_ops;
+	inode = dlmfs_get_root_inode(sb);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static const struct file_operations dlmfs_file_operations = {
+	.open		= dlmfs_file_open,
+	.release	= dlmfs_file_release,
+	.poll		= dlmfs_file_poll,
+	.read		= dlmfs_file_read,
+	.write		= dlmfs_file_write,
+};
+
+static const struct inode_operations dlmfs_dir_inode_operations = {
+	.create		= dlmfs_create,
+	.lookup		= simple_lookup,
+	.unlink		= dlmfs_unlink,
+};
+
+/* this way we can restrict mkdir to only the toplevel of the fs. */
+static const struct inode_operations dlmfs_root_inode_operations = {
+	.lookup		= simple_lookup,
+	.mkdir		= dlmfs_mkdir,
+	.rmdir		= simple_rmdir,
+};
+
+static const struct super_operations dlmfs_ops = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= dlmfs_alloc_inode,
+	.destroy_inode	= dlmfs_destroy_inode,
+	.clear_inode	= dlmfs_clear_inode,
+	.drop_inode	= generic_delete_inode,
+};
+
+static const struct inode_operations dlmfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
+
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
+}
+
+static struct file_system_type dlmfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ocfs2_dlmfs",
+	.get_sb		= dlmfs_get_sb,
+	.kill_sb	= kill_litter_super,
+};
+
+static int __init init_dlmfs_fs(void)
+{
+	int status;
+	int cleanup_inode = 0, cleanup_worker = 0;
+
+	dlmfs_print_version();
+
+	status = bdi_init(&dlmfs_backing_dev_info);
+	if (status)
+		return status;
+
+	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
+				sizeof(struct dlmfs_inode_private),
+				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
+				dlmfs_init_once);
+	if (!dlmfs_inode_cache) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_inode = 1;
+
+	user_dlm_worker = create_singlethread_workqueue("user_dlm");
+	if (!user_dlm_worker) {
+		status = -ENOMEM;
+		goto bail;
+	}
+	cleanup_worker = 1;
+
+	status = register_filesystem(&dlmfs_fs_type);
+bail:
+	if (status) {
+		if (cleanup_inode)
+			kmem_cache_destroy(dlmfs_inode_cache);
+		if (cleanup_worker)
+			destroy_workqueue(user_dlm_worker);
+		bdi_destroy(&dlmfs_backing_dev_info);
+	} else
+		printk("OCFS2 User DLM kernel interface loaded\n");
+	return status;
+}
+
+static void __exit exit_dlmfs_fs(void)
+{
+	unregister_filesystem(&dlmfs_fs_type);
+
+	flush_workqueue(user_dlm_worker);
+	destroy_workqueue(user_dlm_worker);
+
+	kmem_cache_destroy(dlmfs_inode_cache);
+
+	bdi_destroy(&dlmfs_backing_dev_info);
+}
+
+MODULE_AUTHOR("Oracle");
+MODULE_LICENSE("GPL");
+
+module_init(init_dlmfs_fs)
+module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
new file mode 100644
index 000000000000..a733b3321f83
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfsver.c
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmfsver.c
+ *
+ * version string
+ *
+ * Copyright (C) 2002, 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include "dlmfsver.h"
+
+#define DLM_BUILD_VERSION "1.5.0"
+
+#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
+
+void dlmfs_print_version(void)
+{
+	printk(KERN_INFO "%s\n", VERSION_STR);
+}
+
+MODULE_DESCRIPTION(VERSION_STR);
+
+MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
new file mode 100644
index 000000000000..f35eadbed25c
--- /dev/null
+++ b/fs/ocfs2/dlmfs/dlmfsver.h
@@ -0,0 +1,31 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * dlmver.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2005 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef DLMFS_VER_H
+#define DLMFS_VER_H
+
+void dlmfs_print_version(void);
+
+#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
new file mode 100644
index 000000000000..6adae70cee8e
--- /dev/null
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -0,0 +1,676 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.c
+ *
+ * Code which implements the kernel side of a minimal userspace
+ * interface to our DLM.
+ *
+ * Many of the functions here are pared down versions of dlmglue.c
+ * functions.
+ *
+ * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/signal.h>
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+
+
+#include "cluster/nodemanager.h"
+#include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
+
+#include "dlm/dlmapi.h"
+
+#include "userdlm.h"
+
+#define MLOG_MASK_PREFIX ML_DLMFS
+#include "cluster/masklog.h"
+
+static inline int user_check_wait_flag(struct user_lock_res *lockres,
+				       int flag)
+{
+	int ret;
+
+	spin_lock(&lockres->l_lock);
+	ret = lockres->l_flags & flag;
+	spin_unlock(&lockres->l_lock);
+
+	return ret;
+}
+
+static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BUSY));
+}
+
+static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
+
+{
+	wait_event(lockres->l_event,
+		   !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
+}
+
+/* I heart container_of... */
+static inline struct dlm_ctxt *
+dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return ip->ip_dlm;
+}
+
+static struct inode *
+user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
+{
+	struct dlmfs_inode_private *ip;
+
+	ip = container_of(lockres,
+			  struct dlmfs_inode_private,
+			  ip_lockres);
+	return &ip->ip_vfs_inode;
+}
+
+static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
+{
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+}
+
+#define user_log_dlm_error(_func, _stat, _lockres) do {			\
+	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "		\
+		"resource %.*s: %s\n", dlm_errname(_stat), _func,	\
+		_lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+} while (0)
+
+/* WARNING: This function lives in a world where the only three lock
+ * levels are EX, PR, and NL. It *will* have to be adjusted when more
+ * lock types are added. */
+static inline int user_highest_compat_lock_level(int level)
+{
+	int new_level = LKM_EXMODE;
+
+	if (level == LKM_EXMODE)
+		new_level = LKM_NLMODE;
+	else if (level == LKM_PRMODE)
+		new_level = LKM_PRMODE;
+	return new_level;
+}
+
+static void user_ast(void *opaque)
+{
+	struct user_lock_res *lockres = opaque;
+	struct dlm_lockstatus *lksb;
+
+	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
+	     lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	lksb = &(lockres->l_lksb);
+	if (lksb->status != DLM_NORMAL) {
+		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
+		     lksb->status, lockres->l_namelen, lockres->l_name);
+		spin_unlock(&lockres->l_lock);
+		return;
+	}
+
+	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+			"Lockres %.*s, requested ivmode. flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* we're downconverting. */
+	if (lockres->l_requested < lockres->l_level) {
+		if (lockres->l_requested <=
+		    user_highest_compat_lock_level(lockres->l_blocking)) {
+			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_flags &= ~USER_LOCK_BLOCKED;
+		}
+	}
+
+	lockres->l_level = lockres->l_requested;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_flags |= USER_LOCK_ATTACHED;
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	if (!igrab(inode))
+		BUG();
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work);
+
+static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
+{
+	if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
+		user_dlm_grab_inode_ref(lockres);
+
+		INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
+
+		queue_work(user_dlm_worker, &lockres->l_work);
+		lockres->l_flags |= USER_LOCK_QUEUED;
+	}
+}
+
+static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
+{
+	int queue = 0;
+
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED))
+		return;
+
+	switch (lockres->l_blocking) {
+	case LKM_EXMODE:
+		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
+			queue = 1;
+		break;
+	case LKM_PRMODE:
+		if (!lockres->l_ex_holders)
+			queue = 1;
+		break;
+	default:
+		BUG();
+	}
+
+	if (queue)
+		__user_dlm_queue_lockres(lockres);
+}
+
+static void user_bast(void *opaque, int level)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
+	     lockres->l_namelen, lockres->l_name, level);
+
+	spin_lock(&lockres->l_lock);
+	lockres->l_flags |= USER_LOCK_BLOCKED;
+	if (level > lockres->l_blocking)
+		lockres->l_blocking = level;
+
+	__user_dlm_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static void user_unlock_ast(void *opaque, enum dlm_status status)
+{
+	struct user_lock_res *lockres = opaque;
+
+	mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
+	     lockres->l_name);
+
+	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
+		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+
+	spin_lock(&lockres->l_lock);
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
+		lockres->l_level = LKM_IVMODE;
+	} else if (status == DLM_CANCELGRANT) {
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		lockres->l_requested = LKM_IVMODE; /* cancel an
+						    * upconvert
+						    * request. */
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		/* we want the unblock thread to look at it again
+		 * now. */
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
+	}
+
+	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
+	spin_unlock(&lockres->l_lock);
+
+	wake_up(&lockres->l_event);
+}
+
+static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
+{
+	struct inode *inode;
+	inode = user_dlm_inode_from_user_lockres(lockres);
+	iput(inode);
+}
+
+static void user_dlm_unblock_lock(struct work_struct *work)
+{
+	int new_level, status;
+	struct user_lock_res *lockres =
+		container_of(work, struct user_lock_res, l_work);
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
+	     lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %.*s, flags 0x%x\n",
+			lockres->l_namelen, lockres->l_name, lockres->l_flags);
+
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
+	lockres->l_flags &= ~USER_LOCK_QUEUED;
+
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
+	if (lockres->l_flags & USER_LOCK_BUSY) {
+		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			spin_unlock(&lockres->l_lock);
+			goto drop_ref;
+		}
+
+		lockres->l_flags |= USER_LOCK_IN_CANCEL;
+		spin_unlock(&lockres->l_lock);
+
+		status = dlmunlock(dlm,
+				   &lockres->l_lksb,
+				   LKM_CANCEL,
+				   user_unlock_ast,
+				   lockres);
+		if (status != DLM_NORMAL)
+			user_log_dlm_error("dlmunlock", status, lockres);
+		goto drop_ref;
+	}
+
+	/* If there are still incompat holders, we can exit safely
+	 * without worrying about re-queueing this lock as that will
+	 * happen on the last call to user_cluster_unlock. */
+	if ((lockres->l_blocking == LKM_EXMODE)
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
+			lockres->l_ro_holders, lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	if ((lockres->l_blocking == LKM_PRMODE)
+	    && lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		mlog(0, "can't downconvert for pr: ex = %u\n",
+			lockres->l_ex_holders);
+		goto drop_ref;
+	}
+
+	/* yay, we can downconvert now. */
+	new_level = user_highest_compat_lock_level(lockres->l_blocking);
+	lockres->l_requested = new_level;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	mlog(0, "Downconvert lock from %d to %d\n",
+		lockres->l_level, new_level);
+	spin_unlock(&lockres->l_lock);
+
+	/* need lock downconvert request now... */
+	status = dlmlock(dlm,
+			 new_level,
+			 &lockres->l_lksb,
+			 LKM_CONVERT|LKM_VALBLK,
+			 lockres->l_name,
+			 lockres->l_namelen,
+			 user_ast,
+			 lockres,
+			 user_bast);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmlock", status, lockres);
+		user_recover_from_dlm_error(lockres);
+	}
+
+drop_ref:
+	user_dlm_drop_inode_ref(lockres);
+}
+
+static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		lockres->l_ex_holders++;
+		break;
+	case LKM_PRMODE:
+		lockres->l_ro_holders++;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* predict what lock level we'll be dropping down to on behalf
+ * of another node, and return true if the currently wanted
+ * level will be compatible with it. */
+static inline int
+user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
+				  int wanted)
+{
+	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
+
+	return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
+}
+
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags)
+{
+	int status, local_flags;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
+	     lockres->l_namelen, lockres->l_name,
+	     (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+	     lkm_flags);
+
+again:
+	if (signal_pending(current)) {
+		status = -ERESTARTSYS;
+		goto bail;
+	}
+
+	spin_lock(&lockres->l_lock);
+
+	/* We only compare against the currently granted level
+	 * here. If the lock is blocked waiting on a downconvert,
+	 * we'll get caught below. */
+	if ((lockres->l_flags & USER_LOCK_BUSY) &&
+	    (level > lockres->l_level)) {
+		/* is someone sitting in dlm_lock? If so, wait on
+		 * them. */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
+	    (!user_may_continue_on_blocked_lock(lockres, level))) {
+		/* is the lock is currently blocked on behalf of
+		 * another node */
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_blocked_lock(lockres);
+		goto again;
+	}
+
+	if (level > lockres->l_level) {
+		local_flags = lkm_flags | LKM_VALBLK;
+		if (lockres->l_level != LKM_IVMODE)
+			local_flags |= LKM_CONVERT;
+
+		lockres->l_requested = level;
+		lockres->l_flags |= USER_LOCK_BUSY;
+		spin_unlock(&lockres->l_lock);
+
+		BUG_ON(level == LKM_IVMODE);
+		BUG_ON(level == LKM_NLMODE);
+
+		/* call dlm_lock to upgrade lock now */
+		status = dlmlock(dlm,
+				 level,
+				 &lockres->l_lksb,
+				 local_flags,
+				 lockres->l_name,
+				 lockres->l_namelen,
+				 user_ast,
+				 lockres,
+				 user_bast);
+		if (status != DLM_NORMAL) {
+			if ((lkm_flags & LKM_NOQUEUE) &&
+			    (status == DLM_NOTQUEUED))
+				status = -EAGAIN;
+			else {
+				user_log_dlm_error("dlmlock", status, lockres);
+				status = -EINVAL;
+			}
+			user_recover_from_dlm_error(lockres);
+			goto bail;
+		}
+
+		user_wait_on_busy_lock(lockres);
+		goto again;
+	}
+
+	user_dlm_inc_holders(lockres, level);
+	spin_unlock(&lockres->l_lock);
+
+	status = 0;
+bail:
+	return status;
+}
+
+static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
+					int level)
+{
+	switch(level) {
+	case LKM_EXMODE:
+		BUG_ON(!lockres->l_ex_holders);
+		lockres->l_ex_holders--;
+		break;
+	case LKM_PRMODE:
+		BUG_ON(!lockres->l_ro_holders);
+		lockres->l_ro_holders--;
+		break;
+	default:
+		BUG();
+	}
+}
+
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level)
+{
+	if (level != LKM_EXMODE &&
+	    level != LKM_PRMODE) {
+		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
+		     lockres->l_namelen, lockres->l_name);
+		return;
+	}
+
+	spin_lock(&lockres->l_lock);
+	user_dlm_dec_holders(lockres, level);
+	__user_dlm_cond_queue_lockres(lockres);
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_EXMODE);
+	memcpy(lvb, val, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len)
+{
+	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
+	char *lvb = lockres->l_lksb.lvb;
+
+	BUG_ON(len > DLM_LVB_LEN);
+
+	spin_lock(&lockres->l_lock);
+
+	BUG_ON(lockres->l_level < LKM_PRMODE);
+	memcpy(val, lvb, len);
+
+	spin_unlock(&lockres->l_lock);
+}
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry)
+{
+	memset(lockres, 0, sizeof(*lockres));
+
+	spin_lock_init(&lockres->l_lock);
+	init_waitqueue_head(&lockres->l_event);
+	lockres->l_level = LKM_IVMODE;
+	lockres->l_requested = LKM_IVMODE;
+	lockres->l_blocking = LKM_IVMODE;
+
+	/* should have been checked before getting here. */
+	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
+
+	memcpy(lockres->l_name,
+	       dentry->d_name.name,
+	       dentry->d_name.len);
+	lockres->l_namelen = dentry->d_name.len;
+}
+
+int user_dlm_destroy_lock(struct user_lock_res *lockres)
+{
+	int status = -EBUSY;
+	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+
+	mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+
+	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
+	while (lockres->l_flags & USER_LOCK_BUSY) {
+		spin_unlock(&lockres->l_lock);
+
+		user_wait_on_busy_lock(lockres);
+
+		spin_lock(&lockres->l_lock);
+	}
+
+	if (lockres->l_ro_holders || lockres->l_ex_holders) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	status = 0;
+	if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+		spin_unlock(&lockres->l_lock);
+		goto bail;
+	}
+
+	lockres->l_flags &= ~USER_LOCK_ATTACHED;
+	lockres->l_flags |= USER_LOCK_BUSY;
+	spin_unlock(&lockres->l_lock);
+
+	status = dlmunlock(dlm,
+			   &lockres->l_lksb,
+			   LKM_VALBLK,
+			   user_unlock_ast,
+			   lockres);
+	if (status != DLM_NORMAL) {
+		user_log_dlm_error("dlmunlock", status, lockres);
+		status = -EINVAL;
+		goto bail;
+	}
+
+	user_wait_on_busy_lock(lockres);
+
+	status = 0;
+bail:
+	return status;
+}
+
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
+					   struct dlm_protocol_version *proto)
+{
+	struct dlm_ctxt *dlm;
+	u32 dlm_key;
+	char *domain;
+
+	domain = kmalloc(name->len + 1, GFP_NOFS);
+	if (!domain) {
+		mlog_errno(-ENOMEM);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dlm_key = crc32_le(0, name->name, name->len);
+
+	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+
+	dlm = dlm_register_domain(domain, dlm_key, proto);
+	if (IS_ERR(dlm))
+		mlog_errno(PTR_ERR(dlm));
+
+	kfree(domain);
+	return dlm;
+}
+
+void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+{
+	dlm_unregister_domain(dlm);
+}
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
new file mode 100644
index 000000000000..0c3cc03c61fa
--- /dev/null
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -0,0 +1,113 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * userdlm.h
+ *
+ * Userspace dlm defines
+ *
+ * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+
+#ifndef USERDLM_H
+#define USERDLM_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* user_lock_res->l_flags flags. */
+#define USER_LOCK_ATTACHED      (0x00000001) /* we have initialized
+					       * the lvb */
+#define USER_LOCK_BUSY          (0x00000002) /* we are currently in
+					       * dlm_lock */
+#define USER_LOCK_BLOCKED       (0x00000004) /* blocked waiting to
+					      * downconvert*/
+#define USER_LOCK_IN_TEARDOWN   (0x00000008) /* we're currently
+					      * destroying this
+					      * lock. */
+#define USER_LOCK_QUEUED        (0x00000010) /* lock is on the
+					      * workqueue */
+#define USER_LOCK_IN_CANCEL     (0x00000020)
+
+struct user_lock_res {
+	spinlock_t               l_lock;
+
+	int                      l_flags;
+
+#define USER_DLM_LOCK_ID_MAX_LEN  32
+	char                     l_name[USER_DLM_LOCK_ID_MAX_LEN];
+	int                      l_namelen;
+	int                      l_level;
+	unsigned int             l_ro_holders;
+	unsigned int             l_ex_holders;
+	struct dlm_lockstatus    l_lksb;
+
+	int                      l_requested;
+	int                      l_blocking;
+
+	wait_queue_head_t        l_event;
+
+	struct work_struct       l_work;
+};
+
+extern struct workqueue_struct *user_dlm_worker;
+
+void user_dlm_lock_res_init(struct user_lock_res *lockres,
+			    struct dentry *dentry);
+int user_dlm_destroy_lock(struct user_lock_res *lockres);
+int user_dlm_cluster_lock(struct user_lock_res *lockres,
+			  int level,
+			  int lkm_flags);
+void user_dlm_cluster_unlock(struct user_lock_res *lockres,
+			     int level);
+void user_dlm_write_lvb(struct inode *inode,
+			const char *val,
+			unsigned int len);
+void user_dlm_read_lvb(struct inode *inode,
+		       char *val,
+		       unsigned int len);
+struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
+					   struct dlm_protocol_version *proto);
+void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+
+struct dlmfs_inode_private {
+	struct dlm_ctxt             *ip_dlm;
+
+	struct user_lock_res ip_lockres; /* unused for directories. */
+	struct inode         *ip_parent;
+
+	struct inode         ip_vfs_inode;
+};
+
+static inline struct dlmfs_inode_private *
+DLMFS_I(struct inode *inode)
+{
+        return container_of(inode,
+			    struct dlmfs_inode_private,
+			    ip_vfs_inode);
+}
+
+struct dlmfs_filp_private {
+	int                  fp_lock_level;
+};
+
+#define DLMFS_MAGIC	0x76a9f425
+
+#endif /* USERDLM_H */
-- 
cgit v1.2.3


From a796d2862aed8117acc9f470f3429a5ee852912e Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 28 Jan 2010 19:22:39 -0800
Subject: ocfs2: Pass lksbs back from stackglue ast/bast functions.

The stackglue ast and bast functions tried to maintain the fiction that
their arguments were void pointers.  In reality, stack_user.c had to
know that the argument was an ocfs2_lock_res in order to get the status
off of the lksb.  That's ugly.

This changes stackglue to always pass the lksb as the argument to ast
and bast functions.  The caller can always use container_of() to get the
ocfs2_lock_res or user_dlm_lock_res.  The net effect to the caller is
zero.  They still get back the lockres in their ast.  stackglue gets
cleaner, and now can use the lksb itself.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmglue.c    | 34 +++++++++++++++++-----------------
 fs/ocfs2/stack_o2cb.c | 22 +++++++++++++---------
 fs/ocfs2/stack_user.c | 29 +++++++++++------------------
 fs/ocfs2/stackglue.c  | 19 ++++++++-----------
 fs/ocfs2/stackglue.h  | 42 +++++++++++++++++++++---------------------
 5 files changed, 70 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e044019cb3b1..1ba67df00924 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,6 +297,11 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(union ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
+}
+
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
 {
 	BUG_ON(!ocfs2_is_inode_lock(lockres));
@@ -1041,9 +1046,9 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
 }
 
 
-static void ocfs2_blocking_ast(void *opaque, int level)
+static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level)
 {
-	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	int needs_downconvert;
 	unsigned long flags;
@@ -1072,9 +1077,9 @@ static void ocfs2_blocking_ast(void *opaque, int level)
 	ocfs2_wake_downconvert_thread(osb);
 }
 
-static void ocfs2_locking_ast(void *opaque)
+static void ocfs2_locking_ast(union ocfs2_dlm_lksb *lksb)
 {
-	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
 	unsigned long flags;
 	int status;
@@ -1189,8 +1194,7 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 			     &lockres->l_lksb,
 			     dlm_flags,
 			     lockres->l_name,
-			     OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
 	lockres_clear_pending(lockres, gen, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -1421,8 +1425,7 @@ again:
 				     &lockres->l_lksb,
 				     lkm_flags,
 				     lockres->l_name,
-				     OCFS2_LOCK_ID_MAX_LEN - 1,
-				     lockres);
+				     OCFS2_LOCK_ID_MAX_LEN - 1);
 		lockres_clear_pending(lockres, gen, osb);
 		if (ret) {
 			if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
@@ -1859,8 +1862,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
-			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
 	if (ret) {
 		if (!trylock || (ret != -EAGAIN)) {
 			ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3056,9 +3058,9 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(void *opaque, int error)
+static void ocfs2_unlock_ast(union ocfs2_dlm_lksb *lksb, int error)
 {
-	struct ocfs2_lock_res *lockres = opaque;
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	unsigned long flags;
 
 	mlog_entry_void();
@@ -3167,8 +3169,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	mlog(0, "lock %s\n", lockres->l_name);
 
-	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
-			       lockres);
+	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
@@ -3309,8 +3310,7 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 			     &lockres->l_lksb,
 			     dlm_flags,
 			     lockres->l_name,
-			     OCFS2_LOCK_ID_MAX_LEN - 1,
-			     lockres);
+			     OCFS2_LOCK_ID_MAX_LEN - 1);
 	lockres_clear_pending(lockres, generation, osb);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
@@ -3365,7 +3365,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	mlog(0, "lock %s\n", lockres->l_name);
 
 	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
-			       DLM_LKF_CANCEL, lockres);
+			       DLM_LKF_CANCEL);
 	if (ret) {
 		ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
 		ocfs2_recover_from_dlm_error(lockres, 0);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3038c92af493..c4cedff365df 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -161,20 +161,26 @@ static int dlm_status_to_errno(enum dlm_status status)
 
 static void o2dlm_lock_ast_wrapper(void *astarg)
 {
+	union ocfs2_dlm_lksb *lksb = astarg;
+
 	BUG_ON(o2cb_stack.sp_proto == NULL);
 
-	o2cb_stack.sp_proto->lp_lock_ast(astarg);
+	o2cb_stack.sp_proto->lp_lock_ast(lksb);
 }
 
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
+	union ocfs2_dlm_lksb *lksb = astarg;
+
 	BUG_ON(o2cb_stack.sp_proto == NULL);
 
-	o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
+	o2cb_stack.sp_proto->lp_blocking_ast(lksb, level);
 }
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
+	union ocfs2_dlm_lksb *lksb = astarg;
+
 	int error = dlm_status_to_errno(status);
 
 	BUG_ON(o2cb_stack.sp_proto == NULL);
@@ -193,7 +199,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	if (status == DLM_CANCELGRANT)
 		return;
 
-	o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
+	o2cb_stack.sp_proto->lp_unlock_ast(lksb, error);
 }
 
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -201,8 +207,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 union ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
-			 unsigned int namelen,
-			 void *astarg)
+			 unsigned int namelen)
 {
 	enum dlm_status status;
 	int o2dlm_mode = mode_to_o2dlm(mode);
@@ -211,7 +216,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 
 	status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
 			 o2dlm_flags, name, namelen,
-			 o2dlm_lock_ast_wrapper, astarg,
+			 o2dlm_lock_ast_wrapper, lksb,
 			 o2dlm_blocking_ast_wrapper);
 	ret = dlm_status_to_errno(status);
 	return ret;
@@ -219,15 +224,14 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 
 static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
 			   union ocfs2_dlm_lksb *lksb,
-			   u32 flags,
-			   void *astarg)
+			   u32 flags)
 {
 	enum dlm_status status;
 	int o2dlm_flags = flags_to_o2dlm(flags);
 	int ret;
 
 	status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
-			   o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
+			   o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb);
 	ret = dlm_status_to_errno(status);
 	return ret;
 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index da78a2a334fd..129b93159cca 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -25,7 +25,6 @@
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
-#include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
 #include <linux/dlm_plock.h>
@@ -664,16 +663,10 @@ static void ocfs2_control_exit(void)
 		       -rc);
 }
 
-static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
-{
-	struct ocfs2_lock_res *res = astarg;
-	return &res->l_lksb.lksb_fsdlm;
-}
-
 static void fsdlm_lock_ast_wrapper(void *astarg)
 {
-	struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
-	int status = lksb->sb_status;
+	union ocfs2_dlm_lksb *lksb = astarg;
+	int status = lksb->lksb_fsdlm.sb_status;
 
 	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
@@ -688,16 +681,18 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	 */
 
 	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-		ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
+		ocfs2_user_plugin.sp_proto->lp_unlock_ast(lksb, 0);
 	else
-		ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
+		ocfs2_user_plugin.sp_proto->lp_lock_ast(lksb);
 }
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
+	union ocfs2_dlm_lksb *lksb = astarg;
+
 	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
-	ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
+	ocfs2_user_plugin.sp_proto->lp_blocking_ast(lksb, level);
 }
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -705,8 +700,7 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 union ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
-			 unsigned int namelen,
-			 void *astarg)
+			 unsigned int namelen)
 {
 	int ret;
 
@@ -716,20 +710,19 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 
 	ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
 		       flags|DLM_LKF_NODLCKWT, name, namelen, 0,
-		       fsdlm_lock_ast_wrapper, astarg,
+		       fsdlm_lock_ast_wrapper, lksb,
 		       fsdlm_blocking_ast_wrapper);
 	return ret;
 }
 
 static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
 			   union ocfs2_dlm_lksb *lksb,
-			   u32 flags,
-			   void *astarg)
+			   u32 flags)
 {
 	int ret;
 
 	ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
-			 flags, &lksb->lksb_fsdlm, astarg);
+			 flags, &lksb->lksb_fsdlm, lksb);
 	return ret;
 }
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index f3df0baa9a48..3500d9839d76 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -233,35 +233,32 @@ EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
 
 
 /*
- * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
- * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
- * underlying stack plugins need to pilfer the lksb off of the lock_res.
- * If some other structure needs to be passed as an astarg, the plugins
- * will need to be given a different avenue to the lksb.
+ * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument
+ * for the ast and bast functions.  They will pass the lksb to the ast
+ * and bast.  The caller can wrap the lksb with their own structure to
+ * get more information.
  */
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
-		   unsigned int namelen,
-		   struct ocfs2_lock_res *astarg)
+		   unsigned int namelen)
 {
 	BUG_ON(lproto == NULL);
 
 	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
-					      name, namelen, astarg);
+					      name, namelen);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
-		     u32 flags,
-		     struct ocfs2_lock_res *astarg)
+		     u32 flags)
 {
 	BUG_ON(lproto == NULL);
 
-	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
+	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 03a44d60eac9..d699117fb851 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -55,17 +55,6 @@ struct ocfs2_protocol_version {
 	u8 pv_minor;
 };
 
-/*
- * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
- */
-struct ocfs2_locking_protocol {
-	struct ocfs2_protocol_version lp_max_version;
-	void (*lp_lock_ast)(void *astarg);
-	void (*lp_blocking_ast)(void *astarg, int level);
-	void (*lp_unlock_ast)(void *astarg, int error);
-};
-
-
 /*
  * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
  * has a pointer to separately allocated lvb space.  This struct exists only to
@@ -87,6 +76,17 @@ union ocfs2_dlm_lksb {
 	struct fsdlm_lksb_plus_lvb padding;
 };
 
+/*
+ * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
+ */
+struct ocfs2_locking_protocol {
+	struct ocfs2_protocol_version lp_max_version;
+	void (*lp_lock_ast)(union ocfs2_dlm_lksb *lksb);
+	void (*lp_blocking_ast)(union ocfs2_dlm_lksb *lksb, int level);
+	void (*lp_unlock_ast)(union ocfs2_dlm_lksb *lksb, int error);
+};
+
+
 /*
  * A cluster connection.  Mostly opaque to ocfs2, the connection holds
  * state for the underlying stack.  ocfs2 does use cc_version to determine
@@ -155,27 +155,29 @@ struct ocfs2_stack_operations {
 	 *
 	 * ast and bast functions are not part of the call because the
 	 * stack will likely want to wrap ast and bast calls before passing
-	 * them to stack->sp_proto.
+	 * them to stack->sp_proto.  There is no astarg.  The lksb will
+	 * be passed back to the ast and bast functions.  The caller can
+	 * use this to find their object.
 	 */
 	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
 			int mode,
 			union ocfs2_dlm_lksb *lksb,
 			u32 flags,
 			void *name,
-			unsigned int namelen,
-			void *astarg);
+			unsigned int namelen);
 
 	/*
 	 * Call the underlying dlm unlock function.  The ->dlm_unlock()
 	 * function should convert the flags as appropriate.
 	 *
 	 * The unlock ast is not passed, as the stack will want to wrap
-	 * it before calling stack->sp_proto->lp_unlock_ast().
+	 * it before calling stack->sp_proto->lp_unlock_ast().  There is
+	 * no astarg.  The lksb will be passed back to the unlock ast
+	 * function.  The caller can use this to find their object.
 	 */
 	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
 			  union ocfs2_dlm_lksb *lksb,
-			  u32 flags,
-			  void *astarg);
+			  u32 flags);
 
 	/*
 	 * Return the status of the current lock status block.  The fs
@@ -249,12 +251,10 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   union ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
-		   unsigned int namelen,
-		   struct ocfs2_lock_res *astarg);
+		   unsigned int namelen);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     union ocfs2_dlm_lksb *lksb,
-		     u32 flags,
-		     struct ocfs2_lock_res *astarg);
+		     u32 flags);
 
 int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
-- 
cgit v1.2.3


From c0e4133851ed94c73ee3d34a2f2a245fcd0a60a1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 29 Jan 2010 14:46:44 -0800
Subject: ocfs2: Attach the connection to the lksb

We're going to want it in the ast functions, so we convert union
ocfs2_dlm_lksb to struct ocfs2_dlm_lksb and let it carry the connection.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmglue.c    |  8 ++++----
 fs/ocfs2/ocfs2.h      |  2 +-
 fs/ocfs2/stack_o2cb.c | 18 +++++++++---------
 fs/ocfs2/stack_user.c | 16 ++++++++--------
 fs/ocfs2/stackglue.c  | 17 +++++++++++------
 fs/ocfs2/stackglue.h  | 42 +++++++++++++++++++++++-------------------
 6 files changed, 56 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1ba67df00924..2bb868b7b44f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -297,7 +297,7 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
-static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(union ocfs2_dlm_lksb *lksb)
+static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
 {
 	return container_of(lksb, struct ocfs2_lock_res, l_lksb);
 }
@@ -1046,7 +1046,7 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
 }
 
 
-static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level)
+static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
 {
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
@@ -1077,7 +1077,7 @@ static void ocfs2_blocking_ast(union ocfs2_dlm_lksb *lksb, int level)
 	ocfs2_wake_downconvert_thread(osb);
 }
 
-static void ocfs2_locking_ast(union ocfs2_dlm_lksb *lksb)
+static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
 {
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
@@ -3058,7 +3058,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(union ocfs2_dlm_lksb *lksb, int error)
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
 {
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	unsigned long flags;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8857dd724f90..b27fe2489e0c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -159,7 +159,7 @@ struct ocfs2_lock_res {
 	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	union ocfs2_dlm_lksb     l_lksb;
+	struct ocfs2_dlm_lksb    l_lksb;
 
 	/* used from AST/BAST funcs. */
 	enum ocfs2_ast_action    l_action;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index c4cedff365df..fa9dd79c3615 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -161,7 +161,7 @@ static int dlm_status_to_errno(enum dlm_status status)
 
 static void o2dlm_lock_ast_wrapper(void *astarg)
 {
-	union ocfs2_dlm_lksb *lksb = astarg;
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
 	BUG_ON(o2cb_stack.sp_proto == NULL);
 
@@ -170,7 +170,7 @@ static void o2dlm_lock_ast_wrapper(void *astarg)
 
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	union ocfs2_dlm_lksb *lksb = astarg;
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
 	BUG_ON(o2cb_stack.sp_proto == NULL);
 
@@ -179,7 +179,7 @@ static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
-	union ocfs2_dlm_lksb *lksb = astarg;
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
 	int error = dlm_status_to_errno(status);
 
@@ -204,7 +204,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 int mode,
-			 union ocfs2_dlm_lksb *lksb,
+			 struct ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
 			 unsigned int namelen)
@@ -223,7 +223,7 @@ static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
 }
 
 static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
-			   union ocfs2_dlm_lksb *lksb,
+			   struct ocfs2_dlm_lksb *lksb,
 			   u32 flags)
 {
 	enum dlm_status status;
@@ -236,7 +236,7 @@ static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
 	return ret;
 }
 
-static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return dlm_status_to_errno(lksb->lksb_o2dlm.status);
 }
@@ -246,17 +246,17 @@ static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
  * contents, it will zero out the LVB.  Thus the caller can always trust
  * the contents.
  */
-static int o2cb_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	return 1;
 }
 
-static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	return (void *)(lksb->lksb_o2dlm.lvb);
 }
 
-static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 	dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
 }
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 129b93159cca..31276bac78f5 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -665,7 +665,7 @@ static void ocfs2_control_exit(void)
 
 static void fsdlm_lock_ast_wrapper(void *astarg)
 {
-	union ocfs2_dlm_lksb *lksb = astarg;
+	struct ocfs2_dlm_lksb *lksb = astarg;
 	int status = lksb->lksb_fsdlm.sb_status;
 
 	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
@@ -688,7 +688,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	union ocfs2_dlm_lksb *lksb = astarg;
+	struct ocfs2_dlm_lksb *lksb = astarg;
 
 	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
@@ -697,7 +697,7 @@ static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 			 int mode,
-			 union ocfs2_dlm_lksb *lksb,
+			 struct ocfs2_dlm_lksb *lksb,
 			 u32 flags,
 			 void *name,
 			 unsigned int namelen)
@@ -716,7 +716,7 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
 }
 
 static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
-			   union ocfs2_dlm_lksb *lksb,
+			   struct ocfs2_dlm_lksb *lksb,
 			   u32 flags)
 {
 	int ret;
@@ -726,19 +726,19 @@ static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
 	return ret;
 }
 
-static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return lksb->lksb_fsdlm.sb_status;
 }
 
-static int user_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+static int user_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	int invalid = lksb->lksb_fsdlm.sb_flags & DLM_SBF_VALNOTVALID;
 
 	return !invalid;
 }
 
-static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+static void *user_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	if (!lksb->lksb_fsdlm.sb_lvbptr)
 		lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
@@ -746,7 +746,7 @@ static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
 	return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
 }
 
-static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+static void user_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 }
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 3500d9839d76..8ef9a574315e 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -240,47 +240,52 @@ EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
  */
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
-		   union ocfs2_dlm_lksb *lksb,
+		   struct ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen)
 {
 	BUG_ON(lproto == NULL);
 
+	if (!lksb->lksb_conn)
+		lksb->lksb_conn = conn;
+	else
+		BUG_ON(lksb->lksb_conn != conn);
 	return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
 					      name, namelen);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
 
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-		     union ocfs2_dlm_lksb *lksb,
+		     struct ocfs2_dlm_lksb *lksb,
 		     u32 flags)
 {
 	BUG_ON(lproto == NULL);
+	BUG_ON(lksb->lksb_conn == NULL);
 
 	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
 
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lock_status(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
 
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb)
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lvb_valid(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid);
 
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb)
 {
 	return active_stack->sp_ops->lock_lvb(lksb);
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
 
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb)
 {
 	active_stack->sp_ops->dump_lksb(lksb);
 }
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index d699117fb851..bb32926912ba 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -70,10 +70,14 @@ struct fsdlm_lksb_plus_lvb {
  * size of the union is known.  Lock status structures are embedded in
  * ocfs2 inodes.
  */
-union ocfs2_dlm_lksb {
-	struct dlm_lockstatus lksb_o2dlm;
-	struct dlm_lksb lksb_fsdlm;
-	struct fsdlm_lksb_plus_lvb padding;
+struct ocfs2_cluster_connection;
+struct ocfs2_dlm_lksb {
+	 union {
+		 struct dlm_lockstatus lksb_o2dlm;
+		 struct dlm_lksb lksb_fsdlm;
+		 struct fsdlm_lksb_plus_lvb padding;
+	 };
+	 struct ocfs2_cluster_connection *lksb_conn;
 };
 
 /*
@@ -81,9 +85,9 @@ union ocfs2_dlm_lksb {
  */
 struct ocfs2_locking_protocol {
 	struct ocfs2_protocol_version lp_max_version;
-	void (*lp_lock_ast)(union ocfs2_dlm_lksb *lksb);
-	void (*lp_blocking_ast)(union ocfs2_dlm_lksb *lksb, int level);
-	void (*lp_unlock_ast)(union ocfs2_dlm_lksb *lksb, int error);
+	void (*lp_lock_ast)(struct ocfs2_dlm_lksb *lksb);
+	void (*lp_blocking_ast)(struct ocfs2_dlm_lksb *lksb, int level);
+	void (*lp_unlock_ast)(struct ocfs2_dlm_lksb *lksb, int error);
 };
 
 
@@ -161,7 +165,7 @@ struct ocfs2_stack_operations {
 	 */
 	int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
 			int mode,
-			union ocfs2_dlm_lksb *lksb,
+			struct ocfs2_dlm_lksb *lksb,
 			u32 flags,
 			void *name,
 			unsigned int namelen);
@@ -176,7 +180,7 @@ struct ocfs2_stack_operations {
 	 * function.  The caller can use this to find their object.
 	 */
 	int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
-			  union ocfs2_dlm_lksb *lksb,
+			  struct ocfs2_dlm_lksb *lksb,
 			  u32 flags);
 
 	/*
@@ -185,17 +189,17 @@ struct ocfs2_stack_operations {
 	 * callback pulls out the stack-specific lksb, converts the status
 	 * to a proper errno, and returns it.
 	 */
-	int (*lock_status)(union ocfs2_dlm_lksb *lksb);
+	int (*lock_status)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Return non-zero if the LVB is valid.
 	 */
-	int (*lvb_valid)(union ocfs2_dlm_lksb *lksb);
+	int (*lvb_valid)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Pull the lvb pointer off of the stack-specific lksb.
 	 */
-	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
+	void *(*lock_lvb)(struct ocfs2_dlm_lksb *lksb);
 
 	/*
 	 * Cluster-aware posix locks
@@ -212,7 +216,7 @@ struct ocfs2_stack_operations {
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
 	 */
-	void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
+	void (*dump_lksb)(struct ocfs2_dlm_lksb *lksb);
 };
 
 /*
@@ -248,18 +252,18 @@ int ocfs2_cluster_this_node(unsigned int *node);
 struct ocfs2_lock_res;
 int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   int mode,
-		   union ocfs2_dlm_lksb *lksb,
+		   struct ocfs2_dlm_lksb *lksb,
 		   u32 flags,
 		   void *name,
 		   unsigned int namelen);
 int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
-		     union ocfs2_dlm_lksb *lksb,
+		     struct ocfs2_dlm_lksb *lksb,
 		     u32 flags);
 
-int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
-int ocfs2_dlm_lvb_valid(union ocfs2_dlm_lksb *lksb);
-void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
-void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb);
+int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb);
+void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb);
+void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb);
 
 int ocfs2_stack_supports_plocks(void);
 int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
-- 
cgit v1.2.3


From 110946c8fb23c1e1e23312afed0977ad4aa37c95 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 29 Jan 2010 15:46:23 -0800
Subject: ocfs2: Hang the locking proto on the cluster conn and use it in asts.

With the ocfs2_cluster_connection hanging off of the ocfs2_dlm_lksb, we
have access to it in the ast and bast wrapper functions.  Attach the
ocfs2_locking_protocol to the conn.

Now, instead of refering to a static variable for ast/bast pointers, the
wrappers can look at the connection.  This means different connections
can have different ast/bast pointers, and it reduces the need for the
static pointer.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/stack_o2cb.c | 15 ++++-----------
 fs/ocfs2/stack_user.c | 10 +++-------
 fs/ocfs2/stackglue.c  |  1 +
 fs/ocfs2/stackglue.h  |  1 +
 4 files changed, 9 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index fa9dd79c3615..7020e1253ffa 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -163,28 +163,21 @@ static void o2dlm_lock_ast_wrapper(void *astarg)
 {
 	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	BUG_ON(o2cb_stack.sp_proto == NULL);
-
-	o2cb_stack.sp_proto->lp_lock_ast(lksb);
+	lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 
 static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
 {
 	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	BUG_ON(o2cb_stack.sp_proto == NULL);
-
-	o2cb_stack.sp_proto->lp_blocking_ast(lksb, level);
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 
 static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 {
 	struct ocfs2_dlm_lksb *lksb = astarg;
-
 	int error = dlm_status_to_errno(status);
 
-	BUG_ON(o2cb_stack.sp_proto == NULL);
-
 	/*
 	 * In o2dlm, you can get both the lock_ast() for the lock being
 	 * granted and the unlock_ast() for the CANCEL failing.  A
@@ -199,7 +192,7 @@ static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
 	if (status == DLM_CANCELGRANT)
 		return;
 
-	o2cb_stack.sp_proto->lp_unlock_ast(lksb, error);
+	lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error);
 }
 
 static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -284,7 +277,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
 	struct dlm_protocol_version fs_version;
 
 	BUG_ON(conn == NULL);
-	BUG_ON(o2cb_stack.sp_proto == NULL);
+	BUG_ON(conn->cc_proto == NULL);
 
 	/* for now we only have one cluster/node, make sure we see it
 	 * in the heartbeat universe */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 31276bac78f5..b4cf616ef423 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -668,8 +668,6 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	struct ocfs2_dlm_lksb *lksb = astarg;
 	int status = lksb->lksb_fsdlm.sb_status;
 
-	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
-
 	/*
 	 * For now we're punting on the issue of other non-standard errors
 	 * where we can't tell if the unlock_ast or lock_ast should be called.
@@ -681,18 +679,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	 */
 
 	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-		ocfs2_user_plugin.sp_proto->lp_unlock_ast(lksb, 0);
+		lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, 0);
 	else
-		ocfs2_user_plugin.sp_proto->lp_lock_ast(lksb);
+		lksb->lksb_conn->cc_proto->lp_lock_ast(lksb);
 }
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
 	struct ocfs2_dlm_lksb *lksb = astarg;
 
-	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
-
-	ocfs2_user_plugin.sp_proto->lp_blocking_ast(lksb, level);
+	lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level);
 }
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 8ef9a574315e..010ecabbdeb5 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -343,6 +343,7 @@ int ocfs2_cluster_connect(const char *stack_name,
 	new_conn->cc_recovery_handler = recovery_handler;
 	new_conn->cc_recovery_data = recovery_data;
 
+	new_conn->cc_proto = lproto;
 	/* Start the new connection at our maximum compatibility level */
 	new_conn->cc_version = lproto->lp_max_version;
 
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index bb32926912ba..cf8bac23ae09 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -100,6 +100,7 @@ struct ocfs2_cluster_connection {
 	char cc_name[GROUP_NAME_MAX];
 	int cc_namelen;
 	struct ocfs2_protocol_version cc_version;
+	struct ocfs2_locking_protocol *cc_proto;
 	void (*cc_recovery_handler)(int node_num, void *recovery_data);
 	void *cc_recovery_data;
 	void *cc_lockspace;
-- 
cgit v1.2.3


From e603cfb074e150736814ef093a411df32c02ba9f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 29 Jan 2010 16:06:29 -0800
Subject: ocfs2: Remove the ast pointers from ocfs2_stack_plugins

With the full ocfs2_locking_protocol hanging off of the
ocfs2_cluster_connection, ast wrappers can get the ast/bast pointers
there.  They don't need to get them from their plugin structure.

The user plugin still needs the maximum locking protocol version,
though.  This changes the plugin structure so that it only holds the max
version, not the entire ocfs2_locking_protocol pointer.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/stack_user.c | 6 +++---
 fs/ocfs2/stackglue.c  | 4 ++--
 fs/ocfs2/stackglue.h  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index b4cf616ef423..5ae8812b2864 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -62,8 +62,8 @@
  * negotiated by the client.  The client negotiates based on the maximum
  * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
  * number from the "SETV" message must match
- * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
- * must be less than or equal to ...->lp_max_version.pv_minor.
+ * ocfs2_user_plugin.sp_max_proto.pv_major, and the minor number
+ * must be less than or equal to ...sp_max_version.pv_minor.
  *
  * Once this information has been set, mounts will be allowed.  From this
  * point on, the "DOWN" message can be sent for node down notification.
@@ -400,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 	struct ocfs2_protocol_version *max =
-		&ocfs2_user_plugin.sp_proto->lp_max_version;
+		&ocfs2_user_plugin.sp_max_proto;
 
 	if (ocfs2_control_get_handshake_state(file) !=
 	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 010ecabbdeb5..fc184c762700 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
 	spin_lock(&ocfs2_stack_lock);
 	if (!ocfs2_stack_lookup(plugin->sp_name)) {
 		plugin->sp_count = 0;
-		plugin->sp_proto = lproto;
+		plugin->sp_max_proto = lproto->lp_max_version;
 		list_add(&plugin->sp_list, &ocfs2_stack_list);
 		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
 		       plugin->sp_name);
@@ -224,7 +224,7 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 
 	lproto = proto;
 	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
-		p->sp_proto = lproto;
+		p->sp_max_proto = lproto->lp_max_version;
 	}
 
 	spin_unlock(&ocfs2_stack_lock);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index cf8bac23ae09..77a7a9aeba73 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -233,7 +233,7 @@ struct ocfs2_stack_plugin {
 	/* These are managed by the stackglue code. */
 	struct list_head sp_list;
 	unsigned int sp_count;
-	struct ocfs2_locking_protocol *sp_proto;
+	struct ocfs2_protocol_version sp_max_proto;
 };
 
 
-- 
cgit v1.2.3


From 553b5eb91abd5f8e679d23ae547b92c589726814 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 29 Jan 2010 17:19:06 -0800
Subject: ocfs2: Pass the locking protocol into ocfs2_cluster_connect().

Inside the stackglue, the locking protocol structure is hanging off of
the ocfs2_cluster_connection.  This takes it one further; the locking
protocol is passed into ocfs2_cluster_connect().  Now different cluster
connections can have different locking protocols with distinct asts.
Note that all locking protocols have to keep their maximum protocol
version in lock-step.

With the protocol structure set in ocfs2_cluster_connect(), there is no
need for the stackglue to have a static pointer to a specific protocol
structure.  We can change initialization to only pass in the maximum
protocol version.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmglue.c   | 168 +++++++++++++++++++++++++--------------------------
 fs/ocfs2/stackglue.c |  43 +++++++------
 fs/ocfs2/stackglue.h |   3 +-
 3 files changed, 110 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 2bb868b7b44f..d009d7744d63 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1045,7 +1045,6 @@ static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
 	return lockres->l_pending_gen;
 }
 
-
 static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
 {
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
@@ -1139,6 +1138,88 @@ out:
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 }
 
+static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
+{
+	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
+	unsigned long flags;
+
+	mlog_entry_void();
+
+	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
+	     lockres->l_unlock_action);
+
+	spin_lock_irqsave(&lockres->l_lock, flags);
+	if (error) {
+		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
+		     "unlock_action %d\n", error, lockres->l_name,
+		     lockres->l_unlock_action);
+		spin_unlock_irqrestore(&lockres->l_lock, flags);
+		mlog_exit_void();
+		return;
+	}
+
+	switch(lockres->l_unlock_action) {
+	case OCFS2_UNLOCK_CANCEL_CONVERT:
+		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
+		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
+		break;
+	case OCFS2_UNLOCK_DROP_LOCK:
+		lockres->l_level = DLM_LOCK_IV;
+		break;
+	default:
+		BUG();
+	}
+
+	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
+	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
+	wake_up(&lockres->l_event);
+	spin_unlock_irqrestore(&lockres->l_lock, flags);
+
+	mlog_exit_void();
+}
+
+/*
+ * This is the filesystem locking protocol.  It provides the lock handling
+ * hooks for the underlying DLM.  It has a maximum version number.
+ * The version number allows interoperability with systems running at
+ * the same major number and an equal or smaller minor number.
+ *
+ * Whenever the filesystem does new things with locks (adds or removes a
+ * lock, orders them differently, does different things underneath a lock),
+ * the version must be changed.  The protocol is negotiated when joining
+ * the dlm domain.  A node may join the domain if its major version is
+ * identical to all other nodes and its minor version is greater than
+ * or equal to all other nodes.  When its minor version is greater than
+ * the other nodes, it will run at the minor version specified by the
+ * other nodes.
+ *
+ * If a locking change is made that will not be compatible with older
+ * versions, the major number must be increased and the minor version set
+ * to zero.  If a change merely adds a behavior that can be disabled when
+ * speaking to older versions, the minor version must be increased.  If a
+ * change adds a fully backwards compatible change (eg, LVB changes that
+ * are just ignored by older versions), the version does not need to be
+ * updated.
+ */
+static struct ocfs2_locking_protocol lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= ocfs2_locking_ast,
+	.lp_blocking_ast	= ocfs2_blocking_ast,
+	.lp_unlock_ast		= ocfs2_unlock_ast,
+};
+
+void ocfs2_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
+}
+
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert)
 {
@@ -2991,7 +3072,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	status = ocfs2_cluster_connect(osb->osb_cluster_stack,
 				       osb->uuid_str,
 				       strlen(osb->uuid_str),
-				       ocfs2_do_node_down, osb,
+				       &lproto, ocfs2_do_node_down, osb,
 				       &conn);
 	if (status) {
 		mlog_errno(status);
@@ -3058,50 +3139,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	mlog_exit_void();
 }
 
-static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
-{
-	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
-	unsigned long flags;
-
-	mlog_entry_void();
-
-	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
-	     lockres->l_unlock_action);
-
-	spin_lock_irqsave(&lockres->l_lock, flags);
-	if (error) {
-		mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
-		     "unlock_action %d\n", error, lockres->l_name,
-		     lockres->l_unlock_action);
-		spin_unlock_irqrestore(&lockres->l_lock, flags);
-		mlog_exit_void();
-		return;
-	}
-
-	switch(lockres->l_unlock_action) {
-	case OCFS2_UNLOCK_CANCEL_CONVERT:
-		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
-		lockres->l_action = OCFS2_AST_INVALID;
-		/* Downconvert thread may have requeued this lock, we
-		 * need to wake it. */
-		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
-			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
-		break;
-	case OCFS2_UNLOCK_DROP_LOCK:
-		lockres->l_level = DLM_LOCK_IV;
-		break;
-	default:
-		BUG();
-	}
-
-	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
-	wake_up(&lockres->l_event);
-	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
-	mlog_exit_void();
-}
-
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
 			   struct ocfs2_lock_res *lockres)
 {
@@ -3910,45 +3947,6 @@ void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
 		ocfs2_cluster_unlock(osb, lockres, level);
 }
 
-/*
- * This is the filesystem locking protocol.  It provides the lock handling
- * hooks for the underlying DLM.  It has a maximum version number.
- * The version number allows interoperability with systems running at
- * the same major number and an equal or smaller minor number.
- *
- * Whenever the filesystem does new things with locks (adds or removes a
- * lock, orders them differently, does different things underneath a lock),
- * the version must be changed.  The protocol is negotiated when joining
- * the dlm domain.  A node may join the domain if its major version is
- * identical to all other nodes and its minor version is greater than
- * or equal to all other nodes.  When its minor version is greater than
- * the other nodes, it will run at the minor version specified by the
- * other nodes.
- *
- * If a locking change is made that will not be compatible with older
- * versions, the major number must be increased and the minor version set
- * to zero.  If a change merely adds a behavior that can be disabled when
- * speaking to older versions, the minor version must be increased.  If a
- * change adds a fully backwards compatible change (eg, LVB changes that
- * are just ignored by older versions), the version does not need to be
- * updated.
- */
-static struct ocfs2_locking_protocol lproto = {
-	.lp_max_version = {
-		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-	},
-	.lp_lock_ast		= ocfs2_locking_ast,
-	.lp_blocking_ast	= ocfs2_blocking_ast,
-	.lp_unlock_ast		= ocfs2_unlock_ast,
-};
-
-void ocfs2_set_locking_protocol(void)
-{
-	ocfs2_stack_glue_set_locking_protocol(&lproto);
-}
-
-
 static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 				       struct ocfs2_lock_res *lockres)
 {
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index fc184c762700..31db2e87cfd4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -36,7 +36,7 @@
 #define OCFS2_STACK_PLUGIN_USER		"user"
 #define OCFS2_MAX_HB_CTL_PATH		256
 
-static struct ocfs2_locking_protocol *lproto;
+static struct ocfs2_protocol_version locking_max_version;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
@@ -176,7 +176,7 @@ int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
 	spin_lock(&ocfs2_stack_lock);
 	if (!ocfs2_stack_lookup(plugin->sp_name)) {
 		plugin->sp_count = 0;
-		plugin->sp_max_proto = lproto->lp_max_version;
+		plugin->sp_max_proto = locking_max_version;
 		list_add(&plugin->sp_list, &ocfs2_stack_list);
 		printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
 		       plugin->sp_name);
@@ -213,23 +213,23 @@ void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
 }
 EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
 
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto)
 {
 	struct ocfs2_stack_plugin *p;
 
-	BUG_ON(proto == NULL);
-
 	spin_lock(&ocfs2_stack_lock);
-	BUG_ON(active_stack != NULL);
+	if (memcmp(max_proto, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		BUG_ON(locking_max_version.pv_major != 0);
 
-	lproto = proto;
-	list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
-		p->sp_max_proto = lproto->lp_max_version;
+		locking_max_version = *max_proto;
+		list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
+			p->sp_max_proto = locking_max_version;
+		}
 	}
-
 	spin_unlock(&ocfs2_stack_lock);
 }
-EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
+EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version);
 
 
 /*
@@ -245,8 +245,6 @@ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
 		   void *name,
 		   unsigned int namelen)
 {
-	BUG_ON(lproto == NULL);
-
 	if (!lksb->lksb_conn)
 		lksb->lksb_conn = conn;
 	else
@@ -260,7 +258,6 @@ int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
 		     struct ocfs2_dlm_lksb *lksb,
 		     u32 flags)
 {
-	BUG_ON(lproto == NULL);
 	BUG_ON(lksb->lksb_conn == NULL);
 
 	return active_stack->sp_ops->dlm_unlock(conn, lksb, flags);
@@ -314,6 +311,7 @@ EXPORT_SYMBOL_GPL(ocfs2_plock);
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
 			  void *recovery_data,
@@ -331,6 +329,12 @@ int ocfs2_cluster_connect(const char *stack_name,
 		goto out;
 	}
 
+	if (memcmp(&lproto->lp_max_version, &locking_max_version,
+		   sizeof(struct ocfs2_protocol_version))) {
+		rc = -EINVAL;
+		goto out;
+	}
+
 	new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
 			   GFP_KERNEL);
 	if (!new_conn) {
@@ -456,10 +460,10 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
 	ssize_t ret = 0;
 
 	spin_lock(&ocfs2_stack_lock);
-	if (lproto)
+	if (locking_max_version.pv_major)
 		ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
-			       lproto->lp_max_version.pv_major,
-			       lproto->lp_max_version.pv_minor);
+			       locking_max_version.pv_major,
+			       locking_max_version.pv_minor);
 	spin_unlock(&ocfs2_stack_lock);
 
 	return ret;
@@ -688,7 +692,10 @@ static int __init ocfs2_stack_glue_init(void)
 
 static void __exit ocfs2_stack_glue_exit(void)
 {
-	lproto = NULL;
+	memset(&locking_max_version, 0,
+	       sizeof(struct ocfs2_protocol_version));
+	locking_max_version.pv_major = 0;
+	locking_max_version.pv_minor = 0;
 	ocfs2_sysfs_exit();
 	if (ocfs2_table_header)
 		unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 77a7a9aeba73..b1981ba4c91f 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -241,6 +241,7 @@ struct ocfs2_stack_plugin {
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
+			  struct ocfs2_locking_protocol *lproto,
 			  void (*recovery_handler)(int node_num,
 						   void *recovery_data),
 			  void *recovery_data,
@@ -270,7 +271,7 @@ int ocfs2_stack_supports_plocks(void);
 int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
 		struct file *file, int cmd, struct file_lock *fl);
 
-void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
+void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto);
 
 
 /* Used by stack plugins */
-- 
cgit v1.2.3


From e8fce482f3702c1ad27c97b26db5022aa1fa64c7 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Feb 2010 17:52:13 -0800
Subject: ocfs2_dlmfs: Don't honor truncate.  The size of a dlmfs file is
 LVB_LEN

We want folks using dlmfs to be able to use the LVB in places other than
just write(2)/read(2).  By ignoring truncate requests, we allow 'echo
"contents" > /dlm/space/lockname' to work.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmfs/dlmfs.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index e21ce0e5fc42..13ac2bffb05d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -220,6 +220,23 @@ static int dlmfs_file_release(struct inode *inode,
 	return 0;
 }
 
+/*
+ * We do ->setattr() just to override size changes.  Our size is the size
+ * of the LVB and nothing else.
+ */
+static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	int error;
+	struct inode *inode = dentry->d_inode;
+
+	attr->ia_valid &= ~ATTR_SIZE;
+	error = inode_change_ok(inode, attr);
+	if (!error)
+		error = inode_setattr(inode, attr);
+
+	return error;
+}
+
 static unsigned int dlmfs_file_poll(struct file *file, poll_table *wait)
 {
 	int event = 0;
@@ -634,6 +651,7 @@ static const struct super_operations dlmfs_ops = {
 
 static const struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
+	.setattr	= dlmfs_file_setattr,
 };
 
 static int dlmfs_get_sb(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 0016eedc4185a3cd7e578b027a6e69001b85d6c4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Sat, 30 Jan 2010 04:33:50 -0800
Subject: ocfs2_dlmfs: Use the stackglue.

Rather than directly using o2dlm, dlmfs can now use the stackglue.  This
allows it to use userspace cluster stacks and fs/dlm.  This commit
forces o2cb for now.  A latter commit will bump the protocol version and
allow non-o2cb stacks.

This is one big sed, really.  LKM_xxMODE becomes DLM_LOCK_xx.  LKM_flag
becomes DLM_LKF_flag.

We also learn to check that the LVB is valid before reading it.  Any DLM
can lose the contents of the LVB during a complicated recovery.  userdlm
should be checking this.  Now it does.  dlmfs will return 0 from read(2)
if the LVB was invalid.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmfs/dlmfs.c   |  57 ++++------
 fs/ocfs2/dlmfs/userdlm.c | 266 ++++++++++++++++++++++++-----------------------
 fs/ocfs2/dlmfs/userdlm.h |  16 +--
 3 files changed, 166 insertions(+), 173 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 13ac2bffb05d..8697366b63ad 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -47,21 +47,13 @@
 
 #include <asm/uaccess.h>
 
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlm/dlmapi.h"
-
+#include "stackglue.h"
 #include "userdlm.h"
-
 #include "dlmfsver.h"
 
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
 
-#include "ocfs2_lockingver.h"
 
 static const struct super_operations dlmfs_ops;
 static const struct file_operations dlmfs_file_operations;
@@ -72,15 +64,6 @@ static struct kmem_cache *dlmfs_inode_cache;
 
 struct workqueue_struct *user_dlm_worker;
 
-/*
- * This is the userdlmfs locking protocol version.
- *
- * See fs/ocfs2/dlmglue.c for more details on locking versions.
- */
-static const struct dlm_protocol_version user_locking_protocol = {
-	.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
-	.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
-};
 
 
 /*
@@ -259,7 +242,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
 			       loff_t *ppos)
 {
 	int bytes_left;
-	ssize_t readlen;
+	ssize_t readlen, got;
 	char *lvb_buf;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
@@ -285,9 +268,13 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	if (!lvb_buf)
 		return -ENOMEM;
 
-	user_dlm_read_lvb(inode, lvb_buf, readlen);
-	bytes_left = __copy_to_user(buf, lvb_buf, readlen);
-	readlen -= bytes_left;
+	got = user_dlm_read_lvb(inode, lvb_buf, readlen);
+	if (got) {
+		BUG_ON(got != readlen);
+		bytes_left = __copy_to_user(buf, lvb_buf, readlen);
+		readlen -= bytes_left;
+	} else
+		readlen = 0;
 
 	kfree(lvb_buf);
 
@@ -346,7 +333,7 @@ static void dlmfs_init_once(void *foo)
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
 
-	ip->ip_dlm = NULL;
+	ip->ip_conn = NULL;
 	ip->ip_parent = NULL;
 
 	inode_init_once(&ip->ip_vfs_inode);
@@ -388,14 +375,14 @@ static void dlmfs_clear_inode(struct inode *inode)
 		goto clear_fields;
 	}
 
-	mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
+	mlog(0, "we're a directory, ip->ip_conn = 0x%p\n", ip->ip_conn);
 	/* we must be a directory. If required, lets unregister the
 	 * dlm context now. */
-	if (ip->ip_dlm)
-		user_dlm_unregister_context(ip->ip_dlm);
+	if (ip->ip_conn)
+		user_dlm_unregister(ip->ip_conn);
 clear_fields:
 	ip->ip_parent = NULL;
-	ip->ip_dlm = NULL;
+	ip->ip_conn = NULL;
 }
 
 static struct backing_dev_info dlmfs_backing_dev_info = {
@@ -445,7 +432,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	ip = DLMFS_I(inode);
-	ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
+	ip->ip_conn = DLMFS_I(parent)->ip_conn;
 
 	switch (mode & S_IFMT) {
 	default:
@@ -499,13 +486,12 @@ static int dlmfs_mkdir(struct inode * dir,
 	struct inode *inode = NULL;
 	struct qstr *domain = &dentry->d_name;
 	struct dlmfs_inode_private *ip;
-	struct dlm_ctxt *dlm;
-	struct dlm_protocol_version proto = user_locking_protocol;
+	struct ocfs2_cluster_connection *conn;
 
 	mlog(0, "mkdir %.*s\n", domain->len, domain->name);
 
 	/* verify that we have a proper domain */
-	if (domain->len >= O2NM_MAX_NAME_LEN) {
+	if (domain->len >= GROUP_NAME_MAX) {
 		status = -EINVAL;
 		mlog(ML_ERROR, "invalid domain name for directory.\n");
 		goto bail;
@@ -520,14 +506,14 @@ static int dlmfs_mkdir(struct inode * dir,
 
 	ip = DLMFS_I(inode);
 
-	dlm = user_dlm_register_context(domain, &proto);
-	if (IS_ERR(dlm)) {
-		status = PTR_ERR(dlm);
+	conn = user_dlm_register(domain);
+	if (IS_ERR(conn)) {
+		status = PTR_ERR(conn);
 		mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
 		     status, domain->len, domain->name);
 		goto bail;
 	}
-	ip->ip_dlm = dlm;
+	ip->ip_conn = conn;
 
 	inc_nlink(dir);
 	d_instantiate(dentry, inode);
@@ -696,6 +682,7 @@ static int __init init_dlmfs_fs(void)
 	}
 	cleanup_worker = 1;
 
+	user_dlm_set_locking_protocol();
 	status = register_filesystem(&dlmfs_fs_type);
 bail:
 	if (status) {
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 6adae70cee8e..c1b6a56a268f 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -34,18 +34,19 @@
 #include <linux/types.h>
 #include <linux/crc32.h>
 
-
-#include "cluster/nodemanager.h"
-#include "cluster/heartbeat.h"
-#include "cluster/tcp.h"
-
-#include "dlm/dlmapi.h"
-
+#include "ocfs2_lockingver.h"
+#include "stackglue.h"
 #include "userdlm.h"
 
 #define MLOG_MASK_PREFIX ML_DLMFS
 #include "cluster/masklog.h"
 
+
+static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
+{
+	return container_of(lksb, struct user_lock_res, l_lksb);
+}
+
 static inline int user_check_wait_flag(struct user_lock_res *lockres,
 				       int flag)
 {
@@ -73,15 +74,15 @@ static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
 }
 
 /* I heart container_of... */
-static inline struct dlm_ctxt *
-dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
+static inline struct ocfs2_cluster_connection *
+cluster_connection_from_user_lockres(struct user_lock_res *lockres)
 {
 	struct dlmfs_inode_private *ip;
 
 	ip = container_of(lockres,
 			  struct dlmfs_inode_private,
 			  ip_lockres);
-	return ip->ip_dlm;
+	return ip->ip_conn;
 }
 
 static struct inode *
@@ -103,9 +104,9 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
 }
 
 #define user_log_dlm_error(_func, _stat, _lockres) do {			\
-	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "		\
-		"resource %.*s: %s\n", dlm_errname(_stat), _func,	\
-		_lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
+	mlog(ML_ERROR, "Dlm error %d while calling %s on "		\
+		"resource %.*s\n", _stat, _func,			\
+		_lockres->l_namelen, _lockres->l_name); 		\
 } while (0)
 
 /* WARNING: This function lives in a world where the only three lock
@@ -113,34 +114,34 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
  * lock types are added. */
 static inline int user_highest_compat_lock_level(int level)
 {
-	int new_level = LKM_EXMODE;
+	int new_level = DLM_LOCK_EX;
 
-	if (level == LKM_EXMODE)
-		new_level = LKM_NLMODE;
-	else if (level == LKM_PRMODE)
-		new_level = LKM_PRMODE;
+	if (level == DLM_LOCK_EX)
+		new_level = DLM_LOCK_NL;
+	else if (level == DLM_LOCK_PR)
+		new_level = DLM_LOCK_PR;
 	return new_level;
 }
 
-static void user_ast(void *opaque)
+static void user_ast(struct ocfs2_dlm_lksb *lksb)
 {
-	struct user_lock_res *lockres = opaque;
-	struct dlm_lockstatus *lksb;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
+	int status;
 
 	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
 	     lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
 
-	lksb = &(lockres->l_lksb);
-	if (lksb->status != DLM_NORMAL) {
+	status = ocfs2_dlm_lock_status(&lockres->l_lksb);
+	if (status) {
 		mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
-		     lksb->status, lockres->l_namelen, lockres->l_name);
+		     status, lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		return;
 	}
 
-	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+	mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV,
 			"Lockres %.*s, requested ivmode. flags 0x%x\n",
 			lockres->l_namelen, lockres->l_name, lockres->l_flags);
 
@@ -148,13 +149,13 @@ static void user_ast(void *opaque)
 	if (lockres->l_requested < lockres->l_level) {
 		if (lockres->l_requested <=
 		    user_highest_compat_lock_level(lockres->l_blocking)) {
-			lockres->l_blocking = LKM_NLMODE;
+			lockres->l_blocking = DLM_LOCK_NL;
 			lockres->l_flags &= ~USER_LOCK_BLOCKED;
 		}
 	}
 
 	lockres->l_level = lockres->l_requested;
-	lockres->l_requested = LKM_IVMODE;
+	lockres->l_requested = DLM_LOCK_IV;
 	lockres->l_flags |= USER_LOCK_ATTACHED;
 	lockres->l_flags &= ~USER_LOCK_BUSY;
 
@@ -193,11 +194,11 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
 		return;
 
 	switch (lockres->l_blocking) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		if (!lockres->l_ex_holders && !lockres->l_ro_holders)
 			queue = 1;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		if (!lockres->l_ex_holders)
 			queue = 1;
 		break;
@@ -209,9 +210,9 @@ static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
 		__user_dlm_queue_lockres(lockres);
 }
 
-static void user_bast(void *opaque, int level)
+static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
 {
-	struct user_lock_res *lockres = opaque;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
 	mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
 	     lockres->l_namelen, lockres->l_name, level);
@@ -227,15 +228,15 @@ static void user_bast(void *opaque, int level)
 	wake_up(&lockres->l_event);
 }
 
-static void user_unlock_ast(void *opaque, enum dlm_status status)
+static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
 {
-	struct user_lock_res *lockres = opaque;
+	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
 	mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
 	     lockres->l_name);
 
-	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
-		mlog(ML_ERROR, "Dlm returns status %d\n", status);
+	if (status)
+		mlog(ML_ERROR, "dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
 	/* The teardown flag gets set early during the unlock process,
@@ -243,7 +244,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 	 * for a concurrent cancel. */
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
 	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
-		lockres->l_level = LKM_IVMODE;
+		lockres->l_level = DLM_LOCK_IV;
 	} else if (status == DLM_CANCELGRANT) {
 		/* We tried to cancel a convert request, but it was
 		 * already granted. Don't clear the busy flag - the
@@ -254,7 +255,7 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 	} else {
 		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
 		/* Cancel succeeded, we want to re-queue */
-		lockres->l_requested = LKM_IVMODE; /* cancel an
+		lockres->l_requested = DLM_LOCK_IV; /* cancel an
 						    * upconvert
 						    * request. */
 		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
@@ -271,6 +272,21 @@ out_noclear:
 	wake_up(&lockres->l_event);
 }
 
+/*
+ * This is the userdlmfs locking protocol version.
+ *
+ * See fs/ocfs2/dlmglue.c for more details on locking versions.
+ */
+static struct ocfs2_locking_protocol user_dlm_lproto = {
+	.lp_max_version = {
+		.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
+		.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
+	},
+	.lp_lock_ast		= user_ast,
+	.lp_blocking_ast	= user_bast,
+	.lp_unlock_ast		= user_unlock_ast,
+};
+
 static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
 {
 	struct inode *inode;
@@ -283,7 +299,8 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	int new_level, status;
 	struct user_lock_res *lockres =
 		container_of(work, struct user_lock_res, l_work);
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
 	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
 	     lockres->l_name);
@@ -322,20 +339,17 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 		lockres->l_flags |= USER_LOCK_IN_CANCEL;
 		spin_unlock(&lockres->l_lock);
 
-		status = dlmunlock(dlm,
-				   &lockres->l_lksb,
-				   LKM_CANCEL,
-				   user_unlock_ast,
-				   lockres);
-		if (status != DLM_NORMAL)
-			user_log_dlm_error("dlmunlock", status, lockres);
+		status = ocfs2_dlm_unlock(conn, &lockres->l_lksb,
+					  DLM_LKF_CANCEL);
+		if (status)
+			user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		goto drop_ref;
 	}
 
 	/* If there are still incompat holders, we can exit safely
 	 * without worrying about re-queueing this lock as that will
 	 * happen on the last call to user_cluster_unlock. */
-	if ((lockres->l_blocking == LKM_EXMODE)
+	if ((lockres->l_blocking == DLM_LOCK_EX)
 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
 		spin_unlock(&lockres->l_lock);
 		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
@@ -343,7 +357,7 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 		goto drop_ref;
 	}
 
-	if ((lockres->l_blocking == LKM_PRMODE)
+	if ((lockres->l_blocking == DLM_LOCK_PR)
 	    && lockres->l_ex_holders) {
 		spin_unlock(&lockres->l_lock);
 		mlog(0, "can't downconvert for pr: ex = %u\n",
@@ -360,17 +374,12 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	spin_unlock(&lockres->l_lock);
 
 	/* need lock downconvert request now... */
-	status = dlmlock(dlm,
-			 new_level,
-			 &lockres->l_lksb,
-			 LKM_CONVERT|LKM_VALBLK,
-			 lockres->l_name,
-			 lockres->l_namelen,
-			 user_ast,
-			 lockres,
-			 user_bast);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmlock", status, lockres);
+	status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb,
+				DLM_LKF_CONVERT|DLM_LKF_VALBLK,
+				lockres->l_name,
+				lockres->l_namelen);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_lock", status, lockres);
 		user_recover_from_dlm_error(lockres);
 	}
 
@@ -382,10 +391,10 @@ static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
 					int level)
 {
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		lockres->l_ex_holders++;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		lockres->l_ro_holders++;
 		break;
 	default:
@@ -410,10 +419,11 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
 			  int lkm_flags)
 {
 	int status, local_flags;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
 		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
 		     lockres->l_namelen, lockres->l_name);
 		status = -EINVAL;
@@ -422,7 +432,7 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
 
 	mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
 	     lockres->l_namelen, lockres->l_name,
-	     (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
+	     (level == DLM_LOCK_EX) ? "DLM_LOCK_EX" : "DLM_LOCK_PR",
 	     lkm_flags);
 
 again:
@@ -457,35 +467,26 @@ again:
 	}
 
 	if (level > lockres->l_level) {
-		local_flags = lkm_flags | LKM_VALBLK;
-		if (lockres->l_level != LKM_IVMODE)
-			local_flags |= LKM_CONVERT;
+		local_flags = lkm_flags | DLM_LKF_VALBLK;
+		if (lockres->l_level != DLM_LOCK_IV)
+			local_flags |= DLM_LKF_CONVERT;
 
 		lockres->l_requested = level;
 		lockres->l_flags |= USER_LOCK_BUSY;
 		spin_unlock(&lockres->l_lock);
 
-		BUG_ON(level == LKM_IVMODE);
-		BUG_ON(level == LKM_NLMODE);
+		BUG_ON(level == DLM_LOCK_IV);
+		BUG_ON(level == DLM_LOCK_NL);
 
 		/* call dlm_lock to upgrade lock now */
-		status = dlmlock(dlm,
-				 level,
-				 &lockres->l_lksb,
-				 local_flags,
-				 lockres->l_name,
-				 lockres->l_namelen,
-				 user_ast,
-				 lockres,
-				 user_bast);
-		if (status != DLM_NORMAL) {
-			if ((lkm_flags & LKM_NOQUEUE) &&
-			    (status == DLM_NOTQUEUED))
-				status = -EAGAIN;
-			else {
-				user_log_dlm_error("dlmlock", status, lockres);
-				status = -EINVAL;
-			}
+		status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb,
+					local_flags, lockres->l_name,
+					lockres->l_namelen);
+		if (status) {
+			if ((lkm_flags & DLM_LKF_NOQUEUE) &&
+			    (status != -EAGAIN))
+				user_log_dlm_error("ocfs2_dlm_lock",
+						   status, lockres);
 			user_recover_from_dlm_error(lockres);
 			goto bail;
 		}
@@ -506,11 +507,11 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
 					int level)
 {
 	switch(level) {
-	case LKM_EXMODE:
+	case DLM_LOCK_EX:
 		BUG_ON(!lockres->l_ex_holders);
 		lockres->l_ex_holders--;
 		break;
-	case LKM_PRMODE:
+	case DLM_LOCK_PR:
 		BUG_ON(!lockres->l_ro_holders);
 		lockres->l_ro_holders--;
 		break;
@@ -522,8 +523,8 @@ static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
 void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 			     int level)
 {
-	if (level != LKM_EXMODE &&
-	    level != LKM_PRMODE) {
+	if (level != DLM_LOCK_EX &&
+	    level != DLM_LOCK_PR) {
 		mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
 		     lockres->l_namelen, lockres->l_name);
 		return;
@@ -540,33 +541,40 @@ void user_dlm_write_lvb(struct inode *inode,
 			unsigned int len)
 {
 	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
+	char *lvb;
 
 	BUG_ON(len > DLM_LVB_LEN);
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(lockres->l_level < LKM_EXMODE);
+	BUG_ON(lockres->l_level < DLM_LOCK_EX);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	memcpy(lvb, val, len);
 
 	spin_unlock(&lockres->l_lock);
 }
 
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len)
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len)
 {
 	struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
-	char *lvb = lockres->l_lksb.lvb;
+	char *lvb;
+	ssize_t ret = len;
 
 	BUG_ON(len > DLM_LVB_LEN);
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(lockres->l_level < LKM_PRMODE);
-	memcpy(val, lvb, len);
+	BUG_ON(lockres->l_level < DLM_LOCK_PR);
+	if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) {
+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+		memcpy(val, lvb, len);
+	} else
+		ret = 0;
 
 	spin_unlock(&lockres->l_lock);
+	return ret;
 }
 
 void user_dlm_lock_res_init(struct user_lock_res *lockres,
@@ -576,9 +584,9 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
 
 	spin_lock_init(&lockres->l_lock);
 	init_waitqueue_head(&lockres->l_event);
-	lockres->l_level = LKM_IVMODE;
-	lockres->l_requested = LKM_IVMODE;
-	lockres->l_blocking = LKM_IVMODE;
+	lockres->l_level = DLM_LOCK_IV;
+	lockres->l_requested = DLM_LOCK_IV;
+	lockres->l_blocking = DLM_LOCK_IV;
 
 	/* should have been checked before getting here. */
 	BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
@@ -592,7 +600,8 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres,
 int user_dlm_destroy_lock(struct user_lock_res *lockres)
 {
 	int status = -EBUSY;
-	struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
+	struct ocfs2_cluster_connection *conn =
+		cluster_connection_from_user_lockres(lockres);
 
 	mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
 
@@ -627,14 +636,9 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	lockres->l_flags |= USER_LOCK_BUSY;
 	spin_unlock(&lockres->l_lock);
 
-	status = dlmunlock(dlm,
-			   &lockres->l_lksb,
-			   LKM_VALBLK,
-			   user_unlock_ast,
-			   lockres);
-	if (status != DLM_NORMAL) {
-		user_log_dlm_error("dlmunlock", status, lockres);
-		status = -EINVAL;
+	status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
+	if (status) {
+		user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
 		goto bail;
 	}
 
@@ -645,32 +649,34 @@ bail:
 	return status;
 }
 
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto)
+static void user_dlm_recovery_handler_noop(int node_num,
+					   void *recovery_data)
 {
-	struct dlm_ctxt *dlm;
-	u32 dlm_key;
-	char *domain;
-
-	domain = kmalloc(name->len + 1, GFP_NOFS);
-	if (!domain) {
-		mlog_errno(-ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
+	/* We ignore recovery events */
+	return;
+}
 
-	dlm_key = crc32_le(0, name->name, name->len);
+void user_dlm_set_locking_protocol(void)
+{
+	ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version);
+}
 
-	snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
+{
+	int rc;
+	struct ocfs2_cluster_connection *conn;
 
-	dlm = dlm_register_domain(domain, dlm_key, proto);
-	if (IS_ERR(dlm))
-		mlog_errno(PTR_ERR(dlm));
+	rc = ocfs2_cluster_connect("o2cb", name->name, name->len,
+				   &user_dlm_lproto,
+				   user_dlm_recovery_handler_noop,
+				   NULL, &conn);
+	if (rc)
+		mlog_errno(rc);
 
-	kfree(domain);
-	return dlm;
+	return rc ? ERR_PTR(rc) : conn;
 }
 
-void user_dlm_unregister_context(struct dlm_ctxt *dlm)
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn)
 {
-	dlm_unregister_domain(dlm);
+	ocfs2_cluster_disconnect(conn, 0);
 }
diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h
index 0c3cc03c61fa..3b42d79531d7 100644
--- a/fs/ocfs2/dlmfs/userdlm.h
+++ b/fs/ocfs2/dlmfs/userdlm.h
@@ -57,7 +57,7 @@ struct user_lock_res {
 	int                      l_level;
 	unsigned int             l_ro_holders;
 	unsigned int             l_ex_holders;
-	struct dlm_lockstatus    l_lksb;
+	struct ocfs2_dlm_lksb    l_lksb;
 
 	int                      l_requested;
 	int                      l_blocking;
@@ -80,15 +80,15 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres,
 void user_dlm_write_lvb(struct inode *inode,
 			const char *val,
 			unsigned int len);
-void user_dlm_read_lvb(struct inode *inode,
-		       char *val,
-		       unsigned int len);
-struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
-					   struct dlm_protocol_version *proto);
-void user_dlm_unregister_context(struct dlm_ctxt *dlm);
+ssize_t user_dlm_read_lvb(struct inode *inode,
+			  char *val,
+			  unsigned int len);
+struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name);
+void user_dlm_unregister(struct ocfs2_cluster_connection *conn);
+void user_dlm_set_locking_protocol(void);
 
 struct dlmfs_inode_private {
-	struct dlm_ctxt             *ip_dlm;
+	struct ocfs2_cluster_connection	*ip_conn;
 
 	struct user_lock_res ip_lockres; /* unused for directories. */
 	struct inode         *ip_parent;
-- 
cgit v1.2.3


From cbe0e331fdbdb256943499358c75bc098a2134c1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Sat, 30 Jan 2010 06:02:10 -0800
Subject: ocfs2_dlmfs: Enable the use of user cluster stacks.

Unlike ocfs2, dlmfs has no permanent storage.  It can't store off a
cluster stack it is supposed to be using.  So it can't specify the stack
name in ocfs2_cluster_connect().

Instead, we create ocfs2_cluster_connect_agnostic(), which simply uses
the stack that is currently enabled.  This is find for dlmfs, which will
rely on the stack initialization.

We add the "stackglue" capability to dlmfs's capability list.  This lets
userspace know dlmfs can be used with all cluster stacks.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmfs/dlmfs.c      |  2 +-
 fs/ocfs2/dlmfs/userdlm.c    |  8 ++++----
 fs/ocfs2/ocfs2_lockingver.h |  2 ++
 fs/ocfs2/stackglue.c        | 18 ++++++++++++++++++
 fs/ocfs2/stackglue.h        | 11 +++++++++++
 5 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 8697366b63ad..1b0de157a08c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -87,7 +87,7 @@ struct workqueue_struct *user_dlm_worker;
  * - bast	: POLLIN against the file descriptor of a held lock
  *		  signifies a bast fired on the lock.
  */
-#define DLMFS_CAPABILITIES "bast"
+#define DLMFS_CAPABILITIES "bast stackglue"
 extern int param_set_dlmfs_capabilities(const char *val,
 					struct kernel_param *kp)
 {
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index c1b6a56a268f..2858ee6003c7 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -666,10 +666,10 @@ struct ocfs2_cluster_connection *user_dlm_register(struct qstr *name)
 	int rc;
 	struct ocfs2_cluster_connection *conn;
 
-	rc = ocfs2_cluster_connect("o2cb", name->name, name->len,
-				   &user_dlm_lproto,
-				   user_dlm_recovery_handler_noop,
-				   NULL, &conn);
+	rc = ocfs2_cluster_connect_agnostic(name->name, name->len,
+					    &user_dlm_lproto,
+					    user_dlm_recovery_handler_noop,
+					    NULL, &conn);
 	if (rc)
 		mlog_errno(rc);
 
diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h
index 82d5eeac0fff..2e45c8d2ea7e 100644
--- a/fs/ocfs2/ocfs2_lockingver.h
+++ b/fs/ocfs2/ocfs2_lockingver.h
@@ -23,6 +23,8 @@
 /*
  * The protocol version for ocfs2 cluster locking.  See dlmglue.c for
  * more details.
+ *
+ * 1.0 - Initial locking version from ocfs2 1.4.
  */
 #define OCFS2_LOCKING_PROTOCOL_MAJOR 1
 #define OCFS2_LOCKING_PROTOCOL_MINOR 0
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 31db2e87cfd4..39abf89697ed 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -373,6 +373,24 @@ out:
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
 
+/* The caller will ensure all nodes have the same cluster stack */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn)
+{
+	char *stack_name = NULL;
+
+	if (cluster_stack_name[0])
+		stack_name = cluster_stack_name;
+	return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
+				     recovery_handler, recovery_data, conn);
+}
+EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
+
 /* If hangup_pending is 0, the stack driver will be dropped */
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 			     int hangup_pending)
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index b1981ba4c91f..8ce7398ae1d2 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -246,6 +246,17 @@ int ocfs2_cluster_connect(const char *stack_name,
 						   void *recovery_data),
 			  void *recovery_data,
 			  struct ocfs2_cluster_connection **conn);
+/*
+ * Used by callers that don't store their stack name.  They must ensure
+ * all nodes have the same stack.
+ */
+int ocfs2_cluster_connect_agnostic(const char *group,
+				   int grouplen,
+				   struct ocfs2_locking_protocol *lproto,
+				   void (*recovery_handler)(int node_num,
+							    void *recovery_data),
+				   void *recovery_data,
+				   struct ocfs2_cluster_connection **conn);
 int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 			     int hangup_pending);
 void ocfs2_cluster_hangup(const char *group, int grouplen);
-- 
cgit v1.2.3


From 66b116c9d8f70baadf5b2145dddb35af222df041 Mon Sep 17 00:00:00 2001
From: Coly Li <coly.li@suse.de>
Date: Thu, 25 Feb 2010 14:57:13 +0800
Subject: ocfs2: fix warning in ocfs2_file_aio_write()

This patch fixes a compiling warning in ocfs2_file_aio_write().

Signed-off-by: Coly Li <coly.li@suse.de>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index da097bd07b72..c8a4a2939e55 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2041,7 +2041,7 @@ out_dio:
 	 * async dio is going to do it in the future or an end_io after an
 	 * error has already done it.
 	 */
-	if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
+	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
 		have_alloc_sem = 0;
 	}
-- 
cgit v1.2.3


From cbaee472f274ea9a98aabe47025f6e5551acadcb Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 26 Feb 2010 10:54:52 +0800
Subject: ocfs2: Only bug out in direct io write for reflinked extent.

In ocfs2_direct_IO_get_blocks, we only need to bug out
in case of we are going to write a recounted extent rec.

What a silly bug introduced by me!

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
---
 fs/ocfs2/aops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7e9df11260f4..4c2a6d282c4d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -577,8 +577,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	/* We should already CoW the refcounted extent. */
-	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+	/* We should already CoW the refcounted extent in case of create. */
+	BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+
 	/*
 	 * get_more_blocks() expects us to describe a hole by clearing
 	 * the mapped bit on bh_result().
-- 
cgit v1.2.3


From bc9838c4d44a1713ab1bf24aa6675bc3a02b6a88 Mon Sep 17 00:00:00 2001
From: Srinivas Eeda <srinivas.eeda@oracle.com>
Date: Fri, 26 Feb 2010 12:53:51 -0800
Subject: dlm: allow dlm do recovery during shutdown

If a node down event happens while dlm shutdown in progress, dlm recovery
should be done before dlm is shutdown.  We can't migrate unrecovered locks,
obviously.  But dlm_reco_thread only does recovery if the dlm_state is
in DLM_CTXT_JOINED.

dlm_reco_thread should do recovery if dlm_state is in DLM_CTXT_JOINED or
DLM_CTXT_IN_SHUTDOWN.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 344bcf90cbf4..b4f99de2caf3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -310,7 +310,7 @@ static int dlm_recovery_thread(void *data)
 	mlog(0, "dlm thread running for %s...\n", dlm->name);
 
 	while (!kthread_should_stop()) {
-		if (dlm_joined(dlm)) {
+		if (dlm_domain_fully_joined(dlm)) {
 			status = dlm_do_recovery(dlm);
 			if (status == -EAGAIN) {
 				/* do not sleep, recheck immediately. */
-- 
cgit v1.2.3


From 4912002fffa377e66c5caefc2c311732a4ad5fb8 Mon Sep 17 00:00:00 2001
From: Christian Kujau <lists@nerdbynature.de>
Date: Fri, 26 Feb 2010 17:25:14 +0000
Subject: Remove EXPERIMENTAL from NFS_FSCACHE

There's currently an open Ubuntu bug[0], with the intent to compile NFS_FSCACHE
(and possibly AFS_FSCACHE, 9P_FSCACHE) into the standard Ubuntu kernel.
However, since *_FSCACHE still depends on EXPERIMENTAL, this won't happen.

As Arjan van de Ven pointed out[1], the EXPERIMENTAL flag doesn't mean that
much any more, I propose the following patch to fs/nfs/Kconfig.  I'd do the
same for fs/9p/Kconfig and fs/afs/Kconfig, but as I did not test 9p or AFS, I
feel it would not be appropriate for me to remove the flag.

[0] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/440522/comments/5
[1] http://lkml.org/lkml/2010/1/23/145

Signed-off-by: Christian Kujau <lists@nerdbynature.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 59e5673b4597..a43d07e7b924 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -95,8 +95,7 @@ config ROOT_NFS
 	  Most people say N here.
 
 config NFS_FSCACHE
-	bool "Provide NFS client caching support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Provide NFS client caching support"
 	depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
 	help
 	  Say Y here if you want NFS data to be cached locally on disc through
-- 
cgit v1.2.3


From 9b915181af0a99fe94ef0152e6a4ca9990c3b6d0 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 26 Feb 2010 19:42:44 -0800
Subject: ocfs2: Use a separate masklog for AST and BASTs

This patch adds a new masklog and uses it allow tracing ASTs and BASTs
in the dlmglue layer. This has been found to be very useful in debugging
cluster locking issues.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/cluster/masklog.c |  1 +
 fs/ocfs2/cluster/masklog.h |  1 +
 fs/ocfs2/dlmglue.c         | 90 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 67 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 1cd2934de615..b39da877b12f 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -112,6 +112,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(XATTR),
 	define_mask(QUOTA),
 	define_mask(REFCOUNT),
+	define_mask(BASTS),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 0442366b3060..3dfddbec32f2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -114,6 +114,7 @@
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
+#define ML_BASTS	0x0000001000000000ULL /* dlmglue asts and basts */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d009d7744d63..8298608d4165 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -932,6 +932,10 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 		lockres->l_blocking = level;
 	}
 
+	mlog(ML_BASTS, "lockres %s, block %d, level %d, l_block %d, dwn %d\n",
+	     lockres->l_name, level, lockres->l_level, lockres->l_blocking,
+	     needs_downconvert);
+
 	if (needs_downconvert)
 		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
 
@@ -1054,8 +1058,8 @@ static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
 
 	BUG_ON(level <= DLM_LOCK_NL);
 
-	mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
-	     lockres->l_name, level, lockres->l_level,
+	mlog(ML_BASTS, "BAST fired for lockres %s, blocking %d, level %d, "
+	     "type %s\n", lockres->l_name, level, lockres->l_level,
 	     ocfs2_lock_type_string(lockres->l_type));
 
 	/*
@@ -1099,6 +1103,10 @@ static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
 		return;
 	}
 
+	mlog(ML_BASTS, "AST fired for lockres %s, action %d, unlock %d, "
+	     "level %d => %d\n", lockres->l_name, lockres->l_action,
+	     lockres->l_unlock_action, lockres->l_level, lockres->l_requested);
+
 	switch(lockres->l_action) {
 	case OCFS2_AST_ATTACH:
 		ocfs2_generic_handle_attach_action(lockres);
@@ -1111,8 +1119,8 @@ static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
 		ocfs2_generic_handle_downconvert_action(lockres);
 		break;
 	default:
-		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
-		     "lockres flags = 0x%lx, unlock action: %u\n",
+		mlog(ML_ERROR, "lockres %s: AST fired with invalid action: %u, "
+		     "flags 0x%lx, unlock: %u\n",
 		     lockres->l_name, lockres->l_action, lockres->l_flags,
 		     lockres->l_unlock_action);
 		BUG();
@@ -1145,8 +1153,8 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
 
 	mlog_entry_void();
 
-	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
-	     lockres->l_unlock_action);
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
+	     lockres->l_name, lockres->l_unlock_action);
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (error) {
@@ -1497,7 +1505,7 @@ again:
 		BUG_ON(level == DLM_LOCK_IV);
 		BUG_ON(level == DLM_LOCK_NL);
 
-		mlog(0, "lock %s, convert from %d to level = %d\n",
+		mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
 		     lockres->l_name, lockres->l_level, level);
 
 		/* call dlm_lock to upgrade lock now */
@@ -3314,13 +3322,20 @@ static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
 	BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
 
 	if (lockres->l_level <= new_level) {
-		mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
-		     lockres->l_level, new_level);
+		mlog(ML_ERROR, "lockres %s, lvl %d <= %d, blcklst %d, mask %d, "
+		     "type %d, flags 0x%lx, hold %d %d, act %d %d, req %d, "
+		     "block %d, pgen %d\n", lockres->l_name, lockres->l_level,
+		     new_level, list_empty(&lockres->l_blocked_list),
+		     list_empty(&lockres->l_mask_waiters), lockres->l_type,
+		     lockres->l_flags, lockres->l_ro_holders,
+		     lockres->l_ex_holders, lockres->l_action,
+		     lockres->l_unlock_action, lockres->l_requested,
+		     lockres->l_blocking, lockres->l_pending_gen);
 		BUG();
 	}
 
-	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
-	     lockres->l_name, new_level, lockres->l_blocking);
+	mlog(ML_BASTS, "lockres %s, level %d => %d, blocking %d\n",
+	     lockres->l_name, lockres->l_level, new_level, lockres->l_blocking);
 
 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
 	lockres->l_requested = new_level;
@@ -3339,6 +3354,9 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
+	     lockres->l_level, new_level);
+
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
@@ -3368,14 +3386,12 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 	assert_spin_locked(&lockres->l_lock);
 
 	mlog_entry_void();
-	mlog(0, "lock %s\n", lockres->l_name);
 
 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
 		/* If we're already trying to cancel a lock conversion
 		 * then just drop the spinlock and allow the caller to
 		 * requeue this lock. */
-
-		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
+		mlog(ML_BASTS, "lockres %s, skip convert\n", lockres->l_name);
 		return 0;
 	}
 
@@ -3390,6 +3406,8 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 			"lock %s, invalid flags: 0x%lx\n",
 			lockres->l_name, lockres->l_flags);
 
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
+
 	return 1;
 }
 
@@ -3399,7 +3417,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 	int ret;
 
 	mlog_entry_void();
-	mlog(0, "lock %s\n", lockres->l_name);
 
 	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
 			       DLM_LKF_CANCEL);
@@ -3408,7 +3425,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 		ocfs2_recover_from_dlm_error(lockres, 0);
 	}
 
-	mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
+	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
 
 	mlog_exit(ret);
 	return ret;
@@ -3465,8 +3482,11 @@ recheck:
 		 * at the same time they set OCFS2_DLM_BUSY.  They must
 		 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
 		 */
-		if (lockres->l_flags & OCFS2_LOCK_PENDING)
+		if (lockres->l_flags & OCFS2_LOCK_PENDING) {
+			mlog(ML_BASTS, "lockres %s, ReQ: Pending\n",
+			     lockres->l_name);
 			goto leave_requeue;
+		}
 
 		ctl->requeue = 1;
 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
@@ -3498,6 +3518,7 @@ recheck:
 	 */
 	if (lockres->l_level == DLM_LOCK_NL) {
 		BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
+		mlog(ML_BASTS, "lockres %s, Aborting dc\n", lockres->l_name);
 		lockres->l_blocking = DLM_LOCK_NL;
 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3507,28 +3528,41 @@ recheck:
 	/* if we're blocking an exclusive and we have *any* holders,
 	 * then requeue. */
 	if ((lockres->l_blocking == DLM_LOCK_EX)
-	    && (lockres->l_ex_holders || lockres->l_ro_holders))
+	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX/PR Holders %u,%u\n",
+		     lockres->l_name, lockres->l_ex_holders,
+		     lockres->l_ro_holders);
 		goto leave_requeue;
+	}
 
 	/* If it's a PR we're blocking, then only
 	 * requeue if we've got any EX holders */
 	if (lockres->l_blocking == DLM_LOCK_PR &&
-	    lockres->l_ex_holders)
+	    lockres->l_ex_holders) {
+		mlog(ML_BASTS, "lockres %s, ReQ: EX Holders %u\n",
+		     lockres->l_name, lockres->l_ex_holders);
 		goto leave_requeue;
+	}
 
 	/*
 	 * Can we get a lock in this state if the holder counts are
 	 * zero? The meta data unblock code used to check this.
 	 */
 	if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
-	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
+	    && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Lock Refreshing\n",
+		     lockres->l_name);
 		goto leave_requeue;
+	}
 
 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
 
 	if (lockres->l_ops->check_downconvert
-	    && !lockres->l_ops->check_downconvert(lockres, new_level))
+	    && !lockres->l_ops->check_downconvert(lockres, new_level)) {
+		mlog(ML_BASTS, "lockres %s, ReQ: Checkpointing\n",
+		     lockres->l_name);
 		goto leave_requeue;
+	}
 
 	/* If we get here, then we know that there are no more
 	 * incompatible holders (and anyone asking for an incompatible
@@ -3546,13 +3580,19 @@ recheck:
 
 	ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
 
-	if (ctl->unblock_action == UNBLOCK_STOP_POST)
+	if (ctl->unblock_action == UNBLOCK_STOP_POST) {
+		mlog(ML_BASTS, "lockres %s, UNBLOCK_STOP_POST\n",
+		     lockres->l_name);
 		goto leave;
+	}
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
 		/* If this changed underneath us, then we can't drop
 		 * it just yet. */
+		mlog(ML_BASTS, "lockres %s, block=%d:%d, level=%d:%d, "
+		     "Recheck\n", lockres->l_name, blocking,
+		     lockres->l_blocking, level, lockres->l_level);
 		goto recheck;
 	}
 
@@ -3963,7 +4003,7 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 	BUG_ON(!lockres);
 	BUG_ON(!lockres->l_ops);
 
-	mlog(0, "lockres %s blocked.\n", lockres->l_name);
+	mlog(ML_BASTS, "lockres %s blocked\n", lockres->l_name);
 
 	/* Detect whether a lock has been marked as going away while
 	 * the downconvert thread was processing other things. A lock can
@@ -3986,7 +4026,7 @@ unqueue:
 	} else
 		ocfs2_schedule_blocked_lock(osb, lockres);
 
-	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
+	mlog(ML_BASTS, "lockres %s, requeue = %s.\n", lockres->l_name,
 	     ctl.requeue ? "yes" : "no");
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
@@ -4008,7 +4048,7 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 		/* Do not schedule a lock for downconvert when it's on
 		 * the way to destruction - any nodes wanting access
 		 * to the resource will get it soon. */
-		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
+		mlog(ML_BASTS, "lockres %s won't be scheduled: flags 0x%lx\n",
 		     lockres->l_name, lockres->l_flags);
 		return;
 	}
-- 
cgit v1.2.3


From 6fcef3f04a1a0f8d7a086147d2f2e650c8cc2754 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Fri, 26 Feb 2010 19:42:45 -0800
Subject: ocfs2/userdlm: Add tracing in userdlm

Make use of the newly added BASTS masklog to trace ASTs and BASTs in userdlm.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmfs/userdlm.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 2858ee6003c7..0499e3fb7bdb 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -128,8 +128,9 @@ static void user_ast(struct ocfs2_dlm_lksb *lksb)
 	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 	int status;
 
-	mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level,
+	     lockres->l_requested);
 
 	spin_lock(&lockres->l_lock);
 
@@ -214,8 +215,8 @@ static void user_bast(struct ocfs2_dlm_lksb *lksb, int level)
 {
 	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
-	mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
-	     lockres->l_namelen, lockres->l_name, level);
+	mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n",
+	     lockres->l_namelen, lockres->l_name, level, lockres->l_level);
 
 	spin_lock(&lockres->l_lock);
 	lockres->l_flags |= USER_LOCK_BLOCKED;
@@ -232,8 +233,8 @@ static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status)
 {
 	struct user_lock_res *lockres = user_lksb_to_lock_res(lksb);
 
-	mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_flags);
 
 	if (status)
 		mlog(ML_ERROR, "dlm returns status %d\n", status);
@@ -302,8 +303,7 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	struct ocfs2_cluster_connection *conn =
 		cluster_connection_from_user_lockres(lockres);
 
-	mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
-	     lockres->l_name);
+	mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
 
@@ -321,17 +321,23 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	 * flag, and finally we might get another bast which re-queues
 	 * us before our ast for the downconvert is called. */
 	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n",
+		     lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		goto drop_ref;
 	}
 
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n",
+		     lockres->l_namelen, lockres->l_name);
 		spin_unlock(&lockres->l_lock);
 		goto drop_ref;
 	}
 
 	if (lockres->l_flags & USER_LOCK_BUSY) {
 		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
+			mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n",
+			     lockres->l_namelen, lockres->l_name);
 			spin_unlock(&lockres->l_lock);
 			goto drop_ref;
 		}
@@ -352,16 +358,18 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	if ((lockres->l_blocking == DLM_LOCK_EX)
 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
 		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
-			lockres->l_ro_holders, lockres->l_ex_holders);
+		mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders, lockres->l_ro_holders);
 		goto drop_ref;
 	}
 
 	if ((lockres->l_blocking == DLM_LOCK_PR)
 	    && lockres->l_ex_holders) {
 		spin_unlock(&lockres->l_lock);
-		mlog(0, "can't downconvert for pr: ex = %u\n",
-			lockres->l_ex_holders);
+		mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n",
+		     lockres->l_namelen, lockres->l_name,
+		     lockres->l_ex_holders);
 		goto drop_ref;
 	}
 
@@ -369,8 +377,8 @@ static void user_dlm_unblock_lock(struct work_struct *work)
 	new_level = user_highest_compat_lock_level(lockres->l_blocking);
 	lockres->l_requested = new_level;
 	lockres->l_flags |= USER_LOCK_BUSY;
-	mlog(0, "Downconvert lock from %d to %d\n",
-		lockres->l_level, new_level);
+	mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n",
+	     lockres->l_namelen, lockres->l_name, lockres->l_level, new_level);
 	spin_unlock(&lockres->l_lock);
 
 	/* need lock downconvert request now... */
@@ -430,10 +438,8 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres,
 		goto bail;
 	}
 
-	mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
-	     lockres->l_namelen, lockres->l_name,
-	     (level == DLM_LOCK_EX) ? "DLM_LOCK_EX" : "DLM_LOCK_PR",
-	     lkm_flags);
+	mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n",
+	     lockres->l_namelen, lockres->l_name, level, lkm_flags);
 
 again:
 	if (signal_pending(current)) {
@@ -603,7 +609,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	struct ocfs2_cluster_connection *conn =
 		cluster_connection_from_user_lockres(lockres);
 
-	mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
+	mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
-- 
cgit v1.2.3


From 5051f76883897ea3d3d034c92e7b84236da2ec57 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Fri, 26 Feb 2010 18:18:25 +0800
Subject: ocfs2: send SIGXFSZ if new filesize exceeds limit -v2

This patch makes ocfs2 send SIGXFSZ if new file size exceeds the rlimit.
Processes may get SIGXFSZ on one node (in the cluster) while others will
not on another if file size limits are different on the two nodes.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c8a4a2939e55..5b52547d6299 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -993,10 +993,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (size_change && attr->ia_size != i_size_read(inode)) {
-		if (attr->ia_size > sb->s_maxbytes) {
-			status = -EFBIG;
+		status = inode_newsize_ok(inode, attr->ia_size);
+		if (status)
 			goto bail_unlock;
-		}
 
 		if (i_size_read(inode) > attr->ia_size) {
 			if (ocfs2_should_order_data(inode)) {
-- 
cgit v1.2.3


From 34ce4e7c23e3da578e459b05c6fb17edecb19e6b Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 15 Dec 2009 19:34:17 +0200
Subject: exofs: debug print even less

* Last debug trimming left in some stupid print, remove them.
  Fixup some other prints
* Shift printing from inode.c to ios.c
* Add couple of prints when memory allocation fails.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 23 +++++++++++++----------
 fs/exofs/ios.c   | 38 ++++++++++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2afbcebeda71..c88a0c5250cb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -193,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
@@ -222,7 +222,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 	}
 
 	pcol_free(pcol);
-	EXOFS_DBGMSG("readpages_done END\n");
+	EXOFS_DBGMSG2("readpages_done END\n");
 	return ret;
 }
 
@@ -290,7 +290,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
 
-	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
 		  ios->obj.id, _LLU(ios->offset), pcol->length);
 
 	/* pages ownership was passed to pcol_copy */
@@ -462,7 +462,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 	else
 		good_bytes = pcol->length - resid;
 
-	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+	EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx"
 		     " length=0x%lx nr_pages=%u\n",
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
@@ -490,7 +490,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 
 	pcol_free(pcol);
 	kfree(pcol);
-	EXOFS_DBGMSG("writepages_done END\n");
+	EXOFS_DBGMSG2("writepages_done END\n");
 }
 
 static int write_exec(struct page_collect *pcol)
@@ -527,7 +527,7 @@ static int write_exec(struct page_collect *pcol)
 	}
 
 	atomic_inc(&pcol->sbi->s_curr_pending);
-	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+	EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
 		  pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
 		  pcol->length);
 	/* pages ownership was passed to pcol_copy */
@@ -616,7 +616,7 @@ try_again:
 
 	ret = pcol_add_page(pcol, page, len);
 	if (unlikely(ret)) {
-		EXOFS_DBGMSG("Failed pcol_add_page "
+		EXOFS_DBGMSG2("Failed pcol_add_page "
 			     "nr_pages=%u total_length=0x%lx\n",
 			     pcol->nr_pages, pcol->length);
 
@@ -663,7 +663,7 @@ static int exofs_writepages(struct address_space *mapping,
 	if (expected_pages < 32L)
 		expected_pages = 32L;
 
-	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
+	EXOFS_DBGMSG2("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
 		     "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
 		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
 		     mapping->nrpages, start, end, expected_pages);
@@ -1170,8 +1170,10 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	int ret;
 
 	args = kzalloc(sizeof(*args), GFP_KERNEL);
-	if (!args)
+	if (!args) {
+		EXOFS_DBGMSG("Faild kzalloc of args\n");
 		return -ENOMEM;
+	}
 
 	fcb = &args->fcb;
 
@@ -1234,7 +1236,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 free_args:
 	kfree(args);
 out:
-	EXOFS_DBGMSG("ret=>%d\n", ret);
+	EXOFS_DBGMSG("(0x%lx) do_sync=%d ret=>%d\n",
+		     inode->i_ino, do_sync, ret);
 	return ret;
 }
 
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5bad01fa1f9f..3cc0dd3f0eb2 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -26,6 +26,9 @@
 
 #include "exofs.h"
 
+#define EXOFS_DBGMSG2(M...) do {} while (0)
+/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
+
 void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
 {
 	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
@@ -73,6 +76,8 @@ int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
 	 */
 	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
+		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
+			     exofs_io_state_size(sbi->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
@@ -276,6 +281,9 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 				bio = bio_kmalloc(GFP_KERNEL,
 						  ios->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
+					EXOFS_DBGMSG(
+					      "Faild to allocate BIO size=%u\n",
+					      ios->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
 				}
@@ -290,14 +298,21 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 
 			osd_req_write(or, &ios->obj, ios->offset, bio,
 				      ios->length);
-/*			EXOFS_DBGMSG("write sync=%d\n", sync);*/
+			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(ios->offset),
+				     _LLU(ios->length), i);
 		} else if (ios->kern_buff) {
 			osd_req_write_kern(or, &ios->obj, ios->offset,
 					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id), _LLU(ios->offset),
+				     _LLU(ios->length), i);
 		} else {
 			osd_req_set_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->out_attr_len, i);
 		}
 
 		if (ios->out_attr)
@@ -335,14 +350,25 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		if (ios->bio) {
 			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
 				     ios->length);
-/*			EXOFS_DBGMSG("read sync=%d\n", sync);*/
+			EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+				      " dev=%d\n", _LLU(ios->obj.id),
+				     _LLU(ios->offset),
+				     _LLU(ios->length),
+				     first_dev);
 		} else if (ios->kern_buff) {
 			osd_req_read_kern(or, &ios->obj, ios->offset,
 					   ios->kern_buff, ios->length);
-/*			EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+				      "length=0x%llx dev=%d\n",
+				     _LLU(ios->obj.id),
+				     _LLU(ios->offset),
+				     _LLU(ios->length),
+				     first_dev);
 		} else {
 			osd_req_get_attributes(or, &ios->obj);
-/*			EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
+			EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+				     _LLU(ios->obj.id), ios->in_attr_len,
+				     first_dev);
 		}
 
 		if (ios->out_attr)
-- 
cgit v1.2.3


From 518f167a37b3c53f3cf44d27800455ca24e920f6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 21 Jan 2010 20:00:02 +0200
Subject: exofs: Micro-optimize exofs_i_info

optimize the exofs_i_info struct usage by moving the embedded
vfs_inode to be first. A compiler might optimize away an "add"
operation with constant zero. (Which it cannot with other constants)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c35fd4623986..13663da2b119 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -78,13 +78,13 @@ struct exofs_sb_info {
  * our extension to the in-memory inode
  */
 struct exofs_i_info {
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	unsigned long  i_flags;            /* various atomic flags            */
 	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
 	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
-	wait_queue_head_t i_wq;            /* wait queue for inode            */
 	uint64_t       i_commit_size;      /* the object's written length     */
 	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
-	struct inode   vfs_inode;          /* normal in-memory inode          */
 };
 
 static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
-- 
cgit v1.2.3


From 22ddc556380cf5645c52292b6d980766646eb864 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 19 Jan 2010 19:24:45 +0200
Subject: exofs: Recover in the case of read-passed-end-of-file

In check_io, implement the case of reading passed end of
file, by clearing the pages and recover with no error. In
a raid arrangement this can become a legitimate situation
in case of holes in the file.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ios.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 3cc0dd3f0eb2..439c5d097b27 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -173,6 +173,21 @@ static int exofs_io_execute(struct exofs_io_state *ios)
 	return ret;
 }
 
+static void _clear_bio(struct bio *bio)
+{
+	struct bio_vec *bv;
+	unsigned i;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		unsigned this_count = bv->bv_len;
+
+		if (likely(PAGE_SIZE == this_count))
+			clear_highpage(bv->bv_page);
+		else
+			zero_user(bv->bv_page, bv->bv_offset, this_count);
+	}
+}
+
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 {
 	enum osd_err_priority acumulated_osd_err = 0;
@@ -181,16 +196,25 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 
 	for (i = 0; i < ios->numdevs; i++) {
 		struct osd_sense_info osi;
-		int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
+		struct osd_request *or = ios->per_dev[i].or;
+		int ret;
+
+		if (unlikely(!or))
+			continue;
 
+		ret = osd_req_decode_sense(or, &osi);
 		if (likely(!ret))
 			continue;
 
-		if (unlikely(ret == -EFAULT)) {
-			EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
-			/*FIXME: All the pages in this device range should:
-			 *	clear_highpage(page);
-			 */
+		if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+			/* start read offset passed endof file */
+			_clear_bio(ios->per_dev[i].bio);
+			EXOFS_DBGMSG("start read offset passed end of file "
+				"offset=0x%llx, length=0x%llx\n",
+				_LLU(ios->offset),
+				_LLU(ios->length));
+
+			continue; /* we recovered */
 		}
 
 		if (osi.osd_err_pri >= acumulated_osd_err) {
-- 
cgit v1.2.3


From 45d3abcb1a7388b2b97582e13bf9dd21784dcaa5 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 28 Jan 2010 11:46:16 +0200
Subject: exofs: Move layout related members to a layout structure

* Abstract away those members in exofs_sb_info that are related/needed
  by a layout into a new exofs_layout structure. Embed it in exofs_sb_info.

* At exofs_io_state receive/keep a pointer to an exofs_layout. No need for
  an exofs_sb_info pointer, all we need is at exofs_layout.

* Change any usage of above exofs_sb_info members to their new name.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h | 24 ++++++++++++++++++------
 fs/exofs/inode.c | 14 +++++++-------
 fs/exofs/ios.c   | 36 +++++++++++++++++++-----------------
 fs/exofs/super.c | 51 +++++++++++++++++++++++++++------------------------
 4 files changed, 71 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 13663da2b119..33c68568b338 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -55,12 +55,18 @@
 /* u64 has problems with printk this will cast it to unsigned long long */
 #define _LLU(x) (unsigned long long)(x)
 
+struct exofs_layout {
+	osd_id		s_pid;			/* partition ID of file system*/
+
+	unsigned	s_numdevs;		/* Num of devices in array    */
+	struct osd_dev	*s_ods[0];		/* Variable length            */
+};
+
 /*
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
 	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
-	osd_id		s_pid;			/* partition ID of file system*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
 	uint32_t	s_numfiles;		/* number of files on fs      */
@@ -69,9 +75,14 @@ struct exofs_sb_info {
 	atomic_t	s_curr_pending;		/* number of pending commands */
 	uint8_t		s_cred[OSD_CAP_LEN];	/* credential for the fscb    */
 
-	struct pnfs_osd_data_map data_map;	/* Default raid to use        */
-	unsigned	s_numdevs;		/* Num of devices in array    */
-	struct osd_dev	*s_ods[1];		/* Variable length, minimum 1 */
+	struct pnfs_osd_data_map data_map;	/* Default raid to use
+						 * FIXME: Needed ?
+						 */
+/*	struct exofs_layout	dir_layout;*/	/* Default dir layout */
+	struct exofs_layout	layout;		/* Default files layout,
+						 * contains the variable osd_dev
+						 * array. Keep last */
+	struct osd_dev	*_min_one_dev[1];	/* Place holder for one dev   */
 };
 
 /*
@@ -101,7 +112,7 @@ struct exofs_io_state {
 	void			*private;
 	exofs_io_done_fn	done;
 
-	struct exofs_sb_info	*sbi;
+	struct exofs_layout	*layout;
 	struct osd_obj_id	obj;
 	u8			*cred;
 
@@ -189,7 +200,8 @@ void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
 int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
 		    u64 offset, void *p, unsigned length);
 
-int  exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
+int  exofs_get_io_state(struct exofs_layout *layout,
+			struct exofs_io_state **ios);
 void exofs_put_io_state(struct exofs_io_state *ios);
 
 int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index c88a0c5250cb..03189a958b33 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -65,7 +65,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	/* Create master bios on first Q, later on cloning, each clone will be
 	 * allocated on it's destination Q
 	 */
-	pcol->req_q = osd_request_queue(sbi->s_ods[0]);
+	pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
@@ -99,7 +99,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
 			  BIO_MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
-		int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
+		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
 
 		if (ret)
 			return ret;
@@ -872,7 +872,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	int ret;
 
 	*obj_size = ~0;
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		return ret;
@@ -1043,7 +1043,7 @@ static void create_done(struct exofs_io_state *ios, void *p)
 
 	if (unlikely(ret)) {
 		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
-			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
+			  _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid));
 		/*TODO: When FS is corrupted creation can fail, object already
 		 * exist. Get rid of this asynchronous creation, if exist
 		 * increment the obj counter and try the next object. Until we
@@ -1104,7 +1104,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	mark_inode_dirty(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
 		return ERR_PTR(ret);
@@ -1202,7 +1202,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
 	} else
 		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 		goto free_args;
@@ -1286,7 +1286,7 @@ void exofs_delete_inode(struct inode *inode)
 
 	clear_inode(inode);
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
 		return;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 439c5d097b27..83e54a77b992 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -67,23 +67,24 @@ out:
 	return ret;
 }
 
-int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
+int exofs_get_io_state(struct exofs_layout *layout,
+		       struct exofs_io_state **pios)
 {
 	struct exofs_io_state *ios;
 
 	/*TODO: Maybe use kmem_cach per sbi of size
-	 * exofs_io_state_size(sbi->s_numdevs)
+	 * exofs_io_state_size(layout->s_numdevs)
 	 */
-	ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
+	ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL);
 	if (unlikely(!ios)) {
 		EXOFS_DBGMSG("Faild kzalloc bytes=%d\n",
-			     exofs_io_state_size(sbi->s_numdevs));
+			     exofs_io_state_size(layout->s_numdevs));
 		*pios = NULL;
 		return -ENOMEM;
 	}
 
-	ios->sbi = sbi;
-	ios->obj.partition = sbi->s_pid;
+	ios->layout = layout;
+	ios->obj.partition = layout->s_pid;
 	*pios = ios;
 	return 0;
 }
@@ -238,10 +239,10 @@ int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -262,10 +263,10 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -286,10 +287,10 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 {
 	int i, ret;
 
-	for (i = 0; i < ios->sbi->s_numdevs; i++) {
+	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -361,8 +362,9 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		struct osd_request *or;
 		unsigned first_dev = (unsigned)ios->obj.id;
 
-		first_dev %= ios->sbi->s_numdevs;
-		or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
+		first_dev %= ios->layout->s_numdevs;
+		or = osd_start_request(ios->layout->s_ods[first_dev],
+				       GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -438,7 +440,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	__be64 newsize;
 	int i, ret;
 
-	if (exofs_get_io_state(sbi, &ios))
+	if (exofs_get_io_state(&sbi->layout, &ios))
 		return -ENOMEM;
 
 	ios->obj.id = exofs_oi_objno(oi);
@@ -448,10 +450,10 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	attr = g_attr_logical_length;
 	attr.val_ptr = &newsize;
 
-	for (i = 0; i < sbi->s_numdevs; i++) {
+	for (i = 0; i < sbi->layout.s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index a1d1e77b12eb..fc8875186ae8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -210,7 +210,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	sbi = sb->s_fs_info;
 	fscb = &sbi->s_fscb;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret)
 		goto out;
 
@@ -264,12 +264,12 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
 
 void exofs_free_sbi(struct exofs_sb_info *sbi)
 {
-	while (sbi->s_numdevs) {
-		int i = --sbi->s_numdevs;
-		struct osd_dev *od = sbi->s_ods[i];
+	while (sbi->layout.s_numdevs) {
+		int i = --sbi->layout.s_numdevs;
+		struct osd_dev *od = sbi->layout.s_ods[i];
 
 		if (od) {
-			sbi->s_ods[i] = NULL;
+			sbi->layout.s_ods[i] = NULL;
 			osduld_put_device(od);
 		}
 	}
@@ -298,7 +298,8 @@ static void exofs_put_super(struct super_block *sb)
 				  msecs_to_jiffies(100));
 	}
 
-	_exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
+	_exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 
 	exofs_free_sbi(sbi);
 	sb->s_fs_info = NULL;
@@ -361,7 +362,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 {
 	struct exofs_sb_info *sbi = *psbi;
 	struct osd_dev *fscb_od;
-	struct osd_obj_id obj = {.partition = sbi->s_pid,
+	struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
 				 .id = EXOFS_DEVTABLE_ID};
 	struct exofs_device_table *dt;
 	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
@@ -376,9 +377,9 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		return -ENOMEM;
 	}
 
-	fscb_od = sbi->s_ods[0];
-	sbi->s_ods[0] = NULL;
-	sbi->s_numdevs = 0;
+	fscb_od = sbi->layout.s_ods[0];
+	sbi->layout.s_ods[0] = NULL;
+	sbi->layout.s_numdevs = 0;
 	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
 	if (unlikely(ret)) {
 		EXOFS_ERR("ERROR: reading device table\n");
@@ -397,14 +398,15 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		goto out;
 
 	if (likely(numdevs > 1)) {
-		unsigned size = numdevs * sizeof(sbi->s_ods[0]);
+		unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
 
 		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
 		if (unlikely(!sbi)) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
+		memset(&sbi->layout.s_ods[1], 0,
+		       size - sizeof(sbi->layout.s_ods[0]));
 		*psbi = sbi;
 	}
 
@@ -427,8 +429,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		 * line. We always keep them in device-table order.
 		 */
 		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
-			sbi->s_ods[i] = fscb_od;
-			++sbi->s_numdevs;
+			sbi->layout.s_ods[i] = fscb_od;
+			++sbi->layout.s_numdevs;
 			fscb_od = NULL;
 			continue;
 		}
@@ -441,8 +443,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 			goto out;
 		}
 
-		sbi->s_ods[i] = od;
-		++sbi->s_numdevs;
+		sbi->layout.s_ods[i] = od;
+		++sbi->layout.s_numdevs;
 
 		/* Read the fscb of the other devices to make sure the FS
 		 * partition is there.
@@ -499,9 +501,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	sbi->s_ods[0] = od;
-	sbi->s_numdevs = 1;
-	sbi->s_pid = opts->pid;
+	/* Default layout in case we do not have a device-table */
+	sbi->layout.s_ods[0] = od;
+	sbi->layout.s_numdevs = 1;
+	sbi->layout.s_pid = opts->pid;
 	sbi->s_timeout = opts->timeout;
 
 	/* fill in some other data by hand */
@@ -514,7 +517,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_bdev = NULL;
 	sb->s_dev = 0;
 
-	obj.partition = sbi->s_pid;
+	obj.partition = sbi->layout.s_pid;
 	obj.id = EXOFS_SUPER_ID;
 	exofs_make_credential(sbi->s_cred, &obj);
 
@@ -578,13 +581,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	_exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
-			    sbi->s_pid);
+	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
+			    sbi->layout.s_pid);
 	return 0;
 
 free_sbi:
 	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
-		  opts->dev_name, sbi->s_pid, ret);
+		  opts->dev_name, sbi->layout.s_pid, ret);
 	exofs_free_sbi(sbi);
 	return ret;
 }
@@ -627,7 +630,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	uint8_t cred_a[OSD_CAP_LEN];
 	int ret;
 
-	ret = exofs_get_io_state(sbi, &ios);
+	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (ret) {
 		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
 		return ret;
-- 
cgit v1.2.3


From 46f4d973f6874c06b7a41a3bf8f4c1717d90f97a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 1 Feb 2010 11:37:30 +0200
Subject: exofs: unindent exofs_sbi_read

The original idea was that a mirror read can be sub-divided
to multiple devices. But this has very little gain and only
at very large IOes so it's not going to be implemented soon.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ios.c | 87 +++++++++++++++++++++++++---------------------------------
 1 file changed, 38 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 83e54a77b992..4f679317ca54 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -356,59 +356,48 @@ out:
 
 int exofs_sbi_read(struct exofs_io_state *ios)
 {
-	int i, ret;
-
-	for (i = 0; i < 1; i++) {
-		struct osd_request *or;
-		unsigned first_dev = (unsigned)ios->obj.id;
-
-		first_dev %= ios->layout->s_numdevs;
-		or = osd_start_request(ios->layout->s_ods[first_dev],
-				       GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+	struct osd_request *or;
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+	unsigned first_dev = (unsigned)ios->obj.id;
 
-		if (ios->bio) {
-			osd_req_read(or, &ios->obj, ios->offset, ios->bio,
-				     ios->length);
-			EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-				      " dev=%d\n", _LLU(ios->obj.id),
-				     _LLU(ios->offset),
-				     _LLU(ios->length),
-				     first_dev);
-		} else if (ios->kern_buff) {
-			osd_req_read_kern(or, &ios->obj, ios->offset,
-					   ios->kern_buff, ios->length);
-			EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
-				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id),
-				     _LLU(ios->offset),
-				     _LLU(ios->length),
-				     first_dev);
-		} else {
-			osd_req_get_attributes(or, &ios->obj);
-			EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
-				     _LLU(ios->obj.id), ios->in_attr_len,
-				     first_dev);
-		}
+	first_dev %= ios->layout->s_numdevs;
+	or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+		return -ENOMEM;
+	}
+	per_dev->or = or;
+	ios->numdevs++;
+
+	if (ios->bio) {
+		osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length);
+		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
+			     " dev=%d\n", _LLU(ios->obj.id),
+			     _LLU(ios->offset), _LLU(ios->length),
+			     first_dev);
+	} else if (ios->kern_buff) {
+		int ret = osd_req_read_kern(or, &ios->obj, ios->offset,
+					    ios->kern_buff, ios->length);
+
+		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
+			      "length=0x%llx dev=%d ret=>%d\n",
+			      _LLU(ios->obj.id), _LLU(ios->offset),
+			      _LLU(ios->length), first_dev, ret);
+		if (unlikely(ret))
+			return ret;
+	} else {
+		osd_req_get_attributes(or, &ios->obj);
+		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
+			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
+	}
 
-		if (ios->out_attr)
-			osd_req_add_set_attr_list(or, ios->out_attr,
-						  ios->out_attr_len);
+	if (ios->out_attr)
+		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
 
-		if (ios->in_attr)
-			osd_req_add_get_attr_list(or, ios->in_attr,
-						  ios->in_attr_len);
-	}
-	ret = exofs_io_execute(ios);
+	if (ios->in_attr)
+		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
 
-out:
-	return ret;
+	return exofs_io_execute(ios);
 }
 
 int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
-- 
cgit v1.2.3


From d9c740d2253e75db8cef8f87a3125c450f3ebd82 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 28 Jan 2010 11:58:08 +0200
Subject: exofs: Define on-disk per-inode optional layout attribute

* Layouts describe the way a file is spread on multiple devices.
  The layout information is stored in the objects attribute introduced
  in this patch.

* There can be multiple generating function for the layout.
  Currently defined:
    - No attribute present - use below moving-window on global
      device table, all devices.
      (This is the only one currently used in exofs)
    - an obj_id generated moving window - the obj_id is a randomizing
      factor in the otherwise global map layout.
    - An explicit layout stored, including a data_map and a device
      index list.
    - More might be defined in future ...

* There are two attributes defined of the same structure:
  A-data-files-layout - This layout is used by data-files. If present
                        at a directory, all files of that directory will
                        be created with this layout.
  A-meta-data-layout - This layout is used by a directory and other
                       meta-data information. Also inherited at creation
                       of subdirectories.

* At creation time inodes are created with the layout specified above.
  A usermode utility may change the creation layout on a give directory
  or file. Which in the case of directories, will also apply to newly
  created files/subdirectories, children of that directory.
  In the simple unaltered case of a newly created exofs, no layout
  attributes are present, and all layouts adhere to the layout specified
  at the device-table.

* In case of a future file system loaded in an old exofs-driver.
  At iget(), the generating_function is inspected and if not supported
  will return an IO error to the application and the inode will not
  be loaded. So not to damage any data.
  Note: After this patch we do not yet support any type of layout
        only the RAID0 patch that enables striping at the super-block
        level will add support for RAID0 layouts above. This way we
        are past and future compatible and fully bisectable.

* Access to the device table is done by an accessor since
  it will change according to above information.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/common.h | 39 ++++++++++++++++++++++++++++++++++++++
 fs/exofs/exofs.h  |  6 ++++++
 fs/exofs/inode.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/exofs/ios.c    | 23 ++++++++++++++++++-----
 4 files changed, 114 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1b178e61718..f0d520312d8b 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -55,6 +55,8 @@
 /* exofs Application specific page/attribute */
 # define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA	1
+# define EXOFS_ATTR_INODE_FILE_LAYOUT	2
+# define EXOFS_ATTR_INODE_DIR_LAYOUT	3
 
 /*
  * The maximum number of files we can have is limited by the size of the
@@ -206,4 +208,41 @@ enum {
 	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
 	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
 
+/*
+ * The on-disk (optional) layout structure.
+ * sits in an EXOFS_ATTR_INODE_FILE_LAYOUT or EXOFS_ATTR_INODE_DIR_LAYOUT
+ * attribute, attached to any inode, usually to a directory.
+ */
+
+enum exofs_inode_layout_gen_functions {
+	LAYOUT_MOVING_WINDOW = 0,
+	LAYOUT_IMPLICT = 1,
+};
+
+struct exofs_on_disk_inode_layout {
+	__le16 gen_func; /* One of enum exofs_inode_layout_gen_functions */
+	__le16 pad;
+	union {
+		/* gen_func == LAYOUT_MOVING_WINDOW (default) */
+		struct exofs_layout_sliding_window {
+			__le32 num_devices; /* first n devices in global-table*/
+		} sliding_window __packed;
+
+		/* gen_func == LAYOUT_IMPLICT */
+		struct exofs_layout_implict_list {
+			struct exofs_dt_data_map data_map;
+			/* Variable array of size data_map.cb_num_comps. These
+			 * are device indexes of the devices in the global table
+			 */
+			__le32 dev_indexes[];
+		} implict __packed;
+	};
+} __packed;
+
+static inline size_t exofs_on_disk_inode_layout_size(unsigned max_devs)
+{
+	return sizeof(struct exofs_on_disk_inode_layout) +
+		max_devs * sizeof(__le32);
+}
+
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 33c68568b338..09e331935514 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -185,6 +185,12 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 	return container_of(inode, struct exofs_i_info, vfs_inode);
 }
 
+/*
+ * Given a layout, object_number and stripe_index return the associated global
+ * dev_index
+ */
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index);
 /*
  * Maximum count of links to a file
  */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 03189a958b33..0163546ba05a 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -859,6 +859,15 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	return error;
 }
 
+static const struct osd_attr g_attr_inode_file_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_FILE_LAYOUT,
+	0);
+static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DIR_LAYOUT,
+	0);
+
 /*
  * Read an inode from the OSD, and return it as is.  We also return the size
  * attribute in the 'obj_size' argument.
@@ -867,11 +876,16 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		    struct exofs_fcb *inode, uint64_t *obj_size)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
-	struct osd_attr attrs[2];
+	struct osd_attr attrs[] = {
+		[0] = g_attr_inode_data,
+		[1] = g_attr_inode_file_layout,
+		[2] = g_attr_inode_dir_layout,
+		[3] = g_attr_logical_length,
+	};
 	struct exofs_io_state *ios;
+	struct exofs_on_disk_inode_layout *layout;
 	int ret;
 
-	*obj_size = ~0;
 	ret = exofs_get_io_state(&sbi->layout, &ios);
 	if (unlikely(ret)) {
 		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
@@ -882,8 +896,9 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	exofs_make_credential(oi->i_cred, &ios->obj);
 	ios->cred = oi->i_cred;
 
-	attrs[0] = g_attr_inode_data;
-	attrs[1] = g_attr_logical_length;
+	attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+	attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
+
 	ios->in_attr = attrs;
 	ios->in_attr_len = ARRAY_SIZE(attrs);
 
@@ -900,12 +915,43 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
 
 	ret = extract_attr_from_ios(ios, &attrs[1]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[1].len) {
+		layout = attrs[1].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported files layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[2]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+		goto out;
+	}
+	if (attrs[2].len) {
+		layout = attrs[2].val_ptr;
+		if (layout->gen_func != cpu_to_le16(LAYOUT_MOVING_WINDOW)) {
+			EXOFS_ERR("%s: unsupported meta-data layout %d\n",
+				__func__, layout->gen_func);
+			ret = -ENOTSUPP;
+			goto out;
+		}
+	}
+
+	*obj_size = ~0;
+	ret = extract_attr_from_ios(ios, &attrs[3]);
 	if (ret) {
 		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
 			  __func__);
 		goto out;
 	}
-	*obj_size = get_unaligned_be64(attrs[1].val_ptr);
+	*obj_size = get_unaligned_be64(attrs[3].val_ptr);
 
 out:
 	exofs_put_io_state(ios);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 4f679317ca54..2b81f99fd62c 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -107,6 +107,19 @@ void exofs_put_io_state(struct exofs_io_state *ios)
 	}
 }
 
+unsigned exofs_layout_od_id(struct exofs_layout *layout,
+			    osd_id obj_no, unsigned layout_index)
+{
+	return layout_index;
+}
+
+static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
+					   unsigned layout_index)
+{
+	return ios->layout->s_ods[
+		exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
+}
+
 static void _sync_done(struct exofs_io_state *ios, void *p)
 {
 	struct completion *waiting = p;
@@ -242,7 +255,7 @@ int exofs_sbi_create(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -266,7 +279,7 @@ int exofs_sbi_remove(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -290,7 +303,7 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 	for (i = 0; i < ios->layout->s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(ios->layout->s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
@@ -361,7 +374,7 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 	unsigned first_dev = (unsigned)ios->obj.id;
 
 	first_dev %= ios->layout->s_numdevs;
-	or = osd_start_request(ios->layout->s_ods[first_dev], GFP_KERNEL);
+	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
 	if (unlikely(!or)) {
 		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
@@ -442,7 +455,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	for (i = 0; i < sbi->layout.s_numdevs; i++) {
 		struct osd_request *or;
 
-		or = osd_start_request(sbi->layout.s_ods[i], GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
-- 
cgit v1.2.3


From 5d952b8391692553c31e620a92d6e09262a9a307 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 1 Feb 2010 13:35:51 +0200
Subject: exofs: RAID0 support

We now support striping over mirror devices. Including variable sized
stripe_unit.

Some limits:
* stripe_unit must be a multiple of PAGE_SIZE
* stripe_unit * stripe_count is maximum upto 32-bit (4Gb)

Tested RAID0 over mirrors, RAID0 only, mirrors only. All check.

Design notes:
* I'm not using a vectored raid-engine mechanism yet. Following the
  pnfs-objects-layout data-map structure, "Mirror" is just a private
  case of "group_width" == 1, and RAID0 is a private case of
  "Mirrors" == 1. The performance lose of the general case over the
  particular special case optimization is totally negligible, also
  considering the extra code size.

* In general I added a prepare_stripes() stage that divides the
  to-be-io pages to the participating devices, the previous
  exofs_ios_write/read, now becomes _write/read_mirrors and a new
  write/read upper layer loops on all devices calling
  _write/read_mirrors. Effectively the prepare_stripes stage is the all
  secret.
  Also truncate need fixing to accommodate for striping.

* In a RAID0 arrangement, in a regular usage scenario, if all inode
  layouts will start at the same device, the small files fill up the
  first device and the later devices stay empty, the farther the device
  the emptier it is.

  To fix that, each inode will start at a different stripe_unit,
  according to it's obj_id modulus number-of-stripe-units. And
  will then span all stripe-units in the same incrementing order
  wrapping back to the beginning of the device table. We call it
  a stripe-units moving window.

  Special consideration was taken to keep all devices in a mirror
  arrangement identical. So a broken osd-device could just be cloned
  from one of the mirrors and no FS scrubbing is needed. (We do that
  by rotating stripe-unit at a time and not a single device at a time.)

TODO:
 We no longer verify object_length == inode->i_size in exofs_iget.
 (since i_size is stripped on multiple objects now).
 I should introduce a multiple-device attribute reading, and use
 it in exofs_iget.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |  11 ++
 fs/exofs/inode.c |  26 +----
 fs/exofs/ios.c   | 327 ++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/exofs/super.c |  52 +++++++--
 4 files changed, 333 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 09e331935514..0d8a34b21ae1 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -58,6 +58,14 @@
 struct exofs_layout {
 	osd_id		s_pid;			/* partition ID of file system*/
 
+	/* Our way of looking at the data_map */
+	unsigned stripe_unit;
+	unsigned mirrors_p1;
+
+	unsigned group_width;
+
+	enum exofs_inode_layout_gen_functions lay_func;
+
 	unsigned	s_numdevs;		/* Num of devices in array    */
 	struct osd_dev	*s_ods[0];		/* Variable length            */
 };
@@ -133,6 +141,9 @@ struct exofs_io_state {
 	struct exofs_per_dev_state {
 		struct osd_request *or;
 		struct bio *bio;
+		loff_t offset;
+		unsigned length;
+		unsigned dev;
 	} per_dev[];
 };
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 0163546ba05a..2b3163ea56eb 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -869,18 +869,17 @@ static const struct osd_attr g_attr_inode_dir_layout = ATTR_DEF(
 	0);
 
 /*
- * Read an inode from the OSD, and return it as is.  We also return the size
- * attribute in the 'obj_size' argument.
+ * Read the Linux inode info from the OSD, and return it as is. In exofs the
+ * inode info is in an application specific page/attribute of the osd-object.
  */
 static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
-		    struct exofs_fcb *inode, uint64_t *obj_size)
+		    struct exofs_fcb *inode)
 {
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 	struct osd_attr attrs[] = {
 		[0] = g_attr_inode_data,
 		[1] = g_attr_inode_file_layout,
 		[2] = g_attr_inode_dir_layout,
-		[3] = g_attr_logical_length,
 	};
 	struct exofs_io_state *ios;
 	struct exofs_on_disk_inode_layout *layout;
@@ -944,15 +943,6 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 		}
 	}
 
-	*obj_size = ~0;
-	ret = extract_attr_from_ios(ios, &attrs[3]);
-	if (ret) {
-		EXOFS_ERR("%s: extract_attr of logical_length failed\n",
-			  __func__);
-		goto out;
-	}
-	*obj_size = get_unaligned_be64(attrs[3].val_ptr);
-
 out:
 	exofs_put_io_state(ios);
 	return ret;
@@ -971,7 +961,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	struct exofs_i_info *oi;
 	struct exofs_fcb fcb;
 	struct inode *inode;
-	uint64_t obj_size;
 	int ret;
 
 	inode = iget_locked(sb, ino);
@@ -983,7 +972,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	__oi_init(oi);
 
 	/* read the inode from the osd */
-	ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
+	ret = exofs_get_inode(sb, oi, &fcb);
 	if (ret)
 		goto bad_inode;
 
@@ -1004,13 +993,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_blkbits = EXOFS_BLKSHIFT;
 	inode->i_generation = le32_to_cpu(fcb.i_generation);
 
-	if ((inode->i_size != obj_size) &&
-		(!exofs_inode_is_fast_symlink(inode))) {
-		EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
-			  inode->i_size, _LLU(obj_size));
-		/* FIXME: call exofs_inode_recovery() */
-	}
-
 	oi->i_dir_start_lookup = 0;
 
 	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 2b81f99fd62c..6e446b2670b9 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -23,6 +23,7 @@
  */
 
 #include <scsi/scsi_device.h>
+#include <asm/div64.h>
 
 #include "exofs.h"
 
@@ -110,7 +111,17 @@ void exofs_put_io_state(struct exofs_io_state *ios)
 unsigned exofs_layout_od_id(struct exofs_layout *layout,
 			    osd_id obj_no, unsigned layout_index)
 {
-	return layout_index;
+/*	switch (layout->lay_func) {
+	case LAYOUT_MOVING_WINDOW:
+	{*/
+		unsigned dev_mod = obj_no;
+
+		return (layout_index + dev_mod * layout->mirrors_p1) %
+							      layout->s_numdevs;
+/*	}
+	case LAYOUT_FUNC_IMPLICT:
+		return layout->devs[layout_index];
+	}*/
 }
 
 static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
@@ -225,8 +236,8 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 			_clear_bio(ios->per_dev[i].bio);
 			EXOFS_DBGMSG("start read offset passed end of file "
 				"offset=0x%llx, length=0x%llx\n",
-				_LLU(ios->offset),
-				_LLU(ios->length));
+				_LLU(ios->per_dev[i].offset),
+				_LLU(ios->per_dev[i].length));
 
 			continue; /* we recovered */
 		}
@@ -248,6 +259,127 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 	return acumulated_lin_err;
 }
 
+/* REMOVEME: After review
+   Some quoteing from the standard
+
+   L = logical offset into the file
+   W = number of data components in a stripe
+   S = W * stripe_unit (S is Stripe length)
+   N = L / S (N is the stripe Number)
+   C = (L-(N*S)) / stripe_unit (C is the component)
+   O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset)
+*/
+
+static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
+			u64 *obj_offset, unsigned *dev, unsigned *unit_off)
+{
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned stripe_length = stripe_unit * ios->layout->group_width;
+	u64 stripe_no = file_offset;
+	unsigned stripe_mod = do_div(stripe_no, stripe_length);
+
+	*unit_off = stripe_mod % stripe_unit;
+	*obj_offset = stripe_no * stripe_unit + *unit_off;
+	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
+}
+
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
+		     struct exofs_per_dev_state *per_dev, int cur_len)
+{
+	unsigned bv = *cur_bvec;
+	struct request_queue *q =
+			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
+
+	per_dev->length += cur_len;
+
+	if (per_dev->bio == NULL) {
+		unsigned pages_in_stripe = ios->layout->group_width *
+					(ios->layout->stripe_unit / PAGE_SIZE);
+		unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
+						ios->layout->group_width;
+
+		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
+		if (unlikely(!per_dev->bio)) {
+			EXOFS_DBGMSG("Faild to allocate BIO size=%u\n",
+				     bio_size);
+			return -ENOMEM;
+		}
+	}
+
+	while (cur_len > 0) {
+		int added_len;
+		struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
+
+		BUG_ON(ios->bio->bi_vcnt <= bv);
+		cur_len -= bvec->bv_len;
+
+		added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
+					    bvec->bv_len, bvec->bv_offset);
+		if (unlikely(bvec->bv_len != added_len))
+			return -ENOMEM;
+		++bv;
+	}
+	BUG_ON(cur_len);
+
+	*cur_bvec = bv;
+	return 0;
+}
+
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+	u64 length = ios->length;
+	u64 offset = ios->offset;
+	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned comp = 0;
+	unsigned stripes = 0;
+	unsigned cur_bvec = 0;
+	int ret;
+
+	if (!ios->bio) {
+		if (ios->kern_buff) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+			unsigned unit_off;
+
+			_offset_dev_unit_off(ios, offset, &per_dev->offset,
+					     &per_dev->dev, &unit_off);
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (unit_off + length > stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	while (length) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+		unsigned cur_len;
+
+		if (!per_dev->length) {
+			unsigned unit_off;
+
+			_offset_dev_unit_off(ios, offset, &per_dev->offset,
+					     &per_dev->dev, &unit_off);
+			stripes++;
+			cur_len = min_t(u64, stripe_unit - unit_off, length);
+			offset += cur_len;
+		} else {
+			cur_len = min_t(u64, stripe_unit, length);
+		}
+
+		ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
+		if (unlikely(ret))
+			goto out;
+
+		comp += ios->layout->mirrors_p1;
+		comp %= ios->layout->s_numdevs;
+
+		length -= cur_len;
+	}
+out:
+	ios->numdevs = stripes * ios->layout->mirrors_p1;
+	return ret;
+}
+
 int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
@@ -296,61 +428,71 @@ out:
 	return ret;
 }
 
-int exofs_sbi_write(struct exofs_io_state *ios)
+static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 {
-	int i, ret;
+	struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp];
+	unsigned dev = ios->per_dev[cur_comp].dev;
+	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+	int ret = 0;
 
-	for (i = 0; i < ios->layout->s_numdevs; i++) {
+	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
 
-		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
+		or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL);
 		if (unlikely(!or)) {
 			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 			ret = -ENOMEM;
 			goto out;
 		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		per_dev->or = or;
+		per_dev->offset = master_dev->offset;
 
 		if (ios->bio) {
 			struct bio *bio;
 
-			if (i != 0) {
+			if (per_dev != master_dev) {
 				bio = bio_kmalloc(GFP_KERNEL,
-						  ios->bio->bi_max_vecs);
+						  master_dev->bio->bi_max_vecs);
 				if (unlikely(!bio)) {
 					EXOFS_DBGMSG(
 					      "Faild to allocate BIO size=%u\n",
-					      ios->bio->bi_max_vecs);
+					      master_dev->bio->bi_max_vecs);
 					ret = -ENOMEM;
 					goto out;
 				}
 
-				__bio_clone(bio, ios->bio);
+				__bio_clone(bio, master_dev->bio);
 				bio->bi_bdev = NULL;
 				bio->bi_next = NULL;
-				ios->per_dev[i].bio =  bio;
+				per_dev->length = master_dev->length;
+				per_dev->bio =  bio;
+				per_dev->dev = dev;
 			} else {
-				bio = ios->bio;
+				bio = master_dev->bio;
+				/* FIXME: bio_set_dir() */
+				bio->bi_rw |= (1 << BIO_RW);
 			}
 
-			osd_req_write(or, &ios->obj, ios->offset, bio,
-				      ios->length);
+			osd_req_write(or, &ios->obj, per_dev->offset, bio,
+				      per_dev->length);
 			EXOFS_DBGMSG("write(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id), _LLU(ios->offset),
-				     _LLU(ios->length), i);
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(per_dev->length), dev);
 		} else if (ios->kern_buff) {
-			osd_req_write_kern(or, &ios->obj, ios->offset,
+			ret = osd_req_write_kern(or, &ios->obj, per_dev->offset,
 					   ios->kern_buff, ios->length);
+			if (unlikely(ret))
+				goto out;
 			EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
 				      "length=0x%llx dev=%d\n",
-				     _LLU(ios->obj.id), _LLU(ios->offset),
-				     _LLU(ios->length), i);
+				     _LLU(ios->obj.id), _LLU(per_dev->offset),
+				     _LLU(ios->length), dev);
 		} else {
 			osd_req_set_attributes(or, &ios->obj);
 			EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-				     _LLU(ios->obj.id), ios->out_attr_len, i);
+				     _LLU(ios->obj.id), ios->out_attr_len, dev);
 		}
 
 		if (ios->out_attr)
@@ -361,40 +503,57 @@ int exofs_sbi_write(struct exofs_io_state *ios)
 			osd_req_add_get_attr_list(or, ios->in_attr,
 						  ios->in_attr_len);
 	}
-	ret = exofs_io_execute(ios);
 
 out:
 	return ret;
 }
 
-int exofs_sbi_read(struct exofs_io_state *ios)
+int exofs_sbi_write(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_write_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
+}
+
+static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
 {
 	struct osd_request *or;
-	struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+	struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 	unsigned first_dev = (unsigned)ios->obj.id;
 
-	first_dev %= ios->layout->s_numdevs;
+	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
 	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
 	if (unlikely(!or)) {
 		EXOFS_ERR("%s: osd_start_request failed\n", __func__);
 		return -ENOMEM;
 	}
 	per_dev->or = or;
-	ios->numdevs++;
 
 	if (ios->bio) {
-		osd_req_read(or, &ios->obj, ios->offset, ios->bio, ios->length);
+		osd_req_read(or, &ios->obj, per_dev->offset,
+				per_dev->bio, per_dev->length);
 		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
 			     " dev=%d\n", _LLU(ios->obj.id),
-			     _LLU(ios->offset), _LLU(ios->length),
+			     _LLU(per_dev->offset), _LLU(per_dev->length),
 			     first_dev);
 	} else if (ios->kern_buff) {
-		int ret = osd_req_read_kern(or, &ios->obj, ios->offset,
+		int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset,
 					    ios->kern_buff, ios->length);
-
 		EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
 			      "length=0x%llx dev=%d ret=>%d\n",
-			      _LLU(ios->obj.id), _LLU(ios->offset),
+			      _LLU(ios->obj.id), _LLU(per_dev->offset),
 			      _LLU(ios->length), first_dev, ret);
 		if (unlikely(ret))
 			return ret;
@@ -403,14 +562,32 @@ int exofs_sbi_read(struct exofs_io_state *ios)
 		EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
 			      _LLU(ios->obj.id), ios->in_attr_len, first_dev);
 	}
-
 	if (ios->out_attr)
 		osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
 
 	if (ios->in_attr)
 		osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len);
 
-	return exofs_io_execute(ios);
+	return 0;
+}
+
+int exofs_sbi_read(struct exofs_io_state *ios)
+{
+	int i;
+	int ret;
+
+	ret = _prepare_for_striping(ios);
+	if (unlikely(ret))
+		return ret;
+
+	for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+		ret = _sbi_read_mirror(ios, i);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	ret = exofs_io_execute(ios);
+	return ret;
 }
 
 int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
@@ -434,42 +611,84 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
 	return -EIO;
 }
 
+static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp,
+			     struct osd_attr *attr)
+{
+	int last_comp = cur_comp + ios->layout->mirrors_p1;
+
+	for (; cur_comp < last_comp; ++cur_comp) {
+		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
+		struct osd_request *or;
+
+		or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL);
+		if (unlikely(!or)) {
+			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
+			return -ENOMEM;
+		}
+		per_dev->or = or;
+
+		osd_req_set_attributes(or, &ios->obj);
+		osd_req_add_set_attr_list(or, attr, 1);
+	}
+
+	return 0;
+}
+
 int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 {
 	struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
 	struct exofs_io_state *ios;
-	struct osd_attr attr;
-	__be64 newsize;
+	struct exofs_trunc_attr {
+		struct osd_attr attr;
+		__be64 newsize;
+	} *size_attrs;
+	u64 this_obj_size;
+	unsigned dev;
+	unsigned unit_off;
 	int i, ret;
 
-	if (exofs_get_io_state(&sbi->layout, &ios))
-		return -ENOMEM;
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret))
+		return ret;
+
+	size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs),
+			     GFP_KERNEL);
+	if (unlikely(!size_attrs)) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	ios->obj.id = exofs_oi_objno(oi);
 	ios->cred = oi->i_cred;
 
-	newsize = cpu_to_be64(size);
-	attr = g_attr_logical_length;
-	attr.val_ptr = &newsize;
+	ios->numdevs = ios->layout->s_numdevs;
+	_offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off);
 
-	for (i = 0; i < sbi->layout.s_numdevs; i++) {
-		struct osd_request *or;
+	for (i = 0; i < ios->layout->group_width; ++i) {
+		struct exofs_trunc_attr *size_attr = &size_attrs[i];
+		u64 obj_size;
 
-		or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL);
-		if (unlikely(!or)) {
-			EXOFS_ERR("%s: osd_start_request failed\n", __func__);
-			ret = -ENOMEM;
-			goto out;
-		}
-		ios->per_dev[i].or = or;
-		ios->numdevs++;
+		if (i < dev)
+			obj_size = this_obj_size +
+					ios->layout->stripe_unit - unit_off;
+		else if (i == dev)
+			obj_size = this_obj_size;
+		else /* i > dev */
+			obj_size = this_obj_size - unit_off;
 
-		osd_req_set_attributes(or, &ios->obj);
-		osd_req_add_set_attr_list(or, &attr, 1);
+		size_attr->newsize = cpu_to_be64(obj_size);
+		size_attr->attr = g_attr_logical_length;
+		size_attr->attr.val_ptr = &size_attr->newsize;
+
+		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
+					&size_attr->attr);
+		if (unlikely(ret))
+			goto out;
 	}
 	ret = exofs_io_execute(ios);
 
 out:
+	kfree(size_attrs);
 	exofs_put_io_state(ios);
 	return ret;
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fc8875186ae8..8f4e4b37a578 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -308,6 +308,8 @@ static void exofs_put_super(struct super_block *sb)
 static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 				    struct exofs_device_table *dt)
 {
+	u64 stripe_length;
+
 	sbi->data_map.odm_num_comps   =
 				le32_to_cpu(dt->dt_data_map.cb_num_comps);
 	sbi->data_map.odm_stripe_unit =
@@ -321,14 +323,47 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	sbi->data_map.odm_raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Hard coded mirror only for now. if not so do not mount */
-	if ((sbi->data_map.odm_num_comps != numdevs) ||
-	    (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
-	    (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
-	    (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
+/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
+	if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) {
+		EXOFS_ERR("Group width/depth not supported\n");
 		return -EINVAL;
-	else
-		return 0;
+	}
+	if (sbi->data_map.odm_num_comps != numdevs) {
+		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
+			  sbi->data_map.odm_num_comps, numdevs);
+		return -EINVAL;
+	}
+	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
+		EXOFS_ERR("Only RAID_0 for now\n");
+		return -EINVAL;
+	}
+	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
+		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
+			  numdevs, sbi->data_map.odm_mirror_cnt);
+		return -EINVAL;
+	}
+
+	stripe_length = sbi->data_map.odm_stripe_unit *
+			(numdevs / (sbi->data_map.odm_mirror_cnt + 1));
+	if (stripe_length >= (1ULL << 32)) {
+		EXOFS_ERR("Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -EINVAL;
+	}
+
+	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
+		EXOFS_ERR("Stripe Unit(0x%llx)"
+			  " must be Multples of PAGE_SIZE(0x%lx)\n",
+			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
+	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
+	sbi->layout.group_width = sbi->data_map.odm_num_comps /
+							sbi->layout.mirrors_p1;
+
+	return 0;
 }
 
 /* @odi is valid only as long as @fscb_dev is valid */
@@ -502,6 +537,9 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* Default layout in case we do not have a device-table */
+	sbi->layout.stripe_unit = PAGE_SIZE;
+	sbi->layout.mirrors_p1 = 1;
+	sbi->layout.group_width = 1;
 	sbi->layout.s_ods[0] = od;
 	sbi->layout.s_numdevs = 1;
 	sbi->layout.s_pid = opts->pid;
-- 
cgit v1.2.3


From 86093aaff5be5b214613eb60553e236bdb389c84 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 28 Jan 2010 18:24:06 +0200
Subject: exofs: convert io_state to use pages array instead of bio at input

* inode.c operations are full-pages based, and not actually
  true scatter-gather
* Lets us use more pages at once upto 512 (from 249) in 64 bit
* Brings us much much closer to be able to use exofs's io_state engine
  from objlayout driver. (Once I decide where to put the common code)

After RAID0 patch the outer (input) bio was never used as a bio, but
was simply a page carrier into the raid engine. Even in the simple
mirror/single-dev arrangement pages info was copied into a second bio.
It is now easer to just pass a pages array into the io_state and prepare
bio(s) once.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |  5 +++-
 fs/exofs/inode.c | 81 ++++++++++++++++++++++++++++----------------------------
 fs/exofs/ios.c   | 46 ++++++++++++++++++--------------
 3 files changed, 71 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0d8a34b21ae1..acfebd36de83 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -128,7 +128,10 @@ struct exofs_io_state {
 	loff_t			offset;
 	unsigned long		length;
 	void			*kern_buff;
-	struct bio		*bio;
+
+	struct page		**pages;
+	unsigned		nr_pages;
+	unsigned		pgbase;
 
 	/* Attributes */
 	unsigned		in_attr_len;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2b3163ea56eb..6ca0b0117f04 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -41,16 +41,18 @@
 
 enum { BIO_MAX_PAGES_KMALLOC =
 		(PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+	MAX_PAGES_KMALLOC =
+		PAGE_SIZE / sizeof(struct page *),
 };
 
 struct page_collect {
 	struct exofs_sb_info *sbi;
-	struct request_queue *req_q;
 	struct inode *inode;
 	unsigned expected_pages;
 	struct exofs_io_state *ios;
 
-	struct bio *bio;
+	struct page **pages;
+	unsigned alloc_pages;
 	unsigned nr_pages;
 	unsigned long length;
 	loff_t pg_first; /* keep 64bit also in 32-arches */
@@ -62,15 +64,12 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
 	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
 
 	pcol->sbi = sbi;
-	/* Create master bios on first Q, later on cloning, each clone will be
-	 * allocated on it's destination Q
-	 */
-	pcol->req_q = osd_request_queue(sbi->layout.s_ods[0]);
 	pcol->inode = inode;
 	pcol->expected_pages = expected_pages;
 
 	pcol->ios = NULL;
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -80,7 +79,8 @@ static void _pcol_reset(struct page_collect *pcol)
 {
 	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
 
-	pcol->bio = NULL;
+	pcol->pages = NULL;
+	pcol->alloc_pages = 0;
 	pcol->nr_pages = 0;
 	pcol->length = 0;
 	pcol->pg_first = -1;
@@ -90,13 +90,13 @@ static void _pcol_reset(struct page_collect *pcol)
 	 * it might not end here. don't be left with nothing
 	 */
 	if (!pcol->expected_pages)
-		pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
+		pcol->expected_pages = MAX_PAGES_KMALLOC;
 }
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	int pages = min_t(unsigned, pcol->expected_pages,
-			  BIO_MAX_PAGES_KMALLOC);
+	unsigned pages = min_t(unsigned, pcol->expected_pages,
+			  MAX_PAGES_KMALLOC);
 
 	if (!pcol->ios) { /* First time allocate io_state */
 		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -105,23 +105,28 @@ static int pcol_try_alloc(struct page_collect *pcol)
 			return ret;
 	}
 
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+
 	for (; pages; pages >>= 1) {
-		pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
-		if (likely(pcol->bio))
+		pcol->pages = kmalloc(pages * sizeof(struct page *),
+				      GFP_KERNEL);
+		if (likely(pcol->pages)) {
+			pcol->alloc_pages = pages;
 			return 0;
+		}
 	}
 
-	EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
+	EXOFS_ERR("Failed to kmalloc expected_pages=%u\n",
 		  pcol->expected_pages);
 	return -ENOMEM;
 }
 
 static void pcol_free(struct page_collect *pcol)
 {
-	if (pcol->bio) {
-		bio_put(pcol->bio);
-		pcol->bio = NULL;
-	}
+	kfree(pcol->pages);
+	pcol->pages = NULL;
 
 	if (pcol->ios) {
 		exofs_put_io_state(pcol->ios);
@@ -132,11 +137,10 @@ static void pcol_free(struct page_collect *pcol)
 static int pcol_add_page(struct page_collect *pcol, struct page *page,
 			 unsigned len)
 {
-	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
-	if (unlikely(len != added_len))
+	if (unlikely(pcol->nr_pages >= pcol->alloc_pages))
 		return -ENOMEM;
 
-	++pcol->nr_pages;
+	pcol->pages[pcol->nr_pages++] = page;
 	pcol->length += len;
 	return 0;
 }
@@ -181,7 +185,6 @@ static void update_write_page(struct page *page, int ret)
  */
 static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 {
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64 good_bytes;
@@ -198,8 +201,8 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -218,7 +221,7 @@ static int __readpages_done(struct page_collect *pcol, bool do_unlock)
 		ret = update_read_page(page, page_stat);
 		if (do_unlock)
 			unlock_page(page);
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -238,11 +241,10 @@ static void readpages_done(struct exofs_io_state *ios, void *p)
 
 static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
 {
-	struct bio_vec *bvec;
 	int i;
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 
 		if (rw == READ)
 			update_read_page(page, ret);
@@ -260,13 +262,14 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	/* see comment in _readpage() about sync reads */
 	WARN_ON(is_sync && (pcol->nr_pages != 1));
 
-	ios->bio = pcol->bio;
+	ios->pages = pcol->pages;
+	ios->nr_pages = pcol->nr_pages;
 	ios->length = pcol->length;
 	ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
 
@@ -366,7 +369,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
@@ -448,7 +451,6 @@ static int exofs_readpage(struct file *file, struct page *page)
 static void writepages_done(struct exofs_io_state *ios, void *p)
 {
 	struct page_collect *pcol = p;
-	struct bio_vec *bvec;
 	int i;
 	u64 resid;
 	u64  good_bytes;
@@ -467,8 +469,8 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
 		     pcol->nr_pages);
 
-	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
-		struct page *page = bvec->bv_page;
+	for (i = 0; i < pcol->nr_pages; i++) {
+		struct page *page = pcol->pages[i];
 		struct inode *inode = page->mapping->host;
 		int page_stat;
 
@@ -485,7 +487,7 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
 		EXOFS_DBGMSG2("    writepages_done(0x%lx, 0x%lx) status=%d\n",
 			     inode->i_ino, page->index, page_stat);
 
-		length += bvec->bv_len;
+		length += PAGE_SIZE;
 	}
 
 	pcol_free(pcol);
@@ -500,7 +502,7 @@ static int write_exec(struct page_collect *pcol)
 	struct page_collect *pcol_copy = NULL;
 	int ret;
 
-	if (!pcol->bio)
+	if (!pcol->pages)
 		return 0;
 
 	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -512,9 +514,8 @@ static int write_exec(struct page_collect *pcol)
 
 	*pcol_copy = *pcol;
 
-	pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
-
-	ios->bio = pcol_copy->bio;
+	ios->pages = pcol_copy->pages;
+	ios->nr_pages = pcol_copy->nr_pages;
 	ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
 	ios->length = pcol_copy->length;
 	ios->done = writepages_done;
@@ -605,7 +606,7 @@ try_again:
 		goto try_again;
 	}
 
-	if (!pcol->bio) {
+	if (!pcol->pages) {
 		ret = pcol_try_alloc(pcol);
 		if (unlikely(ret))
 			goto fail;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 6e446b2670b9..263052c77f41 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -283,10 +283,11 @@ static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
 	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
 }
 
-static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
-		     struct exofs_per_dev_state *per_dev, int cur_len)
+static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
+		unsigned pgbase, struct exofs_per_dev_state *per_dev,
+		int cur_len)
 {
-	unsigned bv = *cur_bvec;
+	unsigned pg = *cur_pg;
 	struct request_queue *q =
 			osd_request_queue(exofs_ios_od(ios, per_dev->dev));
 
@@ -295,7 +296,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
 	if (per_dev->bio == NULL) {
 		unsigned pages_in_stripe = ios->layout->group_width *
 					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned bio_size = (ios->bio->bi_vcnt + pages_in_stripe) /
+		unsigned bio_size = (ios->nr_pages + pages_in_stripe) /
 						ios->layout->group_width;
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
@@ -307,21 +308,22 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_bvec,
 	}
 
 	while (cur_len > 0) {
-		int added_len;
-		struct bio_vec *bvec = &ios->bio->bi_io_vec[bv];
+		unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+		unsigned added_len;
 
-		BUG_ON(ios->bio->bi_vcnt <= bv);
-		cur_len -= bvec->bv_len;
+		BUG_ON(ios->nr_pages <= pg);
+		cur_len -= pglen;
 
-		added_len = bio_add_pc_page(q, per_dev->bio, bvec->bv_page,
-					    bvec->bv_len, bvec->bv_offset);
-		if (unlikely(bvec->bv_len != added_len))
+		added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg],
+					    pglen, pgbase);
+		if (unlikely(pglen != added_len))
 			return -ENOMEM;
-		++bv;
+		pgbase = 0;
+		++pg;
 	}
 	BUG_ON(cur_len);
 
-	*cur_bvec = bv;
+	*cur_pg = pg;
 	return 0;
 }
 
@@ -332,10 +334,10 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned comp = 0;
 	unsigned stripes = 0;
-	unsigned cur_bvec = 0;
-	int ret;
+	unsigned cur_pg = 0;
+	int ret = 0;
 
-	if (!ios->bio) {
+	if (!ios->pages) {
 		if (ios->kern_buff) {
 			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
 			unsigned unit_off;
@@ -352,7 +354,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 
 	while (length) {
 		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
-		unsigned cur_len;
+		unsigned cur_len, page_off;
 
 		if (!per_dev->length) {
 			unsigned unit_off;
@@ -362,11 +364,15 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 			stripes++;
 			cur_len = min_t(u64, stripe_unit - unit_off, length);
 			offset += cur_len;
+			page_off = unit_off & ~PAGE_MASK;
+			BUG_ON(page_off != ios->pgbase);
 		} else {
 			cur_len = min_t(u64, stripe_unit, length);
+			page_off = 0;
 		}
 
-		ret = _add_stripe_unit(ios, &cur_bvec, per_dev, cur_len);
+		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+				       cur_len);
 		if (unlikely(ret))
 			goto out;
 
@@ -448,7 +454,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 		per_dev->or = or;
 		per_dev->offset = master_dev->offset;
 
-		if (ios->bio) {
+		if (ios->pages) {
 			struct bio *bio;
 
 			if (per_dev != master_dev) {
@@ -541,7 +547,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
 	}
 	per_dev->or = or;
 
-	if (ios->bio) {
+	if (ios->pages) {
 		osd_req_read(or, &ios->obj, per_dev->offset,
 				per_dev->bio, per_dev->length);
 		EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
-- 
cgit v1.2.3


From 96391e2bae0f8882b6f44809202a68be66e91dce Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 9 Feb 2010 11:43:21 +0200
Subject: exofs: Error recovery if object is missing from storage

If an object is referenced by a directory but does not
exist on a target, it is a very serious corruption that
means:
1. Either a power failure with very slim chance of it
  happening. Because the directory update is always submitted
  much after object creation, but if a directory is written
  to one device and the object creation to another it might
  theoretically happen.
2. It only ever happened to me while developing with BUGs
  causing file corruption. Crashes could also cause it but
  they are more like case 1.

In any way the object does not exist, so data is surely lost.
If there is a mix-up in the obj-id or data-map, then lost objects
can be salvaged by off-line fsck. The only recoverable information
is the directory name. By letting it appear as a regular empty file,
with date==0 (1970 Jan 1st) ownership to root, we enable recovery
of the only useful information. And also enable deletion or over-write.
I can see how this can hurt.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6ca0b0117f04..5514f3c2c2f4 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -903,8 +903,18 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
 	ios->in_attr_len = ARRAY_SIZE(attrs);
 
 	ret = exofs_sbi_read(ios);
-	if (ret)
+	if (unlikely(ret)) {
+		EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
+			  _LLU(ios->obj.id), ret);
+		memset(inode, 0, sizeof(*inode));
+		inode->i_mode = 0040000 | (0777 & ~022);
+		/* If object is lost on target we might as well enable it's
+		 * delete.
+		 */
+		if ((ret == -ENOENT) || (ret == -EINVAL))
+			ret = 0;
 		goto out;
+	}
 
 	ret = extract_attr_from_ios(ios, &attrs[0]);
 	if (ret) {
-- 
cgit v1.2.3


From b367e78bd1c7af4c018ce98b1f6d3e001aba895a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 7 Feb 2010 19:18:58 +0200
Subject: exofs: Prepare for groups

* Rename _offset_dev_unit_off() to _calc_stripe_info()
  and recieve a struct for the output params

* In _prepare_for_striping we only need to call
  _calc_stripe_info() once. The other componets
  are easy to calculate from that. This code
  was inspired by what's done in truncate.

* Some code shifts that make sense now but will make
  more sense when group support is added.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/ios.c | 159 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 99 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 263052c77f41..d28febdf54ab 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -259,28 +259,46 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 	return acumulated_lin_err;
 }
 
-/* REMOVEME: After review
-   Some quoteing from the standard
-
-   L = logical offset into the file
-   W = number of data components in a stripe
-   S = W * stripe_unit (S is Stripe length)
-   N = L / S (N is the stripe Number)
-   C = (L-(N*S)) / stripe_unit (C is the component)
-   O = (N*stripe_unit)+(L%stripe_unit) (O is the object's offset)
-*/
-
-static void _offset_dev_unit_off(struct exofs_io_state *ios, u64 file_offset,
-			u64 *obj_offset, unsigned *dev, unsigned *unit_off)
+/*
+ * L - logical offset into the file
+ *
+ * U - The number of bytes in a full stripe
+ *
+ *	U = stripe_unit * group_width
+ *
+ * N - The stripe number
+ *
+ *	N = L / U
+ *
+ * C - The component index coresponding to L
+ *
+ *	C = (L - (N*U)) / stripe_unit
+ *
+ * O - The component offset coresponding to L
+ *
+ *	(N*stripe_unit)+(L%stripe_unit)
+ */
+
+struct _striping_info {
+	u64 obj_offset;
+	unsigned dev;
+	unsigned unit_off;
+};
+
+static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
+			      struct _striping_info *si)
 {
-	unsigned stripe_unit = ios->layout->stripe_unit;
-	unsigned stripe_length = stripe_unit * ios->layout->group_width;
-	u64 stripe_no = file_offset;
-	unsigned stripe_mod = do_div(stripe_no, stripe_length);
+	u32	stripe_unit = ios->layout->stripe_unit;
+	u32	group_width = ios->layout->group_width;
+	u32	U = stripe_unit * group_width;
+
+	u32	LmodU;
+	u64 	N = div_u64_rem(file_offset, U, &LmodU);
 
-	*unit_off = stripe_mod % stripe_unit;
-	*obj_offset = stripe_no * stripe_unit + *unit_off;
-	*dev = stripe_mod / stripe_unit * ios->layout->mirrors_p1;
+	si->unit_off = LmodU % stripe_unit;
+	si->obj_offset = N * stripe_unit + si->unit_off;
+	si->dev = LmodU / stripe_unit;
+	si->dev *= ios->layout->mirrors_p1;
 }
 
 static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
@@ -327,65 +345,88 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 	return 0;
 }
 
-static int _prepare_for_striping(struct exofs_io_state *ios)
+static int _prepare_pages(struct exofs_io_state *ios,
+			  struct _striping_info *si)
 {
 	u64 length = ios->length;
-	u64 offset = ios->offset;
 	unsigned stripe_unit = ios->layout->stripe_unit;
+	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned dev = si->dev;
 	unsigned comp = 0;
 	unsigned stripes = 0;
 	unsigned cur_pg = 0;
 	int ret = 0;
 
-	if (!ios->pages) {
-		if (ios->kern_buff) {
-			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
-			unsigned unit_off;
-
-			_offset_dev_unit_off(ios, offset, &per_dev->offset,
-					     &per_dev->dev, &unit_off);
-			/* no cross device without page array */
-			BUG_ON((ios->layout->group_width > 1) &&
-			       (unit_off + length > stripe_unit));
-		}
-		ios->numdevs = ios->layout->mirrors_p1;
-		return 0;
-	}
-
 	while (length) {
 		struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
-		unsigned cur_len, page_off;
+		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
-			unsigned unit_off;
+			per_dev->dev = dev;
+			if (dev < si->dev) {
+				per_dev->offset = si->obj_offset + stripe_unit -
+								   si->unit_off;
+				cur_len = stripe_unit;
+			} else if (dev == si->dev) {
+				per_dev->offset = si->obj_offset;
+				cur_len = stripe_unit - si->unit_off;
+				page_off = si->unit_off & ~PAGE_MASK;
+				BUG_ON(page_off && (page_off != ios->pgbase));
+			} else { /* dev > si->dev */
+				per_dev->offset = si->obj_offset - si->unit_off;
+				cur_len = stripe_unit;
+			}
 
-			_offset_dev_unit_off(ios, offset, &per_dev->offset,
-					     &per_dev->dev, &unit_off);
 			stripes++;
-			cur_len = min_t(u64, stripe_unit - unit_off, length);
-			offset += cur_len;
-			page_off = unit_off & ~PAGE_MASK;
-			BUG_ON(page_off != ios->pgbase);
+
+			dev += mirrors_p1;
+			dev %= ios->layout->s_numdevs;
 		} else {
-			cur_len = min_t(u64, stripe_unit, length);
-			page_off = 0;
+			cur_len = stripe_unit;
 		}
+		if (cur_len >= length)
+			cur_len = length;
 
 		ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
 				       cur_len);
 		if (unlikely(ret))
 			goto out;
 
-		comp += ios->layout->mirrors_p1;
+		comp += mirrors_p1;
 		comp %= ios->layout->s_numdevs;
 
 		length -= cur_len;
 	}
 out:
-	ios->numdevs = stripes * ios->layout->mirrors_p1;
+	ios->numdevs = stripes * mirrors_p1;
 	return ret;
 }
 
+static int _prepare_for_striping(struct exofs_io_state *ios)
+{
+	struct _striping_info si;
+
+	_calc_stripe_info(ios, ios->offset, &si);
+
+	if (!ios->pages) {
+		if (ios->kern_buff) {
+			struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+
+			per_dev->offset = si.obj_offset;
+			per_dev->dev = si.dev;
+
+			/* no cross device without page array */
+			BUG_ON((ios->layout->group_width > 1) &&
+			       (si.unit_off + ios->length >
+				ios->layout->stripe_unit));
+		}
+		ios->numdevs = ios->layout->mirrors_p1;
+		return 0;
+	}
+
+	return _prepare_pages(ios, &si);
+}
+
 int exofs_sbi_create(struct exofs_io_state *ios)
 {
 	int i, ret;
@@ -648,9 +689,7 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 		struct osd_attr attr;
 		__be64 newsize;
 	} *size_attrs;
-	u64 this_obj_size;
-	unsigned dev;
-	unsigned unit_off;
+	struct _striping_info si;
 	int i, ret;
 
 	ret = exofs_get_io_state(&sbi->layout, &ios);
@@ -668,19 +707,19 @@ int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
 	ios->cred = oi->i_cred;
 
 	ios->numdevs = ios->layout->s_numdevs;
-	_offset_dev_unit_off(ios, size, &this_obj_size, &dev, &unit_off);
+	_calc_stripe_info(ios, size, &si);
 
 	for (i = 0; i < ios->layout->group_width; ++i) {
 		struct exofs_trunc_attr *size_attr = &size_attrs[i];
 		u64 obj_size;
 
-		if (i < dev)
-			obj_size = this_obj_size +
-					ios->layout->stripe_unit - unit_off;
-		else if (i == dev)
-			obj_size = this_obj_size;
-		else /* i > dev */
-			obj_size = this_obj_size - unit_off;
+		if (i < si.dev)
+			obj_size = si.obj_offset +
+					ios->layout->stripe_unit - si.unit_off;
+		else if (i == si.dev)
+			obj_size = si.obj_offset;
+		else /* i > si.dev */
+			obj_size = si.obj_offset - si.unit_off;
 
 		size_attr->newsize = cpu_to_be64(obj_size);
 		size_attr->attr = g_attr_logical_length;
-- 
cgit v1.2.3


From 50a76fd3c352ed2740eba01512efcfceee0703be Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 11 Feb 2010 13:01:39 +0200
Subject: exofs: groups support

* _calc_stripe_info() changes to accommodate for grouping
  calculations. Returns additional information

* old _prepare_pages() becomes _prepare_one_group()
  which stores pages belonging to one device group.

* New _prepare_for_striping iterates on all groups calling
  _prepare_one_group().

* Enable mounting of groups data_maps (group_width != 0)

[QUESTION]
what is faster A or B;
A.	x += stride;
	x = x % width + first_x;

B	x += stride
	if (x < last_x)
		x = first_x;

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |   3 ++
 fs/exofs/ios.c   | 129 +++++++++++++++++++++++++++++++++++++++++++++----------
 fs/exofs/super.c |  46 ++++++++++++++------
 3 files changed, 141 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index acfebd36de83..59b8bf2825c7 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -63,6 +63,8 @@ struct exofs_layout {
 	unsigned mirrors_p1;
 
 	unsigned group_width;
+	u64	 group_depth;
+	unsigned group_count;
 
 	enum exofs_inode_layout_gen_functions lay_func;
 
@@ -132,6 +134,7 @@ struct exofs_io_state {
 	struct page		**pages;
 	unsigned		nr_pages;
 	unsigned		pgbase;
+	unsigned		pages_consumed;
 
 	/* Attributes */
 	unsigned		in_attr_len;
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index d28febdf54ab..5293bc411d17 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -262,25 +262,50 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 /*
  * L - logical offset into the file
  *
- * U - The number of bytes in a full stripe
+ * U - The number of bytes in a stripe within a group
  *
  *	U = stripe_unit * group_width
  *
- * N - The stripe number
+ * T - The number of bytes striped within a group of component objects
+ *     (before advancing to the next group)
  *
- *	N = L / U
+ *	T = stripe_unit * group_width * group_depth
+ *
+ * S - The number of bytes striped across all component objects
+ *     before the pattern repeats
+ *
+ *	S = stripe_unit * group_width * group_depth * group_count
+ *
+ * M - The "major" (i.e., across all components) stripe number
+ *
+ *	M = L / S
+ *
+ * G - Counts the groups from the beginning of the major stripe
+ *
+ *	G = (L - (M * S)) / T	[or (L % S) / T]
+ *
+ * H - The byte offset within the group
+ *
+ *	H = (L - (M * S)) % T	[or (L % S) % T]
+ *
+ * N - The "minor" (i.e., across the group) stripe number
+ *
+ *	N = H / U
  *
  * C - The component index coresponding to L
  *
- *	C = (L - (N*U)) / stripe_unit
+ *	C = (H - (N * U)) / stripe_unit + G * group_width
+ *	[or (L % U) / stripe_unit + G * group_width]
  *
  * O - The component offset coresponding to L
  *
- *	(N*stripe_unit)+(L%stripe_unit)
+ *	O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit
  */
-
 struct _striping_info {
 	u64 obj_offset;
+	u64 group_length;
+	u64 total_group_length;
+	u64 Major;
 	unsigned dev;
 	unsigned unit_off;
 };
@@ -290,15 +315,35 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
 {
 	u32	stripe_unit = ios->layout->stripe_unit;
 	u32	group_width = ios->layout->group_width;
+	u64	group_depth = ios->layout->group_depth;
+
 	u32	U = stripe_unit * group_width;
+	u64	T = U * group_depth;
+	u64	S = T * ios->layout->group_count;
+	u64	M = div64_u64(file_offset, S);
+
+	/*
+	G = (L - (M * S)) / T
+	H = (L - (M * S)) % T
+	*/
+	u64	LmodS = file_offset - M * S;
+	u32	G = div64_u64(LmodS, T);
+	u64	H = LmodS - G * T;
+
+	u32	N = div_u64(H, U);
+
+	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
+	si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+	si->dev *= ios->layout->mirrors_p1;
 
-	u32	LmodU;
-	u64 	N = div_u64_rem(file_offset, U, &LmodU);
+	div_u64_rem(file_offset, stripe_unit, &si->unit_off);
 
-	si->unit_off = LmodU % stripe_unit;
-	si->obj_offset = N * stripe_unit + si->unit_off;
-	si->dev = LmodU / stripe_unit;
-	si->dev *= ios->layout->mirrors_p1;
+	si->obj_offset = si->unit_off + (N * stripe_unit) +
+				  (M * group_depth * stripe_unit);
+
+	si->group_length = T - H;
+	si->total_group_length = T;
+	si->Major = M;
 }
 
 static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
@@ -345,16 +390,17 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 	return 0;
 }
 
-static int _prepare_pages(struct exofs_io_state *ios,
-			  struct _striping_info *si)
+static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
+			      struct _striping_info *si, unsigned first_comp)
 {
-	u64 length = ios->length;
 	unsigned stripe_unit = ios->layout->stripe_unit;
 	unsigned mirrors_p1 = ios->layout->mirrors_p1;
+	unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
 	unsigned dev = si->dev;
-	unsigned comp = 0;
-	unsigned stripes = 0;
-	unsigned cur_pg = 0;
+	unsigned first_dev = dev - (dev % devs_in_group);
+	unsigned comp = first_comp + (dev - first_dev);
+	unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+	unsigned cur_pg = ios->pages_consumed;
 	int ret = 0;
 
 	while (length) {
@@ -377,10 +423,11 @@ static int _prepare_pages(struct exofs_io_state *ios,
 				cur_len = stripe_unit;
 			}
 
-			stripes++;
+			if (max_comp < comp)
+				max_comp = comp;
 
 			dev += mirrors_p1;
-			dev %= ios->layout->s_numdevs;
+			dev = (dev % devs_in_group) + first_dev;
 		} else {
 			cur_len = stripe_unit;
 		}
@@ -393,18 +440,24 @@ static int _prepare_pages(struct exofs_io_state *ios,
 			goto out;
 
 		comp += mirrors_p1;
-		comp %= ios->layout->s_numdevs;
+		comp = (comp % devs_in_group) + first_comp;
 
 		length -= cur_len;
 	}
 out:
-	ios->numdevs = stripes * mirrors_p1;
+	ios->numdevs = max_comp + mirrors_p1;
+	ios->pages_consumed = cur_pg;
 	return ret;
 }
 
 static int _prepare_for_striping(struct exofs_io_state *ios)
 {
+	u64 length = ios->length;
 	struct _striping_info si;
+	unsigned devs_in_group = ios->layout->group_width *
+				 ios->layout->mirrors_p1;
+	unsigned first_comp = 0;
+	int ret = 0;
 
 	_calc_stripe_info(ios, ios->offset, &si);
 
@@ -424,7 +477,31 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
 		return 0;
 	}
 
-	return _prepare_pages(ios, &si);
+	while (length) {
+		if (length < si.group_length)
+			si.group_length = length;
+
+		ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+		if (unlikely(ret))
+			goto out;
+
+		length -= si.group_length;
+
+		si.group_length = si.total_group_length;
+		si.unit_off = 0;
+		++si.Major;
+		si.obj_offset = si.Major * ios->layout->stripe_unit *
+						ios->layout->group_depth;
+
+		si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
+		si.dev %= ios->layout->s_numdevs;
+
+		first_comp += devs_in_group;
+		first_comp %= ios->layout->s_numdevs;
+	}
+
+out:
+	return ret;
 }
 
 int exofs_sbi_create(struct exofs_io_state *ios)
@@ -482,6 +559,9 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
 	unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
 	int ret = 0;
 
+	if (ios->pages && !master_dev->length)
+		return 0; /* Just an empty slot */
+
 	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
 		struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 		struct osd_request *or;
@@ -580,6 +660,9 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
 	struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp];
 	unsigned first_dev = (unsigned)ios->obj.id;
 
+	if (ios->pages && !per_dev->length)
+		return 0; /* Just an empty slot */
+
 	first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
 	or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL);
 	if (unlikely(!or)) {
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8f4e4b37a578..6cf5e4e84d61 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -323,11 +323,7 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	sbi->data_map.odm_raid_algorithm  =
 				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 
-/* FIXME: Only raid0 !group_width/depth for now. if not so, do not mount */
-	if (sbi->data_map.odm_group_width || sbi->data_map.odm_group_depth) {
-		EXOFS_ERR("Group width/depth not supported\n");
-		return -EINVAL;
-	}
+/* FIXME: Only raid0 for now. if not so, do not mount */
 	if (sbi->data_map.odm_num_comps != numdevs) {
 		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
 			  sbi->data_map.odm_num_comps, numdevs);
@@ -343,14 +339,6 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 		return -EINVAL;
 	}
 
-	stripe_length = sbi->data_map.odm_stripe_unit *
-			(numdevs / (sbi->data_map.odm_mirror_cnt + 1));
-	if (stripe_length >= (1ULL << 32)) {
-		EXOFS_ERR("Total Stripe length(0x%llx)"
-			  " >= 32bit is not supported\n", _LLU(stripe_length));
-		return -EINVAL;
-	}
-
 	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
 		EXOFS_ERR("Stripe Unit(0x%llx)"
 			  " must be Multples of PAGE_SIZE(0x%lx)\n",
@@ -360,8 +348,36 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 
 	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
 	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
-	sbi->layout.group_width = sbi->data_map.odm_num_comps /
+
+	if (sbi->data_map.odm_group_width) {
+		sbi->layout.group_width = sbi->data_map.odm_group_width;
+		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
+		if (!sbi->layout.group_depth) {
+			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
+			return -EINVAL;
+		}
+		sbi->layout.group_count = sbi->data_map.odm_num_comps /
+						sbi->layout.mirrors_p1 /
+						sbi->data_map.odm_group_width;
+	} else {
+		if (sbi->data_map.odm_group_depth) {
+			printk(KERN_NOTICE "Warning: group_depth ignored "
+				"group_width == 0 && group_depth == %d\n",
+				sbi->data_map.odm_group_depth);
+			sbi->data_map.odm_group_depth = 0;
+		}
+		sbi->layout.group_width = sbi->data_map.odm_num_comps /
 							sbi->layout.mirrors_p1;
+		sbi->layout.group_depth = -1;
+		sbi->layout.group_count = 1;
+	}
+
+	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
+	if (stripe_length >= (1ULL << 32)) {
+		EXOFS_ERR("Total Stripe length(0x%llx)"
+			  " >= 32bit is not supported\n", _LLU(stripe_length));
+		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -540,6 +556,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->layout.stripe_unit = PAGE_SIZE;
 	sbi->layout.mirrors_p1 = 1;
 	sbi->layout.group_width = 1;
+	sbi->layout.group_depth = -1;
+	sbi->layout.group_count = 1;
 	sbi->layout.s_ods[0] = od;
 	sbi->layout.s_numdevs = 1;
 	sbi->layout.s_pid = opts->pid;
-- 
cgit v1.2.3


From 9f7cdbc33f36d28e57eaba0093f68f0d14c38c5b Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sat, 27 Feb 2010 20:35:12 +0300
Subject: blkdev: fix merge_bvec_fn return value checks

merge_bvec_fn() returns bvec->bv_len on success. So we have to check
against this value. But in case of fs_optimization merge we compare
with wrong value. This patch must be included in
 b428cd6da7e6559aca69aa2e3a526037d3f20403
But accidentally i've forgot to add this in the initial patch.
To make things straight let's replace all such checks.
In fact this makes code easy to understand.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index dc17afd672e3..0bda289f86fc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 					.bi_rw = bio->bi_rw,
 				};
 
-				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+				if (q->merge_bvec_fn(q, &bvm, prev) != prev->bv_len) {
 					prev->bv_len -= len;
 					return 0;
 				}
@@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) != bvec->bv_len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
-- 
cgit v1.2.3


From 009d851837ab26cab18adda6169a813f70b0b21b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 8 Dec 2009 12:12:13 +0000
Subject: GFS2: Metadata address space clean up

Since the start of GFS2, an "extra" inode has been used to store
the metadata belonging to each inode. The only reason for using
this inode was to have an extra address space, the other fields
were unused. This means that the memory usage was rather inefficient.

The reason for keeping each inode's metadata in a separate address
space is that when glocks are requested on remote nodes, we need to
be able to efficiently locate the data and metadata which relating
to that glock (inode) in order to sync or sync and invalidate it
(depending on the remotely requested lock mode).

This patch adds a new type of glock, which has in addition to
its normal fields, has an address space. This applies to all
inode and rgrp glocks (but to no other glock types which remain
as before). As a result, we no longer need to have the second
inode.

This results in three major improvements:
 1. A saving of approx 25% of memory used in caching inodes
 2. A removal of the circular dependency between inodes and glocks
 3. No confusion between "normal" and "metadata" inodes in super.c

Although the first of these is the more immediately apparent, the
second is just as important as it now enables a number of clean
ups at umount time. Those will be the subject of future patches.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c     |  4 ++--
 fs/gfs2/glock.c    | 40 +++++++++++++++++++++-------------------
 fs/gfs2/glock.h    |  7 +++++++
 fs/gfs2/glops.c    | 16 +++++++++-------
 fs/gfs2/incore.h   |  4 ++--
 fs/gfs2/inode.c    |  6 ++----
 fs/gfs2/lock_dlm.c |  5 ++++-
 fs/gfs2/main.c     | 28 ++++++++++++++++++++++++++++
 fs/gfs2/meta_io.c  | 46 ++++++----------------------------------------
 fs/gfs2/meta_io.h  | 12 ++++++++++--
 fs/gfs2/super.c    | 26 ++++++++------------------
 fs/gfs2/util.c     |  1 +
 fs/gfs2/util.h     |  1 +
 13 files changed, 101 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da9415267..0c1d0b82dcf1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1061,8 +1061,8 @@ out:
 
 int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 {
-	struct inode *aspace = page->mapping->host;
-	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+	struct address_space *mapping = page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct buffer_head *bh, *head;
 	struct gfs2_bufdata *bd;
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42663325931..dfb10a4d467e 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -154,12 +154,14 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
 static void glock_free(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct kmem_cache *cachep = gfs2_glock_cachep;
 
-	if (aspace)
-		gfs2_aspace_put(aspace);
+	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
 	trace_gfs2_glock_put(gl);
-	sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
+	if (mapping)
+		cachep = gfs2_glock_aspace_cachep;
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 
 /**
@@ -750,10 +752,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		   const struct gfs2_glock_operations *glops, int create,
 		   struct gfs2_glock **glp)
 {
+	struct super_block *s = sdp->sd_vfs;
 	struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
 	struct gfs2_glock *gl, *tmp;
 	unsigned int hash = gl_hash(sdp, &name);
-	int error;
+	struct address_space *mapping;
 
 	read_lock(gl_lock_addr(hash));
 	gl = search_bucket(hash, sdp, &name);
@@ -765,7 +768,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	if (!create)
 		return -ENOENT;
 
-	gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+	if (glops->go_flags & GLOF_ASPACE)
+		gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+	else
+		gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
 	if (!gl)
 		return -ENOMEM;
 
@@ -784,18 +790,18 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
-	gl->gl_aspace = NULL;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 	INIT_WORK(&gl->gl_delete, delete_work_func);
 
-	/* If this glock protects actual on-disk data or metadata blocks,
-	   create a VFS inode to manage the pages/buffers holding them. */
-	if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
-		gl->gl_aspace = gfs2_aspace_get(sdp);
-		if (!gl->gl_aspace) {
-			error = -ENOMEM;
-			goto fail;
-		}
+	mapping = gfs2_glock2aspace(gl);
+	if (mapping) {
+                mapping->a_ops = &gfs2_meta_aops;
+		mapping->host = s->s_bdev->bd_inode;
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, GFP_NOFS);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = s->s_bdi;
+		mapping->writeback_index = 0;
 	}
 
 	write_lock(gl_lock_addr(hash));
@@ -812,10 +818,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	*glp = gl;
 
 	return 0;
-
-fail:
-	kmem_cache_free(gfs2_glock_cachep, gl);
-	return error;
 }
 
 /**
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c0262faf4725..2bda1911b156 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -180,6 +180,13 @@ static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
 	return gl->gl_state == LM_ST_SHARED;
 }
 
+static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
+{
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		return (struct address_space *)(gl + 1);
+	return NULL;
+}
+
 int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   u64 number, const struct gfs2_glock_operations *glops,
 		   int create, struct gfs2_glock **glp);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 78554acc0605..38e3749d476c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -87,7 +87,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
 	int error;
 
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
@@ -113,7 +113,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 
 	BUG_ON(!(flags & DIO_METADATA));
 	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
@@ -134,7 +134,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 static void inode_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_inode *ip = gl->gl_object;
-	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	struct address_space *metamapping = gfs2_glock2aspace(gl);
 	int error;
 
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
@@ -183,7 +183,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
 
 	if (flags & DIO_METADATA) {
-		struct address_space *mapping = gl->gl_aspace->i_mapping;
+		struct address_space *mapping = gfs2_glock2aspace(gl);
 		truncate_inode_pages(mapping, 0);
 		if (ip) {
 			set_bit(GIF_INVALID, &ip->i_flags);
@@ -282,7 +282,8 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 
 static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
-	return !gl->gl_aspace->i_mapping->nrpages;
+	const struct address_space *mapping = (const struct address_space *)(gl + 1);
+	return !mapping->nrpages;
 }
 
 /**
@@ -387,8 +388,7 @@ static void iopen_go_callback(struct gfs2_glock *gl)
 	struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
 
 	if (gl->gl_demote_state == LM_ST_UNLOCKED &&
-	    gl->gl_state == LM_ST_SHARED &&
-	    ip && test_bit(GIF_USER, &ip->i_flags)) {
+	    gl->gl_state == LM_ST_SHARED && ip) {
 		gfs2_glock_hold(gl);
 		if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
 			gfs2_glock_put_nolock(gl);
@@ -407,6 +407,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
 	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -418,6 +419,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_min_hold_time = HZ / 5,
+	.go_flags = GLOF_ASPACE,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bc0ad158e6b4..1de7e1b7ce83 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -162,6 +162,8 @@ struct gfs2_glock_operations {
 	void (*go_callback) (struct gfs2_glock *gl);
 	const int go_type;
 	const unsigned long go_min_hold_time;
+	const unsigned long go_flags;
+#define GLOF_ASPACE 1
 };
 
 enum {
@@ -225,7 +227,6 @@ struct gfs2_glock {
 
 	struct gfs2_sbd *gl_sbd;
 
-	struct inode *gl_aspace;
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
@@ -258,7 +259,6 @@ enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_SW_PAGED		= 3,
-	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6e220f4eee7d..b1bf2694fb2b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -45,7 +45,7 @@ static int iget_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 *no_addr = opaque;
 
-	if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags))
+	if (ip->i_no_addr == *no_addr)
 		return 1;
 
 	return 0;
@@ -58,7 +58,6 @@ static int iget_set(struct inode *inode, void *opaque)
 
 	inode->i_ino = (unsigned long)*no_addr;
 	ip->i_no_addr = *no_addr;
-	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -84,7 +83,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_skip_data *data = opaque;
 
-	if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){
+	if (ip->i_no_addr == data->no_addr) {
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
@@ -103,7 +102,6 @@ static int iget_skip_set(struct inode *inode, void *opaque)
 		return 1;
 	inode->i_ino = (unsigned long)(data->no_addr);
 	ip->i_no_addr = data->no_addr;
-	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e5e0e7022e5..569b46240f61 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -30,7 +30,10 @@ static void gdlm_ast(void *arg)
 
 	switch (gl->gl_lksb.sb_status) {
 	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-		kmem_cache_free(gfs2_glock_cachep, gl);
+		if (gl->gl_ops->go_flags & GLOF_ASPACE)
+			kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+		else
+			kmem_cache_free(gfs2_glock_cachep, gl);
 		if (atomic_dec_and_test(&sdp->sd_glock_disposal))
 			wake_up(&sdp->sd_glock_wait);
 		return;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 5b31f7741a8f..a88fadc704bb 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -52,6 +52,22 @@ static void gfs2_init_glock_once(void *foo)
 	atomic_set(&gl->gl_ail_count, 0);
 }
 
+static void gfs2_init_gl_aspace_once(void *foo)
+{
+	struct gfs2_glock *gl = foo;
+	struct address_space *mapping = (struct address_space *)(gl + 1);
+
+	gfs2_init_glock_once(gl);
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+}
+
 /**
  * init_gfs2_fs - Register GFS2 as a filesystem
  *
@@ -78,6 +94,14 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_glock_cachep)
 		goto fail;
 
+	gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock (aspace)",
+					sizeof(struct gfs2_glock) +
+					sizeof(struct address_space),
+					0, 0, gfs2_init_gl_aspace_once);
+
+	if (!gfs2_glock_aspace_cachep)
+		goto fail;
+
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
@@ -144,6 +168,9 @@ fail:
 	if (gfs2_inode_cachep)
 		kmem_cache_destroy(gfs2_inode_cachep);
 
+	if (gfs2_glock_aspace_cachep)
+		kmem_cache_destroy(gfs2_glock_aspace_cachep);
+
 	if (gfs2_glock_cachep)
 		kmem_cache_destroy(gfs2_glock_cachep);
 
@@ -169,6 +196,7 @@ static void __exit exit_gfs2_fs(void)
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
 	kmem_cache_destroy(gfs2_inode_cachep);
+	kmem_cache_destroy(gfs2_glock_aspace_cachep);
 	kmem_cache_destroy(gfs2_glock_cachep);
 
 	gfs2_sys_uninit();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6f68a5f18eb8..0bb12c80937a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -93,48 +93,12 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	return err;
 }
 
-static const struct address_space_operations aspace_aops = {
+const struct address_space_operations gfs2_meta_aops = {
 	.writepage = gfs2_aspace_writepage,
 	.releasepage = gfs2_releasepage,
 	.sync_page = block_sync_page,
 };
 
-/**
- * gfs2_aspace_get - Create and initialize a struct inode structure
- * @sdp: the filesystem the aspace is in
- *
- * Right now a struct inode is just a struct inode.  Maybe Linux
- * will supply a more lightweight address space construct (that works)
- * in the future.
- *
- * Make sure pages/buffers in this aspace aren't in high memory.
- *
- * Returns: the aspace
- */
-
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
-{
-	struct inode *aspace;
-	struct gfs2_inode *ip;
-
-	aspace = new_inode(sdp->sd_vfs);
-	if (aspace) {
-		mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
-		aspace->i_mapping->a_ops = &aspace_aops;
-		aspace->i_size = MAX_LFS_FILESIZE;
-		ip = GFS2_I(aspace);
-		clear_bit(GIF_USER, &ip->i_flags);
-		insert_inode_hash(aspace);
-	}
-	return aspace;
-}
-
-void gfs2_aspace_put(struct inode *aspace)
-{
-	remove_inode_hash(aspace);
-	iput(aspace);
-}
-
 /**
  * gfs2_meta_sync - Sync all buffers associated with a glock
  * @gl: The glock
@@ -143,7 +107,7 @@ void gfs2_aspace_put(struct inode *aspace)
 
 void gfs2_meta_sync(struct gfs2_glock *gl)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 	int error;
 
 	filemap_fdatawrite(mapping);
@@ -164,7 +128,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct page *page;
 	struct buffer_head *bh;
@@ -344,8 +308,10 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
+	struct address_space *mapping = bh->b_page->mapping;
+	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct gfs2_bufdata *bd = bh->b_private;
+
 	if (test_clear_buffer_pinned(bh)) {
 		list_del_init(&bd->bd_le.le_list);
 		if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index de270c2f9b63..6a1d9ba16411 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -37,8 +37,16 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 	       0, from_head - to_head);
 }
 
-struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
-void gfs2_aspace_put(struct inode *aspace);
+extern const struct address_space_operations gfs2_meta_aops;
+
+static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	if (mapping->a_ops == &gfs2_meta_aops)
+		return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+	else
+		return inode->i_sb->s_fs_info;
+}
 
 void gfs2_meta_sync(struct gfs2_glock *gl);
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9dd3da22c0a..ad7bc2d25ac2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -722,8 +722,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 	int ret = 0;
 
 	/* Check this is a "normal" inode, etc */
-	if (!test_bit(GIF_USER, &ip->i_flags) ||
-	    (current->flags & PF_MEMALLOC))
+	if (current->flags & PF_MEMALLOC)
 		return 0;
 	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (ret)
@@ -1194,7 +1193,7 @@ static void gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+	if (inode->i_nlink) {
 		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
@@ -1212,18 +1211,12 @@ static void gfs2_clear_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	/* This tells us its a "real" inode and not one which only
-	 * serves to contain an address space (see rgrp.c, meta_io.c)
-	 * which therefore doesn't have its own glocks.
-	 */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		ip->i_gl->gl_object = NULL;
-		gfs2_glock_put(ip->i_gl);
-		ip->i_gl = NULL;
-		if (ip->i_iopen_gh.gh_gl) {
-			ip->i_iopen_gh.gh_gl->gl_object = NULL;
-			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		}
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+	ip->i_gl = NULL;
+	if (ip->i_iopen_gh.gh_gl) {
+		ip->i_iopen_gh.gh_gl->gl_object = NULL;
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 	}
 }
 
@@ -1358,9 +1351,6 @@ static void gfs2_delete_inode(struct inode *inode)
 	struct gfs2_holder gh;
 	int error;
 
-	if (!test_bit(GIF_USER, &ip->i_flags))
-		goto out;
-
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	if (unlikely(error)) {
 		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f6a7efa34eb9..226f2bfbf16a 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -21,6 +21,7 @@
 #include "util.h"
 
 struct kmem_cache *gfs2_glock_cachep __read_mostly;
+struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 33e96b0ce9ab..b432e04600de 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -145,6 +145,7 @@ gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__);
 
 
 extern struct kmem_cache *gfs2_glock_cachep;
+extern struct kmem_cache *gfs2_glock_aspace_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
-- 
cgit v1.2.3


From c1184f8ab7ea26681f3cab18284a870aad678b0f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 8 Jan 2010 16:14:29 +0000
Subject: GFS2: Remove loopy umount code

As a consequence of the previous patch, we can now remove the
loop which used to be required due to the circular dependency
between the inodes and glocks. Instead we can just invalidate
the inodes, and then clear up any glocks which are left.

Also we no longer need the rwsem since there is no longer any
danger of the inode invalidation calling back into the glock
code (and from there back into the inode code).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      | 33 ++-------------------------------
 fs/gfs2/incore.h     |  1 -
 fs/gfs2/ops_fstype.c |  4 +---
 fs/gfs2/super.c      |  1 +
 fs/gfs2/sys.c        |  2 --
 5 files changed, 4 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index dfb10a4d467e..4773f9098a41 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -19,7 +19,6 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/module.h>
-#include <linux/rwsem.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
@@ -60,7 +59,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 
-static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
 struct workqueue_struct *gfs2_delete_workqueue;
@@ -714,7 +712,6 @@ static void glock_work_func(struct work_struct *work)
 		finish_xmote(gl, gl->gl_reply);
 		drop_ref = 1;
 	}
-	down_read(&gfs2_umount_flush_sem);
 	spin_lock(&gl->gl_spin);
 	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_state != LM_ST_UNLOCKED &&
@@ -727,7 +724,6 @@ static void glock_work_func(struct work_struct *work)
 	}
 	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
-	up_read(&gfs2_umount_flush_sem);
 	if (!delay ||
 	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
@@ -1512,35 +1508,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-	unsigned long t;
 	unsigned int x;
-	int cont;
 
-	t = jiffies;
-
-	for (;;) {
-		cont = 0;
-		for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-			if (examine_bucket(clear_glock, sdp, x))
-				cont = 1;
-		}
-
-		if (!cont)
-			break;
-
-		if (time_after_eq(jiffies,
-				  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
-			fs_warn(sdp, "Unmount seems to be stalled. "
-				     "Dumping lock state...\n");
-			gfs2_dump_lockstate(sdp);
-			t = jiffies;
-		}
-
-		down_write(&gfs2_umount_flush_sem);
-		invalidate_inodes(sdp->sd_vfs);
-		up_write(&gfs2_umount_flush_sem);
-		msleep(10);
-	}
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(clear_glock, sdp, x);
 	flush_workqueue(glock_workqueue);
 	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
 	gfs2_dump_lockstate(sdp);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1de7e1b7ce83..b8025e51cabf 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -451,7 +451,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
-	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
 	unsigned int gt_statfs_quantum;
 	unsigned int gt_statfs_slow;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a86ed6381566..a054b526dc08 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -65,7 +65,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_scale_den = 1;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
-	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
 }
 
@@ -1241,10 +1240,9 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
+	invalidate_inodes(sb);
 	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
-	while (invalidate_inodes(sb))
-		yield();
 fail_sys:
 	gfs2_sys_fs_del(sdp);
 fail:
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ad7bc2d25ac2..e5e22629da67 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -859,6 +859,7 @@ restart:
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
+	invalidate_inodes(sdp->sd_vfs);
 	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 0dc34621f6a6..a0db1c94317d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -478,7 +478,6 @@ TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
-TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
@@ -491,7 +490,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
 	&tune_attr_quota_simul_sync.attr,
-	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
-- 
cgit v1.2.3


From e5884636da3a128617032747654284ae7badc7ff Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 5 Feb 2010 16:45:25 +1100
Subject: GFS2: ordered writes are backwards

When we queue data buffers for ordered write, the buffers are added
to the head of the ordered write list. When the log needs to push
these buffers to disk, it also walks the list from the head. The
result is that the the ordered buffers are submitted to disk in
reverse order.

For large writes, this means that whenever the log flushes large
streams of reverse sequential order buffers are pushed down into the
block layers. The elevators don't handle this particularly well, so
IO rates tend to be significantly lower than if the IO was issued in
ascending block order.

Queue new ordered buffers to the tail of the ordered buffer list to
ensure that IO is dispatched in the order it was submitted. This
should significantly improve large sequential write speeds. On a
disk capable of 85MB/s, speeds increase from 50MB/s to 65MB/s for
noop and from 38MB/s to 50MB/s for cfq.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index de97632ba32f..adc260fbea90 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -528,9 +528,9 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
 		sdp->sd_log_num_databuf++;
-		list_add(&le->le_list, &sdp->sd_log_le_databuf);
+		list_add_tail(&le->le_list, &sdp->sd_log_le_databuf);
 	} else {
-		list_add(&le->le_list, &sdp->sd_log_le_ordered);
+		list_add_tail(&le->le_list, &sdp->sd_log_le_ordered);
 	}
 out:
 	gfs2_log_unlock(sdp);
-- 
cgit v1.2.3


From 4818972efb105730f007e5efc05e203a065fc318 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 23 Feb 2010 12:20:00 -0500
Subject: GFS2: print glock numbers in hex

This patch changes glock numbers from printing in decimal to hex.
Since DLM prints corresponding resource IDs in hex, it makes debugging
easier.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4773f9098a41..454d4b4eb36b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1658,7 +1658,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
-- 
cgit v1.2.3


From cc483f102c3f703e853c96f95a654f0106fb2603 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 1 Mar 2010 19:06:35 -0500
Subject: ext4: Fix fencepost error in chosing choosing group vs file
 preallocation.

The ext4 multiblock allocator decides whether to use group or file
preallocation based on the file size.  When the file size reaches
s_mb_stream_request (default is 16 blocks), it changes to use a
file-specific preallocation. This is cool, but it has a tiny problem.

See a simple script:
mkfs.ext4 -b 1024 /dev/sda8 1000000
mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4
for((i=0;i<5;i++))
do
cat /mnt/4096>>/mnt/ext4/a	#4096 is a file with 4096 characters.
cat /mnt/4096>>/mnt/ext4/b
done
debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1

And you get
BLOCKS:
(0-14):8705-8719, (15):2356, (16-19):8465-8468

So there are 3 extents, a bit strange for the lonely 15th logical
block.  As we write to the 16 blocks, we choose file preallocation in
ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with
the 16*1024 range, so no preallocation will be carried. file b then
reserves the space after '2356', so when when write 16, we start from
another part.

This patch just change the check in ext4_mb_group_or_file, so
that for the lonely 15 we will still use group preallocation.
After the patch, we will get:
debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
BLOCKS:
(0-15):8705-8720, (16-19):8465-8468

Looks more sane. Thanks.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 415e11f1e9ee..72d5ceb18523 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3935,7 +3935,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 
 	/* don't use group allocation for large files */
 	size = max(size, isize);
-	if (size >= sbi->s_mb_stream_request) {
+	if (size > sbi->s_mb_stream_request) {
 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
 	}
-- 
cgit v1.2.3


From 437ca0fda3b442dff9e591581b5e1ffdfec24660 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 1 Mar 2010 22:29:21 -0500
Subject: ext4: deprecate obsoleted mount options

Declare following list of mount options as deprecated:
 - bsddf, miniddf
 - grpid, bsdgroups, nogrpid, sysvgroups

Declare following list of default mount options as deprecated:
 - bsdgroups

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 79937e921988..c832508d5515 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1205,6 +1205,8 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
 
 #ifdef CONFIG_QUOTA
 static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
@@ -1294,16 +1296,23 @@ static int parse_options(char *options, struct super_block *sb,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			clear_opt(sbi->s_mount_opt, MINIX_DF);
 			break;
 		case Opt_minix_df:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			set_opt(sbi->s_mount_opt, MINIX_DF);
+
 			break;
 		case Opt_grpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			set_opt(sbi->s_mount_opt, GRPID);
+
 			break;
 		case Opt_nogrpid:
+			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
 			clear_opt(sbi->s_mount_opt, GRPID);
+
 			break;
 		case Opt_resuid:
 			if (match_int(&args[0], &option))
@@ -2449,8 +2458,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sbi->s_mount_opt, DEBUG);
-	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
+			"2.6.38");
 		set_opt(sbi->s_mount_opt, GRPID);
+	}
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sbi->s_mount_opt, NO_UID32);
 #ifdef CONFIG_EXT4_FS_XATTR
-- 
cgit v1.2.3


From f39490bcd1691d65dc33689222a12e1fc13dd824 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 1 Mar 2010 23:14:36 -0500
Subject: ext4: fix error handling in migrate

Set i_nlink to zero for temporary inode from very beginning.
otherwise we may fail to start new journal handle and this
inode will be unreferenced but with i_nlink == 1
Since we hold inode reference it can not be pruned.

Also add missed journal_start retval check.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46a4101e0aec..8b87bd0eac95 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
-	 * We don't want the inode to be reclaimed
-	 * if we got interrupted in between. We have
-	 * this tmp inode carrying reference to the
-	 * data blocks of the original file. We set
-	 * the i_nlink to zero at the last stage after
-	 * switching the original file to extent format
+	 * Set the i_nlink to zero so it will be deleted later
+	 * when we drop inode reference.
 	 */
-	tmp_inode->i_nlink = 1;
+	tmp_inode->i_nlink = 0;
 
 	ext4_ext_tree_init(handle, tmp_inode);
 	ext4_orphan_add(handle, tmp_inode);
@@ -537,6 +533,16 @@ int ext4_ext_migrate(struct inode *inode)
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		/*
+		 * It is impossible to update on-disk structures without
+		 * a handle, so just rollback in-core changes and live other
+		 * work to orphan_list_cleanup()
+		 */
+		ext4_orphan_del(NULL, tmp_inode);
+		retval = PTR_ERR(handle);
+		goto out;
+	}
 
 	ei = EXT4_I(inode);
 	i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:
 
 	/* Reset the extent details */
 	ext4_ext_tree_init(handle, tmp_inode);
-
-	/*
-	 * Set the i_nlink to zero so that
-	 * generic_drop_inode really deletes the
-	 * inode
-	 */
-	tmp_inode->i_nlink = 0;
-
 	ext4_journal_stop(handle);
+out:
 	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
 
-- 
cgit v1.2.3


From da1dafca84413145f5ac59998b4cdd06fb89f721 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 1 Mar 2010 23:15:02 -0500
Subject: ext4: explicitly remove inode from orphan list after failed direct io

Otherwise non-empty orphan list will be triggered on umount.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index edb7edc99f71..427f4690ad6d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3438,6 +3438,9 @@ retry:
 			 * but cannot extend i_size. Bail out and pretend
 			 * the write failed... */
 			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
 			goto out;
 		}
 		if (inode->i_nlink)
-- 
cgit v1.2.3


From 6e3617e579e070d3655a93ee9ed7149113e795e0 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 1 Mar 2010 23:29:39 -0500
Subject: ext4: Handle non empty on-disk orphan link

In case of truncate errors we explicitly remove inode from in-core
orphan list via orphan_del(NULL, inode) without modifying the on-disk list.

But later on, the same inode may be inserted in the orphan list again
which will result the on-disk linked list getting corrupted.  If inode
i_dtime contains valid value, then skip on-disk list modification.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a13948f8242f..608d21f873ec 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2013,6 +2013,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
 		goto out_unlock;
+	/*
+	 * Due to previous errors inode may be already a part of on-disk
+	 * orphan list. If so skip on-disk list modification.
+	 */
+	if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+		(le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+			goto mem_insert;
 
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
@@ -2030,6 +2037,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 *
 	 * This is safe: on error we're going to ignore the orphan list
 	 * anyway on the next recovery. */
+mem_insert:
 	if (!err)
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 
-- 
cgit v1.2.3


From b8b8afe236e97b6359d46d3a3f8c46455e192271 Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Tue, 2 Mar 2010 00:21:35 -0500
Subject: ext4: make "offset" consistent in ext4_check_dir_entry()

The callers of ext4_check_dir_entry() usually pass in the "file
offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock,
ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf,
ext4_delete_entry) only pass in the buffer offset.

To accomodate those last two (which would be hard to fix otherwise),
this patch changes ext4_check_dir_entry() to print the physical block
number and the relative offset as well as the passed-in offset.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0e0bef3ba91e..29857ddd9e26 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 
 	if (error_msg != NULL)
 		__ext4_error(dir->i_sb, function,
-			"bad entry in directory #%lu: %s - "
-			"offset=%u, inode=%u, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, offset,
+			"bad entry in directory #%lu: %s - block=%llu"
+			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+			dir->i_ino, error_msg, 
+			(unsigned long long) bh->b_blocknr,     
+			(unsigned) (offset%bh->b_size), offset,
 			le32_to_cpu(de->inode),
 			rlen, de->name_len);
 	return error_msg == NULL ? 1 : 0;
-- 
cgit v1.2.3


From c7064ef13b2181a489836349f9baf87df0dab28f Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Tue, 2 Mar 2010 13:28:44 -0500
Subject: ext4: mechanical rename some of the direct I/O get_block's
 identifiers

This commit renames some of the direct I/O's block allocation flags,
variables, and functions introduced in Mingming's "Direct IO for holes
and fallocate" patches so that they can be used by ext4's buffered
write path as well.  Also changed the related function comments
accordingly to cover both direct write and buffered write cases.

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 18 ++++++------
 fs/ext4/extents.c | 24 ++++++++--------
 fs/ext4/fsync.c   |  2 +-
 fs/ext4/inode.c   | 84 +++++++++++++++++++++++++------------------------------
 fs/ext4/super.c   |  2 +-
 5 files changed, 61 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 74664ca19e22..c831a580bd76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -133,7 +133,7 @@ struct mpage_da_data {
 	int pages_written;
 	int retval;
 };
-#define	DIO_AIO_UNWRITTEN	0x1
+#define	EXT4_IO_UNWRITTEN	0x1
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished AIO list */
 	struct inode		*inode;		/* file being written to */
@@ -355,13 +355,13 @@ struct ext4_new_group_data {
 	/* caller is from the direct IO path, request to creation of an
 	unitialized extents if not allocated, split the uninitialized
 	extent if blocks has been preallocated already*/
-#define EXT4_GET_BLOCKS_DIO			0x0008
+#define EXT4_GET_BLOCKS_PRE_IO			0x0008
 #define EXT4_GET_BLOCKS_CONVERT			0x0010
-#define EXT4_GET_BLOCKS_DIO_CREATE_EXT		(EXT4_GET_BLOCKS_DIO|\
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
-	/* Convert extent to initialized after direct IO complete */
-#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
-					 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+	/* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
+					 EXT4_GET_BLOCKS_IO_CREATE_EXT)
 
 /*
  * Flags used by ext4_free_blocks
@@ -700,8 +700,8 @@ struct ext4_inode_info {
 	qsize_t i_reserved_quota;
 #endif
 
-	/* completed async DIOs that might need unwritten extents handling */
-	struct list_head i_aio_dio_complete_list;
+	/* completed IOs that might need unwritten extents handling */
+	struct list_head i_completed_io_list;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
 
@@ -1461,7 +1461,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
-extern int flush_aio_dio_completed_IO(struct inode *inode);
+extern int flush_completed_IO(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 /* ioctl.c */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a2c21aa09e2b..90ba8d9df697 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	BUG_ON(path[depth].p_hdr == NULL);
 
 	/* try to insert block into found extent and return */
-	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+	if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
 				ext4_ext_is_uninitialized(newext),
@@ -1740,7 +1740,7 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+	if (flag != EXT4_GET_BLOCKS_PRE_IO)
 		ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
@@ -2984,7 +2984,7 @@ fix_extent_len:
 	ext4_ext_dirty(handle, inode, path + depth);
 	return err;
 }
-static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
 					      struct inode *inode,
 					      struct ext4_ext_path *path)
 {
@@ -3064,8 +3064,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
-	/* DIO get_block() before submit the IO, split the extent */
-	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+	/* get_block() before submit the IO, split the extent */
+	if (flags == EXT4_GET_BLOCKS_PRE_IO) {
 		ret = ext4_split_unwritten_extents(handle,
 						inode, path, iblock,
 						max_blocks, flags);
@@ -3075,14 +3075,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * completed
 		 */
 		if (io)
-			io->flag = DIO_AIO_UNWRITTEN;
+			io->flag = EXT4_IO_UNWRITTEN;
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		goto out;
 	}
-	/* async DIO end_io complete, convert the filled extent to written */
-	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
-		ret = ext4_convert_unwritten_extents_dio(handle, inode,
+	/* IO end_io complete, convert the filled extent to written */
+	if (flags == EXT4_GET_BLOCKS_CONVERT) {
+		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
 		if (ret >= 0)
 			ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3359,9 +3359,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * For non asycn direct IO case, flag the inode state
 		 * that we need to perform convertion when IO is done.
 		 */
-		if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+		if (flags == EXT4_GET_BLOCKS_PRE_IO) {
 			if (io)
-				io->flag = DIO_AIO_UNWRITTEN;
+				io->flag = EXT4_IO_UNWRITTEN;
 			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
@@ -3656,7 +3656,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 		map_bh.b_state = 0;
 		ret = ext4_get_blocks(handle, inode, block,
 				      max_blocks, &map_bh,
-				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
 			printk(KERN_ERR "%s: ext4_ext_get_blocks "
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140aad01..0d0c3239c1cd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -63,7 +63,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	if (inode->i_sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = flush_aio_dio_completed_IO(inode);
+	ret = flush_completed_IO(inode);
 	if (ret < 0)
 		return ret;
 	
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 427f4690ad6d..28f116bdc405 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3468,7 +3468,7 @@ out:
 	return ret;
 }
 
-static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = NULL;
@@ -3476,28 +3476,14 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
 
-	ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
 	/*
-	 * DIO VFS code passes create = 0 flag for write to
-	 * the middle of file. It does this to avoid block
-	 * allocation for holes, to prevent expose stale data
-	 * out when there is parallel buffered read (which does
-	 * not hold the i_mutex lock) while direct IO write has
-	 * not completed. DIO request on holes finally falls back
-	 * to buffered IO for this reason.
-	 *
-	 * For ext4 extent based file, since we support fallocate,
-	 * new allocated extent as uninitialized, for holes, we
-	 * could fallocate blocks for holes, thus parallel
-	 * buffered IO read will zero out the page when read on
-	 * a hole while parallel DIO write to the hole has not completed.
-	 *
-	 * when we come here, we know it's a direct IO write to
-	 * to the middle of file (<i_size)
-	 * so it's safe to override the create flag from VFS.
+	 * ext4_get_block in prepare for a DIO write or buffer write.
+	 * We allocate an uinitialized extent if blocks haven't been allocated.
+	 * The extent will be converted to initialized after IO complete.
 	 */
-	create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+	create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
 
 	if (max_blocks > DIO_MAX_BLOCKS)
 		max_blocks = DIO_MAX_BLOCKS;
@@ -3524,19 +3510,20 @@ static void ext4_free_io_end(ext4_io_end_t *io)
 	iput(io->inode);
 	kfree(io);
 }
-static void dump_aio_dio_list(struct inode * inode)
+
+static void dump_completed_IO(struct inode * inode)
 {
 #ifdef	EXT4_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
 
-	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-		ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
 		return;
 	}
 
-	ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
-	list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
 		cur = &io->list;
 		before = cur->prev;
 		io0 = container_of(before, ext4_io_end_t, list);
@@ -3552,21 +3539,21 @@ static void dump_aio_dio_list(struct inode * inode)
 /*
  * check a range of space and convert unwritten extents to written.
  */
-static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+static int ext4_end_io_nolock(ext4_io_end_t *io)
 {
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	ssize_t size = io->size;
 	int ret = 0;
 
-	ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
 		   "list->prev 0x%p\n",
 	           io, inode->i_ino, io->list.next, io->list.prev);
 
 	if (list_empty(&io->list))
 		return ret;
 
-	if (io->flag != DIO_AIO_UNWRITTEN)
+	if (io->flag != EXT4_IO_UNWRITTEN)
 		return ret;
 
 	if (offset + size <= i_size_read(inode))
@@ -3584,17 +3571,18 @@ static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
 	io->flag = 0;
 	return ret;
 }
+
 /*
  * work on completed aio dio IO, to convert unwritten extents to extents
  */
-static void ext4_end_aio_dio_work(struct work_struct *work)
+static void ext4_end_io_work(struct work_struct *work)
 {
 	ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
 	struct inode *inode = io->inode;
 	int ret = 0;
 
 	mutex_lock(&inode->i_mutex);
-	ret = ext4_end_aio_dio_nolock(io);
+	ret = ext4_end_io_nolock(io);
 	if (ret >= 0) {
 		if (!list_empty(&io->list))
 			list_del_init(&io->list);
@@ -3602,32 +3590,35 @@ static void ext4_end_aio_dio_work(struct work_struct *work)
 	}
 	mutex_unlock(&inode->i_mutex);
 }
+
 /*
  * This function is called from ext4_sync_file().
  *
- * When AIO DIO IO is completed, the work to convert unwritten
- * extents to written is queued on workqueue but may not get immediately
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
  * scheduled. When fsync is called, we need to ensure the
  * conversion is complete before fsync returns.
- * The inode keeps track of a list of completed AIO from DIO path
- * that might needs to do the conversion. This function walks through
- * the list and convert the related unwritten extents to written.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
  */
-int flush_aio_dio_completed_IO(struct inode *inode)
+int flush_completed_IO(struct inode *inode)
 {
 	ext4_io_end_t *io;
 	int ret = 0;
 	int ret2 = 0;
 
-	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+	if (list_empty(&EXT4_I(inode)->i_completed_io_list))
 		return ret;
 
-	dump_aio_dio_list(inode);
-	while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
-		io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+	dump_completed_IO(inode);
+	while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){
+		io = list_entry(EXT4_I(inode)->i_completed_io_list.next,
 				ext4_io_end_t, list);
 		/*
-		 * Calling ext4_end_aio_dio_nolock() to convert completed
+		 * Calling ext4_end_io_nolock() to convert completed
 		 * IO to written.
 		 *
 		 * When ext4_sync_file() is called, run_queue() may already
@@ -3640,7 +3631,7 @@ int flush_aio_dio_completed_IO(struct inode *inode)
 		 * avoid double converting from both fsync and background work
 		 * queue work.
 		 */
-		ret = ext4_end_aio_dio_nolock(io);
+		ret = ext4_end_io_nolock(io);
 		if (ret < 0)
 			ret2 = ret;
 		else
@@ -3662,7 +3653,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
 		io->offset = 0;
 		io->size = 0;
 		io->error = 0;
-		INIT_WORK(&io->work, ext4_end_aio_dio_work);
+		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
 	}
 
@@ -3685,7 +3676,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 		  size);
 
 	/* if not aio dio with unwritten extents, just free io and return */
-	if (io_end->flag != DIO_AIO_UNWRITTEN){
+	if (io_end->flag != EXT4_IO_UNWRITTEN){
 		ext4_free_io_end(io_end);
 		iocb->private = NULL;
 		return;
@@ -3700,9 +3691,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	/* Add the io_end to per-inode completed aio dio list*/
 	list_add_tail(&io_end->list,
-		 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+		 &EXT4_I(io_end->inode)->i_completed_io_list);
 	iocb->private = NULL;
 }
+
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
@@ -3772,7 +3764,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		ret = blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
-					 ext4_get_block_dio_write,
+					 ext4_get_block_write,
 					 ext4_end_io_dio);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c832508d5515..dc7a97e79e3b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -708,7 +708,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
 #endif
-	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+	INIT_LIST_HEAD(&ei->i_completed_io_list);
 	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
-- 
cgit v1.2.3


From 744692dc059845b2a3022119871846e74d4f6e11 Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Thu, 4 Mar 2010 16:14:02 -0500
Subject: ext4: use ext4_get_block_write in buffer write

Allocate uninitialized extent before ext4 buffer write and
convert the extent to initialized after io completes.
The purpose is to make sure an extent can only be marked
initialized after it has been written with new data so
we can safely drop the i_mutex lock in ext4 DIO read without
exposing stale data. This helps to improve multi-thread DIO
read performance on high-speed disks.

Skip the nobh and data=journal mount cases to make things simple for now.

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h      |  15 +++-
 fs/ext4/ext4_jbd2.h |  24 ++++++
 fs/ext4/extents.c   |  22 +++---
 fs/ext4/inode.c     | 213 ++++++++++++++++++++++++++++++++++++++++++----------
 fs/ext4/super.c     |  37 ++++++++-
 5 files changed, 256 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c831a580bd76..dee45800dc95 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -138,7 +138,7 @@ typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished AIO list */
 	struct inode		*inode;		/* file being written to */
 	unsigned int		flag;		/* unwritten or not */
-	int			error;		/* I/O error code */
+	struct page		*page;		/* page struct for buffer write */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
 	struct work_struct	work;		/* data work queue */
@@ -361,7 +361,7 @@ struct ext4_new_group_data {
 					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
 	/* Convert extent to initialized after IO complete */
 #define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
-					 EXT4_GET_BLOCKS_IO_CREATE_EXT)
+					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
 
 /*
  * Flags used by ext4_free_blocks
@@ -702,6 +702,7 @@ struct ext4_inode_info {
 
 	/* completed IOs that might need unwritten extents handling */
 	struct list_head i_completed_io_list;
+	spinlock_t i_completed_io_lock;
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
 
@@ -752,6 +753,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -1781,6 +1783,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 len, __u64 *moved_len);
 
 
+/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
+enum ext4_state_bits {
+	BH_Uninit	/* blocks are allocated but uninitialized on disk */
+	  = BH_JBDPrivateStart,
+};
+
+BUFFER_FNS(Uninit, uninit)
+TAS_BUFFER_FNS(Uninit, uninit)
+
 /*
  * Add new method to test wether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca817d704..b79ad5126468 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -304,4 +304,28 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 	return 0;
 }
 
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work for nobh or if data journaling is
+ * enabled, since the dioread_nolock code uses b_private to pass
+ * information back to the I/O completion handler, and this conflicts
+ * with the jbd's use of b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+		return 0;
+	if (test_opt(inode->i_sb, NOBH))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	return 1;
+}
+
 #endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 90ba8d9df697..c7f166ab50eb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1619,7 +1619,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	BUG_ON(path[depth].p_hdr == NULL);
 
 	/* try to insert block into found extent and return */
-	if (ex && (flag != EXT4_GET_BLOCKS_PRE_IO)
+	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
 				ext4_ext_is_uninitialized(newext),
@@ -1740,7 +1740,7 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	if (flag != EXT4_GET_BLOCKS_PRE_IO)
+	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
 		ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
@@ -3065,7 +3065,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 	ext4_ext_show_leaf(inode, path);
 
 	/* get_block() before submit the IO, split the extent */
-	if (flags == EXT4_GET_BLOCKS_PRE_IO) {
+	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 		ret = ext4_split_unwritten_extents(handle,
 						inode, path, iblock,
 						max_blocks, flags);
@@ -3078,10 +3078,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 			io->flag = EXT4_IO_UNWRITTEN;
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		if (ext4_should_dioread_nolock(inode))
+			set_buffer_uninit(bh_result);
 		goto out;
 	}
 	/* IO end_io complete, convert the filled extent to written */
-	if (flags == EXT4_GET_BLOCKS_CONVERT) {
+	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
 							path);
 		if (ret >= 0)
@@ -3351,21 +3353,21 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
 		ext4_ext_mark_uninitialized(&newex);
 		/*
-		 * io_end structure was created for every async
-		 * direct IO write to the middle of the file.
-		 * To avoid unecessary convertion for every aio dio rewrite
-		 * to the mid of file, here we flag the IO that is really
-		 * need the convertion.
+		 * io_end structure was created for every IO write to an
+		 * uninitialized extent. To avoid unecessary conversion,
+		 * here we flag the IO that really needs the conversion.
 		 * For non asycn direct IO case, flag the inode state
 		 * that we need to perform convertion when IO is done.
 		 */
-		if (flags == EXT4_GET_BLOCKS_PRE_IO) {
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
 			if (io)
 				io->flag = EXT4_IO_UNWRITTEN;
 			else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
+		if (ext4_should_dioread_nolock(inode))
+			set_buffer_uninit(bh_result);
 	}
 
 	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f116bdc405..d291310aef6b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/kernel.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -1534,6 +1535,8 @@ static void ext4_truncate_failed_write(struct inode *inode)
 	ext4_truncate(inode);
 }
 
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned flags,
 			    struct page **pagep, void **fsdata)
@@ -1575,8 +1578,12 @@ retry:
 	}
 	*pagep = page;
 
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-				ext4_get_block);
+	if (ext4_should_dioread_nolock(inode))
+		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, ext4_get_block_write);
+	else
+		ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -2092,6 +2099,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
+				if (buffer_uninit(exbh))
+					set_buffer_uninit(bh);
 				cur_logical++;
 				pblock++;
 			} while ((bh = bh->b_this_page) != head);
@@ -2221,6 +2230,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 */
 	new.b_state = 0;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+	if (ext4_should_dioread_nolock(mpd->inode))
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
@@ -2636,6 +2647,9 @@ out:
 	return ret;
 }
 
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
 /*
  * Note that we don't need to start a transaction unless we're journaling data
  * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2683,7 +2697,7 @@ static int ext4_writepage(struct page *page,
 	int ret = 0;
 	loff_t size;
 	unsigned int len;
-	struct buffer_head *page_bufs;
+	struct buffer_head *page_bufs = NULL;
 	struct inode *inode = page->mapping->host;
 
 	trace_ext4_writepage(inode, page);
@@ -2759,7 +2773,11 @@ static int ext4_writepage(struct page *page,
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
 		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
-	else
+	else if (page_bufs && buffer_uninit(page_bufs)) {
+		ext4_set_bh_endio(page_bufs, inode);
+		ret = block_write_full_page_endio(page, noalloc_get_block_write,
+					    wbc, ext4_end_io_buffer_write);
+	} else
 		ret = block_write_full_page(page, noalloc_get_block_write,
 					    wbc);
 
@@ -3347,10 +3365,44 @@ ext4_readpages(struct file *file, struct address_space *mapping,
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+	BUG_ON(!io);
+	if (io->page)
+		put_page(io->page);
+	iput(io->inode);
+	kfree(io);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	if (!page_has_buffers(page))
+		return;
+	head = bh = page_buffers(page);
+	do {
+		if (offset <= curr_off && test_clear_buffer_uninit(bh)
+					&& bh->b_private) {
+			ext4_free_io_end(bh->b_private);
+			bh->b_private = NULL;
+			bh->b_end_io = NULL;
+		}
+		curr_off = curr_off + bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
 static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
+	/*
+	 * free any io_end structure allocated for buffers to be discarded
+	 */
+	if (ext4_should_dioread_nolock(page->mapping->host))
+		ext4_invalidatepage_free_endio(page, offset);
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
@@ -3471,10 +3523,11 @@ out:
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	handle_t *handle = NULL;
+	handle_t *handle = ext4_journal_current_handle();
 	int ret = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
+	int started = 0;
 
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
@@ -3485,37 +3538,36 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 	 */
 	create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
 
-	if (max_blocks > DIO_MAX_BLOCKS)
-		max_blocks = DIO_MAX_BLOCKS;
-	dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-	handle = ext4_journal_start(inode, dio_credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out;
+	if (!handle) {
+		if (max_blocks > DIO_MAX_BLOCKS)
+			max_blocks = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		handle = ext4_journal_start(inode, dio_credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		started = 1;
 	}
+
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
 			      create);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
 	}
-	ext4_journal_stop(handle);
+	if (started)
+		ext4_journal_stop(handle);
 out:
 	return ret;
 }
 
-static void ext4_free_io_end(ext4_io_end_t *io)
-{
-	BUG_ON(!io);
-	iput(io->inode);
-	kfree(io);
-}
-
 static void dump_completed_IO(struct inode * inode)
 {
 #ifdef	EXT4_DEBUG
 	struct list_head *cur, *before, *after;
 	ext4_io_end_t *io, *io0, *io1;
+	unsigned long flags;
 
 	if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
 		ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
@@ -3523,6 +3575,7 @@ static void dump_completed_IO(struct inode * inode)
 	}
 
 	ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
 	list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
 		cur = &io->list;
 		before = cur->prev;
@@ -3533,6 +3586,7 @@ static void dump_completed_IO(struct inode * inode)
 		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
 			    io, inode->i_ino, io0, io1);
 	}
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
 #endif
 }
 
@@ -3556,9 +3610,7 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
 	if (io->flag != EXT4_IO_UNWRITTEN)
 		return ret;
 
-	if (offset + size <= i_size_read(inode))
-		ret = ext4_convert_unwritten_extents(inode, offset, size);
-
+	ret = ext4_convert_unwritten_extents(inode, offset, size);
 	if (ret < 0) {
 		printk(KERN_EMERG "%s: failed to convert unwritten"
 			"extents to written extents, error is %d"
@@ -3577,18 +3629,25 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
  */
 static void ext4_end_io_work(struct work_struct *work)
 {
-	ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
-	struct inode *inode = io->inode;
-	int ret = 0;
+	ext4_io_end_t		*io = container_of(work, ext4_io_end_t, work);
+	struct inode		*inode = io->inode;
+	struct ext4_inode_info	*ei = EXT4_I(inode);
+	unsigned long		flags;
+	int			ret;
 
 	mutex_lock(&inode->i_mutex);
 	ret = ext4_end_io_nolock(io);
-	if (ret >= 0) {
-		if (!list_empty(&io->list))
-			list_del_init(&io->list);
-		ext4_free_io_end(io);
+	if (ret < 0) {
+		mutex_unlock(&inode->i_mutex);
+		return;
 	}
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (!list_empty(&io->list))
+		list_del_init(&io->list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	mutex_unlock(&inode->i_mutex);
+	ext4_free_io_end(io);
 }
 
 /*
@@ -3607,15 +3666,18 @@ static void ext4_end_io_work(struct work_struct *work)
 int flush_completed_IO(struct inode *inode)
 {
 	ext4_io_end_t *io;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned long flags;
 	int ret = 0;
 	int ret2 = 0;
 
-	if (list_empty(&EXT4_I(inode)->i_completed_io_list))
+	if (list_empty(&ei->i_completed_io_list))
 		return ret;
 
 	dump_completed_IO(inode);
-	while (!list_empty(&EXT4_I(inode)->i_completed_io_list)){
-		io = list_entry(EXT4_I(inode)->i_completed_io_list.next,
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	while (!list_empty(&ei->i_completed_io_list)){
+		io = list_entry(ei->i_completed_io_list.next,
 				ext4_io_end_t, list);
 		/*
 		 * Calling ext4_end_io_nolock() to convert completed
@@ -3631,20 +3693,23 @@ int flush_completed_IO(struct inode *inode)
 		 * avoid double converting from both fsync and background work
 		 * queue work.
 		 */
+		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		ret = ext4_end_io_nolock(io);
+		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
 		if (ret < 0)
 			ret2 = ret;
 		else
 			list_del_init(&io->list);
 	}
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	return (ret2 < 0) ? ret2 : 0;
 }
 
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
 {
 	ext4_io_end_t *io = NULL;
 
-	io = kmalloc(sizeof(*io), GFP_NOFS);
+	io = kmalloc(sizeof(*io), flags);
 
 	if (io) {
 		igrab(inode);
@@ -3652,7 +3717,7 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
 		io->flag = 0;
 		io->offset = 0;
 		io->size = 0;
-		io->error = 0;
+		io->page = NULL;
 		INIT_WORK(&io->work, ext4_end_io_work);
 		INIT_LIST_HEAD(&io->list);
 	}
@@ -3665,6 +3730,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 {
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
+	unsigned long flags;
+	struct ext4_inode_info *ei;
 
 	/* if not async direct IO or dio with 0 bytes write, just return */
 	if (!io_end || !size)
@@ -3684,17 +3751,85 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 
 	io_end->offset = offset;
 	io_end->size = size;
+	io_end->flag = EXT4_IO_UNWRITTEN;
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 
 	/* Add the io_end to per-inode completed aio dio list*/
-	list_add_tail(&io_end->list,
-		 &EXT4_I(io_end->inode)->i_completed_io_list);
+	ei = EXT4_I(io_end->inode);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &ei->i_completed_io_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	iocb->private = NULL;
 }
 
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+	ext4_io_end_t *io_end = bh->b_private;
+	struct workqueue_struct *wq;
+	struct inode *inode;
+	unsigned long flags;
+
+	if (!test_clear_buffer_uninit(bh) || !io_end)
+		goto out;
+
+	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+		printk("sb umounted, discard end_io request for inode %lu\n",
+			io_end->inode->i_ino);
+		ext4_free_io_end(io_end);
+		goto out;
+	}
+
+	io_end->flag = EXT4_IO_UNWRITTEN;
+	inode = io_end->inode;
+
+	/* Add the io_end to per-inode completed io list*/
+	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+	list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+	spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+	wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+	/* queue the work to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+out:
+	bh->b_private = NULL;
+	bh->b_end_io = NULL;
+	clear_buffer_uninit(bh);
+	end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+	ext4_io_end_t *io_end;
+	struct page *page = bh->b_page;
+	loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+	size_t size = bh->b_size;
+
+retry:
+	io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+	if (!io_end) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: allocation fail\n", __func__);
+		schedule();
+		goto retry;
+	}
+	io_end->offset = offset;
+	io_end->size = size;
+	/*
+	 * We need to hold a reference to the page to make sure it
+	 * doesn't get evicted before ext4_end_io_work() has a chance
+	 * to convert the extent from written to unwritten.
+	 */
+	io_end->page = page;
+	get_page(io_end->page);
+
+	bh->b_private = io_end;
+	bh->b_end_io = ext4_end_io_buffer_write;
+	return 0;
+}
+
 /*
  * For ext4 extent files, ext4 will do direct-io write to holes,
  * preallocated extents, and those write extend the file, no need to
@@ -3748,7 +3883,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		iocb->private = NULL;
 		EXT4_I(inode)->cur_aio_dio = NULL;
 		if (!is_sync_kiocb(iocb)) {
-			iocb->private = ext4_init_io_end(inode);
+			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
 			if (!iocb->private)
 				return -ENOMEM;
 			/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc7a97e79e3b..5e8f9077b0fc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -709,6 +709,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_reserved_quota = 0;
 #endif
 	INIT_LIST_HEAD(&ei->i_completed_io_list);
+	spin_lock_init(&ei->i_completed_io_lock);
 	ei->cur_aio_dio = NULL;
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
@@ -926,6 +927,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (test_opt(sb, NOLOAD))
 		seq_puts(seq, ",norecovery");
 
+	if (test_opt(sb, DIOREAD_NOLOCK))
+		seq_puts(seq, ",dioread_nolock");
+
 	ext4_show_quota_options(seq, sb);
 
 	return 0;
@@ -1109,6 +1113,7 @@ enum {
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard,
 };
 
@@ -1176,6 +1181,8 @@ static const match_table_t tokens = {
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc"},
 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_dioread_nolock, "dioread_nolock"},
+	{Opt_dioread_lock, "dioread_lock"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL},
@@ -1640,6 +1647,12 @@ set_qf_format:
 		case Opt_nodiscard:
 			clear_opt(sbi->s_mount_opt, DISCARD);
 			break;
+		case Opt_dioread_nolock:
+			set_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			break;
+		case Opt_dioread_lock:
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+			break;
 		default:
 			ext4_msg(sb, KERN_ERR,
 			       "Unrecognized mount option \"%s\" "
@@ -2795,7 +2808,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
 		       "suppressed and not mounted read-only");
-		goto failed_mount4;
+		goto failed_mount_wq;
 	} else {
 		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
 		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
@@ -2808,7 +2821,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
 		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
-		goto failed_mount4;
+		goto failed_mount_wq;
 	}
 
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
@@ -2847,7 +2860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
 			ext4_msg(sb, KERN_ERR, "Journal does not support "
 			       "requested data journaling mode");
-			goto failed_mount4;
+			goto failed_mount_wq;
 		}
 	default:
 		break;
@@ -2855,13 +2868,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
-
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
 				"its supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
+				"not supported with nobh mode");
+			goto failed_mount_wq;
+		}
 	}
 	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
@@ -2926,6 +2943,18 @@ no_journal:
 			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
 	}
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - requested data journaling mode");
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+		}
+		if (sb->s_blocksize < PAGE_SIZE) {
+			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
+				"option - block size is too small");
+			clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
+		}
+	}
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
-- 
cgit v1.2.3


From b7adc1f363e72e9131a582cc2cb00eaf83f51a39 Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Tue, 2 Mar 2010 13:26:36 -0500
Subject: ext4: Use direct_IO_no_locking in ext4 dio read

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d291310aef6b..92214d4e5afa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3474,7 +3474,14 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 	}
 
 retry:
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+	if (rw == READ && ext4_should_dioread_nolock(inode))
+		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL);
+	else
+		ret = blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-- 
cgit v1.2.3


From 273df556b6ee2065bfe96edab5888d3dc9b108d8 Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Tue, 2 Mar 2010 11:46:09 -0500
Subject: ext4: Convert BUG_ON checks to use ext4_error() instead

Convert a bunch of BUG_ONs to emit a ext4_error() message and return
EIO.  This is a first pass and most notably does _not_ cover
mballoc.c, which is a morass of void functions.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  10 +++
 fs/ext4/extents.c | 189 ++++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/inode.c   |  18 +++++-
 fs/ext4/super.c   |  36 +++++++++++
 4 files changed, 209 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index dee45800dc95..3d85bbb09f1b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -53,6 +53,12 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode(__func__, (inode), (fmt), ## a);
+
+#define EXT4_ERROR_FILE(file, fmt, a...)	\
+	ext4_error_file(__func__, (file), (fmt), ## a);
+
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
 
@@ -1492,6 +1498,10 @@ extern int ext4_group_extend(struct super_block *sb,
 extern void __ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 #define ext4_error(sb, message...)	__ext4_error(sb, __func__, ## message)
+extern void ext4_error_inode(const char *, struct inode *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_error_file(const char *, struct file *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
 extern void __ext4_std_error(struct super_block *, const char *, int);
 extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c7f166ab50eb..4bb69206f175 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -703,7 +703,12 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		}
 		eh = ext_block_hdr(bh);
 		ppos++;
-		BUG_ON(ppos > depth);
+		if (unlikely(ppos > depth)) {
+			put_bh(bh);
+			EXT4_ERROR_INODE(inode,
+					 "ppos %d > depth %d", ppos, depth);
+			goto err;
+		}
 		path[ppos].p_bh = bh;
 		path[ppos].p_hdr = eh;
 		i--;
@@ -749,7 +754,12 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 
-	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
 	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 		/* insert after */
@@ -779,9 +789,17 @@ int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	ext4_idx_store_pblock(ix, ptr);
 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 
-	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
-			     > le16_to_cpu(curp->p_hdr->eh_max));
-	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+			     > le16_to_cpu(curp->p_hdr->eh_max))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+		return -EIO;
+	}
 
 	err = ext4_ext_dirty(handle, inode, curp);
 	ext4_std_error(inode->i_sb, err);
@@ -819,7 +837,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* if current leaf will be split, then we should use
 	 * border from split point */
-	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+		return -EIO;
+	}
 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		border = path[depth].p_ext[1].ee_block;
 		ext_debug("leaf will be split."
@@ -860,7 +881,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* initialize new leaf */
 	newblock = ablocks[--a];
-	BUG_ON(newblock == 0);
+	if (unlikely(newblock == 0)) {
+		EXT4_ERROR_INODE(inode, "newblock == 0!");
+		err = -EIO;
+		goto cleanup;
+	}
 	bh = sb_getblk(inode->i_sb, newblock);
 	if (!bh) {
 		err = -EIO;
@@ -880,7 +905,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	ex = EXT_FIRST_EXTENT(neh);
 
 	/* move remainder of path[depth] to the new leaf */
-	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+	if (unlikely(path[depth].p_hdr->eh_entries !=
+		     path[depth].p_hdr->eh_max)) {
+		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+				 path[depth].p_hdr->eh_entries,
+				 path[depth].p_hdr->eh_max);
+		err = -EIO;
+		goto cleanup;
+	}
 	/* start copy from next extent */
 	/* TODO: we could do it by single memmove */
 	m = 0;
@@ -927,7 +959,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* create intermediate indexes */
 	k = depth - at - 1;
-	BUG_ON(k < 0);
+	if (unlikely(k < 0)) {
+		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+		err = -EIO;
+		goto cleanup;
+	}
 	if (k)
 		ext_debug("create %d intermediate indices\n", k);
 	/* insert new index into current index block */
@@ -964,8 +1000,14 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
 				EXT_MAX_INDEX(path[i].p_hdr));
-		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
-				EXT_LAST_INDEX(path[i].p_hdr));
+		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+					EXT_LAST_INDEX(path[i].p_hdr))) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+					 le32_to_cpu(path[i].p_ext->ee_block));
+			err = -EIO;
+			goto cleanup;
+		}
 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
 					le32_to_cpu(path[i].p_idx->ei_block),
@@ -1203,7 +1245,10 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	struct ext4_extent *ex;
 	int depth, ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1217,15 +1262,33 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+					 *logical, le32_to_cpu(ex->ee_block));
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+				  ix != NULL ? ix->ei_block : 0,
+				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
+				  depth);
+				return -EIO;
+			}
 		}
 		return 0;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
 	*phys = ext_pblock(ex) + ee_len - 1;
@@ -1251,7 +1314,10 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
 	int ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1265,17 +1331,32 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "first_extent(path[%d].p_hdr) != ex",
+					 depth);
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+						 "ix != EXT_FIRST_INDEX *logical %d!",
+						 *logical);
+				return -EIO;
+			}
 		}
 		*logical = le32_to_cpu(ex->ee_block);
 		*phys = ext_pblock(ex);
 		return 0;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
@@ -1414,8 +1495,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
-	BUG_ON(ex == NULL);
-	BUG_ON(eh == NULL);
+
+	if (unlikely(ex == NULL || eh == NULL)) {
+		EXT4_ERROR_INODE(inode,
+				 "ex %p == NULL or eh %p == NULL", ex, eh);
+		return -EIO;
+	}
 
 	if (depth == 0) {
 		/* there is no tree at all */
@@ -1613,10 +1698,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	ext4_lblk_t next;
 	unsigned uninitialized = 0;
 
-	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+		return -EIO;
+	}
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
-	BUG_ON(path[depth].p_hdr == NULL);
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 
 	/* try to insert block into found extent and return */
 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
@@ -1788,7 +1879,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 		}
 
 		depth = ext_depth(inode);
-		BUG_ON(path[depth].p_hdr == NULL);
+		if (unlikely(path[depth].p_hdr == NULL)) {
+			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+			err = -EIO;
+			break;
+		}
 		ex = path[depth].p_ext;
 		next = ext4_ext_next_allocated_block(path);
 
@@ -1839,7 +1934,11 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
 			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
 		}
 
-		BUG_ON(cbex.ec_len == 0);
+		if (unlikely(cbex.ec_len == 0)) {
+			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
+			err = -EIO;
+			break;
+		}
 		err = func(inode, path, &cbex, ex, cbdata);
 		ext4_ext_drop_refs(path);
 
@@ -1982,7 +2081,10 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 	/* free index block */
 	path--;
 	leaf = idx_pblock(path->p_idx);
-	BUG_ON(path->p_hdr->eh_entries == 0);
+	if (unlikely(path->p_hdr->eh_entries == 0)) {
+		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+		return -EIO;
+	}
 	err = ext4_ext_get_access(handle, inode, path);
 	if (err)
 		return err;
@@ -2120,8 +2222,10 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
-	BUG_ON(eh == NULL);
-
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 	/* find where to start removing */
 	ex = EXT_LAST_EXTENT(eh);
 
@@ -3240,10 +3344,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this situation is possible, though, _during_ tree modification;
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
-	if (path[depth].p_ext == NULL && depth != 0) {
-		ext4_error(inode->i_sb, "bad extent address "
-			   "inode: %lu, iblock: %d, depth: %d",
-			   inode->i_ino, iblock, depth);
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "iblock: %d, depth: %d pblock %lld",
+				 iblock, depth, path[depth].p_block);
 		err = -EIO;
 		goto out2;
 	}
@@ -3371,16 +3475,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
-		if (eh->eh_entries) {
-			last_ex = EXT_LAST_EXTENT(eh);
-			if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
-					    + ext4_ext_get_actual_len(last_ex))
-				EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
-		} else {
-			WARN_ON(eh->eh_entries == 0);
-			ext4_error(inode->i_sb, __func__,
-				"inode#%lu, eh->eh_entries = 0!", inode->i_ino);
-			}
+		if (unlikely(!eh->eh_entries)) {
+			EXT4_ERROR_INODE(inode,
+					 "eh->eh_entries == 0 ee_block %d",
+					 ex->ee_block);
+			err = -EIO;
+			goto out2;
+		}
+		last_ex = EXT_LAST_EXTENT(eh);
+		if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+		    + ext4_ext_get_actual_len(last_ex))
+			EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 92214d4e5afa..c717a74f2178 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -607,7 +607,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		if (*err)
 			goto failed_out;
 
-		BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+		if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+			EXT4_ERROR_INODE(inode,
+					 "current_block %llu + count %lu > %d!",
+					 current_block, count,
+					 EXT4_MAX_BLOCK_FILE_PHYS);
+			*err = -EIO;
+			goto failed_out;
+		}
 
 		target -= count;
 		/* allocate blocks for indirect blocks */
@@ -643,7 +650,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		ar.flags = EXT4_MB_HINT_DATA;
 
 	current_block = ext4_mb_new_blocks(handle, &ar, err);
-	BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+	if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+		EXT4_ERROR_INODE(inode,
+				 "current_block %llu + ar.len %d > %d!",
+				 current_block, ar.len,
+				 EXT4_MAX_BLOCK_FILE_PHYS);
+		*err = -EIO;
+		goto failed_out;
+	}
 
 	if (*err && (target == blks)) {
 		/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5e8f9077b0fc..5a18e9ec7cf9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -347,6 +347,42 @@ void __ext4_error(struct super_block *sb, const char *function,
 	ext4_handle_error(sb);
 }
 
+void ext4_error_inode(const char *function, struct inode *inode,
+		      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ",
+	       inode->i_sb->s_id, function, inode->i_ino, current->comm);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
+void ext4_error_file(const char *function, struct file *file,
+		     const char *fmt, ...)
+{
+	va_list args;
+	struct inode *inode = file->f_dentry->d_inode;
+	char pathname[80], *path;
+
+	va_start(args, fmt);
+	path = d_path(&(file->f_path), pathname, sizeof(pathname));
+	if (!path)
+		path = "(unknown)";
+	printk(KERN_CRIT
+	       "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ",
+	       inode->i_sb->s_id, function, inode->i_ino, current->comm, path);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	ext4_handle_error(inode->i_sb);
+}
+
 static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16])
 {
-- 
cgit v1.2.3


From 67eeb5685d2a211c0252ac7884142e503c759500 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Tue, 2 Mar 2010 08:08:51 -0500
Subject: ext4: Fix ext4_quota_write cross block boundary behaviour

We always assume what dquot update result in changes in one data block
But ext4_quota_write() function may handle cross block boundary writes
In fact if this ever happen it will result in incorrect journal
credits reservation, and later a BUG_ON.  As soon this never happen
the boundary cross loop is NOOP.  In order to make things straight
let's remove this loop and assert cross boundary condition.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 69 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5a18e9ec7cf9..ad1ee5f21bab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4010,9 +4010,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
 	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
@@ -4022,52 +4020,53 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
-		bh = ext4_bread(handle, inode, blk, 1, &err);
-		if (!bh)
+	bh = ext4_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	if (journal_quota) {
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err) {
+			brelse(bh);
 			goto out;
-		if (journal_quota) {
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err) {
-				brelse(bh);
-				goto out;
-			}
 		}
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		unlock_buffer(bh);
-		if (journal_quota)
-			err = ext4_handle_dirty_metadata(handle, NULL, bh);
-		else {
-			/* Always do at least ordered writes for quotas */
-			err = ext4_jbd2_file_inode(handle, inode);
-			mark_buffer_dirty(bh);
-		}
-		brelse(bh);
-		if (err)
-			goto out;
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
 	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	if (journal_quota)
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	else {
+		/* Always do at least ordered writes for quotas */
+		err = ext4_jbd2_file_inode(handle, inode);
+		mark_buffer_dirty(bh);
+	}
+	brelse(bh);
 out:
-	if (len == towrite) {
+	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		return err;
 	}
-	if (inode->i_size < off+len-towrite) {
-		i_size_write(inode, off+len-towrite);
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
 	}
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	ext4_mark_inode_dirty(handle, inode);
 	mutex_unlock(&inode->i_mutex);
-	return len - towrite;
+	return len;
 }
 
 #endif
-- 
cgit v1.2.3


From 9599945bac93b344519ea97f502cf537124b5a6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 2 Mar 2010 19:17:34 +0100
Subject: Revert "blkdev: fix merge_bvec_fn return value checks"

This reverts commit 9f7cdbc33f36d28e57eaba0093f68f0d14c38c5b.

It's causing oopses om dm setups, so revert it until we investigate.

Reported-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 0bda289f86fc..dc17afd672e3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 					.bi_rw = bio->bi_rw,
 				};
 
-				if (q->merge_bvec_fn(q, &bvm, prev) != prev->bv_len) {
+				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
 					prev->bv_len -= len;
 					return 0;
 				}
@@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) != bvec->bv_len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
-- 
cgit v1.2.3


From 9df5778ecee8b301b447fc05706792d5f447ace5 Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Tue, 2 Mar 2010 13:59:42 +0800
Subject: Ocfs2: Move ocfs2 ioctl definitions from ocfs2_fs.h to newly added
 ocfs2_ioctl.h

Currently we were adding ioctl cmds/structures for ocfs2 into ocfs2_fs.h
which was used for define ocfs2 on-disk layout. That sounds a little bit
confusing, and it may be quickly polluted espcially when growing the
ocfs2_info_request ioctls afterwards(it will grow i bet).

As a result, such OCFS2 IOCs do need to be placed somewhere other than
ocfs2_fs.h, a separated ocfs2_ioctl.h will be added to store such ioctl
structures and definitions which could also be used from userspace to
invoke ioctls call.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/ioctl.h       |  6 ++--
 fs/ocfs2/ocfs2.h       |  1 +
 fs/ocfs2/ocfs2_fs.h    | 57 ------------------------------------
 fs/ocfs2/ocfs2_ioctl.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 60 deletions(-)
 create mode 100644 fs/ocfs2/ocfs2_ioctl.h

(limited to 'fs')

diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index cf9a5ee30fef..0cd5323bd3f0 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -7,10 +7,10 @@
  *
  */
 
-#ifndef OCFS2_IOCTL_H
-#define OCFS2_IOCTL_H
+#ifndef OCFS2_IOCTL_PROTO_H
+#define OCFS2_IOCTL_PROTO_H
 
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
-#endif /* OCFS2_IOCTL_H */
+#endif /* OCFS2_IOCTL_PROTO_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index b27fe2489e0c..1238b491db90 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -42,6 +42,7 @@
 
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
+#include "ocfs2_ioctl.h"
 
 /* For struct ocfs2_blockcheck_stats */
 #include "blockcheck.h"
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7638a38c32bc..bb37218a7978 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -253,63 +253,6 @@
 						 * counted in an associated
 						 * refcount tree */
 
-/*
- * ioctl commands
- */
-#define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
-#define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
-#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
-#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
-
-/*
- * Space reservation / allocation / free ioctls and argument structure
- * are designed to be compatible with XFS.
- *
- * ALLOCSP* and FREESP* are not and will never be supported, but are
- * included here for completeness.
- */
-struct ocfs2_space_resv {
-	__s16		l_type;
-	__s16		l_whence;
-	__s64		l_start;
-	__s64		l_len;		/* len == 0 means until end of file */
-	__s32		l_sysid;
-	__u32		l_pid;
-	__s32		l_pad[4];	/* reserve area			    */
-};
-
-#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
-#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
-#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
-#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
-#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
-
-/* Used to pass group descriptor data when online resize is done */
-struct ocfs2_new_group_input {
-	__u64 group;		/* Group descriptor's blkno. */
-	__u32 clusters;		/* Total number of clusters in this group */
-	__u32 frees;		/* Total free clusters in this group */
-	__u16 chain;		/* Chain for this group */
-	__u16 reserved1;
-	__u32 reserved2;
-};
-
-#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
-#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
-#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
-
-/* Used to pass 2 file names to reflink. */
-struct reflink_arguments {
-	__u64 old_path;
-	__u64 new_path;
-	__u64 preserve;
-};
-#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
-
-
 /*
  * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
  */
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
new file mode 100644
index 000000000000..2d3420af1a83
--- /dev/null
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -0,0 +1,79 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_ioctl.h
+ *
+ * Defines OCFS2 ioctls.
+ *
+ * Copyright (C) 2010 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
+#define OCFS2_IOC_SETFLAGS	_IOW('f', 2, long)
+#define OCFS2_IOC32_GETFLAGS	_IOR('f', 1, int)
+#define OCFS2_IOC32_SETFLAGS	_IOW('f', 2, int)
+
+/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	__u32		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+};
+
+#define OCFS2_IOC_ALLOCSP		_IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP		_IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP		_IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP	_IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64	_IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64	_IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64	_IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64	_IOW ('X', 43, struct ocfs2_space_resv)
+
+/* Used to pass group descriptor data when online resize is done */
+struct ocfs2_new_group_input {
+	__u64 group;		/* Group descriptor's blkno. */
+	__u32 clusters;		/* Total number of clusters in this group */
+	__u32 frees;		/* Total free clusters in this group */
+	__u16 chain;		/* Chain for this group */
+	__u16 reserved1;
+	__u32 reserved2;
+};
+
+#define OCFS2_IOC_GROUP_EXTEND	_IOW('o', 1, int)
+#define OCFS2_IOC_GROUP_ADD	_IOW('o', 2,struct ocfs2_new_group_input)
+#define OCFS2_IOC_GROUP_ADD64	_IOW('o', 3,struct ocfs2_new_group_input)
+
+/* Used to pass 2 file names to reflink. */
+struct reflink_arguments {
+	__u64 old_path;
+	__u64 new_path;
+	__u64 preserve;
+};
+#define OCFS2_IOC_REFLINK	_IOW('o', 4, struct reflink_arguments)
+
+#endif /* OCFS2_IOCTL_H */
-- 
cgit v1.2.3


From 4b1ae27a96d9860e6c4348673e8fb6a0322511fe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 3 Mar 2010 12:58:31 -0500
Subject: Revert "autofs4: always use lookup for lookup"

This reverts commit 213614d583748d00967a91cacd656f417efb36ce.

Alas, ->d_revalidate() can't rely on ->lookup() finishing what
it's started; if d_alloc() in do_lookup() fails, we are not going
to call ->lookup() at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |   7 -
 fs/autofs4/expire.c   |   6 +-
 fs/autofs4/inode.c    |   1 -
 fs/autofs4/root.c     | 474 +++++++++++++++++---------------------------------
 4 files changed, 158 insertions(+), 330 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 0118d67221b2..3d283abf67d7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,11 +60,6 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
-struct rehash_entry {
-	struct task_struct *task;
-	struct list_head list;
-};
-
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
@@ -81,7 +76,6 @@ struct autofs_info {
 
 	struct list_head active;
 	int active_count;
-	struct list_head rehash_list;
 
 	struct list_head expiring;
 
@@ -104,7 +98,6 @@ struct autofs_info {
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
 #define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
-#define AUTOFS_INF_REHASH	(1<<3) /* dentry in transit to ->lookup() */
 
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 74bc9aa6df31..a796c9417fb1 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -279,7 +279,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 			root->d_mounted--;
 		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
-		autofs4_add_expiring(root);
 		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
@@ -407,7 +406,6 @@ found:
 		expired, (int)expired->d_name.len, expired->d_name.name);
 	ino = autofs4_dentry_ino(expired);
 	ino->flags |= AUTOFS_INF_EXPIRING;
-	autofs4_add_expiring(expired);
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
@@ -435,7 +433,7 @@ int autofs4_expire_wait(struct dentry *dentry)
 
 		DPRINTK("expire done status=%d", status);
 
-		if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode))
+		if (d_unhashed(dentry))
 			return -EAGAIN;
 
 		return status;
@@ -475,7 +473,6 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
-	autofs4_del_expiring(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -506,7 +503,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
 		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
-		autofs4_del_expiring(dentry);
 		complete_all(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d0a3de247458..4670a7818eac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,7 +49,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 		ino->dentry = NULL;
 		ino->size = 0;
 		INIT_LIST_HEAD(&ino->active);
-		INIT_LIST_HEAD(&ino->rehash_list);
 		ino->active_count = 0;
 		INIT_LIST_HEAD(&ino->expiring);
 		atomic_set(&ino->count, 0);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 30cc9ddf4b70..a015b49891df 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -104,99 +104,6 @@ static void autofs4_del_active(struct dentry *dentry)
 	return;
 }
 
-static void autofs4_add_rehash_entry(struct autofs_info *ino,
-				     struct rehash_entry *entry)
-{
-	entry->task = current;
-	INIT_LIST_HEAD(&entry->list);
-	list_add(&entry->list, &ino->rehash_list);
-	return;
-}
-
-static void autofs4_remove_rehash_entry(struct autofs_info *ino)
-{
-	struct list_head *head = &ino->rehash_list;
-	struct rehash_entry *entry;
-	list_for_each_entry(entry, head, list) {
-		if (entry->task == current) {
-			list_del(&entry->list);
-			kfree(entry);
-			break;
-		}
-	}
-	return;
-}
-
-static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
-{
-	struct autofs_sb_info *sbi = ino->sbi;
-	struct rehash_entry *entry, *next;
-	struct list_head *head;
-
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&sbi->lookup_lock);
-	if (!(ino->flags & AUTOFS_INF_REHASH)) {
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&sbi->fs_lock);
-		return;
-	}
-	ino->flags &= ~AUTOFS_INF_REHASH;
-	head = &ino->rehash_list;
-	list_for_each_entry_safe(entry, next, head, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&sbi->fs_lock);
-	dput(ino->dentry);
-
-	return;
-}
-
-static void autofs4_revalidate_drop(struct dentry *dentry,
-				    struct rehash_entry *entry)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	/*
-	 * Add to the active list so we can pick this up in
-	 * ->lookup(). Also add an entry to a rehash list so
-	 * we know when there are no dentrys in flight so we
-	 * know when we can rehash the dentry.
-	 */
-	spin_lock(&sbi->lookup_lock);
-	if (list_empty(&ino->active))
-		list_add(&ino->active, &sbi->active_list);
-	autofs4_add_rehash_entry(ino, entry);
-	spin_unlock(&sbi->lookup_lock);
-	if (!(ino->flags & AUTOFS_INF_REHASH)) {
-		ino->flags |= AUTOFS_INF_REHASH;
-		dget(dentry);
-		spin_lock(&dentry->d_lock);
-		__d_drop(dentry);
-		spin_unlock(&dentry->d_lock);
-	}
-	return;
-}
-
-static void autofs4_revalidate_rehash(struct dentry *dentry)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	if (ino->flags & AUTOFS_INF_REHASH) {
-		spin_lock(&sbi->lookup_lock);
-		autofs4_remove_rehash_entry(ino);
-		if (list_empty(&ino->rehash_list)) {
-			spin_unlock(&sbi->lookup_lock);
-			ino->flags &= ~AUTOFS_INF_REHASH;
-			d_rehash(dentry);
-			dput(ino->dentry);
-		} else
-			spin_unlock(&sbi->lookup_lock);
-	}
-	return;
-}
-
 static unsigned int autofs4_need_mount(unsigned int flags)
 {
 	unsigned int res = 0;
@@ -236,7 +143,7 @@ out:
 	return dcache_dir_open(inode, file);
 }
 
-static int try_to_fill_dentry(struct dentry *dentry)
+static int try_to_fill_dentry(struct dentry *dentry, int flags)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -249,17 +156,55 @@ static int try_to_fill_dentry(struct dentry *dentry)
 	 * Wait for a pending mount, triggering one if there
 	 * isn't one already
 	 */
-	DPRINTK("waiting for mount name=%.*s",
-		 dentry->d_name.len, dentry->d_name.name);
+	if (dentry->d_inode == NULL) {
+		DPRINTK("waiting for mount name=%.*s",
+			 dentry->d_name.len, dentry->d_name.name);
 
-	status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 
-	DPRINTK("mount done status=%d", status);
+		DPRINTK("mount done status=%d", status);
 
-	/* Update expiry counter */
-	ino->last_used = jiffies;
+		/* Turn this into a real negative dentry? */
+		if (status == -ENOENT) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
+			return status;
+		} else if (status) {
+			/* Return a negative dentry, but leave it "pending" */
+			return status;
+		}
+	/* Trigger mount for path component or follow link */
+	} else if (ino->flags & AUTOFS_INF_PENDING ||
+			autofs4_need_mount(flags) ||
+			current->link_count) {
+		DPRINTK("waiting for mount name=%.*s",
+			dentry->d_name.len, dentry->d_name.name);
 
-	return status;
+		spin_lock(&sbi->fs_lock);
+		ino->flags |= AUTOFS_INF_PENDING;
+		spin_unlock(&sbi->fs_lock);
+		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+
+		DPRINTK("mount done status=%d", status);
+
+		if (status) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
+			return status;
+		}
+	}
+
+	/* Initialize expiry counter after successful mount */
+	if (ino)
+		ino->last_used = jiffies;
+
+	spin_lock(&sbi->fs_lock);
+	ino->flags &= ~AUTOFS_INF_PENDING;
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
 }
 
 /* For autofs direct mounts the follow link triggers the mount */
@@ -313,16 +258,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 */
 	if (ino->flags & AUTOFS_INF_PENDING ||
 	    (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&dcache_lock);
 		spin_unlock(&sbi->fs_lock);
 
-		status = try_to_fill_dentry(dentry);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-
+		status = try_to_fill_dentry(dentry, 0);
 		if (status)
 			goto out_error;
 
@@ -361,47 +300,18 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	struct rehash_entry *entry;
+	int oz_mode = autofs4_oz_mode(sbi);
 	int flags = nd ? nd->flags : 0;
-	unsigned int mutex_aquired;
+	int status = 1;
 
-	DPRINTK("name = %.*s oz_mode = %d",
-		dentry->d_name.len, dentry->d_name.name, oz_mode);
-
-	/* Daemon never causes a mount to trigger */
-	if (autofs4_oz_mode(sbi))
-		return 1;
-
-	entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	mutex_aquired = mutex_trylock(&dir->i_mutex);
-
-	spin_lock(&sbi->fs_lock);
-	spin_lock(&dcache_lock);
 	/* Pending dentry */
+	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
-		int status;
-
-		/*
-		 * We can only unhash and send this to ->lookup() if
-		 * the directory mutex is held over d_revalidate() and
-		 * ->lookup(). This prevents the VFS from incorrectly
-		 * seeing the dentry as non-existent.
-		 */
-		ino->flags |= AUTOFS_INF_PENDING;
-		if (!mutex_aquired) {
-			autofs4_revalidate_drop(dentry, entry);
-			spin_unlock(&dcache_lock);
-			spin_unlock(&sbi->fs_lock);
-			return 0;
-		}
-		spin_unlock(&dcache_lock);
+		/* The daemon never causes a mount to trigger */
 		spin_unlock(&sbi->fs_lock);
-		mutex_unlock(&dir->i_mutex);
-		kfree(entry);
+
+		if (oz_mode)
+			return 1;
 
 		/*
 		 * If the directory has gone away due to an expire
@@ -415,82 +325,45 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * A zero status is success otherwise we have a
 		 * negative error code.
 		 */
-		status = try_to_fill_dentry(dentry);
-
-		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
-		spin_unlock(&sbi->fs_lock);
-
+		status = try_to_fill_dentry(dentry, flags);
 		if (status == 0)
 			return 1;
 
 		return status;
 	}
+	spin_unlock(&sbi->fs_lock);
+
+	/* Negative dentry.. invalidate if "old" */
+	if (dentry->d_inode == NULL)
+		return 0;
 
 	/* Check for a non-mountpoint directory with no contents */
+	spin_lock(&dcache_lock);
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 	    !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
+		spin_unlock(&dcache_lock);
 
-		if (autofs4_need_mount(flags) || current->link_count) {
-			int status;
-
-			/*
-			 * We can only unhash and send this to ->lookup() if
-			 * the directory mutex is held over d_revalidate() and
-			 * ->lookup(). This prevents the VFS from incorrectly
-			 * seeing the dentry as non-existent.
-			 */
-			ino->flags |= AUTOFS_INF_PENDING;
-			if (!mutex_aquired) {
-				autofs4_revalidate_drop(dentry, entry);
-				spin_unlock(&dcache_lock);
-				spin_unlock(&sbi->fs_lock);
-				return 0;
-			}
-			spin_unlock(&dcache_lock);
-			spin_unlock(&sbi->fs_lock);
-			mutex_unlock(&dir->i_mutex);
-			kfree(entry);
-
-			/*
-			 * A zero status is success otherwise we have a
-			 * negative error code.
-			 */
-			status = try_to_fill_dentry(dentry);
-
-			spin_lock(&sbi->fs_lock);
-			ino->flags &= ~AUTOFS_INF_PENDING;
-			spin_unlock(&sbi->fs_lock);
+		/* The daemon never causes a mount to trigger */
+		if (oz_mode)
+			return 1;
 
-			if (status == 0)
-				return 1;
+		/*
+		 * A zero status is success otherwise we have a
+		 * negative error code.
+		 */
+		status = try_to_fill_dentry(dentry, flags);
+		if (status == 0)
+			return 1;
 
-			return status;
-		}
+		return status;
 	}
 	spin_unlock(&dcache_lock);
-	spin_unlock(&sbi->fs_lock);
-
-	if (mutex_aquired)
-		mutex_unlock(&dir->i_mutex);
-
-	kfree(entry);
 
 	return 1;
 }
 
-static void autofs4_free_rehash_entrys(struct autofs_info *inf)
-{
-	struct list_head *head = &inf->rehash_list;
-	struct rehash_entry *entry, *next;
-	list_for_each_entry_safe(entry, next, head, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-}
-
 void autofs4_dentry_release(struct dentry *de)
 {
 	struct autofs_info *inf;
@@ -509,8 +382,6 @@ void autofs4_dentry_release(struct dentry *de)
 				list_del(&inf->active);
 			if (!list_empty(&inf->expiring))
 				list_del(&inf->expiring);
-			if (!list_empty(&inf->rehash_list))
-				autofs4_free_rehash_entrys(inf);
 			spin_unlock(&sbi->lookup_lock);
 		}
 
@@ -543,7 +414,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-restart:
 	spin_lock(&dcache_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
@@ -561,19 +431,6 @@ restart:
 		if (atomic_read(&active->d_count) == 0)
 			goto next;
 
-		if (active->d_inode && IS_DEADDIR(active->d_inode)) {
-			if (!list_empty(&ino->rehash_list)) {
-				dget(active);
-				spin_unlock(&active->d_lock);
-				spin_unlock(&sbi->lookup_lock);
-				spin_unlock(&dcache_lock);
-				autofs4_remove_rehash_entrys(ino);
-				dput(active);
-				goto restart;
-			}
-			goto next;
-		}
-
 		qstr = &active->d_name;
 
 		if (active->d_name.hash != hash)
@@ -586,11 +443,13 @@ restart:
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		dget(active);
-		spin_unlock(&active->d_lock);
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
-		return active;
+		if (d_unhashed(active)) {
+			dget(active);
+			spin_unlock(&active->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return active;
+		}
 next:
 		spin_unlock(&active->d_lock);
 	}
@@ -639,11 +498,13 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 		if (memcmp(qstr->name, str, len))
 			goto next;
 
-		dget(expiring);
-		spin_unlock(&expiring->d_lock);
-		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&dcache_lock);
-		return expiring;
+		if (d_unhashed(expiring)) {
+			dget(expiring);
+			spin_unlock(&expiring->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return expiring;
+		}
 next:
 		spin_unlock(&expiring->d_lock);
 	}
@@ -653,48 +514,6 @@ next:
 	return NULL;
 }
 
-static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
-					   struct dentry *dentry, int oz_mode)
-{
-	struct autofs_info *ino;
-
-	/*
-	 * Mark the dentry incomplete but don't hash it. We do this
-	 * to serialize our inode creation operations (symlink and
-	 * mkdir) which prevents deadlock during the callback to
-	 * the daemon. Subsequent user space lookups for the same
-	 * dentry are placed on the wait queue while the daemon
-	 * itself is allowed passage unresticted so the create
-	 * operation itself can then hash the dentry. Finally,
-	 * we check for the hashed dentry and return the newly
-	 * hashed dentry.
-	 */
-	dentry->d_op = &autofs4_root_dentry_operations;
-
-	/*
-	 * And we need to ensure that the same dentry is used for
-	 * all following lookup calls until it is hashed so that
-	 * the dentry flags are persistent throughout the request.
-	 */
-	ino = autofs4_init_ino(NULL, sbi, 0555);
-	if (!ino)
-		return ERR_PTR(-ENOMEM);
-
-	dentry->d_fsdata = ino;
-	ino->dentry = dentry;
-
-	/*
-	 * Only set the mount pending flag for new dentrys not created
-	 * by the daemon.
-	 */
-	if (!oz_mode)
-		ino->flags |= AUTOFS_INF_PENDING;
-
-	d_instantiate(dentry, NULL);
-
-	return ino;
-}
-
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
@@ -702,7 +521,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	struct autofs_info *ino;
 	struct dentry *expiring, *active;
 	int oz_mode;
-	int status = 0;
 
 	DPRINTK("name = %.*s",
 		dentry->d_name.len, dentry->d_name.name);
@@ -717,26 +535,44 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	spin_lock(&sbi->fs_lock);
 	active = autofs4_lookup_active(dentry);
 	if (active) {
 		dentry = active;
 		ino = autofs4_dentry_ino(dentry);
-		/* If this came from revalidate, rehash it */
-		autofs4_revalidate_rehash(dentry);
-		spin_unlock(&sbi->fs_lock);
 	} else {
-		spin_unlock(&sbi->fs_lock);
-		ino = init_new_dentry(sbi, dentry, oz_mode);
-		if (IS_ERR(ino))
-			return (struct dentry *) ino;
-	}
+		/*
+		 * Mark the dentry incomplete but don't hash it. We do this
+		 * to serialize our inode creation operations (symlink and
+		 * mkdir) which prevents deadlock during the callback to
+		 * the daemon. Subsequent user space lookups for the same
+		 * dentry are placed on the wait queue while the daemon
+		 * itself is allowed passage unresticted so the create
+		 * operation itself can then hash the dentry. Finally,
+		 * we check for the hashed dentry and return the newly
+		 * hashed dentry.
+		 */
+		dentry->d_op = &autofs4_root_dentry_operations;
+
+		/*
+		 * And we need to ensure that the same dentry is used for
+		 * all following lookup calls until it is hashed so that
+		 * the dentry flags are persistent throughout the request.
+		 */
+		ino = autofs4_init_ino(NULL, sbi, 0555);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
 
-	autofs4_add_active(dentry);
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		autofs4_add_active(dentry);
+
+		d_instantiate(dentry, NULL);
+	}
 
 	if (!oz_mode) {
-		expiring = autofs4_lookup_expiring(dentry);
 		mutex_unlock(&dir->i_mutex);
+		expiring = autofs4_lookup_expiring(dentry);
 		if (expiring) {
 			/*
 			 * If we are racing with expire the request might not
@@ -744,22 +580,23 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			 * so it must have been successful, so just wait for it.
 			 */
 			autofs4_expire_wait(expiring);
+			autofs4_del_expiring(expiring);
 			dput(expiring);
 		}
-		status = try_to_fill_dentry(dentry);
-		mutex_lock(&dir->i_mutex);
+
 		spin_lock(&sbi->fs_lock);
-		ino->flags &= ~AUTOFS_INF_PENDING;
+		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&sbi->fs_lock);
+		if (dentry->d_op && dentry->d_op->d_revalidate)
+			(dentry->d_op->d_revalidate)(dentry, nd);
+		mutex_lock(&dir->i_mutex);
 	}
 
-	autofs4_del_active(dentry);
-
 	/*
-	 * If we had a mount fail, check if we had to handle
+	 * If we are still pending, check if we had to handle
 	 * a signal. If so we can force a restart..
 	 */
-	if (status) {
+	if (ino->flags & AUTOFS_INF_PENDING) {
 		/* See if we were interrupted */
 		if (signal_pending(current)) {
 			sigset_t *sigset = &current->pending.signal;
@@ -771,46 +608,43 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
-	}
-
-	/*
-	 * User space can (and has done in the past) remove and re-create
-	 * this directory during the callback. This can leave us with an
-	 * unhashed dentry, but a successful mount!  So we need to
-	 * perform another cached lookup in case the dentry now exists.
-	 */
-	if (!oz_mode && !have_submounts(dentry)) {
-		struct dentry *new;
-		new = d_lookup(dentry->d_parent, &dentry->d_name);
-		if (new) {
-			if (active)
-				dput(active);
-			return new;
-		} else {
-			if (!status)
-				status = -ENOENT;
+		if (!oz_mode) {
+			spin_lock(&sbi->fs_lock);
+			ino->flags &= ~AUTOFS_INF_PENDING;
+			spin_unlock(&sbi->fs_lock);
 		}
 	}
 
 	/*
-	 * If we had a mount failure, return status to user space.
-	 * If the mount succeeded and we used a dentry from the active queue
-	 * return it.
+	 * If this dentry is unhashed, then we shouldn't honour this
+	 * lookup.  Returning ENOENT here doesn't do the right thing
+	 * for all system calls, but it should be OK for the operations
+	 * we permit from an autofs.
 	 */
-	if (status) {
-		dentry = ERR_PTR(status);
-		if (active)
-			dput(active);
-		return dentry;
-	} else {
+	if (!oz_mode && d_unhashed(dentry)) {
 		/*
-		 * Valid successful mount, return active dentry or NULL
-		 * for a new dentry.
+		 * A user space application can (and has done in the past)
+		 * remove and re-create this directory during the callback.
+		 * This can leave us with an unhashed dentry, but a
+		 * successful mount!  So we need to perform another
+		 * cached lookup in case the dentry now exists.
 		 */
+		struct dentry *parent = dentry->d_parent;
+		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		if (new != NULL)
+			dentry = new;
+		else
+			dentry = ERR_PTR(-ENOENT);
+
 		if (active)
-			return active;
+			dput(active);
+
+		return dentry;
 	}
 
+	if (active)
+		return active;
+
 	return NULL;
 }
 
@@ -834,6 +668,8 @@ static int autofs4_dir_symlink(struct inode *dir,
 	if (!ino)
 		return -ENOMEM;
 
+	autofs4_del_active(dentry);
+
 	ino->size = strlen(symname);
 	cp = kmalloc(ino->size + 1, GFP_KERNEL);
 	if (!cp) {
@@ -910,6 +746,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_mtime = CURRENT_TIME;
 
 	spin_lock(&dcache_lock);
+	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -935,6 +772,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
+	autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -972,6 +810,8 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (!ino)
 		return -ENOMEM;
 
+	autofs4_del_active(dentry);
+
 	inode = autofs4_get_inode(dir->i_sb, ino);
 	if (!inode) {
 		if (!dentry->d_fsdata)
-- 
cgit v1.2.3


From ad2a722f196d2b014f49e6c37e072df71eb3695f Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 12 Jan 2010 15:13:47 +0200
Subject: libfs: Open code simple_commit_write into only user

* simple_commit_write was only called by simple_write_end.
  Open coding it makes it tiny bit less heavy on the arithmetic and
  much more readable.

* While at it use zero_user() for clearing a partial page.
* While at it add a docbook comment for simple_write_end.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c | 59 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 6e8d17e1dc4c..cd88abdcb436 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -370,40 +370,51 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 	return simple_prepare_write(file, page, from, from+len);
 }
 
-static int simple_commit_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-	/*
-	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_mutex.
-	 */
-	if (pos > inode->i_size)
-		i_size_write(inode, pos);
-	set_page_dirty(page);
-	return 0;
-}
-
+/**
+ * simple_write_end - .write_end helper for non-block-device FSes
+ * @available: See .write_end of address_space_operations
+ * @file: 		"
+ * @mapping: 		"
+ * @pos: 		"
+ * @len: 		"
+ * @copied: 		"
+ * @page: 		"
+ * @fsdata: 		"
+ *
+ * simple_write_end does the minimum needed for updating a page after writing is
+ * done. It has the same API signature as the .write_end of
+ * address_space_operations vector. So it can just be set onto .write_end for
+ * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * Block based filesystems should use generic_write_end().
+ * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
+ * is not called, so a filesystem that actually does store data in .write_inode
+ * should extend on what's done here with a call to mark_inode_dirty() in the
+ * case that i_size has changed.
+ */
 int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = page->mapping->host;
+	loff_t last_pos = pos + copied;
 
 	/* zero the stale part of the page if we did a short copy */
 	if (copied < len) {
-		void *kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + from + copied, 0, len - copied);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
 	}
 
-	simple_commit_write(file, page, from, from+copied);
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size)
+		i_size_write(inode, last_pos);
 
+	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
 
-- 
cgit v1.2.3


From 193cf4b99113a4550598ba9e8343e591fc062e23 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 12 Jan 2010 16:18:08 +0200
Subject: libfs: Unexport and kill simple_prepare_write

Remove the EXPORT_UNUSED_SYMBOL of simple_prepare_write

Collapse simple_prepare_write into it's only caller, though
making it simpler and clearer to understand.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 22 ++++++----------------
 include/linux/fs.h |  2 --
 2 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index cd88abdcb436..9e50bcf55857 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -338,28 +338,14 @@ int simple_readpage(struct file *file, struct page *page)
 	return 0;
 }
 
-int simple_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to)
-{
-	if (!PageUptodate(page)) {
-		if (to - from != PAGE_CACHE_SIZE)
-			zero_user_segments(page,
-				0, from,
-				to, PAGE_CACHE_SIZE);
-	}
-	return 0;
-}
-
 int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	struct page *page;
 	pgoff_t index;
-	unsigned from;
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
@@ -367,7 +353,12 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 
 	*pagep = page;
 
-	return simple_prepare_write(file, page, from, from+len);
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+	}
+	return 0;
 }
 
 /**
@@ -864,7 +855,6 @@ EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
 EXPORT_SYMBOL(simple_lookup);
 EXPORT_SYMBOL(simple_pin_fs);
-EXPORT_UNUSED_SYMBOL(simple_prepare_write);
 EXPORT_SYMBOL(simple_readpage);
 EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ebb1cd5bc241..2b124c825e38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2340,8 +2340,6 @@ extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct
 extern int simple_sync_file(struct file *, struct dentry *, int);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
-extern int simple_prepare_write(struct file *file, struct page *page,
-			unsigned offset, unsigned to);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
-- 
cgit v1.2.3


From 587d4a17d837ac0f17edb26f1b6c80c0abda6343 Mon Sep 17 00:00:00 2001
From: "Helight.Xu" <helight.xu@gmail.com>
Date: Wed, 30 Dec 2009 13:24:41 +0800
Subject: some clean up in fs/proc

EXPORT_SYMBOL(proc_symlink);
EXPORT_SYMBOL(proc_mkdir);
EXPORT_SYMBOL(create_proc_entry);
EXPORT_SYMBOL(proc_create_data);
EXPORT_SYMBOL(remove_proc_entry);

Those EXPORT_SYMBOL shouldn't be in fs/proc/root.c,
should be in fs/proc/generic.c.

Signed-off-by: Helight.Xu <helight.xu@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c | 5 +++++
 fs/proc/root.c    | 6 ------
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 480cb1065eec..9580abeadeb3 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -662,6 +662,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 	}
 	return ent;
 }
+EXPORT_SYMBOL(proc_symlink);
 
 struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
 		struct proc_dir_entry *parent)
@@ -700,6 +701,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
 {
 	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
 }
+EXPORT_SYMBOL(proc_mkdir);
 
 struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 					 struct proc_dir_entry *parent)
@@ -728,6 +730,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 	}
 	return ent;
 }
+EXPORT_SYMBOL(create_proc_entry);
 
 struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 					struct proc_dir_entry *parent,
@@ -762,6 +765,7 @@ out_free:
 out:
 	return NULL;
 }
+EXPORT_SYMBOL(proc_create_data);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
@@ -853,3 +857,4 @@ continue_removing:
 			de->parent->name, de->name, de->subdir->name);
 	pde_put(de);
 }
+EXPORT_SYMBOL(remove_proc_entry);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b791d9e3..757c069f2a65 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns)
 {
 	mntput(ns->proc_mnt);
 }
-
-EXPORT_SYMBOL(proc_symlink);
-EXPORT_SYMBOL(proc_mkdir);
-EXPORT_SYMBOL(create_proc_entry);
-EXPORT_SYMBOL(proc_create_data);
-EXPORT_SYMBOL(remove_proc_entry);
-- 
cgit v1.2.3


From ec4f860597af41c6b71f4de86d8e86f710bfab54 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 5 Jan 2010 13:45:18 -0700
Subject: fs/dcache.c: CodingStyle cleanup

Cleanup EXPORT* macros according to Documantation/CodingStyle.

Move EXPORT* macros to the line immediately after the closing
function brace.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 953173a293a9..4365998b8df4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -257,6 +257,7 @@ kill_it:
 	if (dentry)
 		goto repeat;
 }
+EXPORT_SYMBOL(dput);
 
 /**
  * d_invalidate - invalidate a dentry
@@ -314,6 +315,7 @@ int d_invalidate(struct dentry * dentry)
 	spin_unlock(&dcache_lock);
 	return 0;
 }
+EXPORT_SYMBOL(d_invalidate);
 
 /* This should be called _only_ with dcache_lock held */
 
@@ -328,6 +330,7 @@ struct dentry * dget_locked(struct dentry *dentry)
 {
 	return __dget_locked(dentry);
 }
+EXPORT_SYMBOL(dget_locked);
 
 /**
  * d_find_alias - grab a hashed alias of inode
@@ -384,6 +387,7 @@ struct dentry * d_find_alias(struct inode *inode)
 	}
 	return de;
 }
+EXPORT_SYMBOL(d_find_alias);
 
 /*
  *	Try to kill dentries associated with this inode.
@@ -408,6 +412,7 @@ restart:
 	}
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_prune_aliases);
 
 /*
  * Throw away a dentry - free the inode, dput the parent.  This requires that
@@ -610,6 +615,7 @@ void shrink_dcache_sb(struct super_block * sb)
 {
 	__shrink_dcache_sb(sb, NULL, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_sb);
 
 /*
  * destroy a single subtree of dentries for unmount
@@ -792,6 +798,7 @@ positive:
 	spin_unlock(&dcache_lock);
 	return 1;
 }
+EXPORT_SYMBOL(have_submounts);
 
 /*
  * Search the dentry child list for the specified parent,
@@ -876,6 +883,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	while ((found = select_parent(parent)) != 0)
 		__shrink_dcache_sb(sb, &found, 0);
 }
+EXPORT_SYMBOL(shrink_dcache_parent);
 
 /*
  * Scan `nr' dentries and return the number which remain.
@@ -968,6 +976,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 
 	return dentry;
 }
+EXPORT_SYMBOL(d_alloc);
 
 struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 {
@@ -1012,6 +1021,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
+EXPORT_SYMBOL(d_instantiate);
 
 /**
  * d_instantiate_unique - instantiate a non-aliased dentry
@@ -1108,6 +1118,7 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 	}
 	return res;
 }
+EXPORT_SYMBOL(d_alloc_root);
 
 static inline struct hlist_head *d_hash(struct dentry *parent,
 					unsigned long hash)
@@ -1225,6 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		d_add(dentry, inode);
 	return new;
 }
+EXPORT_SYMBOL(d_splice_alias);
 
 /**
  * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -1314,6 +1326,7 @@ err_out:
 	iput(inode);
 	return ERR_PTR(error);
 }
+EXPORT_SYMBOL(d_add_ci);
 
 /**
  * d_lookup - search for a dentry
@@ -1357,6 +1370,7 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 	} while (read_seqretry(&rename_lock, seq));
 	return dentry;
 }
+EXPORT_SYMBOL(d_lookup);
 
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
@@ -1483,6 +1497,7 @@ int d_validate(struct dentry *dentry, struct dentry *dparent)
 out:
 	return 0;
 }
+EXPORT_SYMBOL(d_validate);
 
 /*
  * When a file is deleted, we have two options:
@@ -1528,6 +1543,7 @@ void d_delete(struct dentry * dentry)
 
 	fsnotify_nameremove(dentry, isdir);
 }
+EXPORT_SYMBOL(d_delete);
 
 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
 {
@@ -1556,6 +1572,7 @@ void d_rehash(struct dentry * entry)
 	spin_unlock(&entry->d_lock);
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_rehash);
 
 /*
  * When switching names, the actual string doesn't strictly have to
@@ -1702,6 +1719,7 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	d_move_locked(dentry, target);
 	spin_unlock(&dcache_lock);
 }
+EXPORT_SYMBOL(d_move);
 
 /**
  * d_ancestor - search for an ancestor
@@ -1868,6 +1886,7 @@ shouldnt_be_hashed:
 	spin_unlock(&dcache_lock);
 	BUG();
 }
+EXPORT_SYMBOL_GPL(d_materialise_unique);
 
 static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 {
@@ -2005,6 +2024,7 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	path_put(&root);
 	return res;
 }
+EXPORT_SYMBOL(d_path);
 
 /*
  * Helper function for dentry_operations.d_dname() members
@@ -2228,6 +2248,7 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name)
 	}
 	return ino;
 }
+EXPORT_SYMBOL(find_inode_number);
 
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
@@ -2297,6 +2318,7 @@ static void __init dcache_init(void)
 
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
+EXPORT_SYMBOL(names_cachep);
 
 EXPORT_SYMBOL(d_genocide);
 
@@ -2326,26 +2348,3 @@ void __init vfs_caches_init(unsigned long mempages)
 	bdev_cache_init();
 	chrdev_init();
 }
-
-EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_root);
-EXPORT_SYMBOL(d_delete);
-EXPORT_SYMBOL(d_find_alias);
-EXPORT_SYMBOL(d_instantiate);
-EXPORT_SYMBOL(d_invalidate);
-EXPORT_SYMBOL(d_lookup);
-EXPORT_SYMBOL(d_move);
-EXPORT_SYMBOL_GPL(d_materialise_unique);
-EXPORT_SYMBOL(d_path);
-EXPORT_SYMBOL(d_prune_aliases);
-EXPORT_SYMBOL(d_rehash);
-EXPORT_SYMBOL(d_splice_alias);
-EXPORT_SYMBOL(d_add_ci);
-EXPORT_SYMBOL(d_validate);
-EXPORT_SYMBOL(dget_locked);
-EXPORT_SYMBOL(dput);
-EXPORT_SYMBOL(find_inode_number);
-EXPORT_SYMBOL(have_submounts);
-EXPORT_SYMBOL(names_cachep);
-EXPORT_SYMBOL(shrink_dcache_parent);
-EXPORT_SYMBOL(shrink_dcache_sb);
-- 
cgit v1.2.3


From d208bbdda991b8808d9c033ce4d31cb1bd87dcfc Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 21 Dec 2009 16:28:53 -0800
Subject: fs: improve remount,ro vs buffercache coherency

Invalidate sb->s_bdev on remount,ro.

Fixes a problem reported by Jorge Boncompte who is seeing corruption
trying to snapshot a minix filesystem image.  Some filesystems modify
their metadata via a path other than the bdev buffer cache (eg.  they may
use a private linear mapping for their metadata, or implement directories
in pagecache, etc).  Also, file data modifications usually go to the bdev
via their own mappings.

These updates are not coherent with buffercache IO (eg.  via /dev/bdev)
and never have been.  However there could be a reasonable expectation that
after a mount -oremount,ro operation then the buffercache should
subsequently be coherent with previous filesystem modifications.

So invalidate the bdev mappings on a remount,ro operation to provide a
coherency point.

The problem was exposed when we switched the old rd to brd because old rd
didn't really function like a normal block device and updates to rd via
mappings other than the buffercache would still end up going into its
buffercache.  But the same problem has always affected other "normal"
block devices, including loop.

[akpm@linux-foundation.org: repair comment layout]
Reported-by: "Jorge Boncompte [DTI2]" <jorge@dti2.net>
Tested-by: "Jorge Boncompte [DTI2]" <jorge@dti2.net>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index aff046b0fe78..903896ec7c73 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -568,7 +568,7 @@ out:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
-	int remount_rw;
+	int remount_rw, remount_ro;
 
 	if (sb->s_frozen != SB_UNFROZEN)
 		return -EBUSY;
@@ -583,9 +583,12 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
+	remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
+
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
-	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
+	if (remount_ro) {
 		if (force)
 			mark_files_ro(sb);
 		else if (!fs_may_remount_ro(sb))
@@ -594,7 +597,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		if (retval < 0 && retval != -ENOSYS)
 			return -EBUSY;
 	}
-	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
@@ -604,6 +606,16 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
+	/*
+	 * Some filesystems modify their metadata via some other path than the
+	 * bdev buffer cache (eg. use a private mapping, or directories in
+	 * pagecache, etc). Also file data modifications go via their own
+	 * mappings. So If we try to mount readonly then copy the filesystem
+	 * from bdev, we could get stale data, so invalidate it to give a best
+	 * effort at coherency.
+	 */
+	if (remount_ro && sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8737c9305bd5602b11f7eb4655d5695d4a42a0c6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 06:47:55 -0500
Subject: Switch may_open() and break_lease() to passing O_...

... instead of mixing FMODE_ and O_

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/drivers/mconsole_kern.c |  2 +-
 fs/cifs/file.c                  |  4 ++--
 fs/locks.c                      |  5 +++--
 fs/namei.c                      | 10 +++++-----
 fs/nfsctl.c                     |  5 ++---
 fs/nfsd/vfs.c                   |  4 ++--
 fs/open.c                       |  2 +-
 kernel/sysctl_binary.c          |  7 ++-----
 8 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 3b3c36601a7b..de317d0c3294 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -140,7 +140,7 @@ void mconsole_proc(struct mc_request *req)
 		goto out;
 	}
 
-	err = may_open(&nd.path, MAY_READ, FMODE_READ);
+	err = may_open(&nd.path, MAY_READ, O_RDONLY);
 	if (result) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
 		path_put(&nd.path);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 057e1dae12ab..3d8f8a96f5a3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2289,9 +2289,9 @@ cifs_oplock_break(struct slow_work *work)
 	if (inode && S_ISREG(inode->i_mode)) {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cinode->clientCanCacheAll == 0)
-			break_lease(inode, FMODE_READ);
+			break_lease(inode, O_RDONLY);
 		else if (cinode->clientCanCacheRead == 0)
-			break_lease(inode, FMODE_WRITE);
+			break_lease(inode, O_WRONLY);
 #endif
 		rc = filemap_fdatawrite(inode->i_mapping);
 		if (cinode->clientCanCacheRead == 0) {
diff --git a/fs/locks.c b/fs/locks.c
index a8794f233bc9..ae9ded026b7c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1182,8 +1182,9 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	struct file_lock *fl;
 	unsigned long break_time;
 	int i_have_this_lease = 0;
+	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 
-	new_fl = lease_alloc(NULL, mode & FMODE_WRITE ? F_WRLCK : F_RDLCK);
+	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 
 	lock_kernel();
 
@@ -1197,7 +1198,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 		if (fl->fl_owner == current->files)
 			i_have_this_lease = 1;
 
-	if (mode & FMODE_WRITE) {
+	if (want_write) {
 		/* If we want write access, we have to revoke any lease. */
 		future = F_UNLCK | F_INPROGRESS;
 	} else if (flock->fl_type & F_INPROGRESS) {
diff --git a/fs/namei.c b/fs/namei.c
index a4855af776a8..b20f83d1e065 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1503,7 +1503,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 	 * An append-only file must be opened in append mode for writing.
 	 */
 	if (IS_APPEND(inode)) {
-		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
+		if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
 			return -EPERM;
 		if (flag & O_TRUNC)
 			return -EPERM;
@@ -1547,7 +1547,7 @@ static int handle_truncate(struct path *path)
  * what get passed to sys_open().
  */
 static int __open_namei_create(struct nameidata *nd, struct path *path,
-				int flag, int mode)
+				int open_flag, int mode)
 {
 	int error;
 	struct dentry *dir = nd->path.dentry;
@@ -1565,7 +1565,7 @@ out_unlock:
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
-	return may_open(&nd->path, 0, flag & ~O_TRUNC);
+	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
 }
 
 /*
@@ -1736,7 +1736,7 @@ do_last:
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(&nd, &path, flag, mode);
+		error = __open_namei_create(&nd, &path, open_flag, mode);
 		if (error) {
 			mnt_drop_write(nd.path.mnt);
 			goto exit;
@@ -1798,7 +1798,7 @@ ok:
 		if (error)
 			goto exit;
 	}
-	error = may_open(&nd.path, acc_mode, flag);
+	error = may_open(&nd.path, acc_mode, open_flag);
 	if (error) {
 		if (will_truncate)
 			mnt_drop_write(nd.path.mnt);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index d3854d94b7cf..bf9cbd242ddd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -36,10 +36,9 @@ static struct file *do_open(char *name, int flags)
 		return ERR_PTR(error);
 
 	if (flags == O_RDWR)
-		error = may_open(&nd.path, MAY_READ|MAY_WRITE,
-					   FMODE_READ|FMODE_WRITE);
+		error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
 	else
-		error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
+		error = may_open(&nd.path, MAY_WRITE, flags);
 
 	if (!error)
 		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8715d194561a..15dc2deaac5f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -361,7 +361,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		 * If we are changing the size of the file, then
 		 * we need to break all leases.
 		 */
-		host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+		host_err = break_lease(inode, O_WRONLY | O_NONBLOCK);
 		if (host_err == -EWOULDBLOCK)
 			host_err = -ETIMEDOUT;
 		if (host_err) /* ENOMEM or EWOULDBLOCK */
@@ -734,7 +734,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * Check to see if there are any leases on this file.
 	 * This may block while leases are broken.
 	 */
-	host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
+	host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0));
 	if (host_err == -EWOULDBLOCK)
 		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
diff --git a/fs/open.c b/fs/open.c
index 040cef72bc00..e0b2d88b0380 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -271,7 +271,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	 * Make sure that there are no leases.  get_write_access() protects
 	 * against the truncate racing with a lease-granting setlease().
 	 */
-	error = break_lease(inode, FMODE_WRITE);
+	error = break_lease(inode, O_WRONLY);
 	if (error)
 		goto put_write_and_out;
 
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	ssize_t result;
 	char *pathname;
 	int flags;
-	int acc_mode, fmode;
+	int acc_mode;
 
 	pathname = sysctl_getname(name, nlen, &table);
 	result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	if (oldval && oldlen && newval && newlen) {
 		flags = O_RDWR;
 		acc_mode = MAY_READ | MAY_WRITE;
-		fmode = FMODE_READ | FMODE_WRITE;
 	} else if (newval && newlen) {
 		flags = O_WRONLY;
 		acc_mode = MAY_WRITE;
-		fmode = FMODE_WRITE;
 	} else if (oldval && oldlen) {
 		flags = O_RDONLY;
 		acc_mode = MAY_READ;
-		fmode = FMODE_READ;
 	} else {
 		result = 0;
 		goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	if (result)
 		goto out_putname;
 
-	result = may_open(&nd.path, acc_mode, fmode);
+	result = may_open(&nd.path, acc_mode, flags);
 	if (result)
 		goto out_putpath;
 
-- 
cgit v1.2.3


From c177c2ac8c5aa83ed181db44543c3b38fd1f17a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 14 Jan 2010 00:59:16 -0500
Subject: Switch gfs2 to nd_set_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/ops_inode.c | 113 +++++++++++++---------------------------------------
 1 file changed, 27 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 84350e1be66d..4e64352d49de 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -976,122 +976,62 @@ out:
 }
 
 /**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
  *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
+ * This can handle symlinks of any size.
  *
- * Returns: errno
+ * Returns: 0 on success or error code
  */
 
-static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
 	unsigned int x;
+	char *buf;
 	int error;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
-		return error;
+		nd_set_link(nd, ERR_PTR(error));
+		return NULL;
 	}
 
 	if (!ip->i_disksize) {
 		gfs2_consist_inode(ip);
-		error = -EIO;
+		buf = ERR_PTR(-EIO);
 		goto out;
 	}
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
+	if (error) {
+		buf = ERR_PTR(error);
 		goto out;
-
-	x = ip->i_disksize + 1;
-	if (x > *len) {
-		*buf = kmalloc(x, GFP_NOFS);
-		if (!*buf) {
-			error = -ENOMEM;
-			goto out_brelse;
-		}
 	}
 
-	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	*len = x;
-
-out_brelse:
+	x = ip->i_disksize + 1;
+	buf = kmalloc(x, GFP_NOFS);
+	if (!buf)
+		buf = ERR_PTR(-ENOMEM);
+	else
+		memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
 	brelse(dibh);
 out:
 	gfs2_glock_dq_uninit(&i_gh);
-	return error;
-}
-
-/**
- * gfs2_readlink - Read the value of a symlink
- * @dentry: the symlink
- * @buf: the buffer to read the symlink data into
- * @size: the size of the buffer
- *
- * Returns: errno
- */
-
-static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
-			 int user_size)
-{
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	char array[GFS2_FAST_NAME_SIZE], *buf = array;
-	unsigned int len = GFS2_FAST_NAME_SIZE;
-	int error;
-
-	error = gfs2_readlinki(ip, &buf, &len);
-	if (error)
-		return error;
-
-	if (user_size > len - 1)
-		user_size = len - 1;
-
-	if (copy_to_user(user_buf, buf, user_size))
-		error = -EFAULT;
-	else
-		error = user_size;
-
-	if (buf != array)
-		kfree(buf);
-
-	return error;
+	nd_set_link(nd, buf);
+	return NULL;
 }
 
-/**
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size. It is optimised for symlinks
- * under GFS2_FAST_NAME_SIZE.
- *
- * Returns: 0 on success or error code
- */
-
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
-	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
-	char array[GFS2_FAST_NAME_SIZE], *buf = array;
-	unsigned int len = GFS2_FAST_NAME_SIZE;
-	int error;
-
-	error = gfs2_readlinki(ip, &buf, &len);
-	if (!error) {
-		error = vfs_follow_link(nd, buf);
-		if (buf != array)
-			kfree(buf);
-	} else
-		path_put(&nd->path);
-
-	return ERR_PTR(error);
+	char *s = nd_get_link(nd);
+	if (!IS_ERR(s))
+		kfree(s);
 }
 
 /**
@@ -1426,8 +1366,9 @@ const struct inode_operations gfs2_dir_iops = {
 };
 
 const struct inode_operations gfs2_symlink_iops = {
-	.readlink = gfs2_readlink,
+	.readlink = generic_readlink,
 	.follow_link = gfs2_follow_link,
+	.put_link = gfs2_put_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
-- 
cgit v1.2.3


From 796a6b521d0eadb338adf8cf7e482351c3a8a7b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 16 Jan 2010 13:28:47 -0500
Subject: Kill CL_PROPAGATION, sanitize fs/pnode.c:get_source()

First of all, get_source() never results in CL_PROPAGATION
alone.  We either get CL_MAKE_SHARED (for the continuation
of peer group) or CL_SLAVE (slave that is not shared) or both
(beginning of peer group among slaves).  Massage the code to
make that explicit, kill CL_PROPAGATION test in clone_mnt()
(nothing sets CL_MAKE_SHARED without CL_PROPAGATION and in
clone_mnt() we are checking CL_PROPAGATION after we'd found
that there's no CL_SLAVE, so the check for CL_MAKE_SHARED
would do just as well).

Fix comments, while we are at it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c |  2 +-
 fs/pnode.c     | 28 +++++++++++++++++-----------
 fs/pnode.h     |  3 +--
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index c768f733c8d6..25c1dcf9e9eb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -573,7 +573,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 			mnt->mnt_master = old;
 			CLEAR_MNT_SHARED(mnt);
 		} else if (!(flag & CL_PRIVATE)) {
-			if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+			if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
 				list_add(&mnt->mnt_share, &old->mnt_share);
 			if (IS_MNT_SLAVE(old))
 				list_add(&mnt->mnt_slave, &old->mnt_slave);
diff --git a/fs/pnode.c b/fs/pnode.c
index 8d5f392ec3d3..5cc564a83149 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -86,7 +86,7 @@ static int do_make_slave(struct vfsmount *mnt)
 
 	/*
 	 * slave 'mnt' to a peer mount that has the
-	 * same root dentry. If none is available than
+	 * same root dentry. If none is available then
 	 * slave it to anything that is available.
 	 */
 	while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
@@ -147,6 +147,11 @@ void change_mnt_propagation(struct vfsmount *mnt, int type)
  * get the next mount in the propagation tree.
  * @m: the mount seen last
  * @origin: the original mount from where the tree walk initiated
+ *
+ * Note that peer groups form contiguous segments of slave lists.
+ * We rely on that in get_source() to be able to find out if
+ * vfsmount found while iterating with propagation_next() is
+ * a peer of one we'd found earlier.
  */
 static struct vfsmount *propagation_next(struct vfsmount *m,
 					 struct vfsmount *origin)
@@ -186,10 +191,6 @@ static struct vfsmount *get_source(struct vfsmount *dest,
 {
 	struct vfsmount *p_last_src = NULL;
 	struct vfsmount *p_last_dest = NULL;
-	*type = CL_PROPAGATION;
-
-	if (IS_MNT_SHARED(dest))
-		*type |= CL_MAKE_SHARED;
 
 	while (last_dest != dest->mnt_master) {
 		p_last_dest = last_dest;
@@ -202,13 +203,18 @@ static struct vfsmount *get_source(struct vfsmount *dest,
 		do {
 			p_last_dest = next_peer(p_last_dest);
 		} while (IS_MNT_NEW(p_last_dest));
+		/* is that a peer of the earlier? */
+		if (dest == p_last_dest) {
+			*type = CL_MAKE_SHARED;
+			return p_last_src;
+		}
 	}
-
-	if (dest != p_last_dest) {
-		*type |= CL_SLAVE;
-		return last_src;
-	} else
-		return p_last_src;
+	/* slave of the earlier, then */
+	*type = CL_SLAVE;
+	/* beginning of peer group among the slaves? */
+	if (IS_MNT_SHARED(dest))
+		*type |= CL_MAKE_SHARED;
+	return last_src;
 }
 
 /*
diff --git a/fs/pnode.h b/fs/pnode.h
index 958665d662af..6c7ef3252a26 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -21,8 +21,7 @@
 #define CL_SLAVE     		0x02
 #define CL_COPY_ALL 		0x04
 #define CL_MAKE_SHARED 		0x08
-#define CL_PROPAGATION 		0x10
-#define CL_PRIVATE 		0x20
+#define CL_PRIVATE 		0x10
 
 static inline void set_mnt_shared(struct vfsmount *mnt)
 {
-- 
cgit v1.2.3


From f598f9f1252b33410ffc52f51e117645ac5116c4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Jan 2010 20:08:53 -0500
Subject: Sanitize autofs_dev_ioctl_ismountpoint()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fcb245f..c8a80dffb455 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -544,10 +544,9 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 			goto out;
 		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
 		err = 0;
-		if (path.dentry->d_inode &&
-		    path.mnt->mnt_root == path.dentry) {
+		if (path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = path.dentry->d_inode->i_sb->s_magic;
+			magic = path.mnt->mnt_sb->s_magic;
 		}
 	} else {
 		dev_t dev = sbi->sb->s_dev;
@@ -560,10 +559,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		err = have_submounts(path.dentry);
 
-		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path))
-				magic = path.mnt->mnt_sb->s_magic;
-		}
+		if (follow_down(&path))
+			magic = path.mnt->mnt_sb->s_magic;
 	}
 
 	param->ismountpoint.out.devid = devid;
-- 
cgit v1.2.3


From 3899167dbd6832a3d8d7171b425257ad46b6c40c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Jan 2010 20:10:29 -0500
Subject: Get rid of mnt_mountpoint abuses in ext4

path to mnt/mnt->mnt_root is no worse than that to
mnt->mnt_parent/mnt->mnt_mountpoint *and* needs no
pinning the sucker down (mnt is not going away and
mnt->mnt_root won't change)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/file.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583cef28..56eee3d796c2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -116,11 +116,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		 * devices or filesystem images.
 		 */
 		memset(buf, 0, sizeof(buf));
-		path.mnt = mnt->mnt_parent;
-		path.dentry = mnt->mnt_mountpoint;
-		path_get(&path);
+		path.mnt = mnt;
+		path.dentry = mnt->mnt_root;
 		cp = d_path(&path, buf, sizeof(buf));
-		path_put(&path);
 		if (!IS_ERR(cp)) {
 			memcpy(sbi->s_es->s_last_mounted, cp,
 			       sizeof(sbi->s_es->s_last_mounted));
-- 
cgit v1.2.3


From 5b7e934d887c67fe093b61f1308bc2d9c49381ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Jan 2010 00:28:52 -0500
Subject: Use kill_litter_super() in autofs4 ->kill_sb()

... and get rid of open-coding its guts (i.e. RIP autofs4_force_release())

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/inode.c | 62 +-----------------------------------------------------
 1 file changed, 1 insertion(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 4670a7818eac..821b2b955dac 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,63 +96,6 @@ void autofs4_free_ino(struct autofs_info *ino)
 	kfree(ino);
 }
 
-/*
- * Deal with the infamous "Busy inodes after umount ..." message.
- *
- * Clean up the dentry tree. This happens with autofs if the user
- * space program goes away due to a SIGKILL, SIGSEGV etc.
- */
-static void autofs4_force_release(struct autofs_sb_info *sbi)
-{
-	struct dentry *this_parent = sbi->sb->s_root;
-	struct list_head *next;
-
-	if (!sbi->sb->s_root)
-		return;
-
-	spin_lock(&dcache_lock);
-repeat:
-	next = this_parent->d_subdirs.next;
-resume:
-	while (next != &this_parent->d_subdirs) {
-		struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child);
-
-		/* Negative dentry - don`t care */
-		if (!simple_positive(dentry)) {
-			next = next->next;
-			continue;
-		}
-
-		if (!list_empty(&dentry->d_subdirs)) {
-			this_parent = dentry;
-			goto repeat;
-		}
-
-		next = next->next;
-		spin_unlock(&dcache_lock);
-
-		DPRINTK("dentry %p %.*s",
-			dentry, (int)dentry->d_name.len, dentry->d_name.name);
-
-		dput(dentry);
-		spin_lock(&dcache_lock);
-	}
-
-	if (this_parent != sbi->sb->s_root) {
-		struct dentry *dentry = this_parent;
-
-		next = this_parent->d_u.d_child.next;
-		this_parent = this_parent->d_parent;
-		spin_unlock(&dcache_lock);
-		DPRINTK("parent dentry %p %.*s",
-			dentry, (int)dentry->d_name.len, dentry->d_name.name);
-		dput(dentry);
-		spin_lock(&dcache_lock);
-		goto resume;
-	}
-	spin_unlock(&dcache_lock);
-}
-
 void autofs4_kill_sb(struct super_block *sb)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -169,15 +112,12 @@ void autofs4_kill_sb(struct super_block *sb)
 	/* Free wait queues, close pipe */
 	autofs4_catatonic_mode(sbi);
 
-	/* Clean up and release dangling references */
-	autofs4_force_release(sbi);
-
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
 out_kill_sb:
 	DPRINTK("shutting down");
-	kill_anon_super(sb);
+	kill_litter_super(sb);
 }
 
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
-- 
cgit v1.2.3


From 495d6c9c6595ec7b37910dfd42634839431d21fd Mon Sep 17 00:00:00 2001
From: Valerie Aurora <vaurora@redhat.com>
Date: Tue, 26 Jan 2010 14:20:47 -0500
Subject: VFS: Clean up shared mount flag propagation

The handling of mount flags in set_mnt_shared() got a little tangled
up during previous cleanups, with the following problems:

* MNT_PNODE_MASK is defined as a literal constant when it should be a
bitwise xor of other MNT_* flags
* set_mnt_shared() clears and then sets MNT_SHARED (part of MNT_PNODE_MASK)
* MNT_PNODE_MASK could use a comment in mount.h
* MNT_PNODE_MASK is a terrible name, change to MNT_SHARED_MASK

This patch fixes these problems.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        |  2 +-
 fs/pnode.h            |  2 +-
 include/linux/mount.h | 11 ++++++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 25c1dcf9e9eb..d25d4602ab50 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1538,7 +1538,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
 		spin_lock(&vfsmount_lock);
-		mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK;
+		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
 		path->mnt->mnt_flags = mnt_flags;
 		spin_unlock(&vfsmount_lock);
 	}
diff --git a/fs/pnode.h b/fs/pnode.h
index 6c7ef3252a26..1ea4ae1efcd3 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -25,7 +25,7 @@
 
 static inline void set_mnt_shared(struct vfsmount *mnt)
 {
-	mnt->mnt_flags &= ~MNT_PNODE_MASK;
+	mnt->mnt_flags &= ~MNT_SHARED_MASK;
 	mnt->mnt_flags |= MNT_SHARED;
 }
 
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d5275364867..375d43a5d802 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -34,7 +34,16 @@ struct mnt_namespace;
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
-#define MNT_PNODE_MASK	0x3000	/* propagation flag mask */
+/*
+ * MNT_SHARED_MASK is the set of flags that should be cleared when a
+ * mount becomes shared.  Currently, this is only the flag that says a
+ * mount cannot be bind mounted, since this is how we create a mount
+ * that shares events with another mount.  If you add a new MNT_*
+ * flag, consider how it interacts with shared mounts.
+ */
+#define MNT_SHARED_MASK	(MNT_UNBINDABLE)
+#define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+
 
 struct vfsmount {
 	struct list_head mnt_hash;
-- 
cgit v1.2.3


From 2096f759abcb42200a81d776f597362fd9265024 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 13:16:21 -0500
Subject: New helper: path_is_under(path1, path2)

Analog of is_subdir for vfsmount,dentry pairs, moved from audit_tree.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c         | 24 ++++++++++++++++++++++++
 include/linux/fs.h  |  1 +
 kernel/audit_tree.c | 51 ++++++++++++---------------------------------------
 3 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 4365998b8df4..74da947b160b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2191,6 +2191,30 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 	return result;
 }
 
+int path_is_under(struct path *path1, struct path *path2)
+{
+	struct vfsmount *mnt = path1->mnt;
+	struct dentry *dentry = path1->dentry;
+	int res;
+	spin_lock(&vfsmount_lock);
+	if (mnt != path2->mnt) {
+		for (;;) {
+			if (mnt->mnt_parent == mnt) {
+				spin_unlock(&vfsmount_lock);
+				return 0;
+			}
+			if (mnt->mnt_parent == path2->mnt)
+				break;
+			mnt = mnt->mnt_parent;
+		}
+		dentry = mnt->mnt_mountpoint;
+	}
+	res = is_subdir(dentry, path2->dentry);
+	spin_unlock(&vfsmount_lock);
+	return res;
+}
+EXPORT_SYMBOL(path_is_under);
+
 void d_genocide(struct dentry *root)
 {
 	struct dentry *this_parent = root;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d443c9dd3caa..8d53bc17f93f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2126,6 +2126,7 @@ extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
 extern int is_subdir(struct dentry *, struct dentry *);
+extern int path_is_under(struct path *, struct path *);
 extern ino_t find_inode_number(struct dentry *, struct qstr *);
 
 #include <linux/err.h>
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..f09b42d9c32d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -603,22 +603,6 @@ skip_it:
 	mutex_unlock(&audit_filter_mutex);
 }
 
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-		    struct path *path)
-{
-	if (mnt != path->mnt) {
-		for (;;) {
-			if (mnt->mnt_parent == mnt)
-				return 0;
-			if (mnt->mnt_parent == path->mnt)
-					break;
-			mnt = mnt->mnt_parent;
-		}
-		dentry = mnt->mnt_mountpoint;
-	}
-	return is_subdir(dentry, path->dentry);
-}
-
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
 
@@ -714,29 +698,24 @@ int audit_tag_tree(char *old, char *new)
 {
 	struct list_head cursor, barrier;
 	int failed = 0;
-	struct path path;
+	struct path path1, path2;
 	struct vfsmount *tagged;
 	struct list_head list;
-	struct vfsmount *mnt;
-	struct dentry *dentry;
 	int err;
 
-	err = kern_path(new, 0, &path);
+	err = kern_path(new, 0, &path2);
 	if (err)
 		return err;
-	tagged = collect_mounts(&path);
-	path_put(&path);
+	tagged = collect_mounts(&path2);
+	path_put(&path2);
 	if (!tagged)
 		return -ENOMEM;
 
-	err = kern_path(old, 0, &path);
+	err = kern_path(old, 0, &path1);
 	if (err) {
 		drop_collected_mounts(tagged);
 		return err;
 	}
-	mnt = mntget(path.mnt);
-	dentry = dget(path.dentry);
-	path_put(&path);
 
 	list_add_tail(&list, &tagged->mnt_list);
 
@@ -747,6 +726,7 @@ int audit_tag_tree(char *old, char *new)
 	while (cursor.next != &tree_list) {
 		struct audit_tree *tree;
 		struct vfsmount *p;
+		int good_one = 0;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
 		get_tree(tree);
@@ -754,23 +734,17 @@ int audit_tag_tree(char *old, char *new)
 		list_add(&cursor, &tree->list);
 		mutex_unlock(&audit_filter_mutex);
 
-		err = kern_path(tree->pathname, 0, &path);
-		if (err) {
-			put_tree(tree);
-			mutex_lock(&audit_filter_mutex);
-			continue;
+		err = kern_path(tree->pathname, 0, &path2);
+		if (!err) {
+			good_one = path_is_under(&path1, &path2);
+			path_put(&path2);
 		}
 
-		spin_lock(&vfsmount_lock);
-		if (!is_under(mnt, dentry, &path)) {
-			spin_unlock(&vfsmount_lock);
-			path_put(&path);
+		if (!good_one) {
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
 			continue;
 		}
-		spin_unlock(&vfsmount_lock);
-		path_put(&path);
 
 		list_for_each_entry(p, &list, mnt_list) {
 			failed = tag_chunk(p->mnt_root->d_inode, tree);
@@ -820,8 +794,7 @@ int audit_tag_tree(char *old, char *new)
 	list_del(&cursor);
 	list_del(&list);
 	mutex_unlock(&audit_filter_mutex);
-	dput(dentry);
-	mntput(mnt);
+	path_put(&path1);
 	drop_collected_mounts(tagged);
 	return failed;
 }
-- 
cgit v1.2.3


From 6eae7974d0490a9dbc3091f702ea1650871652a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 13:44:07 -0500
Subject: Switch alloc_nfs_open_context() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f141bde7756a..7570573bdb30 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -574,14 +574,14 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	nfs_revalidate_inode(server, inode);
 }
 
-static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
+static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cred *cred)
 {
 	struct nfs_open_context *ctx;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
-		ctx->path.dentry = dget(dentry);
-		ctx->path.mnt = mntget(mnt);
+		ctx->path = *path;
+		path_get(&ctx->path);
 		ctx->cred = get_rpccred(cred);
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
@@ -686,7 +686,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 	cred = rpc_lookup_cred();
 	if (IS_ERR(cred))
 		return PTR_ERR(cred);
-	ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred);
+	ctx = alloc_nfs_open_context(&filp->f_path, cred);
 	put_rpccred(cred);
 	if (ctx == NULL)
 		return -ENOMEM;
-- 
cgit v1.2.3


From f694869709cc39a5fbde21aa40f22999ddad0e6e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 13:51:04 -0500
Subject: a couple of mntget+dget -> path_get in nfs4proc

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/nfs4proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 375f0fae2c6a..84d83be25a98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -724,8 +724,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
 	p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid);
 	if (p->o_arg.seqid == NULL)
 		goto err_free;
-	p->path.mnt = mntget(path->mnt);
-	p->path.dentry = dget(path->dentry);
+	path_get(path);
+	p->path = *path;
 	p->dir = parent;
 	p->owner = sp;
 	atomic_inc(&sp->so_count);
@@ -1944,8 +1944,8 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
 	calldata->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE;
-	calldata->path.mnt = mntget(path->mnt);
-	calldata->path.dentry = dget(path->dentry);
+	path_get(path);
+	calldata->path = *path;
 
 	msg.rpc_argp = &calldata->arg,
 	msg.rpc_resp = &calldata->res,
-- 
cgit v1.2.3


From 3088dd7080d1ecc6d18c27ef9e617cbbd2a2e51e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 15:47:29 -0500
Subject: Clean follow_dotdot() up a bit

No need to open-code follow_up() in it and locking can be lighter.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b20f83d1e065..3df2ed50ab57 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -689,33 +689,20 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 	set_root(nd);
 
 	while(1) {
-		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
 		if (nd->path.dentry == nd->root.dentry &&
 		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
-		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			nd->path.dentry = dget(nd->path.dentry->d_parent);
-			spin_unlock(&dcache_lock);
+			/* rare case of legitimate dget_parent()... */
+			nd->path.dentry = dget_parent(nd->path.dentry);
 			dput(old);
 			break;
 		}
-		spin_unlock(&dcache_lock);
-		spin_lock(&vfsmount_lock);
-		parent = nd->path.mnt->mnt_parent;
-		if (parent == nd->path.mnt) {
-			spin_unlock(&vfsmount_lock);
+		if (!follow_up(&nd->path))
 			break;
-		}
-		mntget(parent);
-		nd->path.dentry = dget(nd->path.mnt->mnt_mountpoint);
-		spin_unlock(&vfsmount_lock);
-		dput(old);
-		mntput(nd->path.mnt);
-		nd->path.mnt = parent;
 	}
 	follow_mount(&nd->path);
 }
-- 
cgit v1.2.3


From 462d60577a997aa87c935ae4521bd303733a9f2b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 16:11:21 -0500
Subject: fix NFS4 handling of mountpoint stat

RFC says we need to follow the chain of mounts if there's more
than one stacked on that point.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4xdr.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a8587e90fd5a..bbf72d8f9fc0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2121,9 +2121,15 @@ out_acl:
 		 * and this is the root of a cross-mounted filesystem.
 		 */
 		if (ignore_crossmnt == 0 &&
-		    exp->ex_path.mnt->mnt_root->d_inode == dentry->d_inode) {
-			err = vfs_getattr(exp->ex_path.mnt->mnt_parent,
-				exp->ex_path.mnt->mnt_mountpoint, &stat);
+		    dentry == exp->ex_path.mnt->mnt_root) {
+			struct path path = exp->ex_path;
+			path_get(&path);
+			while (follow_up(&path)) {
+				if (path.dentry != path.mnt->mnt_root)
+					break;
+			}
+			err = vfs_getattr(path.mnt, path.dentry, &stat);
+			path_put(&path);
 			if (err)
 				goto out_nfserr;
 		}
-- 
cgit v1.2.3


From 1f707137b55764740981d022d29c622832a61880 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Jan 2010 22:51:25 -0500
Subject: new helper: iterate_mounts()

apply function to vfsmounts in set returned by collect_mounts(),
stop if it returns non-zero.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c      | 15 +++++++++++++++
 include/linux/fs.h  |  3 ++-
 kernel/audit_tree.c | 49 ++++++++++++++++---------------------------------
 3 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d25d4602ab50..d5906c19e08e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1246,6 +1246,21 @@ void drop_collected_mounts(struct vfsmount *mnt)
 	release_mounts(&umount_list);
 }
 
+int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
+		   struct vfsmount *root)
+{
+	struct vfsmount *mnt;
+	int res = f(root, arg);
+	if (res)
+		return res;
+	list_for_each_entry(mnt, &root->mnt_list, mnt_list) {
+		res = f(mnt, arg);
+		if (res)
+			return res;
+	}
+	return 0;
+}
+
 static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
 {
 	struct vfsmount *p;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8d53bc17f93f..e764f247d0ab 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1794,7 +1794,8 @@ extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
 extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
-
+extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
+			  struct vfsmount *);
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
 
 extern int current_umask(void);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f09b42d9c32d..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 	return 0;
 }
 
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+	return mnt->mnt_root->d_inode == arg;
+}
+
 void audit_trim_trees(void)
 {
 	struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
 		struct path path;
 		struct vfsmount *root_mnt;
 		struct node *node;
-		struct list_head list;
 		int err;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
 		if (!root_mnt)
 			goto skip_it;
 
-		list_add_tail(&list, &root_mnt->mnt_list);
 		spin_lock(&hash_lock);
 		list_for_each_entry(node, &tree->chunks, list) {
-			struct audit_chunk *chunk = find_chunk(node);
-			struct inode *inode = chunk->watch.inode;
-			struct vfsmount *mnt;
+			struct inode *inode = find_chunk(node)->watch.inode;
 			node->index |= 1U<<31;
-			list_for_each_entry(mnt, &list, mnt_list) {
-				if (mnt->mnt_root->d_inode == inode) {
-					node->index &= ~(1U<<31);
-					break;
-				}
-			}
+			if (iterate_mounts(compare_root, inode, root_mnt))
+				node->index &= ~(1U<<31);
 		}
 		spin_unlock(&hash_lock);
 		trim_marked(tree);
 		put_tree(tree);
-		list_del_init(&list);
 		drop_collected_mounts(root_mnt);
 skip_it:
 		mutex_lock(&audit_filter_mutex);
@@ -622,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
 	put_tree(tree);
 }
 
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+	return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
+
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
 	struct audit_tree *seed = rule->tree, *tree;
 	struct path path;
-	struct vfsmount *mnt, *p;
-	struct list_head list;
+	struct vfsmount *mnt;
 	int err;
 
 	list_for_each_entry(tree, &tree_list, list) {
@@ -654,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
 		err = -ENOMEM;
 		goto Err;
 	}
-	list_add_tail(&list, &mnt->mnt_list);
 
 	get_tree(tree);
-	list_for_each_entry(p, &list, mnt_list) {
-		err = tag_chunk(p->mnt_root->d_inode, tree);
-		if (err)
-			break;
-	}
-
-	list_del(&list);
+	err = iterate_mounts(tag_mount, tree, mnt);
 	drop_collected_mounts(mnt);
 
 	if (!err) {
@@ -700,7 +693,6 @@ int audit_tag_tree(char *old, char *new)
 	int failed = 0;
 	struct path path1, path2;
 	struct vfsmount *tagged;
-	struct list_head list;
 	int err;
 
 	err = kern_path(new, 0, &path2);
@@ -717,15 +709,12 @@ int audit_tag_tree(char *old, char *new)
 		return err;
 	}
 
-	list_add_tail(&list, &tagged->mnt_list);
-
 	mutex_lock(&audit_filter_mutex);
 	list_add(&barrier, &tree_list);
 	list_add(&cursor, &barrier);
 
 	while (cursor.next != &tree_list) {
 		struct audit_tree *tree;
-		struct vfsmount *p;
 		int good_one = 0;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
@@ -746,12 +735,7 @@ int audit_tag_tree(char *old, char *new)
 			continue;
 		}
 
-		list_for_each_entry(p, &list, mnt_list) {
-			failed = tag_chunk(p->mnt_root->d_inode, tree);
-			if (failed)
-				break;
-		}
-
+		failed = iterate_mounts(tag_mount, tree, tagged);
 		if (failed) {
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
@@ -792,7 +776,6 @@ int audit_tag_tree(char *old, char *new)
 	}
 	list_del(&barrier);
 	list_del(&cursor);
-	list_del(&list);
 	mutex_unlock(&audit_filter_mutex);
 	path_put(&path1);
 	drop_collected_mounts(tagged);
-- 
cgit v1.2.3


From 7e7742ee005c887b86fd1fd38d5b48419329dfa0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Jan 2010 17:09:29 -0500
Subject: sanitize signedness/const for pointers to char in hpfs a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/anode.c   |  2 +-
 fs/hpfs/dentry.c  | 14 +++++++------
 fs/hpfs/dir.c     | 14 ++++++-------
 fs/hpfs/dnode.c   | 21 +++++++++++--------
 fs/hpfs/ea.c      |  7 ++++---
 fs/hpfs/hpfs_fn.h | 30 +++++++++++++++++-----------
 fs/hpfs/inode.c   |  4 ++--
 fs/hpfs/map.c     |  6 +++---
 fs/hpfs/name.c    | 21 +++++++++----------
 fs/hpfs/namei.c   | 60 +++++++++++++++++++++++++++----------------------------
 10 files changed, 97 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 1aa88c4e0964..6a2f04bf3df0 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -353,7 +353,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
 }
 
 int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
-	     unsigned len, char *buf)
+	     unsigned len, const char *buf)
 {
 	struct buffer_head *bh;
 	char *data;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 940d6d150bee..67d9d36b3d5f 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -20,8 +20,8 @@ static int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 
 	if (l == 1) if (qstr->name[0]=='.') goto x;
 	if (l == 2) if (qstr->name[0]=='.' || qstr->name[1]=='.') goto x;
-	hpfs_adjust_length((char *)qstr->name, &l);
-	/*if (hpfs_chk_name((char *)qstr->name,&l))*/
+	hpfs_adjust_length(qstr->name, &l);
+	/*if (hpfs_chk_name(qstr->name,&l))*/
 		/*return -ENAMETOOLONG;*/
 		/*return -ENOENT;*/
 	x:
@@ -38,14 +38,16 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 {
 	unsigned al=a->len;
 	unsigned bl=b->len;
-	hpfs_adjust_length((char *)a->name, &al);
-	/*hpfs_adjust_length((char *)b->name, &bl);*/
+	hpfs_adjust_length(a->name, &al);
+	/*hpfs_adjust_length(b->name, &bl);*/
 	/* 'a' is the qstr of an already existing dentry, so the name
 	 * must be valid. 'b' must be validated first.
 	 */
 
-	if (hpfs_chk_name((char *)b->name, &bl)) return 1;
-	if (hpfs_compare_names(dentry->d_sb, (char *)a->name, al, (char *)b->name, bl, 0)) return 1;
+	if (hpfs_chk_name(b->name, &bl))
+		return 1;
+	if (hpfs_compare_names(dentry->d_sb, a->name, al, b->name, bl, 0))
+		return 1;
 	return 0;
 }
 
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 8865c94f55f6..26e3964a4b8c 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -59,7 +59,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct hpfs_dirent *de;
 	int lc;
 	long old_pos;
-	char *tempname;
+	unsigned char *tempname;
 	int c1, c2 = 0;
 	int ret = 0;
 
@@ -158,11 +158,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
 		if (filldir(dirent, tempname, de->namelen, old_pos, de->fnode, DT_UNKNOWN) < 0) {
 			filp->f_pos = old_pos;
-			if (tempname != (char *)de->name) kfree(tempname);
+			if (tempname != de->name) kfree(tempname);
 			hpfs_brelse4(&qbh);
 			goto out;
 		}
-		if (tempname != (char *)de->name) kfree(tempname);
+		if (tempname != de->name) kfree(tempname);
 		hpfs_brelse4(&qbh);
 	}
 out:
@@ -187,7 +187,7 @@ out:
 
 struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -197,7 +197,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	struct hpfs_inode_info *hpfs_result;
 
 	lock_kernel();
-	if ((err = hpfs_chk_name((char *)name, &len))) {
+	if ((err = hpfs_chk_name(name, &len))) {
 		if (err == -ENAMETOOLONG) {
 			unlock_kernel();
 			return ERR_PTR(-ENAMETOOLONG);
@@ -209,7 +209,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	 * '.' and '..' will never be passed here.
 	 */
 
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *) name, len, NULL, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, NULL, &qbh);
 
 	/*
 	 * This is not really a bailout, just means file not found.
@@ -250,7 +250,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	hpfs_result = hpfs_i(result);
 	if (!de->directory) hpfs_result->i_parent_dir = dir->i_ino;
 
-	hpfs_decide_conv(result, (char *)name, len);
+	hpfs_decide_conv(result, name, len);
 
 	if (de->has_acl || de->has_xtd_perm) if (!(dir->i_sb->s_flags & MS_RDONLY)) {
 		hpfs_error(result->i_sb, "ACLs or XPERM found. This is probably HPFS386. This driver doesn't support it now. Send me some info on these structures");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index fe83c2b7d2d8..9b2ffadfc8c4 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -158,7 +158,8 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
 
 /* Add an entry to dnode and don't care if it grows over 2048 bytes */
 
-struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d, unsigned char *name,
+struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
+				const unsigned char *name,
 				unsigned namelen, secno down_ptr)
 {
 	struct hpfs_dirent *de;
@@ -223,7 +224,7 @@ static void fix_up_ptrs(struct super_block *s, struct dnode *d)
 /* Add an entry to dnode and do dnode splitting if required */
 
 static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
-			     unsigned char *name, unsigned namelen,
+			     const unsigned char *name, unsigned namelen,
 			     struct hpfs_dirent *new_de, dnode_secno down_ptr)
 {
 	struct quad_buffer_head qbh, qbh1, qbh2;
@@ -231,7 +232,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
 	dnode_secno adno, rdno;
 	struct hpfs_dirent *de;
 	struct hpfs_dirent nde;
-	char *nname;
+	unsigned char *nname;
 	int h;
 	int pos;
 	struct buffer_head *bh;
@@ -305,7 +306,9 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
 		pos++;
 	}
 	copy_de(new_de = &nde, de);
-	memcpy(name = nname, de->name, namelen = de->namelen);
+	memcpy(nname, de->name, de->namelen);
+	name = nname;
+	namelen = de->namelen;
 	for_all_poss(i, hpfs_pos_subst, ((loff_t)dno << 4) | pos, 4);
 	down_ptr = adno;
 	set_last_pointer(i->i_sb, ad, de->down ? de_down_pointer(de) : 0);
@@ -368,7 +371,8 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
  * I hope, now it's finally bug-free.
  */
 
-int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen,
+int hpfs_add_dirent(struct inode *i,
+		    const unsigned char *name, unsigned namelen,
 		    struct hpfs_dirent *new_de, int cdepth)
 {
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
@@ -897,7 +901,8 @@ struct hpfs_dirent *map_pos_dirent(struct inode *inode, loff_t *posp,
 
 /* Find a dirent in tree */
 
-struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name, unsigned len,
+struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno,
+			       const unsigned char *name, unsigned len,
 			       dnode_secno *dd, struct quad_buffer_head *qbh)
 {
 	struct dnode *dnode;
@@ -988,8 +993,8 @@ void hpfs_remove_dtree(struct super_block *s, dnode_secno dno)
 struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
 				     struct fnode *f, struct quad_buffer_head *qbh)
 {
-	char *name1;
-	char *name2;
+	unsigned char *name1;
+	unsigned char *name2;
 	int name1len, name2len;
 	struct dnode *d;
 	dnode_secno dno, downd;
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index 547a8384571f..45e53d972b42 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -62,8 +62,8 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
 	return ret;
 }
 
-static void set_indirect_ea(struct super_block *s, int ano, secno a, char *data,
-			    int size)
+static void set_indirect_ea(struct super_block *s, int ano, secno a,
+			    const char *data, int size)
 {
 	hpfs_ea_write(s, a, ano, 0, size, data);
 }
@@ -186,7 +186,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
  * This driver can't change sizes of eas ('cause I just don't need it).
  */
 
-void hpfs_set_ea(struct inode *inode, struct fnode *fnode, char *key, char *data, int size)
+void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
+		 const char *data, int size)
 {
 	fnode_secno fno = inode->i_ino;
 	struct super_block *s = inode->i_sb;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 701ca54c0867..97bf738cd5d6 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -215,7 +215,7 @@ secno hpfs_bplus_lookup(struct super_block *, struct inode *, struct bplus_heade
 secno hpfs_add_sector_to_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_btree(struct super_block *, struct bplus_header *);
 int hpfs_ea_read(struct super_block *, secno, int, unsigned, unsigned, char *);
-int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, char *);
+int hpfs_ea_write(struct super_block *, secno, int, unsigned, unsigned, const char *);
 void hpfs_ea_remove(struct super_block *, secno, int, unsigned);
 void hpfs_truncate_btree(struct super_block *, secno, int, unsigned);
 void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
@@ -244,13 +244,17 @@ extern const struct file_operations hpfs_dir_ops;
 
 void hpfs_add_pos(struct inode *, loff_t *);
 void hpfs_del_pos(struct inode *, loff_t *);
-struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *, unsigned char *, unsigned, secno);
-int hpfs_add_dirent(struct inode *, unsigned char *, unsigned, struct hpfs_dirent *, int);
+struct hpfs_dirent *hpfs_add_de(struct super_block *, struct dnode *,
+				const unsigned char *, unsigned, secno);
+int hpfs_add_dirent(struct inode *, const unsigned char *, unsigned,
+		    struct hpfs_dirent *, int);
 int hpfs_remove_dirent(struct inode *, dnode_secno, struct hpfs_dirent *, struct quad_buffer_head *, int);
 void hpfs_count_dnodes(struct super_block *, dnode_secno, int *, int *, int *);
 dnode_secno hpfs_de_as_down_as_possible(struct super_block *, dnode_secno dno);
 struct hpfs_dirent *map_pos_dirent(struct inode *, loff_t *, struct quad_buffer_head *);
-struct hpfs_dirent *map_dirent(struct inode *, dnode_secno, char *, unsigned, dnode_secno *, struct quad_buffer_head *);
+struct hpfs_dirent *map_dirent(struct inode *, dnode_secno,
+			       const unsigned char *, unsigned, dnode_secno *,
+			       struct quad_buffer_head *);
 void hpfs_remove_dtree(struct super_block *, dnode_secno);
 struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct fnode *, struct quad_buffer_head *);
 
@@ -259,7 +263,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *, fnode_secno, struct f
 void hpfs_ea_ext_remove(struct super_block *, secno, int, unsigned);
 int hpfs_read_ea(struct super_block *, struct fnode *, char *, char *, int);
 char *hpfs_get_ea(struct super_block *, struct fnode *, char *, int *);
-void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
+void hpfs_set_ea(struct inode *, struct fnode *, const char *,
+		 const char *, int);
 
 /* file.c */
 
@@ -282,7 +287,7 @@ void hpfs_delete_inode(struct inode *);
 
 unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
 unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
-char *hpfs_load_code_page(struct super_block *, secno);
+unsigned char *hpfs_load_code_page(struct super_block *, secno);
 secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
 struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
@@ -292,12 +297,13 @@ dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino);
 /* name.c */
 
 unsigned char hpfs_upcase(unsigned char *, unsigned char);
-int hpfs_chk_name(unsigned char *, unsigned *);
-char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
-int hpfs_compare_names(struct super_block *, unsigned char *, unsigned, unsigned char *, unsigned, int);
-int hpfs_is_name_long(unsigned char *, unsigned);
-void hpfs_adjust_length(unsigned char *, unsigned *);
-void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
+int hpfs_chk_name(const unsigned char *, unsigned *);
+unsigned char *hpfs_translate_name(struct super_block *, unsigned char *, unsigned, int, int);
+int hpfs_compare_names(struct super_block *, const unsigned char *, unsigned,
+		       const unsigned char *, unsigned, int);
+int hpfs_is_name_long(const unsigned char *, unsigned);
+void hpfs_adjust_length(const unsigned char *, unsigned *);
+void hpfs_decide_conv(struct inode *, const unsigned char *, unsigned);
 
 /* namei.c */
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index fe703ae46bc7..ff90affb94e1 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -46,7 +46,7 @@ void hpfs_read_inode(struct inode *i)
 	struct fnode *fnode;
 	struct super_block *sb = i->i_sb;
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
-	unsigned char *ea;
+	void *ea;
 	int ea_size;
 
 	if (!(fnode = hpfs_map_fnode(sb, i->i_ino, &bh))) {
@@ -112,7 +112,7 @@ void hpfs_read_inode(struct inode *i)
 		}
 	}
 	if (fnode->dirflag) {
-		unsigned n_dnodes, n_subdirs;
+		int n_dnodes, n_subdirs;
 		i->i_mode |= S_IFDIR;
 		i->i_op = &hpfs_dir_iops;
 		i->i_fop = &hpfs_dir_ops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index c4724589b2eb..840d033ecee8 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -35,7 +35,7 @@ unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
  * lowercasing table
  */
 
-char *hpfs_load_code_page(struct super_block *s, secno cps)
+unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
 {
 	struct buffer_head *bh;
 	secno cpds;
@@ -71,7 +71,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
 		brelse(bh);
 		return NULL;
 	}
-	ptr = (char *)cpd + cpd->offs[cpi] + 6;
+	ptr = (unsigned char *)cpd + cpd->offs[cpi] + 6;
 	if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
 		printk("HPFS: out of memory for code page table\n");
 		brelse(bh);
@@ -217,7 +217,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 	if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
 		if (hpfs_sb(s)->sb_chk) {
 			unsigned p, pp = 0;
-			unsigned char *d = (char *)dnode;
+			unsigned char *d = (unsigned char *)dnode;
 			int b = 0;
 			if (dnode->magic != DNODE_MAGIC) {
 				hpfs_error(s, "bad magic on dnode %08x", secno);
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 1f4a964384eb..f24736d7a439 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -8,16 +8,16 @@
 
 #include "hpfs_fn.h"
 
-static char *text_postfix[]={
+static const char *text_postfix[]={
 ".ASM", ".BAS", ".BAT", ".C", ".CC", ".CFG", ".CMD", ".CON", ".CPP", ".DEF",
 ".DOC", ".DPR", ".ERX", ".H", ".HPP", ".HTM", ".HTML", ".JAVA", ".LOG", ".PAS",
 ".RC", ".TEX", ".TXT", ".Y", ""};
 
-static char *text_prefix[]={
+static const char *text_prefix[]={
 "AUTOEXEC.", "CHANGES", "COPYING", "CONFIG.", "CREDITS", "FAQ", "FILE_ID.DIZ",
 "MAKEFILE", "READ.ME", "README", "TERMCAP", ""};
 
-void hpfs_decide_conv(struct inode *inode, unsigned char *name, unsigned len)
+void hpfs_decide_conv(struct inode *inode, const unsigned char *name, unsigned len)
 {
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
 	int i;
@@ -71,7 +71,7 @@ static inline unsigned char locase(unsigned char *dir, unsigned char a)
 	return dir[a];
 }
 
-int hpfs_chk_name(unsigned char *name, unsigned *len)
+int hpfs_chk_name(const unsigned char *name, unsigned *len)
 {
 	int i;
 	if (*len > 254) return -ENAMETOOLONG;
@@ -83,10 +83,10 @@ int hpfs_chk_name(unsigned char *name, unsigned *len)
 	return 0;
 }
 
-char *hpfs_translate_name(struct super_block *s, unsigned char *from,
+unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 			  unsigned len, int lc, int lng)
 {
-	char *to;
+	unsigned char *to;
 	int i;
 	if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
 		printk("HPFS: Long name flag mismatch - name ");
@@ -103,8 +103,9 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 	return to;
 }
 
-int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
-		       unsigned char *n2, unsigned l2, int last)
+int hpfs_compare_names(struct super_block *s,
+		       const unsigned char *n1, unsigned l1,
+		       const unsigned char *n2, unsigned l2, int last)
 {
 	unsigned l = l1 < l2 ? l1 : l2;
 	unsigned i;
@@ -120,7 +121,7 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
 	return 0;
 }
 
-int hpfs_is_name_long(unsigned char *name, unsigned len)
+int hpfs_is_name_long(const unsigned char *name, unsigned len)
 {
 	int i,j;
 	for (i = 0; i < len && name[i] != '.'; i++)
@@ -134,7 +135,7 @@ int hpfs_is_name_long(unsigned char *name, unsigned len)
 
 /* OS/2 clears dots and spaces at the end of file name, so we have to */
 
-void hpfs_adjust_length(unsigned char *name, unsigned *len)
+void hpfs_adjust_length(const unsigned char *name, unsigned *len)
 {
 	if (!*len) return;
 	if (*len == 1 && name[0] == '.') return;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 82b9c4ba9ed0..15fd2c06f4a7 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -11,7 +11,7 @@
 
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh0;
 	struct buffer_head *bh;
@@ -24,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int r;
 	struct hpfs_dirent dee;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -62,7 +62,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		result->i_mode &= ~0222;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail3;
 	if (r == -1) {
@@ -121,7 +121,7 @@ bail:
 
 static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct inode *result = NULL;
 	struct buffer_head *bh;
@@ -130,7 +130,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	int r;
 	struct hpfs_dirent dee;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len)))
+	if ((err = hpfs_chk_name(name, &len)))
 		return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	err = -ENOSPC;
@@ -155,7 +155,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	result->i_op = &hpfs_file_iops;
 	result->i_fop = &hpfs_file_ops;
 	result->i_nlink = 1;
-	hpfs_decide_conv(result, (char *)name, len);
+	hpfs_decide_conv(result, name, len);
 	hpfs_i(result)->i_parent_dir = dir->i_ino;
 	result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, dee.creation_date);
 	result->i_ctime.tv_nsec = 0;
@@ -170,7 +170,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	hpfs_i(result)->mmu_private = 0;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -211,7 +211,7 @@ bail:
 
 static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct buffer_head *bh;
 	struct fnode *fnode;
@@ -220,7 +220,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	struct hpfs_dirent dee;
 	struct inode *result = NULL;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -256,7 +256,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	init_special_inode(result, mode, rdev);
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -289,7 +289,7 @@ bail:
 
 static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct buffer_head *bh;
 	struct fnode *fnode;
@@ -298,7 +298,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	struct hpfs_dirent dee;
 	struct inode *result;
 	int err;
-	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
+	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
 		unlock_kernel();
@@ -335,7 +335,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	result->i_data.a_ops = &hpfs_symlink_aops;
 
 	mutex_lock(&hpfs_i(dir)->i_mutex);
-	r = hpfs_add_dirent(dir, (char *)name, len, &dee, 0);
+	r = hpfs_add_dirent(dir, name, len, &dee, 0);
 	if (r == 1)
 		goto bail2;
 	if (r == -1) {
@@ -345,7 +345,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	fnode->len = len;
 	memcpy(fnode->name, name, len > 15 ? 15 : len);
 	fnode->up = dir->i_ino;
-	hpfs_set_ea(result, fnode, "SYMLINK", (char *)symlink, strlen(symlink));
+	hpfs_set_ea(result, fnode, "SYMLINK", symlink, strlen(symlink));
 	mark_buffer_dirty(bh);
 	brelse(bh);
 
@@ -369,7 +369,7 @@ bail:
 
 static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -381,12 +381,12 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 	int err;
 
 	lock_kernel();
-	hpfs_adjust_length((char *)name, &len);
+	hpfs_adjust_length(name, &len);
 again:
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
 	mutex_lock(&hpfs_i(dir)->i_mutex);
 	err = -ENOENT;
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
 	if (!de)
 		goto out;
 
@@ -451,7 +451,7 @@ out:
 
 static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
@@ -462,12 +462,12 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int r;
 
-	hpfs_adjust_length((char *)name, &len);
+	hpfs_adjust_length(name, &len);
 	lock_kernel();
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
 	mutex_lock(&hpfs_i(dir)->i_mutex);
 	err = -ENOENT;
-	de = map_dirent(dir, hpfs_i(dir)->i_dno, (char *)name, len, &dno, &qbh);
+	de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
 	if (!de)
 		goto out;
 
@@ -546,10 +546,10 @@ const struct address_space_operations hpfs_symlink_aops = {
 static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
-	char *old_name = (char *)old_dentry->d_name.name;
-	int old_len = old_dentry->d_name.len;
-	char *new_name = (char *)new_dentry->d_name.name;
-	int new_len = new_dentry->d_name.len;
+	const unsigned char *old_name = old_dentry->d_name.name;
+	unsigned old_len = old_dentry->d_name.len;
+	const unsigned char *new_name = new_dentry->d_name.name;
+	unsigned new_len = new_dentry->d_name.len;
 	struct inode *i = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
 	struct quad_buffer_head qbh, qbh1;
@@ -560,9 +560,9 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct buffer_head *bh;
 	struct fnode *fnode;
 	int err;
-	if ((err = hpfs_chk_name((char *)new_name, &new_len))) return err;
+	if ((err = hpfs_chk_name(new_name, &new_len))) return err;
 	err = 0;
-	hpfs_adjust_length((char *)old_name, &old_len);
+	hpfs_adjust_length(old_name, &old_len);
 
 	lock_kernel();
 	/* order doesn't matter, due to VFS exclusion */
@@ -579,7 +579,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto end1;
 	}
 
-	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+	if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
 		hpfs_error(i->i_sb, "lookup succeeded but map dirent failed");
 		err = -ENOENT;
 		goto end1;
@@ -590,7 +590,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_inode) {
 		int r;
 		if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
-			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) {
+			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, new_name, new_len, NULL, &qbh1))) {
 				clear_nlink(new_inode);
 				copy_de(nde, &de);
 				memcpy(nde->name, new_name, new_len);
@@ -618,7 +618,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 	
 	if (new_dir == old_dir)
-		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, (char *)old_name, old_len, &dno, &qbh))) {
+		if (!(dep = map_dirent(old_dir, hpfs_i(old_dir)->i_dno, old_name, old_len, &dno, &qbh))) {
 			hpfs_unlock_creation(i->i_sb);
 			hpfs_error(i->i_sb, "lookup succeeded but map dirent failed at #2");
 			err = -ENOENT;
@@ -648,7 +648,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		brelse(bh);
 	}
 	hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
-	hpfs_decide_conv(i, (char *)new_name, new_len);
+	hpfs_decide_conv(i, new_name, new_len);
 end1:
 	if (old_dir != new_dir)
 		mutex_unlock(&hpfs_i(new_dir)->i_mutex);
-- 
cgit v1.2.3


From 89031bc79782a93fc65adabd0e123c89645bee6e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Jan 2010 20:49:54 -0500
Subject: sanitize const/signedness of ufs a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/dir.c | 10 +++++-----
 fs/ufs/ufs.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 22af68f8b682..317a0d444f6b 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -31,7 +31,7 @@
  * len <= UFS_MAXNAMLEN and de != NULL are guaranteed by caller.
  */
 static inline int ufs_match(struct super_block *sb, int len,
-		const char * const name, struct ufs_dir_entry * de)
+		const unsigned char *name, struct ufs_dir_entry *de)
 {
 	if (len != ufs_get_de_namlen(sb, de))
 		return 0;
@@ -70,7 +70,7 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
 	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }
 
-ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr)
+ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
 	ino_t res = 0;
 	struct ufs_dir_entry *de;
@@ -249,11 +249,11 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
  * (as a parameter - res_dir). Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
-struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr,
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
 				     struct page **res_page)
 {
 	struct super_block *sb = dir->i_sb;
-	const char *name = qstr->name;
+	const unsigned char *name = qstr->name;
 	int namelen = qstr->len;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
@@ -313,7 +313,7 @@ found:
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	struct super_block *sb = dir->i_sb;
 	unsigned reclen = UFS_DIR_REC_LEN(namelen);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 0b4c39bc0d9e..01d0e2a3b230 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
 /* dir.c */
 extern const struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, struct qstr *);
+extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
-- 
cgit v1.2.3


From 0319003d0d229735770c185ddf132c666e9cd01a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Jan 2010 21:02:09 -0500
Subject: nilfs really shouldn't slap struct dentry on stack...

... especially when it only needs (and initializes) .d_name of it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/dir.c   | 10 +++++-----
 fs/nilfs2/namei.c | 13 +++++--------
 fs/nilfs2/nilfs.h |  4 ++--
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 76d803e060a9..26402b9b305e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -349,11 +349,11 @@ done:
  * Entry is guaranteed to be valid.
  */
 struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 		 struct page **res_page)
 {
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const unsigned char *name = qstr->name;
+	int namelen = qstr->len;
 	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
@@ -424,13 +424,13 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
 	return de;
 }
 
-ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 {
 	ino_t res = 0;
 	struct nilfs_dir_entry *de;
 	struct page *page;
 
-	de = nilfs_find_entry(dir, dentry, &page);
+	de = nilfs_find_entry(dir, qstr, &page);
 	if (de) {
 		res = le64_to_cpu(de->inode);
 		kunmap(page);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 07ba838ef089..ad6ed2cf19b4 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -67,7 +67,7 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	if (dentry->d_name.len > NILFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ino = nilfs_inode_by_name(dir, dentry);
+	ino = nilfs_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
 		inode = nilfs_iget(dir->i_sb, ino);
@@ -81,10 +81,7 @@ struct dentry *nilfs_get_parent(struct dentry *child)
 {
 	unsigned long ino;
 	struct inode *inode;
-	struct dentry dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
+	struct qstr dotdot = {.name = "..", .len = 2};
 
 	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
 	if (!ino)
@@ -296,7 +293,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
 	int err;
 
 	err = -ENOENT;
-	de = nilfs_find_entry(dir, dentry, &page);
+	de = nilfs_find_entry(dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
 
@@ -389,7 +386,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return err;
 
 	err = -ENOENT;
-	old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+	old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
 
@@ -409,7 +406,7 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_dir;
 
 		err = -ENOENT;
-		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inc_nlink(old_inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 4da6f67e9a91..8723e5bfd071 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -217,10 +217,10 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
-extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
 extern int nilfs_make_empty(struct inode *, struct inode *);
 extern struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
 extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
 extern int nilfs_empty_dir(struct inode *);
 extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
-- 
cgit v1.2.3


From 072f98b4637eddcbdf2178fc84f382e2ee522f08 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Jan 2010 21:03:58 -0500
Subject: nilfs: sanitize const/signedness in dealing with ->d_name.name

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 26402b9b305e..0092840492ee 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -224,7 +224,7 @@ fail:
  * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
  */
 static int
-nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
 {
 	if (len != de->name_len)
 		return 0;
@@ -465,7 +465,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
+	const unsigned char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	unsigned chunk_size = nilfs_chunk_size(dir);
 	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
-- 
cgit v1.2.3


From 391e8bbd38474b9f85b1f3933394a79ea66fe1e2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Jan 2010 21:28:48 -0500
Subject: sanitize const/signedness for udf

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/balloc.c  |  2 +-
 fs/udf/dir.c     |  4 ++--
 fs/udf/inode.c   |  2 +-
 fs/udf/namei.c   | 20 ++++++++++----------
 fs/udf/symlink.c | 10 +++++-----
 5 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 82372e332f08..b2d96f45c12b 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -547,7 +547,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 		}
 
 		if (epos.offset + (2 * adsize) > sb->s_blocksize) {
-			char *sptr, *dptr;
+			unsigned char *sptr, *dptr;
 			int loffset;
 
 			brelse(oepos.bh);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 61d9a76a3a69..f0f2a436251e 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,8 +45,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
 	int block, iblock;
 	loff_t nf_pos = (filp->f_pos - 1) << 2;
 	int flen;
-	char *fname = NULL;
-	char *nameptr;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
 	uint16_t liu;
 	uint8_t lfi;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index f90231eb2916..378a7592257c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1672,7 +1672,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		return -1;
 
 	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
-		char *sptr, *dptr;
+		unsigned char *sptr, *dptr;
 		struct buffer_head *nbh;
 		int err, loffset;
 		struct kernel_lb_addr obloc = epos->block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index cd2115060fdc..7c56ff00cd53 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -34,8 +34,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
 
-static inline int udf_match(int len1, const char *name1, int len2,
-			    const char *name2)
+static inline int udf_match(int len1, const unsigned char *name1, int len2,
+			    const unsigned char *name2)
 {
 	if (len1 != len2)
 		return 0;
@@ -142,15 +142,15 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 }
 
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
-					    struct qstr *child,
+					    const struct qstr *child,
 					    struct udf_fileident_bh *fibh,
 					    struct fileIdentDesc *cfi)
 {
 	struct fileIdentDesc *fi = NULL;
 	loff_t f_pos;
 	int block, flen;
-	char *fname = NULL;
-	char *nameptr;
+	unsigned char *fname = NULL;
+	unsigned char *nameptr;
 	uint8_t lfi;
 	uint16_t liu;
 	loff_t size;
@@ -308,7 +308,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
 {
 	struct super_block *sb = dir->i_sb;
 	struct fileIdentDesc *fi = NULL;
-	char *name = NULL;
+	unsigned char *name = NULL;
 	int namelen;
 	loff_t f_pos;
 	loff_t size = udf_ext0_offset(dir) + dir->i_size;
@@ -885,16 +885,16 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 	struct pathComponent *pc;
-	char *compstart;
+	const char *compstart;
 	struct udf_fileident_bh fibh;
 	struct extent_position epos = {};
 	int eoffset, elen = 0;
 	struct fileIdentDesc *fi;
 	struct fileIdentDesc cfi;
-	char *ea;
+	uint8_t *ea;
 	int err;
 	int block;
-	char *name = NULL;
+	unsigned char *name = NULL;
 	int namelen;
 	struct buffer_head *bh;
 	struct udf_inode_info *iinfo;
@@ -970,7 +970,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 
 		pc = (struct pathComponent *)(ea + elen);
 
-		compstart = (char *)symname;
+		compstart = symname;
 
 		do {
 			symname++;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index c3265e1385d4..852e91845688 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -32,12 +32,12 @@
 #include <linux/buffer_head.h>
 #include "udf_i.h"
 
-static void udf_pc_to_char(struct super_block *sb, char *from, int fromlen,
-			   char *to)
+static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
+			   int fromlen, unsigned char *to)
 {
 	struct pathComponent *pc;
 	int elen = 0;
-	char *p = to;
+	unsigned char *p = to;
 
 	while (elen < fromlen) {
 		pc = (struct pathComponent *)(from + elen);
@@ -75,9 +75,9 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head *bh = NULL;
-	char *symlink;
+	unsigned char *symlink;
 	int err = -EIO;
-	char *p = kmap(page);
+	unsigned char *p = kmap(page);
 	struct udf_inode_info *iinfo;
 
 	lock_kernel();
-- 
cgit v1.2.3


From e21e7095a78867364d7aa9223d833ccb966f93f3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Feb 2010 11:05:16 -0500
Subject: Don't mess with generic_permission() under ->d_lock in hpfs

Just use dentry_unhash() there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hpfs/namei.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 15fd2c06f4a7..11c2b4080f65 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -413,22 +413,25 @@ again:
 
 		mutex_unlock(&hpfs_i(dir)->i_mutex);
 		mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-		d_drop(dentry);
-		spin_lock(&dentry->d_lock);
-		if (atomic_read(&dentry->d_count) > 1 ||
-		    generic_permission(inode, MAY_WRITE, NULL) ||
+		dentry_unhash(dentry);
+		if (!d_unhashed(dentry)) {
+			dput(dentry);
+			unlock_kernel();
+			return -ENOSPC;
+		}
+		if (generic_permission(inode, MAY_WRITE, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
-			spin_unlock(&dentry->d_lock);
 			d_rehash(dentry);
+			dput(dentry);
 		} else {
 			struct iattr newattrs;
-			spin_unlock(&dentry->d_lock);
 			/*printk("HPFS: truncating file before delete.\n");*/
 			newattrs.ia_size = 0;
 			newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 			err = notify_change(dentry, &newattrs);
 			put_write_access(inode);
+			dput(dentry);
 			if (!err)
 				goto again;
 		}
-- 
cgit v1.2.3


From 9f5596af44514f99e3a654a4f7cb813354b9e516 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Feb 2010 00:40:25 -0500
Subject: take check for new events in namespace (guts of mounts_poll()) to
 namespace.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c                | 15 +++++++++++++++
 fs/proc/base.c                | 10 ++--------
 include/linux/mnt_namespace.h |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d5906c19e08e..970fe79d7867 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v)
 	up_read(&namespace_sem);
 }
 
+int mnt_had_events(struct proc_mounts *p)
+{
+	struct mnt_namespace *ns = p->ns;
+	int res = 0;
+
+	spin_lock(&vfsmount_lock);
+	if (p->event != ns->event) {
+		p->event = ns->event;
+		res = 1;
+	}
+	spin_unlock(&vfsmount_lock);
+
+	return res;
+}
+
 struct proc_fs_info {
 	int flag;
 	const char *str;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58324c299165..746895ddfda1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -647,17 +647,11 @@ static int mounts_release(struct inode *inode, struct file *file)
 static unsigned mounts_poll(struct file *file, poll_table *wait)
 {
 	struct proc_mounts *p = file->private_data;
-	struct mnt_namespace *ns = p->ns;
 	unsigned res = POLLIN | POLLRDNORM;
 
-	poll_wait(file, &ns->poll, wait);
-
-	spin_lock(&vfsmount_lock);
-	if (p->event != ns->event) {
-		p->event = ns->event;
+	poll_wait(file, &p->ns->poll, wait);
+	if (mnt_had_events(p))
 		res |= POLLERR | POLLPRI;
-	}
-	spin_unlock(&vfsmount_lock);
 
 	return res;
 }
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index d74785c2393a..0b89efc6f215 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -35,6 +35,7 @@ static inline void get_mnt_ns(struct mnt_namespace *ns)
 extern const struct seq_operations mounts_op;
 extern const struct seq_operations mountinfo_op;
 extern const struct seq_operations mountstats_op;
+extern int mnt_had_events(struct proc_mounts *);
 
 #endif
 #endif
-- 
cgit v1.2.3


From 47cd813f2984569570021ce3d34cdf9cb20aa6a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Feb 2010 02:01:14 -0500
Subject: Take vfsmount_lock to fs/internal.h

no more users left outside of fs/*.c (and very few outside of
fs/namespace.c, actually)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h         | 2 ++
 include/linux/mount.h | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index e96a1667d749..8a03a5447bdf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -70,6 +70,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
 
+extern spinlock_t vfsmount_lock;
+
 /*
  * fs_struct.c
  */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 375d43a5d802..163896137ab5 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -132,7 +132,6 @@ extern int do_add_mount(struct vfsmount *newmnt, struct path *path,
 
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
-extern spinlock_t vfsmount_lock;
 extern dev_t name_to_dev_t(char *name);
 
 #endif /* _LINUX_MOUNT_H */
-- 
cgit v1.2.3


From d498b25a4f877be535246c4e5b6ee5890781a477 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Feb 2010 02:21:06 -0500
Subject: get rid of useless vfsmount_lock use in put_mnt_ns()

It hadn't been needed since we'd sanitized the logics in
mark_mounts_for_expiry() (which, in turn, used to be a
rudiment of bad old times when namespace_sem was per-ns).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 970fe79d7867..b0b15cc2117c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2344,17 +2344,13 @@ void __init mnt_init(void)
 
 void put_mnt_ns(struct mnt_namespace *ns)
 {
-	struct vfsmount *root;
 	LIST_HEAD(umount_list);
 
-	if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock))
+	if (!atomic_dec_and_test(&ns->count))
 		return;
-	root = ns->root;
-	ns->root = NULL;
-	spin_unlock(&vfsmount_lock);
 	down_write(&namespace_sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(root, 0, &umount_list);
+	umount_tree(ns->root, 0, &umount_list);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
-- 
cgit v1.2.3


From 8089352a13b785d4e0df63d87bd2b71c76bb9aee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Feb 2010 09:30:46 -0500
Subject: Mirror MS_KERNMOUNT in ->mnt_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 2 +-
 fs/super.c            | 3 +++
 include/linux/mount.h | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index b0b15cc2117c..ffa3843404e0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1701,7 +1701,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 {
 	int err;
 
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD);
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
diff --git a/fs/super.c b/fs/super.c
index 903896ec7c73..f35ac6022109 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -937,6 +937,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (!mnt)
 		goto out;
 
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt_flags = MNT_INTERNAL;
+
 	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
 		secdata = alloc_secdata();
 		if (!secdata)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 163896137ab5..ca726ebf50a3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -45,6 +45,8 @@ struct mnt_namespace;
 #define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
 
 
+#define MNT_INTERNAL	0x4000
+
 struct vfsmount {
 	struct list_head mnt_hash;
 	struct vfsmount *mnt_parent;	/* fs we are mounted on */
-- 
cgit v1.2.3


From 0ceeca5a08abb1d880f0cc0ea812ad14932070e0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 5 Feb 2010 09:34:36 -0500
Subject: hppfs can use existing proc_mnt, no need for do_kern_mount() in there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hppfs/hppfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 7239efc690d8..2e4dfa8593da 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	struct vfsmount *proc_mnt;
 	int err = -ENOENT;
 
-	proc_mnt = do_kern_mount("proc", 0, "proc", NULL);
+	proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
 	if (IS_ERR(proc_mnt))
 		goto out;
 
-- 
cgit v1.2.3


From db1f05bb85d7966b9176e293f3ceead1cb8b5d79 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 10 Feb 2010 12:15:53 +0100
Subject: vfs: add NOFOLLOW flag to umount(2)

Add a new UMOUNT_NOFOLLOW flag to umount(2).  This is needed to prevent
symlink attacks in unprivileged unmounts (fuse, samba, ncpfs).

Additionally, return -EINVAL if an unknown flag is used (and specify
an explicitly unused flag: UMOUNT_UNUSED).  This makes it possible for
the caller to determine if a flag is supported or not.

CC: Eugene Teo <eugene@redhat.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     | 9 ++++++++-
 include/linux/fs.h | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index ffa3843404e0..8174c8ab5c70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1136,8 +1136,15 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	int retval;
+	int lookup_flags = 0;
 
-	retval = user_path(name, &path);
+	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
+		return -EINVAL;
+
+	if (!(flags & UMOUNT_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	retval = -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e764f247d0ab..5b3182c7eb5f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1305,6 +1305,8 @@ extern int send_sigurg(struct fown_struct *fown);
 #define MNT_FORCE	0x00000001	/* Attempt to forcibily umount */
 #define MNT_DETACH	0x00000002	/* Just detach from the tree */
 #define MNT_EXPIRE	0x00000004	/* Mark for expiry */
+#define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
+#define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
-- 
cgit v1.2.3


From bec1052e5be6a70f03f6adc650f3a6e4c2f44ddf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 3 Mar 2010 14:12:08 -0500
Subject: set S_DEAD on unlink() and non-directory rename() victims

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3df2ed50ab57..54d33df06be0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2262,8 +2262,11 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 		error = -EBUSY;
 	else {
 		error = security_inode_unlink(dir, dentry);
-		if (!error)
+		if (!error) {
 			error = dir->i_op->unlink(dir, dentry);
+			if (!error)
+				dentry->d_inode->i_flags |= S_DEAD;
+		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
@@ -2616,6 +2619,8 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 	else
 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (!error) {
+		if (target)
+			target->i_flags |= S_DEAD;
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry, new_dentry);
 	}
-- 
cgit v1.2.3


From 4919c5e45a91b5db5a41695fe0357fbdff0d5767 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 3 Mar 2010 14:13:08 -0500
Subject: fix race in d_splice_alias()

rehashing the negative placeholder opens a race with d_lookup();
we unhash it almost immediately (by d_move()), but the race
window is there.  Since d_move() doesn't rely on target being
hashed, we don't need that d_rehash() at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 74da947b160b..f1358e5c3a59 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1222,7 +1222,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
-			d_rehash(dentry);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-- 
cgit v1.2.3


From 9b1d0998d24f9c207d5fbdd0b8bac07284e0eda7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 3 Mar 2010 16:19:32 -0500
Subject: ext4: Release page references acquired in
 ext4_da_block_invalidatepages

We forget to release page references we acquire in
ext4_da_block_invalidatepages.  Luckily, this function gets called only if we
are not able to allocate blocks for delay-allocated data so that function
should better never be called.

Also cleanup handling of index variable.

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c717a74f2178..f55df7192b95 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2157,17 +2157,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
-			index = page->index;
-			if (index > end)
+			if (page->index > end)
 				break;
-			index++;
-
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			block_invalidatepage(page, 0);
 			ClearPageUptodate(page);
 			unlock_page(page);
 		}
+		index = pvec.pages[nr_pages - 1]->index + 1;
+		pagevec_release(&pvec);
 	}
 	return;
 }
-- 
cgit v1.2.3


From 5661bd6861b7490394e29aaf74dca812188272e4 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 3 Mar 2010 23:53:39 -0500
Subject: ext4: cleanup to use ext4_group_first_block_no()

This is a cleanup and simplification patch which takes some open-coded
calculations to calculate the first block number of a group and
converts them to use the (already defined) ext4_group_first_block_no()
function.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/balloc.c  |  3 +--
 fs/ext4/extents.c |  3 +--
 fs/ext4/mballoc.c | 29 +++++++++++------------------
 fs/ext4/mballoc.h |  7 +------
 4 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 720061a0ee53..fadbc4d3a202 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -130,8 +130,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * to make sure we calculate the right free blocks
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
-			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
+			ext4_group_first_block_no(sb, ngroups - 1);
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4bb69206f175..3c0bae11a097 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -195,8 +195,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		if (S_ISREG(inode->i_mode))
 			block_group++;
 	}
-	bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
 
 	/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 72d5ceb18523..11568697b192 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -441,10 +441,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += first + i;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
@@ -1255,10 +1254,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 
 		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
 			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 			blocknr += block;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_grp_locked_error(sb, e4b->bd_group,
 				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
@@ -1631,7 +1629,6 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	int max;
 	int err;
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-	struct ext4_super_block *es = sbi->s_es;
 	struct ext4_free_extent ex;
 
 	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
@@ -1648,8 +1645,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
 		ext4_fsblk_t start;
 
-		start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
-			ex.fe_start + le32_to_cpu(es->s_first_data_block);
+		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+			ex.fe_start;
 		/* use do_div to get remainder (would be 64-bit modulo) */
 		if (do_div(start, sbi->s_stripe) == 0) {
 			ac->ac_found++;
@@ -1803,8 +1800,8 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	BUG_ON(sbi->s_stripe == 0);
 
 	/* find first stripe-aligned block in group */
-	first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ le32_to_cpu(sbi->s_es->s_first_data_block);
+	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
 	a = first_group_block + sbi->s_stripe - 1;
 	do_div(a, sbi->s_stripe);
 	i = (a * sbi->s_stripe) - first_group_block;
@@ -2560,12 +2557,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		ext4_unlock_group(sb, entry->group);
 		if (test_opt(sb, DISCARD)) {
 			ext4_fsblk_t discard_block;
-			struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
-			discard_block = (ext4_fsblk_t)entry->group *
-						EXT4_BLOCKS_PER_GROUP(sb)
-					+ entry->start_blk
-					+ le32_to_cpu(es->s_first_data_block);
+			discard_block = entry->start_blk +
+				ext4_group_first_block_no(sb, entry->group);
 			trace_ext4_discard_blocks(sb,
 					(unsigned long long)discard_block,
 					entry->count);
@@ -3525,8 +3519,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
-				le32_to_cpu(sbi->s_es->s_first_data_block);
+		start = ext4_group_first_block_no(sb, group) + bit;
 		mb_debug(1, "    free preallocated %u/%u in group %u\n",
 				(unsigned) start, (unsigned) next - bit,
 				(unsigned) group);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 436521cae456..9b2deed652b7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -225,11 +225,6 @@ struct ext4_buddy {
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
-	ext4_fsblk_t block;
-
-	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-			+ fex->fe_start
-			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-	return block;
+	return ext4_group_first_block_no(sb, fex->fe_group) + fex->fe_start;
 }
 #endif
-- 
cgit v1.2.3


From bda00de7e8569b1fcde27b68fa59e74e14c5f93a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 3 Mar 2010 23:53:25 -0500
Subject: ext4: cleanup to use ext4_grp_offs_to_block()

More cleanup to convert open-coded calculations of the first block
number of a free extent to use ext4_grp_offs_to_block() instead.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/mballoc.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 11568697b192..37d2438e0cb4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2697,9 +2697,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (err)
 		goto out_err;
 
-	block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ ac->ac_b_ex.fe_start
-		+ le32_to_cpu(es->s_first_data_block);
+	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
 	len = ac->ac_b_ex.fe_len;
 	if (!ext4_data_block_valid(sbi, block, len)) {
@@ -3154,9 +3152,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 		/* The max size of hash table is PREALLOC_TB_SIZE */
 		order = PREALLOC_TB_SIZE - 1;
 
-	goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
-		     ac->ac_g_ex.fe_start +
-		     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
 	/*
 	 * search for the prealloc space that is having
 	 * minimal distance from the goal block.
-- 
cgit v1.2.3


From 731eb1a03a8445cde2cb23ecfb3580c6fa7bb690 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 3 Mar 2010 23:55:01 -0500
Subject: ext4: consolidate in_range() definitions

There are duplicate macro definitions of in_range() in mballoc.h and
balloc.c.  This consolidates these two definitions into ext4.h, and
changes extents.c to use in_range() as well.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/balloc.c  | 3 ---
 fs/ext4/ext4.h    | 2 ++
 fs/ext4/extents.c | 4 ++--
 fs/ext4/mballoc.h | 2 --
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fadbc4d3a202..d2f37a5516c7 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -188,9 +188,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * when a file system is mounted (see ext4_fill_super).
  */
 
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
 /**
  * ext4_get_group_desc() -- load group descriptor from disk
  * @sb:			super block
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3d85bbb09f1b..9b179163f1de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1819,6 +1819,8 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3c0bae11a097..94c8ee81f5e1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2051,7 +2051,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 
 	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
 			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
-	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+	if (in_range(block, cex->ec_block, cex->ec_len)) {
 		ex->ee_block = cpu_to_le32(cex->ec_block);
 		ext4_ext_store_pblock(ex, cex->ec_start);
 		ex->ee_len = cpu_to_le16(cex->ec_len);
@@ -3364,7 +3364,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
-		if (iblock >= ee_block && iblock < ee_block + ee_len) {
+		if (in_range(iblock, ee_block, ee_len)) {
 			newblock = iblock - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (iblock - ee_block);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 9b2deed652b7..b619322c76f0 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -220,8 +220,6 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
 
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
-- 
cgit v1.2.3


From 5fd5249aa36fad98c9fd5edced352939e54f9324 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Thu, 4 Mar 2010 00:31:06 -0500
Subject: ext4: Fix insertion point of extent in mext_insert_across_blocks()

If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT
ioctl is called with the file offset where after the 2nd extent
covers, mext_insert_across_blocks() always tries to insert extent into
the first extent.  As a result, the file gets corrupted because of
wrong extent order.  The patch fixes this problem.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 1654eb862d74..9eca1c0ec546 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		}
 
 		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 
 	} else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 		 * orig  |------------------------------|
 		 */
 		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
 		new_flag = 1;
 
 	} else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 		le32_to_cpu(oext->ee_block) + oext_alen) {
 		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
 					       le32_to_cpu(oext->ee_block));
+		start_ext.ee_block = oext->ee_block;
 		copy_extent_status(oext, &start_ext);
 	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
 		prev_ext = oext - 1;
@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 			start_ext.ee_len = cpu_to_le16(
 				ext4_ext_get_actual_len(prev_ext) +
 				new_ext_alen);
+			start_ext.ee_block = oext->ee_block;
 			copy_extent_status(prev_ext, &start_ext);
 			new_ext.ee_len = 0;
 		}
-- 
cgit v1.2.3


From 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Thu, 4 Mar 2010 00:34:58 -0500
Subject: ext4: Fix the NULL reference in double_down_write_data_sem()

If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in
ext4_ioctl() gets inappropriate file structure for donor; so we need
to do this check earlier, before calling double_down_write_data_sem().

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 9eca1c0ec546..7e99f4e72bf5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_inode,
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
-	/* Regular file check */
-	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
-		ext4_debug("ext4 move extent: The argument files should be "
-			"regular file [ino:orig %lu, donor %lu]\n",
-			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
-	}
-
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1208,6 +1200,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		return -EINVAL;
 	}
 
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
 	/* Protect orig and donor inodes against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
-- 
cgit v1.2.3


From c437b2733520599a2c6e0dbcdeae611319f84707 Mon Sep 17 00:00:00 2001
From: Akira Fujita <a-fujita@rs.jp.nec.com>
Date: Thu, 4 Mar 2010 00:39:24 -0500
Subject: ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl

a) Fix sparse warning in ext4_ioctl()
b) Remove unneeded variable in mext_leaf_block()
c) Fix spelling typo in mext_check_arguments()

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ioctl.c       | 3 ++-
 fs/ext4/move_extent.c | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2220feb2dcc1..016d0249294f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,8 @@ setversion_out:
 		if (me.moved_len > 0)
 			file_remove_suid(donor_filp);
 
-		if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+		if (copy_to_user((struct move_extent __user *)arg, 
+				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
 		fput(donor_filp);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7e99f4e72bf5..aa5fe28d180f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
 	struct ext4_extent new_ext, start_ext, end_ext;
 	ext4_lblk_t new_ext_end;
-	ext4_fsblk_t new_phys_end;
 	int oext_alen, new_ext_alen, end_ext_alen;
 	int depth = ext_depth(orig_inode);
 	int ret;
@@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	new_ext.ee_len = dext->ee_len;
 	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
 	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-	new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
 
 	/*
 	 * Case: original extent is first
@@ -932,7 +930,7 @@ out2:
 }
 
 /**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
  *
  * @orig_inode:		original inode
  * @donor_inode:	donor inode
-- 
cgit v1.2.3


From 64e290ec69be39f1887fa0b403c1e417b6b038e7 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Thu, 4 Mar 2010 22:25:21 -0500
Subject: ext4: fix up rb_root initializations to use RB_ROOT

ext4 uses rb_node = NULL; to zero rb_root at few places.  Using
RB_ROOT as the initializer is more portable in case the underlying
implementation of rbtrees changes in the future.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Paris <eparis@redhat.com>
---
 fs/ext4/block_validity.c | 4 ++--
 fs/ext4/dir.c            | 2 +-
 fs/ext4/mballoc.c        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index a60ab9aad57d..983f0e127493 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -205,14 +205,14 @@ void ext4_release_system_zone(struct super_block *sb)
 		entry = rb_entry(n, struct ext4_system_zone, node);
 		kmem_cache_free(ext4_system_zone_cachep, entry);
 		if (!parent)
-			EXT4_SB(sb)->system_blks.rb_node = NULL;
+			EXT4_SB(sb)->system_blks = RB_ROOT;
 		else if (parent->rb_left == n)
 			parent->rb_left = NULL;
 		else if (parent->rb_right == n)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	EXT4_SB(sb)->system_blks.rb_node = NULL;
+	EXT4_SB(sb)->system_blks = RB_ROOT;
 }
 
 /*
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 29857ddd9e26..86cb6d86a048 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -305,7 +305,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 			kfree(old);
 		}
 		if (!parent)
-			root->rb_node = NULL;
+			*root = RB_ROOT;
 		else if (parent->rb_left == n)
 			parent->rb_left = NULL;
 		else if (parent->rb_right == n)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 37d2438e0cb4..abb11e328b65 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2253,7 +2253,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
-	meta_group_info[i]->bb_free_root.rb_node = NULL;
+	meta_group_info[i]->bb_free_root = RB_ROOT;
 
 #ifdef DOUBLE_CHECK
 	{
-- 
cgit v1.2.3


From 648fa8611de3d4d43bbd64af3226679d2d0eb609 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 01:26:48 -0500
Subject: beginning to untangle do_filp_open()

That's going to be a long and painful series.  The first step:
take the stuff reachable from 'ok' label in do_filp_open() into
a new helper (finish_open()).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 106 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0741c69b3319..60b74b3946a1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1590,6 +1590,61 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & O_TRUNC);
 }
 
+static struct file *finish_open(struct nameidata *nd,
+				int open_flag, int flag, int acc_mode)
+{
+	struct file *filp;
+	int will_truncate;
+	int error;
+
+	will_truncate = open_will_truncate(flag, nd->path.dentry->d_inode);
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+	}
+	error = may_open(&nd->path, acc_mode, open_flag);
+	if (error) {
+		if (will_truncate)
+			mnt_drop_write(nd->path.mnt);
+		goto exit;
+	}
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (acc_mode & MAY_WRITE)
+			vfs_dq_init(nd->path.dentry->d_inode);
+
+		if (will_truncate) {
+			error = handle_truncate(&nd->path);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_truncate)
+		mnt_drop_write(nd->path.mnt);
+	return filp;
+
+exit:
+	if (!IS_ERR(nd->intent.open.file))
+		release_open_intent(nd);
+	path_put(&nd->path);
+	return ERR_PTR(error);
+}
+
 /*
  * Note that the low bits of the passed in "open_flag"
  * are not the same as in the local variable "flag". See
@@ -1604,7 +1659,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	struct dentry *dir;
 	int count = 0;
-	int will_truncate;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
 
@@ -1769,55 +1823,7 @@ do_last:
 	if (S_ISDIR(path.dentry->d_inode->i_mode))
 		goto exit;
 ok:
-	/*
-	 * Consider:
-	 * 1. may_open() truncates a file
-	 * 2. a rw->ro mount transition occurs
-	 * 3. nameidata_to_filp() fails due to
-	 *    the ro mount.
-	 * That would be inconsistent, and should
-	 * be avoided. Taking this mnt write here
-	 * ensures that (2) can not occur.
-	 */
-	will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode);
-	if (will_truncate) {
-		error = mnt_want_write(nd.path.mnt);
-		if (error)
-			goto exit;
-	}
-	error = may_open(&nd.path, acc_mode, open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd.path.mnt);
-		goto exit;
-	}
-	filp = nameidata_to_filp(&nd);
-	if (!IS_ERR(filp)) {
-		error = ima_file_check(filp, acc_mode);
-		if (error) {
-			fput(filp);
-			filp = ERR_PTR(error);
-		}
-	}
-	if (!IS_ERR(filp)) {
-		if (acc_mode & MAY_WRITE)
-			vfs_dq_init(nd.path.dentry->d_inode);
-
-		if (will_truncate) {
-			error = handle_truncate(&nd.path);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
-		mnt_drop_write(nd.path.mnt);
+	filp = finish_open(&nd, open_flag, flag, acc_mode);
 	if (nd.root.mnt)
 		path_put(&nd.root);
 	return filp;
-- 
cgit v1.2.3


From fb1cc555d533869910e20de4b8d5147570afdfad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 01:58:28 -0500
Subject: gut do_filp_open() a bit more (do_last separation)

Brute-force separation of stuff reachable from do_last: with
the exception of do_link:; just take all that crap to a helper
function as-is and have it tell the caller if it has to go
to do_link.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 171 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 103 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 60b74b3946a1..3c39fa1608c5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1645,6 +1645,104 @@ exit:
 	return ERR_PTR(error);
 }
 
+static struct file *do_last(struct nameidata *nd, struct path *path,
+			    int open_flag, int flag, int acc_mode,
+			    int mode, const char *pathname,
+			    struct dentry *dir, int *is_link)
+{
+	struct file *filp;
+	int error;
+
+	*is_link = 0;
+
+	error = PTR_ERR(path->dentry);
+	if (IS_ERR(path->dentry)) {
+		mutex_unlock(&dir->d_inode->i_mutex);
+		goto exit;
+	}
+
+	if (IS_ERR(nd->intent.open.file)) {
+		error = PTR_ERR(nd->intent.open.file);
+		goto exit_mutex_unlock;
+	}
+
+	/* Negative dentry, just create the file */
+	if (!path->dentry->d_inode) {
+		/*
+		 * This write is needed to ensure that a
+		 * ro->rw transition does not occur between
+		 * the time when the file is created and when
+		 * a permanent write count is taken through
+		 * the 'struct file' in nameidata_to_filp().
+		 */
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit_mutex_unlock;
+		error = __open_namei_create(nd, path, open_flag, mode);
+		if (error) {
+			mnt_drop_write(nd->path.mnt);
+			goto exit;
+		}
+		filp = nameidata_to_filp(nd);
+		mnt_drop_write(nd->path.mnt);
+		if (nd->root.mnt)
+			path_put(&nd->root);
+		if (!IS_ERR(filp)) {
+			error = ima_file_check(filp, acc_mode);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+		return filp;
+	}
+
+	/*
+	 * It already exists.
+	 */
+	mutex_unlock(&dir->d_inode->i_mutex);
+	audit_inode(pathname, path->dentry);
+
+	error = -EEXIST;
+	if (flag & O_EXCL)
+		goto exit_dput;
+
+	if (__follow_mount(path)) {
+		error = -ELOOP;
+		if (flag & O_NOFOLLOW)
+			goto exit_dput;
+	}
+
+	error = -ENOENT;
+	if (!path->dentry->d_inode)
+		goto exit_dput;
+	if (path->dentry->d_inode->i_op->follow_link) {
+		*is_link = 1;
+		return NULL;
+	}
+
+	path_to_nameidata(path, nd);
+	error = -EISDIR;
+	if (S_ISDIR(path->dentry->d_inode->i_mode))
+		goto exit;
+	filp = finish_open(nd, open_flag, flag, acc_mode);
+	if (nd->root.mnt)
+		path_put(&nd->root);
+	return filp;
+
+exit_mutex_unlock:
+	mutex_unlock(&dir->d_inode->i_mutex);
+exit_dput:
+	path_put_conditional(path, nd);
+exit:
+	if (!IS_ERR(nd->intent.open.file))
+		release_open_intent(nd);
+	if (nd->root.mnt)
+		path_put(&nd->root);
+	path_put(&nd->path);
+	return ERR_PTR(error);
+}
+
 /*
  * Note that the low bits of the passed in "open_flag"
  * are not the same as in the local variable "flag". See
@@ -1661,6 +1759,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
+	int is_link;
 
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
@@ -1754,82 +1853,18 @@ reval:
 	path.mnt = nd.path.mnt;
 
 do_last:
-	error = PTR_ERR(path.dentry);
-	if (IS_ERR(path.dentry)) {
-		mutex_unlock(&dir->d_inode->i_mutex);
-		goto exit;
-	}
-
-	if (IS_ERR(nd.intent.open.file)) {
-		error = PTR_ERR(nd.intent.open.file);
-		goto exit_mutex_unlock;
-	}
-
-	/* Negative dentry, just create the file */
-	if (!path.dentry->d_inode) {
-		/*
-		 * This write is needed to ensure that a
-		 * ro->rw transition does not occur between
-		 * the time when the file is created and when
-		 * a permanent write count is taken through
-		 * the 'struct file' in nameidata_to_filp().
-		 */
-		error = mnt_want_write(nd.path.mnt);
-		if (error)
-			goto exit_mutex_unlock;
-		error = __open_namei_create(&nd, &path, open_flag, mode);
-		if (error) {
-			mnt_drop_write(nd.path.mnt);
-			goto exit;
-		}
-		filp = nameidata_to_filp(&nd);
-		mnt_drop_write(nd.path.mnt);
-		if (nd.root.mnt)
-			path_put(&nd.root);
-		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, acc_mode);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-		return filp;
-	}
-
-	/*
-	 * It already exists.
-	 */
-	mutex_unlock(&dir->d_inode->i_mutex);
-	audit_inode(pathname, path.dentry);
-
-	error = -EEXIST;
-	if (flag & O_EXCL)
-		goto exit_dput;
-
-	if (__follow_mount(&path)) {
-		error = -ELOOP;
-		if (flag & O_NOFOLLOW)
-			goto exit_dput;
-	}
-
-	error = -ENOENT;
-	if (!path.dentry->d_inode)
-		goto exit_dput;
-	if (path.dentry->d_inode->i_op->follow_link)
+	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
+		       pathname, dir, &is_link);
+	if (is_link)
 		goto do_link;
+	return filp;
 
-	path_to_nameidata(&path, &nd);
-	error = -EISDIR;
-	if (S_ISDIR(path.dentry->d_inode->i_mode))
-		goto exit;
 ok:
 	filp = finish_open(&nd, open_flag, flag, acc_mode);
 	if (nd.root.mnt)
 		path_put(&nd.root);
 	return filp;
 
-exit_mutex_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
 exit_dput:
 	path_put_conditional(&path, &nd);
 exit:
-- 
cgit v1.2.3


From 3343eb8209cc69f0d2059f8c484ad7a3e1834c0b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 02:02:38 -0500
Subject: Shift releasing nd->root from do_last() to its caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3c39fa1608c5..bff27c08134c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1685,8 +1685,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
-		if (nd->root.mnt)
-			path_put(&nd->root);
 		if (!IS_ERR(filp)) {
 			error = ima_file_check(filp, acc_mode);
 			if (error) {
@@ -1726,8 +1724,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(path->dentry->d_inode->i_mode))
 		goto exit;
 	filp = finish_open(nd, open_flag, flag, acc_mode);
-	if (nd->root.mnt)
-		path_put(&nd->root);
 	return filp;
 
 exit_mutex_unlock:
@@ -1737,8 +1733,6 @@ exit_dput:
 exit:
 	if (!IS_ERR(nd->intent.open.file))
 		release_open_intent(nd);
-	if (nd->root.mnt)
-		path_put(&nd->root);
 	path_put(&nd->path);
 	return ERR_PTR(error);
 }
@@ -1857,6 +1851,8 @@ do_last:
 		       pathname, dir, &is_link);
 	if (is_link)
 		goto do_link;
+	if (nd.root.mnt)
+		path_put(&nd.root);
 	return filp;
 
 ok:
-- 
cgit v1.2.3


From 27bff34300482632caf52ff589a4e7d755b32539 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 02:05:43 -0500
Subject: unroll do_last: loop in do_filp_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index bff27c08134c..fc6bed7215c9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1845,8 +1845,6 @@ reval:
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
-
-do_last:
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
 		       pathname, dir, &is_link);
 	if (is_link)
@@ -1926,7 +1924,13 @@ do_link:
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
 	__putname(nd.last.name);
-	goto do_last;
+	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
+		       pathname, dir, &is_link);
+	if (is_link)
+		goto do_link;
+	if (nd.root.mnt)
+		path_put(&nd.root);
+	return filp;
 }
 
 /**
-- 
cgit v1.2.3


From c41c14056210e4a328659c82b1edaccb0910d18c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 02:08:19 -0500
Subject: postpone __putname() until after do_last()

Since do_last() doesn't mangle nd->last_name, we can safely postpone
__putname() done in handling of trailing symlinks until after the
call of do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index fc6bed7215c9..30ba3f3a25e2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1923,9 +1923,9 @@ do_link:
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
-	__putname(nd.last.name);
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
 		       pathname, dir, &is_link);
+	__putname(nd.last.name);
 	if (is_link)
 		goto do_link;
 	if (nd.root.mnt)
-- 
cgit v1.2.3


From a1e28038df98e186807ff55a49c1c26d33d530a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 02:12:06 -0500
Subject: pull the common predecessors into do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 30ba3f3a25e2..976fc323272e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1648,13 +1648,19 @@ exit:
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int flag, int acc_mode,
 			    int mode, const char *pathname,
-			    struct dentry *dir, int *is_link)
+			    int *is_link)
 {
+	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
 	int error;
 
 	*is_link = 0;
 
+	mutex_lock(&dir->d_inode->i_mutex);
+
+	path->dentry = lookup_hash(nd);
+	path->mnt = nd->path.mnt;
+
 	error = PTR_ERR(path->dentry);
 	if (IS_ERR(path->dentry)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
@@ -1749,7 +1755,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct nameidata nd;
 	int error;
 	struct path path;
-	struct dentry *dir;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
@@ -1837,16 +1842,12 @@ reval:
 	filp->f_flags = open_flag;
 	nd.intent.open.flags = flag;
 	nd.intent.open.create_mode = mode;
-	dir = nd.path.dentry;
 	nd.flags &= ~LOOKUP_PARENT;
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
 	if (flag & O_EXCL)
 		nd.flags |= LOOKUP_EXCL;
-	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(&nd);
-	path.mnt = nd.path.mnt;
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
-		       pathname, dir, &is_link);
+		       pathname, &is_link);
 	if (is_link)
 		goto do_link;
 	if (nd.root.mnt)
@@ -1919,12 +1920,8 @@ do_link:
 		__putname(nd.last.name);
 		goto exit;
 	}
-	dir = nd.path.dentry;
-	mutex_lock(&dir->d_inode->i_mutex);
-	path.dentry = lookup_hash(&nd);
-	path.mnt = nd.path.mnt;
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
-		       pathname, dir, &is_link);
+		       pathname, &is_link);
 	__putname(nd.last.name);
 	if (is_link)
 		goto do_link;
-- 
cgit v1.2.3


From c99658fe970f442199733bcace1a00b087336a0d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 02:27:30 -0500
Subject: bail out with ELOOP earlier in do_link loop

If we'd passed through 32 trailing symlinks already, there's
no sense following the 33rd - we'll bail out anyway.  Better
bugger off earlier.

It *does* change behaviour, after a fashion - if the 33rd happens
to be a procfs-style symlink, original code *would* allow it.
This one will not.  Cry me a river if that hurts you.  Please, do.
And post a video of that, while you are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 976fc323272e..84f1ec3b4a5d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1873,7 +1873,7 @@ exit_parent:
 
 do_link:
 	error = -ELOOP;
-	if (flag & O_NOFOLLOW)
+	if ((flag & O_NOFOLLOW) || count++ == 32)
 		goto exit_dput;
 	/*
 	 * This is subtle. Instead of calling do_follow_link() we do the
@@ -1915,11 +1915,6 @@ do_link:
 		__putname(nd.last.name);
 		goto exit;
 	}
-	error = -ELOOP;
-	if (count++==32) {
-		__putname(nd.last.name);
-		goto exit;
-	}
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
 		       pathname, &is_link);
 	__putname(nd.last.name);
-- 
cgit v1.2.3


From a2c36b450ee68470836cb858c58a6ba3a52c5ec5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 03:39:50 -0500
Subject: pull more into do_last()

Handling of LAST_DOT/LAST_ROOT/LAST_DOTDOT/terminating slash
can be pulled in as well

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 84f1ec3b4a5d..52517e0bbdde 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1656,6 +1656,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	*is_link = 0;
 
+	error = -EISDIR;
+	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+		goto exit;
+
 	mutex_lock(&dir->d_inode->i_mutex);
 
 	path->dentry = lookup_hash(nd);
@@ -1826,13 +1830,8 @@ reval:
 		audit_inode(pathname, nd.path.dentry);
 
 	/*
-	 * We have the parent and last component. First of all, check
-	 * that we are not asked to creat(2) an obvious directory - that
-	 * will not do.
+	 * We have the parent and last component.
 	 */
-	error = -EISDIR;
-	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
-		goto exit_parent;
 
 	error = -ENFILE;
 	filp = get_empty_filp();
@@ -1908,16 +1907,10 @@ do_link:
 	nd.flags &= ~LOOKUP_PARENT;
 	if (nd.last_type == LAST_BIND)
 		goto ok;
-	error = -EISDIR;
-	if (nd.last_type != LAST_NORM)
-		goto exit;
-	if (nd.last.name[nd.last.len]) {
-		__putname(nd.last.name);
-		goto exit;
-	}
 	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
 		       pathname, &is_link);
-	__putname(nd.last.name);
+	if (nd.last_type == LAST_NORM)
+		__putname(nd.last.name);
 	if (is_link)
 		goto do_link;
 	if (nd.root.mnt)
-- 
cgit v1.2.3


From 9a66179e13504c676f891908a1e94912ec5cdefb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 06:49:47 -0500
Subject: Don't pass mangled open_flag to finish_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 52517e0bbdde..5b9016006913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1591,13 +1591,13 @@ static int open_will_truncate(int flag, struct inode *inode)
 }
 
 static struct file *finish_open(struct nameidata *nd,
-				int open_flag, int flag, int acc_mode)
+				int open_flag, int acc_mode)
 {
 	struct file *filp;
 	int will_truncate;
 	int error;
 
-	will_truncate = open_will_truncate(flag, nd->path.dentry->d_inode);
+	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
@@ -1733,7 +1733,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	error = -EISDIR;
 	if (S_ISDIR(path->dentry->d_inode->i_mode))
 		goto exit;
-	filp = finish_open(nd, open_flag, flag, acc_mode);
+	filp = finish_open(nd, open_flag, acc_mode);
 	return filp;
 
 exit_mutex_unlock:
@@ -1854,7 +1854,7 @@ reval:
 	return filp;
 
 ok:
-	filp = finish_open(&nd, open_flag, flag, acc_mode);
+	filp = finish_open(&nd, open_flag, acc_mode);
 	if (nd.root.mnt)
 		path_put(&nd.root);
 	return filp;
-- 
cgit v1.2.3


From 5b369df8263fe7ab4dac2bb08b8f423dc5e33752 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 06:51:13 -0500
Subject: Get rid of passing mangled flag to do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 5b9016006913..5ea7330c184b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1646,7 +1646,7 @@ exit:
 }
 
 static struct file *do_last(struct nameidata *nd, struct path *path,
-			    int open_flag, int flag, int acc_mode,
+			    int open_flag, int acc_mode,
 			    int mode, const char *pathname,
 			    int *is_link)
 {
@@ -1712,12 +1712,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		goto exit_dput;
 
 	if (__follow_mount(path)) {
 		error = -ELOOP;
-		if (flag & O_NOFOLLOW)
+		if (open_flag & O_NOFOLLOW)
 			goto exit_dput;
 	}
 
@@ -1845,7 +1845,7 @@ reval:
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
 	if (flag & O_EXCL)
 		nd.flags |= LOOKUP_EXCL;
-	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
 		       pathname, &is_link);
 	if (is_link)
 		goto do_link;
@@ -1907,7 +1907,7 @@ do_link:
 	nd.flags &= ~LOOKUP_PARENT;
 	if (nd.last_type == LAST_BIND)
 		goto ok;
-	filp = do_last(&nd, &path, open_flag, flag, acc_mode, mode,
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
 		       pathname, &is_link);
 	if (nd.last_type == LAST_NORM)
 		__putname(nd.last.name);
-- 
cgit v1.2.3


From 4296e2cbf2138b5831b83f03e81de916ce1a967d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Dec 2009 07:15:41 -0500
Subject: Leave mangled flag only for setting nd.intent.open.flag

Nothing else uses it anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 5ea7330c184b..f5e4397dcd7e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1777,18 +1777,18 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
-	if (flag & O_TRUNC)
+	if (open_flag & O_TRUNC)
 		acc_mode |= MAY_WRITE;
 
 	/* Allow the LSM permission hook to distinguish append 
 	   access from general write access. */
-	if (flag & O_APPEND)
+	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
 	/*
 	 * The simplest case - just a plain lookup.
 	 */
-	if (!(flag & O_CREAT)) {
+	if (!(open_flag & O_CREAT)) {
 		filp = get_empty_filp();
 
 		if (filp == NULL)
@@ -1798,7 +1798,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		nd.intent.open.flags = flag;
 		nd.intent.open.create_mode = 0;
 		error = do_path_lookup(dfd, pathname,
-					lookup_flags(flag)|LOOKUP_OPEN, &nd);
+					lookup_flags(open_flag)|LOOKUP_OPEN, &nd);
 		if (IS_ERR(nd.intent.open.file)) {
 			if (error == 0) {
 				error = PTR_ERR(nd.intent.open.file);
@@ -1843,7 +1843,7 @@ reval:
 	nd.intent.open.create_mode = mode;
 	nd.flags &= ~LOOKUP_PARENT;
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
-	if (flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		nd.flags |= LOOKUP_EXCL;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
 		       pathname, &is_link);
@@ -1872,7 +1872,7 @@ exit_parent:
 
 do_link:
 	error = -ELOOP;
-	if ((flag & O_NOFOLLOW) || count++ == 32)
+	if ((open_flag & O_NOFOLLOW) || count++ == 32)
 		goto exit_dput;
 	/*
 	 * This is subtle. Instead of calling do_follow_link() we do the
-- 
cgit v1.2.3


From 67ee3ad21d0d0b2cc0b70708de8aed860fadda44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 07:01:01 -0500
Subject: Pull handling of LAST_BIND into do_last(), clean up ok: part in
 do_filp_open()

Note that in case of !O_CREAT we know that nd.root has already been given up

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f5e4397dcd7e..0b4d19d47e67 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1656,6 +1656,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	*is_link = 0;
 
+	if (nd->last_type == LAST_BIND)
+		goto ok;
+
 	error = -EISDIR;
 	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
 		goto exit;
@@ -1733,6 +1736,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	error = -EISDIR;
 	if (S_ISDIR(path->dentry->d_inode->i_mode))
 		goto exit;
+ok:
 	filp = finish_open(nd, open_flag, acc_mode);
 	return filp;
 
@@ -1808,7 +1812,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 			release_open_intent(&nd);
 		if (error)
 			return ERR_PTR(error);
-		goto ok;
+		return finish_open(&nd, open_flag, acc_mode);
 	}
 
 	/*
@@ -1853,21 +1857,14 @@ reval:
 		path_put(&nd.root);
 	return filp;
 
-ok:
-	filp = finish_open(&nd, open_flag, acc_mode);
-	if (nd.root.mnt)
-		path_put(&nd.root);
-	return filp;
-
 exit_dput:
 	path_put_conditional(&path, &nd);
-exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
+	path_put(&nd.path);
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	path_put(&nd.path);
 	return ERR_PTR(error);
 
 do_link:
@@ -1905,8 +1902,6 @@ do_link:
 		return ERR_PTR(error);
 	}
 	nd.flags &= ~LOOKUP_PARENT;
-	if (nd.last_type == LAST_BIND)
-		goto ok;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
 		       pathname, &is_link);
 	if (nd.last_type == LAST_NORM)
-- 
cgit v1.2.3


From 9e67f36169117e07daf16dc7ca314f1db9e2050a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 07:04:50 -0500
Subject: Kill is_link argument of do_last()

We set it to 1 iff we return NULL

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0b4d19d47e67..b0c74fe91fb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1647,15 +1647,12 @@ exit:
 
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
-			    int mode, const char *pathname,
-			    int *is_link)
+			    int mode, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
 	int error;
 
-	*is_link = 0;
-
 	if (nd->last_type == LAST_BIND)
 		goto ok;
 
@@ -1727,10 +1724,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	error = -ENOENT;
 	if (!path->dentry->d_inode)
 		goto exit_dput;
-	if (path->dentry->d_inode->i_op->follow_link) {
-		*is_link = 1;
+
+	if (path->dentry->d_inode->i_op->follow_link)
 		return NULL;
-	}
 
 	path_to_nameidata(path, nd);
 	error = -EISDIR;
@@ -1766,7 +1762,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
-	int is_link;
 
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
@@ -1849,9 +1844,8 @@ reval:
 	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
 	if (open_flag & O_EXCL)
 		nd.flags |= LOOKUP_EXCL;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
-		       pathname, &is_link);
-	if (is_link)
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	if (!filp)
 		goto do_link;
 	if (nd.root.mnt)
 		path_put(&nd.root);
@@ -1902,11 +1896,10 @@ do_link:
 		return ERR_PTR(error);
 	}
 	nd.flags &= ~LOOKUP_PARENT;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode,
-		       pathname, &is_link);
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	if (nd.last_type == LAST_NORM)
 		__putname(nd.last.name);
-	if (is_link)
+	if (!filp)
 		goto do_link;
 	if (nd.root.mnt)
 		path_put(&nd.root);
-- 
cgit v1.2.3


From 10fa8e62f2bc33c452516585911f151d88389e4c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 07:09:49 -0500
Subject: Unify exits in O_CREAT handling

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b0c74fe91fb0..675a712137f1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1821,9 +1821,8 @@ reval:
 		nd.flags |= LOOKUP_REVAL;
 	error = path_walk(pathname, &nd);
 	if (error) {
-		if (nd.root.mnt)
-			path_put(&nd.root);
-		return ERR_PTR(error);
+		filp = ERR_PTR(error);
+		goto out;
 	}
 	if (unlikely(!audit_dummy_context()))
 		audit_inode(pathname, nd.path.dentry);
@@ -1847,9 +1846,7 @@ reval:
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	if (!filp)
 		goto do_link;
-	if (nd.root.mnt)
-		path_put(&nd.root);
-	return filp;
+	goto out;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
@@ -1857,9 +1854,15 @@ exit_dput:
 		release_open_intent(&nd);
 exit_parent:
 	path_put(&nd.path);
+	filp = ERR_PTR(error);
+out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	return ERR_PTR(error);
+	if (filp == ERR_PTR(-ESTALE) && !force_reval) {
+		force_reval = 1;
+		goto reval;
+	}
+	return filp;
 
 do_link:
 	error = -ELOOP;
@@ -1887,13 +1890,8 @@ do_link:
 		 * with "intent.open".
 		 */
 		release_open_intent(&nd);
-		if (nd.root.mnt)
-			path_put(&nd.root);
-		if (error == -ESTALE && !force_reval) {
-			force_reval = 1;
-			goto reval;
-		}
-		return ERR_PTR(error);
+		filp = ERR_PTR(error);
+		goto out;
 	}
 	nd.flags &= ~LOOKUP_PARENT;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
@@ -1901,9 +1899,7 @@ do_link:
 		__putname(nd.last.name);
 	if (!filp)
 		goto do_link;
-	if (nd.root.mnt)
-		path_put(&nd.root);
-	return filp;
+	goto out;
 }
 
 /**
-- 
cgit v1.2.3


From 806b681cbe588bebe8fe47dd24da62f2d1c55851 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 07:16:40 -0500
Subject: Turn do_link spaghetty into a normal loop

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 83 ++++++++++++++++++++++++++++----------------------------------
 1 file changed, 38 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 675a712137f1..08da937b1ee2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1844,17 +1844,38 @@ reval:
 	if (open_flag & O_EXCL)
 		nd.flags |= LOOKUP_EXCL;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-	if (!filp)
-		goto do_link;
-	goto out;
-
-exit_dput:
-	path_put_conditional(&path, &nd);
-	if (!IS_ERR(nd.intent.open.file))
-		release_open_intent(&nd);
-exit_parent:
-	path_put(&nd.path);
-	filp = ERR_PTR(error);
+	while (unlikely(!filp)) { /* trailing symlink */
+		error = -ELOOP;
+		if ((open_flag & O_NOFOLLOW) || count++ == 32)
+			goto exit_dput;
+		/*
+		 * This is subtle. Instead of calling do_follow_link() we do
+		 * the thing by hands. The reason is that this way we have zero
+		 * link_count and path_walk() (called from ->follow_link)
+		 * honoring LOOKUP_PARENT.  After that we have the parent and
+		 * last component, i.e. we are in the same situation as after
+		 * the first path_walk().  Well, almost - if the last component
+		 * is normal we get its copy stored in nd->last.name and we will
+		 * have to putname() it when we are done. Procfs-like symlinks
+		 * just set LAST_BIND.
+		 */
+		nd.flags |= LOOKUP_PARENT;
+		error = security_inode_follow_link(path.dentry, &nd);
+		if (error)
+			goto exit_dput;
+		error = __do_follow_link(&path, &nd);
+		path_put(&path);
+		if (error) {
+			/* nd.path had been dropped */
+			release_open_intent(&nd);
+			filp = ERR_PTR(error);
+			goto out;
+		}
+		nd.flags &= ~LOOKUP_PARENT;
+		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+		if (nd.last_type == LAST_NORM)
+			__putname(nd.last.name);
+	}
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
@@ -1864,41 +1885,13 @@ out:
 	}
 	return filp;
 
-do_link:
-	error = -ELOOP;
-	if ((open_flag & O_NOFOLLOW) || count++ == 32)
-		goto exit_dput;
-	/*
-	 * This is subtle. Instead of calling do_follow_link() we do the
-	 * thing by hands. The reason is that this way we have zero link_count
-	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
-	 * After that we have the parent and last component, i.e.
-	 * we are in the same situation as after the first path_walk().
-	 * Well, almost - if the last component is normal we get its copy
-	 * stored in nd->last.name and we will have to putname() it when we
-	 * are done. Procfs-like symlinks just set LAST_BIND.
-	 */
-	nd.flags |= LOOKUP_PARENT;
-	error = security_inode_follow_link(path.dentry, &nd);
-	if (error)
-		goto exit_dput;
-	error = __do_follow_link(&path, &nd);
-	path_put(&path);
-	if (error) {
-		/* Does someone understand code flow here? Or it is only
-		 * me so stupid? Anathema to whoever designed this non-sense
-		 * with "intent.open".
-		 */
+exit_dput:
+	path_put_conditional(&path, &nd);
+	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
-		filp = ERR_PTR(error);
-		goto out;
-	}
-	nd.flags &= ~LOOKUP_PARENT;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-	if (nd.last_type == LAST_NORM)
-		__putname(nd.last.name);
-	if (!filp)
-		goto do_link;
+exit_parent:
+	path_put(&nd.path);
+	filp = ERR_PTR(error);
 	goto out;
 }
 
-- 
cgit v1.2.3


From 3866248e5f86d74960a3d1592882490ec3021675 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 07:21:48 -0500
Subject: Finish pulling of -ESTALE handling to upper level in do_filp_open()

Don't bother with path_walk() (and its retry loop); link_path_walk()
will do it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 08da937b1ee2..adfbaf5c04a7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1819,7 +1819,9 @@ reval:
 		return ERR_PTR(error);
 	if (force_reval)
 		nd.flags |= LOOKUP_REVAL;
-	error = path_walk(pathname, &nd);
+
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, &nd);
 	if (error) {
 		filp = ERR_PTR(error);
 		goto out;
-- 
cgit v1.2.3


From def4af30cf945a3735ffca865788ea84b30b25d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 08:37:05 -0500
Subject: Get rid of symlink body copying

Now that nd->last stays around until ->put_link() is called, we can
just postpone that ->put_link() in do_filp_open() a bit and don't
bother with copying.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 55 ++++++++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index adfbaf5c04a7..1f5d86d1fbf5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,8 +498,6 @@ static int link_path_walk(const char *, struct nameidata *);
 
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
-	int res = 0;
-	char *name;
 	if (IS_ERR(link))
 		goto fail;
 
@@ -510,22 +508,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		path_get(&nd->root);
 	}
 
-	res = link_path_walk(link, nd);
-	if (nd->depth || res || nd->last_type!=LAST_NORM)
-		return res;
-	/*
-	 * If it is an iterative symlinks resolution in open_namei() we
-	 * have to copy the last component. And all that crap because of
-	 * bloody create() on broken symlinks. Furrfu...
-	 */
-	name = __getname();
-	if (unlikely(!name)) {
-		path_put(&nd->path);
-		return -ENOMEM;
-	}
-	strcpy(name, nd->last.name);
-	nd->last.name = name;
-	return 0;
+	return link_path_walk(link, nd);
 fail:
 	path_put(&nd->path);
 	return PTR_ERR(link);
@@ -547,10 +530,10 @@ static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
 	nd->path.dentry = path->dentry;
 }
 
-static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
+static __always_inline int
+__do_follow_link(struct path *path, struct nameidata *nd, void **p)
 {
 	int error;
-	void *cookie;
 	struct dentry *dentry = path->dentry;
 
 	touch_atime(path->mnt, dentry);
@@ -562,9 +545,9 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
 	}
 	mntget(path->mnt);
 	nd->last_type = LAST_BIND;
-	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
-	error = PTR_ERR(cookie);
-	if (!IS_ERR(cookie)) {
+	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
+	error = PTR_ERR(*p);
+	if (!IS_ERR(*p)) {
 		char *s = nd_get_link(nd);
 		error = 0;
 		if (s)
@@ -574,8 +557,6 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
 			if (error)
 				path_put(&nd->path);
 		}
-		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
 	}
 	return error;
 }
@@ -589,6 +570,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
  */
 static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
+	void *cookie;
 	int err = -ELOOP;
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
@@ -602,7 +584,9 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	current->link_count++;
 	current->total_link_count++;
 	nd->depth++;
-	err = __do_follow_link(path, nd);
+	err = __do_follow_link(path, nd, &cookie);
+	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
+		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
 	path_put(path);
 	current->link_count--;
 	nd->depth--;
@@ -1847,6 +1831,9 @@ reval:
 		nd.flags |= LOOKUP_EXCL;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
+		struct path holder;
+		struct inode *inode;
+		void *cookie;
 		error = -ELOOP;
 		if ((open_flag & O_NOFOLLOW) || count++ == 32)
 			goto exit_dput;
@@ -1865,18 +1852,24 @@ reval:
 		error = security_inode_follow_link(path.dentry, &nd);
 		if (error)
 			goto exit_dput;
-		error = __do_follow_link(&path, &nd);
-		path_put(&path);
-		if (error) {
+		error = __do_follow_link(&path, &nd, &cookie);
+		if (unlikely(error)) {
 			/* nd.path had been dropped */
+			inode = path.dentry->d_inode;
+			if (!IS_ERR(cookie) && inode->i_op->put_link)
+				inode->i_op->put_link(path.dentry, &nd, cookie);
+			path_put(&path);
 			release_open_intent(&nd);
 			filp = ERR_PTR(error);
 			goto out;
 		}
+		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
 		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (nd.last_type == LAST_NORM)
-			__putname(nd.last.name);
+		inode = holder.dentry->d_inode;
+		if (inode->i_op->put_link)
+			inode->i_op->put_link(holder.dentry, &nd, cookie);
+		path_put(&holder);
 	}
 out:
 	if (nd.root.mnt)
-- 
cgit v1.2.3


From 1f36f774b22a0ceb7dd33eca626746c81a97b6a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Dec 2009 10:56:19 -0500
Subject: Switch !O_CREAT case to use of do_last()

... and now we have all intents crap well localized

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 127 ++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1f5d86d1fbf5..9a6456099f1e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1359,22 +1359,6 @@ static inline int may_create(struct inode *dir, struct dentry *child)
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
-/* 
- * O_DIRECTORY translates into forcing a directory lookup.
- */
-static inline int lookup_flags(unsigned int f)
-{
-	unsigned long retval = LOOKUP_FOLLOW;
-
-	if (f & O_NOFOLLOW)
-		retval &= ~LOOKUP_FOLLOW;
-	
-	if (f & O_DIRECTORY)
-		retval |= LOOKUP_DIRECTORY;
-
-	return retval;
-}
-
 /*
  * p1 and p2 should be directories on the same fs.
  */
@@ -1631,19 +1615,60 @@ exit:
 
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
-			    int mode, const char *pathname)
+			    int mode, const char *pathname,
+			    int *want_dir)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
-	int error;
+	int error = -EISDIR;
 
-	if (nd->last_type == LAST_BIND)
+	switch (nd->last_type) {
+	case LAST_DOTDOT:
+		follow_dotdot(nd);
+		dir = nd->path.dentry;
+		if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
+			if (!dir->d_op->d_revalidate(dir, nd)) {
+				error = -ESTALE;
+				goto exit;
+			}
+		}
+		/* fallthrough */
+	case LAST_DOT:
+	case LAST_ROOT:
+		if (open_flag & O_CREAT)
+			goto exit;
+		/* fallthrough */
+	case LAST_BIND:
+		audit_inode(pathname, dir);
 		goto ok;
+	}
 
-	error = -EISDIR;
-	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
-		goto exit;
+	/* trailing slashes? */
+	if (nd->last.name[nd->last.len]) {
+		if (open_flag & O_CREAT)
+			goto exit;
+		*want_dir = 1;
+	}
 
+	/* just plain open? */
+	if (!(open_flag & O_CREAT)) {
+		error = do_lookup(nd, &nd->last, path);
+		if (error)
+			goto exit;
+		error = -ENOENT;
+		if (!path->dentry->d_inode)
+			goto exit_dput;
+		if (path->dentry->d_inode->i_op->follow_link)
+			return NULL;
+		error = -ENOTDIR;
+		if (*want_dir & !path->dentry->d_inode->i_op->lookup)
+			goto exit_dput;
+		path_to_nameidata(path, nd);
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
+
+	/* OK, it's O_CREAT */
 	mutex_lock(&dir->d_inode->i_mutex);
 
 	path->dentry = lookup_hash(nd);
@@ -1746,6 +1771,10 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
+	int want_dir = open_flag & O_DIRECTORY;
+
+	if (!(open_flag & O_CREAT))
+		mode = 0;
 
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
@@ -1768,35 +1797,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	/*
-	 * The simplest case - just a plain lookup.
-	 */
-	if (!(open_flag & O_CREAT)) {
-		filp = get_empty_filp();
-
-		if (filp == NULL)
-			return ERR_PTR(-ENFILE);
-		nd.intent.open.file = filp;
-		filp->f_flags = open_flag;
-		nd.intent.open.flags = flag;
-		nd.intent.open.create_mode = 0;
-		error = do_path_lookup(dfd, pathname,
-					lookup_flags(open_flag)|LOOKUP_OPEN, &nd);
-		if (IS_ERR(nd.intent.open.file)) {
-			if (error == 0) {
-				error = PTR_ERR(nd.intent.open.file);
-				path_put(&nd.path);
-			}
-		} else if (error)
-			release_open_intent(&nd);
-		if (error)
-			return ERR_PTR(error);
-		return finish_open(&nd, open_flag, acc_mode);
-	}
-
-	/*
-	 * Create - we need to know the parent.
-	 */
+	/* find the parent */
 reval:
 	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
@@ -1810,7 +1811,7 @@ reval:
 		filp = ERR_PTR(error);
 		goto out;
 	}
-	if (unlikely(!audit_dummy_context()))
+	if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
 		audit_inode(pathname, nd.path.dentry);
 
 	/*
@@ -1826,16 +1827,22 @@ reval:
 	nd.intent.open.flags = flag;
 	nd.intent.open.create_mode = mode;
 	nd.flags &= ~LOOKUP_PARENT;
-	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
-	if (open_flag & O_EXCL)
-		nd.flags |= LOOKUP_EXCL;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	nd.flags |= LOOKUP_OPEN;
+	if (open_flag & O_CREAT) {
+		nd.flags |= LOOKUP_CREATE;
+		if (open_flag & O_EXCL)
+			nd.flags |= LOOKUP_EXCL;
+	}
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
-		struct inode *inode;
+		struct inode *inode = path.dentry->d_inode;
 		void *cookie;
 		error = -ELOOP;
-		if ((open_flag & O_NOFOLLOW) || count++ == 32)
+		/* S_ISDIR part is a temporary automount kludge */
+		if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+			goto exit_dput;
+		if (count++ == 32)
 			goto exit_dput;
 		/*
 		 * This is subtle. Instead of calling do_follow_link() we do
@@ -1855,7 +1862,6 @@ reval:
 		error = __do_follow_link(&path, &nd, &cookie);
 		if (unlikely(error)) {
 			/* nd.path had been dropped */
-			inode = path.dentry->d_inode;
 			if (!IS_ERR(cookie) && inode->i_op->put_link)
 				inode->i_op->put_link(path.dentry, &nd, cookie);
 			path_put(&path);
@@ -1865,8 +1871,7 @@ reval:
 		}
 		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		inode = holder.dentry->d_inode;
+		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
 		if (inode->i_op->put_link)
 			inode->i_op->put_link(holder.dentry, &nd, cookie);
 		path_put(&holder);
-- 
cgit v1.2.3


From ae4a3179b11a97a5b36a768ae6ac1662d0315ff0 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 25 Feb 2010 00:54:48 +0000
Subject: Squashfs: get rid of obsolete variable in struct squashfs_sb_info

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/squashfs_fs_sb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 753335085e41..2e77dc547e25 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,7 +61,6 @@ struct squashfs_sb_info {
 	int					next_meta_index;
 	__le64					*id_table;
 	__le64					*fragment_index;
-	unsigned int				*fragment_index_2;
 	struct mutex				read_data_mutex;
 	struct mutex				meta_index_mutex;
 	struct meta_index			*meta_index;
-- 
cgit v1.2.3


From 06862f884d9c2453daaf0c1d070c69cf444f10b1 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 25 Feb 2010 01:31:13 +0000
Subject: Squashfs: get rid of obsolete definition in header file

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/squashfs_fs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 36e1604ab1c1..79024245ea00 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,8 +183,6 @@
 #define SQUASHFS_MAX_FILE_SIZE		(1LL << \
 					(SQUASHFS_MAX_FILE_SIZE_LOG - 2))
 
-#define SQUASHFS_MARKER_BYTE		0xff
-
 /* meta index cache */
 #define SQUASHFS_META_INDEXES	(SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
 #define SQUASHFS_META_ENTRIES	127
-- 
cgit v1.2.3


From 26821ed40b4230259e770c9911180f38fcaa6f59 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Mar 2010 09:21:21 +0100
Subject: make sure data is on disk before calling ->write_inode

Similar to the fsync issue fixed a while ago in commit
2daea67e966dc0c42067ebea015ddac6834cef88 we need to write for data to
actually hit the disk before writing out the metadata to guarantee
data integrity for filesystems that modify the inode in the data I/O
completion path.  Currently XFS and NFS handle this manually, and AFS
has a write_inode method that does nothing but waiting for data, while
others are possibly missing out on this.

Fortunately this change has a lot less impact than the fsync change
as none of the write_inode methods starts data writeout of any form
by itself.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/internal.h            |  1 -
 fs/afs/super.c               |  1 -
 fs/afs/write.c               | 21 ---------------------
 fs/fs-writeback.c            | 15 ++++++++++-----
 fs/nfs/inode.c               |  7 +------
 fs/xfs/linux-2.6/xfs_super.c |  4 ----
 6 files changed, 11 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6ece2a13bf71..c54dad4e6063 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -733,7 +733,6 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern int afs_write_inode(struct inode *, int);
 extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
 			      unsigned long, loff_t);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index e1ea1c240b6a..14f6431598ad 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,6 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
 	.statfs		= afs_statfs,
 	.alloc_inode	= afs_alloc_inode,
-	.write_inode	= afs_write_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.clear_inode	= afs_clear_inode,
 	.put_super	= afs_put_super,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e15a21dbf9f..3bed54a294d4 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -584,27 +584,6 @@ int afs_writepages(struct address_space *mapping,
 	return ret;
 }
 
-/*
- * write an inode back
- */
-int afs_write_inode(struct inode *inode, int sync)
-{
-	struct afs_vnode *vnode = AFS_FS_I(inode);
-	int ret;
-
-	_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
-
-	ret = 0;
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret < 0)
-			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
 /*
  * completion of write to server
  */
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c64ff4..5f2721b1e4be 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,15 +461,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	ret = do_writepages(mapping, wbc);
 
-	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		int err = write_inode(inode, wait);
+	/*
+	 * Make sure to wait on the data before writing out the metadata.
+	 * This is important for filesystems that modify metadata on data
+	 * I/O completion.
+	 */
+	if (wait) {
+		int err = filemap_fdatawait(mapping);
 		if (ret == 0)
 			ret = err;
 	}
 
-	if (wait) {
-		int err = filemap_fdatawait(mapping);
+	/* Don't write the inode if only I_DIRTY_PAGES was set */
+	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		int err = write_inode(inode, wait);
 		if (ret == 0)
 			ret = err;
 	}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7570573bdb30..5ecd952cae1d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -101,12 +101,7 @@ int nfs_write_inode(struct inode *inode, int sync)
 {
 	int ret;
 
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
+	ret = nfs_commit_inode(inode, sync ? FLUSH_SYNC : 0);
 	if (ret >= 0)
 		return 0;
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 25ea2408118f..8f117db6070e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1075,10 +1075,6 @@ xfs_fs_write_inode(
 		return XFS_ERROR(EIO);
 
 	if (sync) {
-		error = xfs_wait_on_pages(ip, 0, -1);
-		if (error)
-			goto out;
-
 		/*
 		 * Make sure the inode has hit stable storage.  By using the
 		 * log and the fsync transactions we reduce the IOs we have
-- 
cgit v1.2.3


From a9185b41a4f84971b930c519f0c63bd450c4810d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Mar 2010 09:21:37 +0100
Subject: pass writeback_control to ->write_inode

This gives the filesystem more information about the writeback that
is happening.  Trond requested this for the NFS unstable write handling,
and other filesystems might benefit from this too by beeing able to
distinguish between the different callers in more detail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/adfs.h               |  2 +-
 fs/adfs/inode.c              |  5 +++--
 fs/affs/affs.h               |  3 ++-
 fs/affs/inode.c              |  2 +-
 fs/bfs/inode.c               |  5 +++--
 fs/btrfs/ctree.h             |  2 +-
 fs/btrfs/inode.c             |  4 ++--
 fs/exofs/exofs.h             |  2 +-
 fs/exofs/inode.c             |  4 ++--
 fs/ext2/ext2.h               |  2 +-
 fs/ext2/inode.c              | 11 +++++++++--
 fs/ext3/inode.c              |  4 ++--
 fs/ext4/ext4.h               |  2 +-
 fs/ext4/inode.c              |  6 +++---
 fs/fat/inode.c               |  9 +++++++--
 fs/fs-writeback.c            | 11 +++++------
 fs/gfs2/super.c              |  5 +++--
 fs/hfs/hfs_fs.h              |  2 +-
 fs/hfs/inode.c               |  2 +-
 fs/hfsplus/super.c           |  3 ++-
 fs/jfs/inode.c               |  5 ++++-
 fs/jfs/jfs_inode.h           |  2 +-
 fs/minix/inode.c             |  8 +++++---
 fs/nfs/inode.c               |  5 +++--
 fs/nfs/internal.h            |  2 +-
 fs/ntfs/dir.c                |  2 +-
 fs/ntfs/file.c               |  2 +-
 fs/ntfs/inode.c              |  2 +-
 fs/ntfs/inode.h              |  4 ++--
 fs/ntfs/super.c              |  8 ++++++++
 fs/omfs/inode.c              | 10 ++++++++--
 fs/reiserfs/inode.c          |  4 ++--
 fs/sysv/inode.c              | 10 ++++++++--
 fs/sysv/sysv.h               |  2 +-
 fs/ubifs/dir.c               |  2 +-
 fs/ubifs/file.c              |  8 ++++----
 fs/ubifs/super.c             |  2 +-
 fs/udf/inode.c               |  4 ++--
 fs/udf/udfdecl.h             |  2 +-
 fs/ufs/inode.c               |  5 +++--
 fs/ufs/ufs.h                 |  2 +-
 fs/xfs/linux-2.6/xfs_super.c |  4 ++--
 include/linux/ext3_fs.h      |  2 +-
 include/linux/fs.h           |  2 +-
 include/linux/reiserfs_fs.h  |  2 +-
 45 files changed, 115 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 9cc18775b832..2ff622f6f547 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -121,7 +121,7 @@ struct adfs_discmap {
 
 /* Inode stuff */
 struct inode *adfs_iget(struct super_block *sb, struct object_info *obj);
-int adfs_write_inode(struct inode *inode,int unused);
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int adfs_notify_change(struct dentry *dentry, struct iattr *attr);
 
 /* map.c */
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3f57ce4bee5d..0f5e30978135 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -9,6 +9,7 @@
  */
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include "adfs.h"
 
 /*
@@ -360,7 +361,7 @@ out:
  * The adfs-specific inode data has already been updated by
  * adfs_notify_change()
  */
-int adfs_write_inode(struct inode *inode, int wait)
+int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct super_block *sb = inode->i_sb;
 	struct object_info obj;
@@ -375,7 +376,7 @@ int adfs_write_inode(struct inode *inode, int wait)
 	obj.attr	= ADFS_I(inode)->attr;
 	obj.size	= inode->i_size;
 
-	ret = adfs_dir_update(sb, &obj, wait);
+	ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0e40caaba456..861dae68ac12 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -175,7 +175,8 @@ extern void			 affs_delete_inode(struct inode *inode);
 extern void			 affs_clear_inode(struct inode *inode);
 extern struct inode		*affs_iget(struct super_block *sb,
 					unsigned long ino);
-extern int			 affs_write_inode(struct inode *inode, int);
+extern int			 affs_write_inode(struct inode *inode,
+					struct writeback_control *wbc);
 extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type);
 
 /* file.c */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3c4ec7d864c4..c9744d771d98 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -166,7 +166,7 @@ bad_inode:
 }
 
 int
-affs_write_inode(struct inode *inode, int unused)
+affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct super_block	*sb = inode->i_sb;
 	struct buffer_head	*bh;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8f3d9fd89604..f22a7d3dc362 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <asm/uaccess.h>
 #include "bfs.h"
 
@@ -98,7 +99,7 @@ error:
 	return ERR_PTR(-EIO);
 }
 
-static int bfs_write_inode(struct inode *inode, int wait)
+static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
@@ -147,7 +148,7 @@ static int bfs_write_inode(struct inode *inode, int wait)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh))
 			err = -EIO;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..8b5cfdd4bfc1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2326,7 +2326,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-int btrfs_write_inode(struct inode *inode, int wait);
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..c41db6d45ab6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3968,7 +3968,7 @@ err:
 	return ret;
 }
 
-int btrfs_write_inode(struct inode *inode, int wait)
+int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -3977,7 +3977,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	if (root->fs_info->btree_inode == inode)
 		return 0;
 
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 59b8bf2825c7..8442e353309f 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -261,7 +261,7 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		struct page **pagep, void **fsdata);
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
-extern int exofs_write_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, struct writeback_control *wbc);
 extern void exofs_delete_inode(struct inode *);
 
 /* dir.c:                */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5514f3c2c2f4..a17e4b733e35 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1280,9 +1280,9 @@ out:
 	return ret;
 }
 
-int exofs_write_inode(struct inode *inode, int wait)
+int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return exofs_update_inode(inode, wait);
+	return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 }
 
 /*
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 061914add3cf..0b038e47ad2f 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -118,7 +118,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned);
 
 /* inode.c */
 extern struct inode *ext2_iget (struct super_block *, unsigned long);
-extern int ext2_write_inode (struct inode *, int);
+extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 71b032c65a02..36ae1cac767c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,6 +41,8 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 
+static int __ext2_write_inode(struct inode *inode, int do_sync);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -64,7 +66,7 @@ void ext2_delete_inode (struct inode * inode)
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
 	mark_inode_dirty(inode);
-	ext2_write_inode(inode, inode_needs_sync(inode));
+	__ext2_write_inode(inode, inode_needs_sync(inode));
 
 	inode->i_size = 0;
 	if (inode->i_blocks)
@@ -1335,7 +1337,7 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-int ext2_write_inode(struct inode *inode, int do_sync)
+static int __ext2_write_inode(struct inode *inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1440,6 +1442,11 @@ int ext2_write_inode(struct inode *inode, int do_sync)
 	return err;
 }
 
+int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int ext2_sync_inode(struct inode *inode)
 {
 	struct writeback_control wbc = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 455e6e6e5cb9..7aca55fcc976 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3096,7 +3096,7 @@ out_brelse:
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
-int ext3_write_inode(struct inode *inode, int wait)
+int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (current->flags & PF_MEMALLOC)
 		return 0;
@@ -3107,7 +3107,7 @@ int ext3_write_inode(struct inode *inode, int wait)
 		return -EIO;
 	}
 
-	if (!wait)
+	if (wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
 	return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4cedc91ec59d..50af1a2c65e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1416,7 +1416,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e11952404e02..d01a6cdbf854 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5177,7 +5177,7 @@ out_brelse:
  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
  * will no longer be on the superblock's dirty inode list.
  */
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 
@@ -5191,7 +5191,7 @@ int ext4_write_inode(struct inode *inode, int wait)
 			return -EIO;
 		}
 
-		if (!wait)
+		if (wbc->sync_mode != WB_SYNC_ALL)
 			return 0;
 
 		err = ext4_force_commit(inode->i_sb);
@@ -5201,7 +5201,7 @@ int ext4_write_inode(struct inode *inode, int wait)
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (err)
 			return err;
-		if (wait)
+		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
 			ext4_error(inode->i_sb, __func__,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 14da530b05ca..fbeecdc194dc 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -577,7 +577,7 @@ static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi,
 	return i_pos;
 }
 
-static int fat_write_inode(struct inode *inode, int wait)
+static int __fat_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -634,9 +634,14 @@ retry:
 	return err;
 }
 
+static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int fat_sync_inode(struct inode *inode)
 {
-	return fat_write_inode(inode, 1);
+	return __fat_write_inode(inode, 1);
 }
 
 EXPORT_SYMBOL_GPL(fat_sync_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5f2721b1e4be..76fc4d594acb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -381,10 +381,10 @@ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
 
-static int write_inode(struct inode *inode, int sync)
+static int write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-		return inode->i_sb->s_op->write_inode(inode, sync);
+		return inode->i_sb->s_op->write_inode(inode, wbc);
 	return 0;
 }
 
@@ -421,7 +421,6 @@ static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
-	int wait = wbc->sync_mode == WB_SYNC_ALL;
 	unsigned dirty;
 	int ret;
 
@@ -439,7 +438,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * We'll have another go at writing back this inode when we
 		 * completed a full scan of b_io.
 		 */
-		if (!wait) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
 			requeue_io(inode);
 			return 0;
 		}
@@ -466,7 +465,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * This is important for filesystems that modify metadata on data
 	 * I/O completion.
 	 */
-	if (wait) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		int err = filemap_fdatawait(mapping);
 		if (ret == 0)
 			ret = err;
@@ -474,7 +473,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-		int err = write_inode(inode, wait);
+		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5e22629da67..ca87598ead7f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -22,6 +22,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/wait.h>
+#include <linux/writeback.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -711,7 +712,7 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
  * Returns: errno
  */
 
-static int gfs2_write_inode(struct inode *inode, int sync)
+static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -745,7 +746,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 do_unlock:
 	gfs2_glock_dq_uninit(&gh);
 do_flush:
-	if (sync != 0)
+	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
 	return ret;
 }
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 052387e11671..fe35e3b626c4 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -188,7 +188,7 @@ extern const struct address_space_operations hfs_btree_aops;
 
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
-extern int hfs_write_inode(struct inode *, int);
+extern int hfs_write_inode(struct inode *, struct writeback_control *);
 extern int hfs_inode_setattr(struct dentry *, struct iattr *);
 extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext,
 			__be32 log_size, __be32 phys_size, u32 clump_size);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a1cbff2b4d99..14f5cb1b9fdc 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -381,7 +381,7 @@ void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext,
 					 HFS_SB(inode->i_sb)->alloc_blksz);
 }
 
-int hfs_write_inode(struct inode *inode, int unused)
+int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct inode *main_inode = inode;
 	struct hfs_find_data fd;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 43022f3d5148..74b473a8ef92 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -87,7 +87,8 @@ bad_inode:
 	return ERR_PTR(err);
 }
 
-static int hfsplus_write_inode(struct inode *inode, int unused)
+static int hfsplus_write_inode(struct inode *inode,
+		struct writeback_control *wbc)
 {
 	struct hfsplus_vh *vhdr;
 	int ret = 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..182b78cc3e62 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/writeback.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
@@ -120,8 +121,10 @@ int jfs_commit_inode(struct inode *inode, int wait)
 	return rc;
 }
 
-int jfs_write_inode(struct inode *inode, int wait)
+int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
 	if (test_cflag(COMMIT_Nolink, inode))
 		return 0;
 	/*
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..15902b03c2a7 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -26,7 +26,7 @@ extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
-extern int jfs_write_inode(struct inode*, int);
+extern int jfs_write_inode(struct inode *, struct writeback_control *);
 extern void jfs_delete_inode(struct inode *);
 extern void jfs_dirty_inode(struct inode *);
 extern void jfs_truncate(struct inode *);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 74ea82d72164..756f8c93780c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,8 +17,10 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 
-static int minix_write_inode(struct inode * inode, int wait);
+static int minix_write_inode(struct inode *inode,
+		struct writeback_control *wbc);
 static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
@@ -552,7 +554,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	return bh;
 }
 
-static int minix_write_inode(struct inode *inode, int wait)
+static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err = 0;
 	struct buffer_head *bh;
@@ -563,7 +565,7 @@ static int minix_write_inode(struct inode *inode, int wait)
 		bh = V2_minix_update_inode(inode);
 	if (!bh)
 		return -EIO;
-	if (wait && buffer_dirty(bh)) {
+	if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh)) {
 			printk("IO error syncing minix inode [%s:%08lx]\n",
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5ecd952cae1d..7f9ecc46f3fb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,11 +97,12 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 
-	ret = nfs_commit_inode(inode, sync ? FLUSH_SYNC : 0);
+	ret = nfs_commit_inode(inode,
+			wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0);
 	if (ret >= 0)
 		return 0;
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d23b32..11f82f03c5de 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,7 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
+extern int nfs_write_inode(struct inode *, struct writeback_control *);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 5a9e34475e37..9173e82a45d1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1545,7 +1545,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
  		write_inode_now(bmp_vi, !datasync);
 		iput(bmp_vi);
 	}
-	ret = ntfs_write_inode(vi, 1);
+	ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
 	err = sync_blockdev(vi->i_sb->s_bdev);
 	if (unlikely(err && !ret))
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 43179ddd336f..b681c71d7069 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2182,7 +2182,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
-		ret = ntfs_write_inode(vi, 1);
+		ret = __ntfs_write_inode(vi, 1);
 	write_inode_now(vi, !datasync);
 	/*
 	 * NOTE: If we were to use mapping->private_list (see ext2 and
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc2505abb6d7..4b57fb1eac2a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2957,7 +2957,7 @@ out:
  *
  * Return 0 on success and -errno on error.
  */
-int ntfs_write_inode(struct inode *vi, int sync)
+int __ntfs_write_inode(struct inode *vi, int sync)
 {
 	sle64 nt;
 	ntfs_inode *ni = NTFS_I(vi);
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 117eaf8032a3..9a113544605d 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -307,12 +307,12 @@ extern void ntfs_truncate_vfs(struct inode *vi);
 
 extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr);
 
-extern int ntfs_write_inode(struct inode *vi, int sync);
+extern int __ntfs_write_inode(struct inode *vi, int sync);
 
 static inline void ntfs_commit_inode(struct inode *vi)
 {
 	if (!is_bad_inode(vi))
-		ntfs_write_inode(vi, 1);
+		__ntfs_write_inode(vi, 1);
 	return;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 80b04770e8e9..1cf39dfaee7a 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -39,6 +39,7 @@
 #include "dir.h"
 #include "debug.h"
 #include "index.h"
+#include "inode.h"
 #include "aops.h"
 #include "layout.h"
 #include "malloc.h"
@@ -2662,6 +2663,13 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 	return 0;
 }
 
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
 /**
  * The complete super operations.
  */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index f3b7c1541f3a..75d9b5ba1d45 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/vmalloc.h>
+#include <linux/writeback.h>
 #include <linux/crc-itu-t.h>
 #include "omfs.h"
 
@@ -89,7 +90,7 @@ static void omfs_update_checksums(struct omfs_inode *oi)
 	oi->i_head.h_check_xor = xor;
 }
 
-static int omfs_write_inode(struct inode *inode, int wait)
+static int __omfs_write_inode(struct inode *inode, int wait)
 {
 	struct omfs_inode *oi;
 	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
@@ -162,9 +163,14 @@ out:
 	return ret;
 }
 
+static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int omfs_sync_inode(struct inode *inode)
 {
-	return omfs_write_inode(inode, 1);
+	return __omfs_write_inode(inode, 1);
 }
 
 /*
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 2df0f5c7c60b..0d651f980a8d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1615,7 +1615,7 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
-int reiserfs_write_inode(struct inode *inode, int do_sync)
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct reiserfs_transaction_handle th;
 	int jbegin_count = 1;
@@ -1627,7 +1627,7 @@ int reiserfs_write_inode(struct inode *inode, int do_sync)
 	 ** inode needs to reach disk for safety, and they can safely be
 	 ** ignored because the altered inode has already been logged.
 	 */
-	if (do_sync && !(current->flags & PF_MEMALLOC)) {
+	if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
 		reiserfs_write_lock(inode->i_sb);
 		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
 			reiserfs_update_sd(&th, inode);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9824743832a7..4573734d723d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/writeback.h>
 #include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -246,7 +247,7 @@ bad_inode:
 	return ERR_PTR(-EIO);
 }
 
-int sysv_write_inode(struct inode *inode, int wait)
+static int __sysv_write_inode(struct inode *inode, int wait)
 {
 	struct super_block * sb = inode->i_sb;
 	struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -296,9 +297,14 @@ int sysv_write_inode(struct inode *inode, int wait)
 	return 0;
 }
 
+int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
 int sysv_sync_inode(struct inode *inode)
 {
-	return sysv_write_inode(inode, 1);
+	return __sysv_write_inode(inode, 1);
 }
 
 static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 53786eb5cf60..94cb9b4d76c2 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -142,7 +142,7 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 
 /* inode.c */
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, int);
+extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
 extern int sysv_sync_inode(struct inode *);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 552fb0111fff..401e503d44a1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1120,7 +1120,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (release)
 		ubifs_release_budget(c, &ino_req);
 	if (IS_SYNC(old_inode))
-		err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
 	return err;
 
 out_cancel:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 16a6444330ec..e26c02ab6cd5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1011,7 +1011,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	/* Is the page fully inside @i_size? */
 	if (page->index < end_index) {
 		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
-			err = inode->i_sb->s_op->write_inode(inode, 1);
+			err = inode->i_sb->s_op->write_inode(inode, NULL);
 			if (err)
 				goto out_unlock;
 			/*
@@ -1039,7 +1039,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	if (i_size > synced_i_size) {
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 		if (err)
 			goto out_unlock;
 	}
@@ -1242,7 +1242,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	if (release)
 		ubifs_release_budget(c, &req);
 	if (IS_SYNC(inode))
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 	return err;
 
 out:
@@ -1316,7 +1316,7 @@ int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	 * the inode unless this is a 'datasync()' call.
 	 */
 	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
-		err = inode->i_sb->s_op->write_inode(inode, 1);
+		err = inode->i_sb->s_op->write_inode(inode, NULL);
 		if (err)
 			return err;
 	}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 43f9d19a6f33..4d2f2157dd3f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -283,7 +283,7 @@ static void ubifs_destroy_inode(struct inode *inode)
 /*
  * Note, Linux write-back code calls this without 'i_mutex'.
  */
-static int ubifs_write_inode(struct inode *inode, int wait)
+static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err = 0;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 378a7592257c..b02089247296 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1373,12 +1373,12 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
 	return mode;
 }
 
-int udf_write_inode(struct inode *inode, int sync)
+int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 
 	lock_kernel();
-	ret = udf_update_inode(inode, sync);
+	ret = udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 
 	return ret;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8d46f4294ee7..4223ac855da9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -142,7 +142,7 @@ extern void udf_truncate(struct inode *);
 extern void udf_read_inode(struct inode *);
 extern void udf_delete_inode(struct inode *);
 extern void udf_clear_inode(struct inode *);
-extern int udf_write_inode(struct inode *, int);
+extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int udf_extend_file(struct inode *, struct extent_position *,
 			   struct kernel_long_ad *, sector_t);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7cf33379fd46..0a627e08610b 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -890,11 +891,11 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 	return 0;
 }
 
-int ufs_write_inode (struct inode * inode, int wait)
+int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 	lock_kernel();
-	ret = ufs_update_inode (inode, wait);
+	ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
 	unlock_kernel();
 	return ret;
 }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 01d0e2a3b230..43f9f5d5670e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -106,7 +106,7 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 
 /* inode.c */
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern int ufs_write_inode (struct inode *, int);
+extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
 extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8f117db6070e..71345a370d9f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1063,7 +1063,7 @@ xfs_log_inode(
 STATIC int
 xfs_fs_write_inode(
 	struct inode		*inode,
-	int			sync)
+	struct writeback_control *wbc)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1074,7 +1074,7 @@ xfs_fs_write_inode(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	if (sync) {
+	if (wbc->sync_mode == WB_SYNC_ALL) {
 		/*
 		 * Make sure the inode has hit stable storage.  By using the
 		 * log and the fsync transactions we reduce the IOs we have
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 6b049030fbe6..deac2566450e 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -877,7 +877,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	int create);
 
 extern struct inode *ext3_iget(struct super_block *, unsigned long);
-extern int  ext3_write_inode (struct inode *, int);
+extern int  ext3_write_inode (struct inode *, struct writeback_control *);
 extern int  ext3_setattr (struct dentry *, struct iattr *);
 extern void ext3_delete_inode (struct inode *);
 extern int  ext3_sync_inode (handle_t *, struct inode *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b3182c7eb5f..45689621a851 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1557,7 +1557,7 @@ struct super_operations {
 	void (*destroy_inode)(struct inode *);
 
    	void (*dirty_inode) (struct inode *);
-	int (*write_inode) (struct inode *, int);
+	int (*write_inode) (struct inode *, struct writeback_control *wbc);
 	void (*drop_inode) (struct inode *);
 	void (*delete_inode) (struct inode *);
 	void (*put_super) (struct super_block *);
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 1ba3cf6edfbb..3b603f474186 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2034,7 +2034,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
 int reiserfs_find_actor(struct inode *inode, void *p);
 int reiserfs_init_locked_inode(struct inode *inode, void *p);
 void reiserfs_delete_inode(struct inode *inode);
-int reiserfs_write_inode(struct inode *inode, int);
+int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int reiserfs_get_block(struct inode *inode, sector_t block,
 		       struct buffer_head *bh_result, int create);
 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-- 
cgit v1.2.3


From 8fc795f703c5138e1a8bfb88c69f52632031aa6a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 16:46:56 -0800
Subject: NFS: Cleanup - move nfs_write_inode() into fs/nfs/write.c

The sole purpose of nfs_write_inode is to commit unstable writes, so
move it into fs/nfs/write.c, and make nfs_commit_inode static.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 12 ------------
 fs/nfs/write.c         | 24 +++++++++++++++++++++++-
 include/linux/nfs_fs.h |  7 -------
 3 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7f9ecc46f3fb..89e98312599d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,18 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	int ret;
-
-	ret = nfs_commit_inode(inode,
-			wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d63d964a0392..09e97097baaa 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1391,7 +1391,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
 	.rpc_release = nfs_commit_release,
 };
 
-int nfs_commit_inode(struct inode *inode, int how)
+static int nfs_commit_inode(struct inode *inode, int how)
 {
 	LIST_HEAD(head);
 	int res;
@@ -1406,13 +1406,35 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = nfs_commit_inode(inode,
+			wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0);
+	if (ret >= 0)
+		return 0;
+	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	return ret;
+}
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	return nfs_commit_unstable_pages(inode, wbc);
+}
+
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
 {
 	struct inode *inode = mapping->host;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d09db1bc9083..384ea3ef2863 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -483,15 +483,8 @@ extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_write_data *wdata);
-#else
-static inline int
-nfs_commit_inode(struct inode *inode, int how)
-{
-	return 0;
-}
 #endif
 
 static inline int
-- 
cgit v1.2.3


From ff778d02bf867e1733a09b34ad6dbb723b024814 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 16:53:39 -0800
Subject: NFS: Add a count of the number of unstable writes carried by an inode

In order to know when we should do opportunistic commits of the unstable
writes, when the VM is doing a background flush, we add a field to count
the number of unstable writes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 +
 fs/nfs/write.c         | 14 ++++++++++----
 include/linux/nfs_fs.h |  1 +
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 89e98312599d..aa5a831001ab 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1404,6 +1404,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
 	nfsi->npages = 0;
+	nfsi->ncommit = 0;
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
 	init_waitqueue_head(&nfsi->waitqueue);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 09e97097baaa..dc08a6fbde67 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -438,6 +438,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index,
 			NFS_PAGE_TAG_COMMIT);
+	nfsi->ncommit++;
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
@@ -573,11 +574,15 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret;
 
 	if (!nfs_need_commit(nfsi))
 		return 0;
 
-	return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
+	if (ret > 0)
+		nfsi->ncommit -= ret;
+	return ret;
 }
 #else
 static inline int nfs_need_commit(struct nfs_inode *nfsi)
@@ -642,9 +647,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		spin_lock(&inode->i_lock);
 	}
 
-	if (nfs_clear_request_commit(req))
-		radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-				req->wb_index, NFS_PAGE_TAG_COMMIT);
+	if (nfs_clear_request_commit(req) &&
+			radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+				req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
+		NFS_I(inode)->ncommit--;
 
 	/* Okay, the request matches. Update the region */
 	if (offset < req->wb_offset) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 384ea3ef2863..309217f46e28 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,6 +166,7 @@ struct nfs_inode {
 	struct radix_tree_root	nfs_page_tree;
 
 	unsigned long		npages;
+	unsigned long		ncommit;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
-- 
cgit v1.2.3


From 420e3646bb7d93a571734034249fbb1ae1a7a5c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:00:02 -0800
Subject: NFS: Reduce the number of unnecessary COMMIT calls

If the caller is doing a non-blocking flush, and there are still writebacks
pending on the wire, we can usually defer the COMMIT call until those
writes are done.

Also ensure that we honour the wbc->nonblocking flag.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dc08a6fbde67..fc05e35da6a9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1415,12 +1415,30 @@ static int nfs_commit_inode(struct inode *inode, int how)
 
 static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
 {
-	int ret;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int flags = FLUSH_SYNC;
+	int ret = 0;
 
-	ret = nfs_commit_inode(inode,
-			wbc->sync_mode == WB_SYNC_ALL ? FLUSH_SYNC : 0);
-	if (ret >= 0)
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * lots of outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode == WB_SYNC_NONE &&
+	    nfsi->ncommit <= (nfsi->npages >> 1))
+		goto out_mark_dirty;
+
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret >= 0) {
+		if (wbc->sync_mode == WB_SYNC_NONE) {
+			if (ret < wbc->nr_to_write)
+				wbc->nr_to_write -= ret;
+			else
+				wbc->nr_to_write = 0;
+		}
 		return 0;
+	}
+out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
-- 
cgit v1.2.3


From 5bad5abec4058c5214bfc72cec418348d6747977 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:02:24 -0800
Subject: NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background
 is set

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fc05e35da6a9..704e67d392e5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1426,7 +1426,7 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 	    nfsi->ncommit <= (nfsi->npages >> 1))
 		goto out_mark_dirty;
 
-	if (wbc->nonblocking)
+	if (wbc->nonblocking || wbc->for_background)
 		flags = 0;
 	ret = nfs_commit_inode(inode, flags);
 	if (ret >= 0) {
-- 
cgit v1.2.3


From 2928db1ffeacc9717c2d5c230d450bcc377b3ae9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:18 -0800
Subject: NFS: Ensure inode is always marked I_DIRTY_DATASYNC, if it has
 unstable pages

Since nfs_scan_list() doesn't wait for locked pages, we have a race in
which it is possible to end up with an inode that needs to send a COMMIT,
but which does not have the I_DIRTY_DATASYNC flag set.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 704e67d392e5..e40e949598fd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -582,6 +582,8 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
 	ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
 	if (ret > 0)
 		nfsi->ncommit -= ret;
+	if (nfs_need_commit(NFS_I(inode)))
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
 #else
-- 
cgit v1.2.3


From c988950eb6dd6f8e6d98503ca094622729e9aa13 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:21 -0800
Subject: NFS: Simplify nfs_wb_page_cancel()

In all cases we should be able to just remove the request and call
cancel_dirty_page().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c         | 39 +--------------------------------------
 include/linux/nfs_fs.h |  2 --
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e40e949598fd..dc7f5e9a23b4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -540,19 +540,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, u
 	return res;
 }
 
-static void nfs_cancel_commit_list(struct list_head *head)
-{
-	struct nfs_page *req;
-
-	while(!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_clear_request_commit(req);
-		nfs_inode_remove_request(req);
-		nfs_unlock_request(req);
-	}
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
@@ -1495,13 +1482,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
-		if (how & FLUSH_INVALIDATE) {
-			spin_unlock(&inode->i_lock);
-			nfs_cancel_commit_list(&head);
-			ret = pages;
-			spin_lock(&inode->i_lock);
-			continue;
-		}
 		pages += nfs_scan_commit(inode, &head, 0, 0);
 		spin_unlock(&inode->i_lock);
 		ret = nfs_commit_list(inode, &head, how);
@@ -1558,26 +1538,13 @@ int nfs_wb_nocommit(struct inode *inode)
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 {
 	struct nfs_page *req;
-	loff_t range_start = page_offset(page);
-	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
-	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
-		.range_start = range_start,
-		.range_end = range_end,
-	};
 	int ret = 0;
 
 	BUG_ON(!PageLocked(page));
 	for (;;) {
 		req = nfs_page_find_request(page);
 		if (req == NULL)
-			goto out;
-		if (test_bit(PG_CLEAN, &req->wb_flags)) {
-			nfs_release_request(req);
 			break;
-		}
 		if (nfs_lock_request_dontget(req)) {
 			nfs_inode_remove_request(req);
 			/*
@@ -1591,12 +1558,8 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 		ret = nfs_wait_on_request(req);
 		nfs_release_request(req);
 		if (ret < 0)
-			goto out;
+			break;
 	}
-	if (!PagePrivate(page))
-		return 0;
-	ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
-out:
 	return ret;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 309217f46e28..1083134c02ff 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -34,8 +34,6 @@
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
 #define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
-#define FLUSH_INVALIDATE	64	/* Invalidate the page cache */
-#define FLUSH_NOWRITEPAGE	128	/* Don't call writepage() */
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3


From acdc53b2146c7ee67feb1f02f7bc3020126514b8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:26 -0800
Subject: NFS: Replace __nfs_write_mapping with sync_inode()

Now that we have correct COMMIT semantics in writeback_single_inode, we can
reduce and simplify nfs_wb_all(). Also replace nfs_wb_nocommit() with a
call to filemap_write_and_wait(), which doesn't need to hold the
inode->i_mutex.

With that done, we can eliminate nfs_write_mapping() altogether.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         | 15 +++++----------
 fs/nfs/write.c         | 42 +++++-------------------------------------
 include/linux/nfs_fs.h |  2 --
 3 files changed, 10 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index aa5a831001ab..443772df9b17 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -495,17 +495,11 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
-	/*
-	 * Flush out writes to the server in order to update c/mtime.
-	 *
-	 * Hold the i_mutex to suspend application writes temporarily;
-	 * this prevents long-running writing applications from blocking
-	 * nfs_wb_nocommit.
-	 */
+	/* Flush out writes to the server in order to update c/mtime.  */
 	if (S_ISREG(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
-		nfs_wb_nocommit(inode);
-		mutex_unlock(&inode->i_mutex);
+		err = filemap_write_and_wait(inode->i_mapping);
+		if (err)
+			goto out;
 	}
 
 	/*
@@ -529,6 +523,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	}
+out:
 	return err;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dc7f5e9a23b4..0b323091b481 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1454,7 +1454,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 	pgoff_t idx_start, idx_end;
 	unsigned int npages = 0;
 	LIST_HEAD(head);
-	int nocommit = how & FLUSH_NOCOMMIT;
 	long pages, ret;
 
 	/* FIXME */
@@ -1471,14 +1470,11 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 				npages = 0;
 		}
 	}
-	how &= ~FLUSH_NOCOMMIT;
 	spin_lock(&inode->i_lock);
 	do {
 		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
 		if (ret != 0)
 			continue;
-		if (nocommit)
-			break;
 		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
@@ -1492,47 +1488,19 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
 	return ret;
 }
 
-static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
-	int ret;
-
-	ret = nfs_writepages(mapping, wbc);
-	if (ret < 0)
-		goto out;
-	ret = nfs_sync_mapping_wait(mapping, wbc, how);
-	if (ret < 0)
-		goto out;
-	return 0;
-out:
-	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-	return ret;
-}
-
-/* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */
-static int nfs_write_mapping(struct address_space *mapping, int how)
+/*
+ * flush the inode to disk.
+ */
+int nfs_wb_all(struct inode *inode)
 {
 	struct writeback_control wbc = {
-		.bdi = mapping->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
 	};
 
-	return __nfs_write_mapping(mapping, &wbc, how);
-}
-
-/*
- * flush the inode to disk.
- */
-int nfs_wb_all(struct inode *inode)
-{
-	return nfs_write_mapping(inode->i_mapping, 0);
-}
-
-int nfs_wb_nocommit(struct inode *inode)
-{
-	return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT);
+	return sync_inode(inode, &wbc);
 }
 
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1083134c02ff..93f439e7c5bf 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -33,7 +33,6 @@
 #define FLUSH_STABLE		4	/* commit to stable storage */
 #define FLUSH_LOWPRI		8	/* low priority background flush */
 #define FLUSH_HIGHPRI		16	/* high priority memory reclaim flush */
-#define FLUSH_NOCOMMIT		32	/* Don't send the NFSv3/v4 COMMIT */
 
 #ifdef __KERNEL__
 
@@ -478,7 +477,6 @@ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
  */
 extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
 extern int nfs_wb_all(struct inode *inode);
-extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From 7f2f12d963e7c33a93bfb0b22f0178eb1e6a4196 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:28 -0800
Subject: NFS: Simplify nfs_wb_page()

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c         | 120 ++++++++++---------------------------------------
 include/linux/nfs_fs.h |   1 -
 2 files changed, 23 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0b323091b481..53ff70e23993 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -502,44 +502,6 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 }
 #endif
 
-/*
- * Wait for a request to complete.
- *
- * Interruptible by fatal signals only.
- */
-static int nfs_wait_on_requests_locked(struct inode *inode, pgoff_t idx_start, unsigned int npages)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	struct nfs_page *req;
-	pgoff_t idx_end, next;
-	unsigned int		res = 0;
-	int			error;
-
-	if (npages == 0)
-		idx_end = ~0;
-	else
-		idx_end = idx_start + npages - 1;
-
-	next = idx_start;
-	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_LOCKED)) {
-		if (req->wb_index > idx_end)
-			break;
-
-		next = req->wb_index + 1;
-		BUG_ON(!NFS_WBACK_BUSY(req));
-
-		kref_get(&req->wb_kref);
-		spin_unlock(&inode->i_lock);
-		error = nfs_wait_on_request(req);
-		nfs_release_request(req);
-		spin_lock(&inode->i_lock);
-		if (error < 0)
-			return error;
-		res++;
-	}
-	return res;
-}
-
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int
 nfs_need_commit(struct nfs_inode *nfsi)
@@ -1432,7 +1394,7 @@ out_mark_dirty:
 	return ret;
 }
 #else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+static int nfs_commit_inode(struct inode *inode, int how)
 {
 	return 0;
 }
@@ -1448,46 +1410,6 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return nfs_commit_unstable_pages(inode, wbc);
 }
 
-long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
-	struct inode *inode = mapping->host;
-	pgoff_t idx_start, idx_end;
-	unsigned int npages = 0;
-	LIST_HEAD(head);
-	long pages, ret;
-
-	/* FIXME */
-	if (wbc->range_cyclic)
-		idx_start = 0;
-	else {
-		idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (idx_end > idx_start) {
-			pgoff_t l_npages = 1 + idx_end - idx_start;
-			npages = l_npages;
-			if (sizeof(npages) != sizeof(l_npages) &&
-					(pgoff_t)npages != l_npages)
-				npages = 0;
-		}
-	}
-	spin_lock(&inode->i_lock);
-	do {
-		ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
-		if (ret != 0)
-			continue;
-		pages = nfs_scan_commit(inode, &head, idx_start, npages);
-		if (pages == 0)
-			break;
-		pages += nfs_scan_commit(inode, &head, 0, 0);
-		spin_unlock(&inode->i_lock);
-		ret = nfs_commit_list(inode, &head, how);
-		spin_lock(&inode->i_lock);
-
-	} while (ret >= 0);
-	spin_unlock(&inode->i_lock);
-	return ret;
-}
-
 /*
  * flush the inode to disk.
  */
@@ -1531,45 +1453,49 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 	return ret;
 }
 
-static int nfs_wb_page_priority(struct inode *inode, struct page *page,
-				int how)
+/*
+ * Write back all requests on one page - we do this before reading it.
+ */
+int nfs_wb_page(struct inode *inode, struct page *page)
 {
 	loff_t range_start = page_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = LONG_MAX,
+		.nr_to_write = 0,
 		.range_start = range_start,
 		.range_end = range_end,
 	};
+	struct nfs_page *req;
+	int need_commit;
 	int ret;
 
-	do {
+	while(PagePrivate(page)) {
 		if (clear_page_dirty_for_io(page)) {
 			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
-		} else if (!PagePrivate(page))
+		}
+		req = nfs_find_and_lock_request(page);
+		if (!req)
 			break;
-		ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
-		if (ret < 0)
+		if (IS_ERR(req)) {
+			ret = PTR_ERR(req);
 			goto out_error;
-	} while (PagePrivate(page));
+		}
+		need_commit = test_bit(PG_CLEAN, &req->wb_flags);
+		nfs_clear_page_tag_locked(req);
+		if (need_commit) {
+			ret = nfs_commit_inode(inode, FLUSH_SYNC);
+			if (ret < 0)
+				goto out_error;
+		}
+	}
 	return 0;
 out_error:
-	__mark_inode_dirty(inode, I_DIRTY_PAGES);
 	return ret;
 }
 
-/*
- * Write back all requests on one page - we do this before reading it.
- */
-int nfs_wb_page(struct inode *inode, struct page* page)
-{
-	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
-}
-
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 93f439e7c5bf..b789d85bff82 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -475,7 +475,6 @@ extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
  * Try to write back everything synchronously (but check the
  * return value!)
  */
-extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int);
 extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
-- 
cgit v1.2.3


From 5cf95214ccb915591e2214f81de4659302d3e452 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:29 -0800
Subject: NFS: Clean up nfs_sync_mapping

Remove the redundant call to filemap_write_and_wait().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 443772df9b17..e8b41170d295 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -114,16 +114,12 @@ void nfs_clear_inode(struct inode *inode)
  */
 int nfs_sync_mapping(struct address_space *mapping)
 {
-	int ret;
+	int ret = 0;
 
-	if (mapping->nrpages == 0)
-		return 0;
-	unmap_mapping_range(mapping, 0, 0, 0);
-	ret = filemap_write_and_wait(mapping);
-	if (ret != 0)
-		goto out;
-	ret = nfs_wb_all(mapping->host);
-out:
+	if (mapping->nrpages != 0) {
+		unmap_mapping_range(mapping, 0, 0, 0);
+		ret = nfs_wb_all(mapping->host);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1cda707d52e51a6cafac0aef12d2bd7052d572e6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Feb 2010 17:03:30 -0800
Subject: NFS: Remove requirement for inode->i_mutex from
 nfs_invalidate_mapping

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           |  2 +-
 fs/nfs/inode.c         | 41 +----------------------------------------
 fs/nfs/symlink.c       |  2 +-
 include/linux/nfs_fs.h |  1 -
 4 files changed, 3 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3c7f03b669fb..a1f6b4438fb1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -560,7 +560,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	desc->entry = &my_entry;
 
 	nfs_block_sillyrename(dentry);
-	res = nfs_revalidate_mapping_nolock(inode, filp->f_mapping);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0)
 		goto out;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e8b41170d295..dbaaf7d2a188 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -754,7 +754,7 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	return __nfs_revalidate_inode(server, inode);
 }
 
-static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
+static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	
@@ -775,49 +775,10 @@ static int nfs_invalidate_mapping_nolock(struct inode *inode, struct address_spa
 	return 0;
 }
 
-static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
-{
-	int ret = 0;
-
-	mutex_lock(&inode->i_mutex);
-	if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_DATA) {
-		ret = nfs_sync_mapping(mapping);
-		if (ret == 0)
-			ret = nfs_invalidate_mapping_nolock(inode, mapping);
-	}
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-/**
- * nfs_revalidate_mapping_nolock - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int ret = 0;
-
-	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-			|| nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
-		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-		if (ret < 0)
-			goto out;
-	}
-	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-		ret = nfs_invalidate_mapping_nolock(inode, mapping);
-out:
-	return ret;
-}
-
 /**
  * nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
- *
- * This version of the function will take the inode->i_mutex and attempt to
- * flush out all dirty data if it needs to invalidate the page cache.
  */
 int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 412738dbfbc7..2ea9e5c27e55 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -50,7 +50,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	struct page *page;
 	void *err;
 
-	err = ERR_PTR(nfs_revalidate_mapping_nolock(inode, inode->i_mapping));
+	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b789d85bff82..1a0b85aa151e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -347,7 +347,6 @@ extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
-extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
-- 
cgit v1.2.3