From fbb18a277a6f192404aa20ece49529acb1e1e76d Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 26 Mar 2006 23:13:39 +0100
Subject: [SERIAL] amba-pl010: allow platforms to specify modem control method

The amba-pl010 hardware does not provide RTS and DTR control lines; it
is expected that these will be implemented using GPIO.  Allow platforms
to supply a function to implement manipulation of modem control lines.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/serial.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index dc726ffccebd..48ee32a18ac5 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -158,4 +158,10 @@
 #define UART01x_RSR_ANY		(UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE)
 #define UART01x_FR_MODEM_ANY	(UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS)
 
+#ifndef __ASSEMBLY__
+struct amba_pl010_data {
+	void (*set_mctrl)(struct amba_device *dev, void __iomem *base, unsigned int mctrl);
+};
+#endif
+
 #endif
-- 
cgit v1.2.3


From 837c7878771c15ed8d85ecf814ece7fcb4551b46 Mon Sep 17 00:00:00 2001
From: Ben Woodard <woodard@redhat.com>
Date: Wed, 22 Mar 2006 08:09:31 +0100
Subject: [BLOCK] increase size of disk stat counters

The kernel's representation of the disk statistics uses the type unsigned
which is 32b on both 32b and 64b platforms.  Unfortunately, most system
tools that work with these numbers that are exported in /proc/diskstats
including iostat read these numbers into unsigned longs.  This works fine
on 32b platforms and when the number of IO transactions are small on 64b
platforms.  However, when the numbers wrap on 64b platforms & you read the
numbers into unsigned longs, and compare the numbers to previous readings,
then you get an unsigned representation of a negative number.  This looks
like a very large 64b number & gives you bizarre readouts in iostat:

ilc4: Device:    rrqm/s wrqm/s r/s    w/s  rsec/s  wsec/s    rkB/s wkB/s avgrq-sz avgqu-sz   await  svctm  %util
ilc4: sda        5.50   0.00   143.96 0.00 307496983987862656.00 0.00 153748491993931328.00     0.00 2136028725038430.00     7.94   55.12    5.59  80.42

Though fixing iostat in user space is possible, and a quick survey
indicates that several other similar tools also use unsigned longs when
processing /proc/diskstats.  Therefore, it seems like a better approach
would be to extend the length of the disk_stats structure on 64b
architectures to 64b.  The following patch does that.  It should not affect
the operation on 32b platforms.

Signed-off-by: Ben Woodard <woodard@redhat.com>
Cc: Rick Lindsley <ricklind@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/genhd.c         |  6 +++---
 include/linux/genhd.h | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 64510fd88621..db4c60c802d6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -454,8 +454,8 @@ static ssize_t disk_stats_read(struct gendisk * disk, char *page)
 	disk_round_stats(disk);
 	preempt_enable();
 	return sprintf(page,
-		"%8u %8u %8llu %8u "
-		"%8u %8u %8llu %8u "
+		"%8lu %8lu %8llu %8u "
+		"%8lu %8lu %8llu %8u "
 		"%8u %8u %8u"
 		"\n",
 		disk_stat_read(disk, ios[READ]),
@@ -649,7 +649,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 	preempt_disable();
 	disk_round_stats(gp);
 	preempt_enable();
-	seq_printf(s, "%4d %4d %s %u %u %llu %u %u %u %llu %u %u %u %u\n",
+	seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		gp->major, n + gp->first_minor, disk_name(gp, n, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fd647fde5ec1..179fea53fc81 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -89,12 +89,12 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 
 struct disk_stats {
-	unsigned sectors[2];		/* READs and WRITEs */
-	unsigned ios[2];
-	unsigned merges[2];
-	unsigned ticks[2];
-	unsigned io_ticks;
-	unsigned time_in_queue;
+	unsigned long sectors[2];	/* READs and WRITEs */
+	unsigned long ios[2];
+	unsigned long merges[2];
+	unsigned long ticks[2];
+	unsigned long io_ticks;
+	unsigned long time_in_queue;
 };
 	
 struct gendisk {
-- 
cgit v1.2.3


From f75ba3ade8a4599d67040a9493d75a864e7b329c Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Mar 2006 01:14:52 -0800
Subject: [PATCH] autofs4: increase module version

Update autofs4 version.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/auto_fs4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index 9343c89d843c..d998ddcf7288 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	4
 
-#define AUTOFS_PROTO_SUBVERSION		7
+#define AUTOFS_PROTO_SUBVERSION		10
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
-- 
cgit v1.2.3


From 5c0a32fc2cd0be912511199449a37a4a6f0f582d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Mar 2006 01:14:55 -0800
Subject: [PATCH] autofs4: add new packet type for v5 communications

This patch define a new autofs packet for autofs v5 and updates the waitq.c
functions to handle the additional packet type.

Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/autofs_i.h    | 23 +++++++++----
 fs/autofs4/waitq.c       | 86 +++++++++++++++++++++++++++++++++++++++++-------
 include/linux/auto_fs4.h | 51 +++++++++++++++++++++++++---
 3 files changed, 136 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index ed388a1d8fc4..37c8d909d1e9 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -77,6 +77,12 @@ struct autofs_wait_queue {
 	int hash;
 	int len;
 	char *name;
+	u32 dev;
+	u64 ino;
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t tgid;
 	/* This is for status reporting upon return */
 	int status;
 	atomic_t notified;
@@ -180,13 +186,6 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *, struct autofs_sb_info
 
 /* Queue management functions */
 
-enum autofs_notify
-{
-	NFY_NONE,
-	NFY_MOUNT,
-	NFY_EXPIRE
-};
-
 int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
@@ -204,6 +203,16 @@ static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **de
 	return res;
 }
 
+static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
+{
+	return new_encode_dev(sbi->sb->s_dev);
+}
+
+static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
+{
+	return sbi->sb->s_root->d_inode->i_ino;
+}
+
 static inline int simple_positive(struct dentry *dentry)
 {
 	return dentry->d_inode && !d_unhashed(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index b0bb9d43bcd9..12da2c977b0a 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -3,7 +3,7 @@
  * linux/fs/autofs/waitq.c
  *
  *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2001-2003 Ian Kent <raven@themaw.net>
+ *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
@@ -97,7 +97,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
-	if (type == autofs_ptype_missing) {
+	switch (type) {
+	/* Kernel protocol v4 missing and expire packets */
+	case autofs_ptype_missing:
+	{
 		struct autofs_packet_missing *mp = &pkt.missing;
 
 		pktsz = sizeof(*mp);
@@ -106,7 +109,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		mp->len = wq->len;
 		memcpy(mp->name, wq->name, wq->len);
 		mp->name[wq->len] = '\0';
-	} else if (type == autofs_ptype_expire_multi) {
+		break;
+	}
+	case autofs_ptype_expire_multi:
+	{
 		struct autofs_packet_expire_multi *ep = &pkt.expire_multi;
 
 		pktsz = sizeof(*ep);
@@ -115,7 +121,34 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		ep->len = wq->len;
 		memcpy(ep->name, wq->name, wq->len);
 		ep->name[wq->len] = '\0';
-	} else {
+		break;
+	}
+	/*
+	 * Kernel protocol v5 packet for handling indirect and direct
+	 * mount missing and expire requests
+	 */
+	case autofs_ptype_missing_indirect:
+	case autofs_ptype_expire_indirect:
+	case autofs_ptype_missing_direct:
+	case autofs_ptype_expire_direct:
+	{
+		struct autofs_v5_packet *packet = &pkt.v5_packet;
+
+		pktsz = sizeof(*packet);
+
+		packet->wait_queue_token = wq->wait_queue_token;
+		packet->len = wq->len;
+		memcpy(packet->name, wq->name, wq->len);
+		packet->name[wq->len] = '\0';
+		packet->dev = wq->dev;
+		packet->ino = wq->ino;
+		packet->uid = wq->uid;
+		packet->gid = wq->gid;
+		packet->pid = wq->pid;
+		packet->tgid = wq->tgid;
+		break;
+	}
+	default:
 		printk("autofs4_notify_daemon: bad type %d!\n", type);
 		return;
 	}
@@ -161,7 +194,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 {
 	struct autofs_wait_queue *wq;
 	char *name;
-	int len, status;
+	unsigned int len = 0;
+	unsigned int hash = 0;
+	int status;
 
 	/* In catatonic mode, we don't wait for nobody */
 	if (sbi->catatonic)
@@ -171,11 +206,17 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	if (!name)
 		return -ENOMEM;
 
-	len = autofs4_getpath(sbi, dentry, &name);
-	if (!len) {
-		kfree(name);
-		return -ENOENT;
+	/* If this is a direct mount request create a dummy name */
+	if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYP_DIRECT))
+		len = sprintf(name, "%p", dentry);
+	else {
+		len = autofs4_getpath(sbi, dentry, &name);
+		if (!len) {
+			kfree(name);
+			return -ENOENT;
+		}
 	}
+	hash = full_name_hash(name, len);
 
 	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
 		kfree(name);
@@ -211,9 +252,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->next = sbi->queues;
 		sbi->queues = wq;
 		init_waitqueue_head(&wq->queue);
-		wq->hash = dentry->d_name.hash;
+		wq->hash = hash;
 		wq->name = name;
 		wq->len = len;
+		wq->dev = autofs4_get_dev(sbi);
+		wq->ino = autofs4_get_ino(sbi);
+		wq->uid = current->uid;
+		wq->gid = current->gid;
+		wq->pid = current->pid;
+		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		atomic_set(&wq->wait_ctr, 2);
 		atomic_set(&wq->notified, 1);
@@ -227,8 +274,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 	}
 
 	if (notify != NFY_NONE && atomic_dec_and_test(&wq->notified)) {
-		int type = (notify == NFY_MOUNT ?
-			autofs_ptype_missing : autofs_ptype_expire_multi);
+		int type;
+
+		if (sbi->version < 5) {
+			if (notify == NFY_MOUNT)
+				type = autofs_ptype_missing;
+			else
+				type = autofs_ptype_expire_multi;
+		} else {
+			if (notify == NFY_MOUNT)
+				type = (sbi->type & AUTOFS_TYP_DIRECT) ?
+					autofs_ptype_missing_direct :
+					 autofs_ptype_missing_indirect;
+			else
+				type = (sbi->type & AUTOFS_TYP_DIRECT) ?
+					autofs_ptype_expire_direct :
+					autofs_ptype_expire_indirect;
+		}
 
 		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
 			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
diff --git a/include/linux/auto_fs4.h b/include/linux/auto_fs4.h
index d998ddcf7288..0a6bc52ffe88 100644
--- a/include/linux/auto_fs4.h
+++ b/include/linux/auto_fs4.h
@@ -19,18 +19,37 @@
 #undef AUTOFS_MIN_PROTO_VERSION
 #undef AUTOFS_MAX_PROTO_VERSION
 
-#define AUTOFS_PROTO_VERSION		4
+#define AUTOFS_PROTO_VERSION		5
 #define AUTOFS_MIN_PROTO_VERSION	3
-#define AUTOFS_MAX_PROTO_VERSION	4
+#define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		10
+#define AUTOFS_PROTO_SUBVERSION		0
 
 /* Mask for expire behaviour */
 #define AUTOFS_EXP_IMMEDIATE		1
 #define AUTOFS_EXP_LEAVES		2
 
-/* New message type */
-#define autofs_ptype_expire_multi	2	/* Expire entry (umount request) */
+/* Daemon notification packet types */
+enum autofs_notify {
+	NFY_NONE,
+	NFY_MOUNT,
+	NFY_EXPIRE
+};
+
+/* Kernel protocol version 4 packet types */
+
+/* Expire entry (umount request) */
+#define autofs_ptype_expire_multi	2
+
+/* Kernel protocol version 5 packet types */
+
+/* Indirect mount missing and expire requests. */
+#define autofs_ptype_missing_indirect	3
+#define autofs_ptype_expire_indirect	4
+
+/* Direct mount missing and expire requests */
+#define autofs_ptype_missing_direct	5
+#define autofs_ptype_expire_direct	6
 
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
@@ -40,14 +59,36 @@ struct autofs_packet_expire_multi {
 	char name[NAME_MAX+1];
 };
 
+/* autofs v5 common packet struct */
+struct autofs_v5_packet {
+	struct autofs_packet_hdr hdr;
+	autofs_wqt_t wait_queue_token;
+	__u32 dev;
+	__u64 ino;
+	__u32 uid;
+	__u32 gid;
+	__u32 pid;
+	__u32 tgid;
+	__u32 len;
+	char name[NAME_MAX+1];
+};
+
+typedef struct autofs_v5_packet autofs_packet_missing_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_expire_indirect_t;
+typedef struct autofs_v5_packet autofs_packet_missing_direct_t;
+typedef struct autofs_v5_packet autofs_packet_expire_direct_t;
+
 union autofs_packet_union {
 	struct autofs_packet_hdr hdr;
 	struct autofs_packet_missing missing;
 	struct autofs_packet_expire expire;
 	struct autofs_packet_expire_multi expire_multi;
+	struct autofs_v5_packet v5_packet;
 };
 
 #define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93,0x66,int)
+#define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
+#define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93,0x67,int)
 #define AUTOFS_IOC_ASKREGHOST           _IOR(0x93,0x68,int)
 #define AUTOFS_IOC_TOGGLEREGHOST        _IOR(0x93,0x69,int)
-- 
cgit v1.2.3


From efc36aa5608f5717338747e152c23f2cfdb14697 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:14:59 -0800
Subject: [PATCH] knfsd: Change the store of auth_domains to not be a 'cache'

The 'auth_domain's are simply handles on internal data structures.  They do
not cache information from user-space, and forcing them into the mold of a
'cache' misrepresents their true nature and causes confusion.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  |   5 +-
 include/linux/sunrpc/svcauth.h    |  12 ++--
 net/sunrpc/auth_gss/svcauth_gss.c |  14 ++---
 net/sunrpc/sunrpc_syms.c          |   4 +-
 net/sunrpc/svcauth.c              | 122 +++++++++++---------------------------
 net/sunrpc/svcauth_unix.c         |  69 ++++++++++-----------
 6 files changed, 81 insertions(+), 145 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 417ec02df44f..ac0997731fce 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -242,7 +242,7 @@ static inline int svc_expkey_match (struct svc_expkey *a, struct svc_expkey *b)
 
 static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *item)
 {
-	cache_get(&item->ek_client->h);
+	kref_get(&item->ek_client->ref);
 	new->ek_client = item->ek_client;
 	new->ek_fsidtype = item->ek_fsidtype;
 	new->ek_fsid[0] = item->ek_fsid[0];
@@ -474,7 +474,7 @@ static inline int svc_export_match(struct svc_export *a, struct svc_export *b)
 }
 static inline void svc_export_init(struct svc_export *new, struct svc_export *item)
 {
-	cache_get(&item->ex_client->h);
+	kref_get(&item->ex_client->ref);
 	new->ex_client = item->ex_client;
 	new->ex_dentry = dget(item->ex_dentry);
 	new->ex_mnt = mntget(item->ex_mnt);
@@ -1129,7 +1129,6 @@ exp_delclient(struct nfsctl_client *ncp)
 	 */
 	if (dom) {
 		err = auth_unix_forget_old(dom);
-		dom->h.expiry_time = get_seconds();
 		auth_domain_put(dom);
 	}
 
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index c119ce7cbd22..2fe2087edd66 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -45,9 +45,10 @@ struct svc_rqst;		/* forward decl */
  * of ip addresses to the given client.
  */
 struct auth_domain {
-	struct	cache_head	h;
+	struct kref		ref;
+	struct hlist_node	hash;
 	char			*name;
-	int			flavour;
+	struct auth_ops		*flavour;
 };
 
 /*
@@ -86,6 +87,9 @@ struct auth_domain {
  *
  * domain_release()
  *   This call releases a domain.
+ * set_client()
+ *   Givens a pending request (struct svc_rqst), finds and assigns
+ *   an appropriate 'auth_domain' as the client.
  */
 struct auth_ops {
 	char *	name;
@@ -117,7 +121,7 @@ extern void	svc_auth_unregister(rpc_authflavor_t flavor);
 extern struct auth_domain *unix_domain_find(char *name);
 extern void auth_domain_put(struct auth_domain *item);
 extern int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom);
-extern struct auth_domain *auth_domain_lookup(struct auth_domain *item, int set);
+extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new);
 extern struct auth_domain *auth_domain_find(char *name);
 extern struct auth_domain *auth_unix_lookup(struct in_addr addr);
 extern int auth_unix_forget_old(struct auth_domain *dom);
@@ -160,8 +164,6 @@ static inline unsigned long hash_mem(char *buf, int length, int bits)
 	return hash >> (BITS_PER_LONG - bits);
 }
 
-extern struct cache_detail auth_domain_cache, ip_map_cache;
-
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_SUNRPC_SVCAUTH_H_ */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 23632d84d8d7..6b073c2e6930 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -645,6 +645,8 @@ find_gss_auth_domain(struct gss_ctx *ctx, u32 svc)
 	return auth_domain_find(name);
 }
 
+static struct auth_ops svcauthops_gss;
+
 int
 svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
 {
@@ -655,20 +657,18 @@ svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name)
 	new = kmalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		goto out;
-	cache_init(&new->h.h);
+	kref_init(&new->h.ref);
 	new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
 	if (!new->h.name)
 		goto out_free_dom;
 	strcpy(new->h.name, name);
-	new->h.flavour = RPC_AUTH_GSS;
+	new->h.flavour = &svcauthops_gss;
 	new->pseudoflavor = pseudoflavor;
-	new->h.h.expiry_time = NEVER;
 
-	test = auth_domain_lookup(&new->h, 1);
-	if (test == &new->h) {
-		BUG_ON(atomic_dec_and_test(&new->h.h.refcnt));
-	} else { /* XXX Duplicate registration? */
+	test = auth_domain_lookup(name, &new->h);
+	if (test != &new->h) { /* XXX Duplicate registration? */
 		auth_domain_put(&new->h);
+		/* dangling ref-count... */
 		goto out;
 	}
 	return 0;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 9f7373203592..40401196e7de 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -142,6 +142,7 @@ EXPORT_SYMBOL(nlm_debug);
 
 extern int register_rpc_pipefs(void);
 extern void unregister_rpc_pipefs(void);
+extern struct cache_detail ip_map_cache;
 
 static int __init
 init_sunrpc(void)
@@ -158,7 +159,6 @@ init_sunrpc(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_init();
 #endif
-	cache_register(&auth_domain_cache);
 	cache_register(&ip_map_cache);
 out:
 	return err;
@@ -169,8 +169,6 @@ cleanup_sunrpc(void)
 {
 	unregister_rpc_pipefs();
 	rpc_destroy_mempool();
-	if (cache_unregister(&auth_domain_cache))
-		printk(KERN_ERR "sunrpc: failed to unregister auth_domain cache\n");
 	if (cache_unregister(&ip_map_cache))
 		printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index dda4f0c63511..5b28c6176806 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -106,112 +106,56 @@ svc_auth_unregister(rpc_authflavor_t flavor)
 EXPORT_SYMBOL(svc_auth_unregister);
 
 /**************************************************
- * cache for domain name to auth_domain
- * Entries are only added by flavours which will normally
- * have a structure that 'inherits' from auth_domain.
- * e.g. when an IP -> domainname is given to  auth_unix,
- * and the domain name doesn't exist, it will create a
- * auth_unix_domain and add it to this hash table.
- * If it finds the name does exist, but isn't AUTH_UNIX,
- * it will complain.
+ * 'auth_domains' are stored in a hash table indexed by name.
+ * When the last reference to an 'auth_domain' is dropped,
+ * the object is unhashed and freed.
+ * If auth_domain_lookup fails to find an entry, it will return
+ * it's second argument 'new'.  If this is non-null, it will
+ * have been atomically linked into the table.
  */
 
-/*
- * Auth auth_domain cache is somewhat different to other caches,
- * largely because the entries are possibly of different types:
- * each auth flavour has it's own type.
- * One consequence of this that DefineCacheLookup cannot
- * allocate a new structure as it cannot know the size.
- * Notice that the "INIT" code fragment is quite different
- * from other caches.  When auth_domain_lookup might be
- * creating a new domain, the new domain is passed in
- * complete and it is used as-is rather than being copied into
- * another structure.
- */
 #define	DN_HASHBITS	6
 #define	DN_HASHMAX	(1<<DN_HASHBITS)
 #define	DN_HASHMASK	(DN_HASHMAX-1)
 
-static struct cache_head	*auth_domain_table[DN_HASHMAX];
-
-static void auth_domain_drop(struct cache_head *item, struct cache_detail *cd)
-{
-	struct auth_domain *dom = container_of(item, struct auth_domain, h);
-	if (cache_put(item,cd))
-		authtab[dom->flavour]->domain_release(dom);
-}
-
-
-struct cache_detail auth_domain_cache = {
-	.owner		= THIS_MODULE,
-	.hash_size	= DN_HASHMAX,
-	.hash_table	= auth_domain_table,
-	.name		= "auth.domain",
-	.cache_put	= auth_domain_drop,
-};
+static struct hlist_head	auth_domain_table[DN_HASHMAX];
+static spinlock_t	auth_domain_lock = SPIN_LOCK_UNLOCKED;
 
 void auth_domain_put(struct auth_domain *dom)
 {
-	auth_domain_drop(&dom->h, &auth_domain_cache);
-}
-
-static inline int auth_domain_hash(struct auth_domain *item)
-{
-	return hash_str(item->name, DN_HASHBITS);
-}
-static inline int auth_domain_match(struct auth_domain *tmp, struct auth_domain *item)
-{
-	return strcmp(tmp->name, item->name) == 0;
+	if (atomic_dec_and_lock(&dom->ref.refcount, &auth_domain_lock)) {
+		hlist_del(&dom->hash);
+		dom->flavour->domain_release(dom);
+	}
 }
 
 struct auth_domain *
-auth_domain_lookup(struct auth_domain *item, int set)
+auth_domain_lookup(char *name, struct auth_domain *new)
 {
-	struct auth_domain *tmp = NULL;
-	struct cache_head **hp, **head;
-	head = &auth_domain_cache.hash_table[auth_domain_hash(item)];
-
-	if (set)
-		write_lock(&auth_domain_cache.hash_lock);
-	else
-		read_lock(&auth_domain_cache.hash_lock);
-	for (hp=head; *hp != NULL; hp = &tmp->h.next) {
-		tmp = container_of(*hp, struct auth_domain, h);
-		if (!auth_domain_match(tmp, item))
-			continue;
-		if (!set) {
-			cache_get(&tmp->h);
-			goto out_noset;
+	struct auth_domain *hp;
+	struct hlist_head *head;
+	struct hlist_node *np;
+
+	head = &auth_domain_table[hash_str(name, DN_HASHBITS)];
+
+	spin_lock(&auth_domain_lock);
+
+	hlist_for_each_entry(hp, np, head, hash) {
+		if (strcmp(hp->name, name)==0) {
+			kref_get(&hp->ref);
+			spin_unlock(&auth_domain_lock);
+			return hp;
 		}
-		*hp = tmp->h.next;
-		tmp->h.next = NULL;
-		auth_domain_drop(&tmp->h, &auth_domain_cache);
-		goto out_set;
 	}
-	/* Didn't find anything */
-	if (!set)
-		goto out_nada;
-	auth_domain_cache.entries++;
-out_set:
-	item->h.next = *head;
-	*head = &item->h;
-	cache_get(&item->h);
-	write_unlock(&auth_domain_cache.hash_lock);
-	cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time);
-	cache_get(&item->h);
-	return item;
-out_nada:
-	tmp = NULL;
-out_noset:
-	read_unlock(&auth_domain_cache.hash_lock);
-	return tmp;
+	if (new) {
+		hlist_add_head(&new->hash, head);
+		kref_get(&new->ref);
+	}
+	spin_unlock(&auth_domain_lock);
+	return new;
 }
 
 struct auth_domain *auth_domain_find(char *name)
 {
-	struct auth_domain *rv, ad;
-
-	ad.name = name;
-	rv = auth_domain_lookup(&ad, 0);
-	return rv;
+	return auth_domain_lookup(name, NULL);
 }
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 3e6c694bbad1..17e8b2a3130c 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -27,41 +27,35 @@ struct unix_domain {
 	/* other stuff later */
 };
 
+extern struct auth_ops svcauth_unix;
+
 struct auth_domain *unix_domain_find(char *name)
 {
-	struct auth_domain *rv, ud;
-	struct unix_domain *new;
-
-	ud.name = name;
-	
-	rv = auth_domain_lookup(&ud, 0);
-
- foundit:
-	if (rv && rv->flavour != RPC_AUTH_UNIX) {
-		auth_domain_put(rv);
-		return NULL;
-	}
-	if (rv)
-		return rv;
-
-	new = kmalloc(sizeof(*new), GFP_KERNEL);
-	if (new == NULL)
-		return NULL;
-	cache_init(&new->h.h);
-	new->h.name = kstrdup(name, GFP_KERNEL);
-	new->h.flavour = RPC_AUTH_UNIX;
-	new->addr_changes = 0;
-	new->h.h.expiry_time = NEVER;
-
-	rv = auth_domain_lookup(&new->h, 2);
-	if (rv == &new->h) {
-		if (atomic_dec_and_test(&new->h.h.refcnt)) BUG();
-	} else {
-		auth_domain_put(&new->h);
-		goto foundit;
+	struct auth_domain *rv;
+	struct unix_domain *new = NULL;
+
+	rv = auth_domain_lookup(name, NULL);
+	while(1) {
+		if (rv != &new->h) {
+			if (new) auth_domain_put(&new->h);
+			return rv;
+		}
+		if (rv && rv->flavour != &svcauth_unix) {
+			auth_domain_put(rv);
+			return NULL;
+		}
+		if (rv)
+			return rv;
+
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (new == NULL)
+			return NULL;
+		kref_init(&new->h.ref);
+		new->h.name = kstrdup(name, GFP_KERNEL);
+		new->h.flavour = &svcauth_unix;
+		new->addr_changes = 0;
+		rv = auth_domain_lookup(name, &new->h);
 	}
-
-	return rv;
 }
 
 static void svcauth_unix_domain_release(struct auth_domain *dom)
@@ -130,7 +124,7 @@ static inline void ip_map_init(struct ip_map *new, struct ip_map *item)
 }
 static inline void ip_map_update(struct ip_map *new, struct ip_map *item)
 {
-	cache_get(&item->m_client->h.h);
+	kref_get(&item->m_client->h.ref);
 	new->m_client = item->m_client;
 	new->m_add_change = item->m_add_change;
 }
@@ -272,7 +266,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
 	struct unix_domain *udom;
 	struct ip_map ip, *ipmp;
 
-	if (dom->flavour != RPC_AUTH_UNIX)
+	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
 	strcpy(ip.m_class, "nfsd");
@@ -295,7 +289,7 @@ int auth_unix_forget_old(struct auth_domain *dom)
 {
 	struct unix_domain *udom;
 	
-	if (dom->flavour != RPC_AUTH_UNIX)
+	if (dom->flavour != &svcauth_unix)
 		return -EINVAL;
 	udom = container_of(dom, struct unix_domain, h);
 	udom->addr_changes++;
@@ -323,7 +317,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 		rv = NULL;
 	} else {
 		rv = &ipm->m_client->h;
-		cache_get(&rv->h);
+		kref_get(&rv->ref);
 	}
 	ip_map_put(&ipm->h, &ip_map_cache);
 	return rv;
@@ -332,7 +326,6 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 void svcauth_unix_purge(void)
 {
 	cache_purge(&ip_map_cache);
-	cache_purge(&auth_domain_cache);
 }
 
 static int
@@ -361,7 +354,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 			return SVC_DENIED;
 		case 0:
 			rqstp->rq_client = &ipm->m_client->h;
-			cache_get(&rqstp->rq_client->h);
+			kref_get(&rqstp->rq_client->ref);
 			ip_map_put(&ipm->h, &ip_map_cache);
 			break;
 	}
-- 
cgit v1.2.3


From eab7e2e647c348b418e8715ecaca0177e1b473c7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:00 -0800
Subject: [PATCH] knfsd: Break the hard linkage from svc_expkey to svc_export

Current svc_expkey holds a pointer to the svc_export structure, so updates to
that structure have to be in-place, which is a wart on the whole cache
infrastruct.  So we break that linkage and just do a second lookup.

If this became a performance issue, it would be possible to put a direct link
back in which was only used conditionally.  i.e.  when an object is replaced
in the cache, we set a flag in the old object.  When dereferencing the link
from svc_expkey, if the flag is set, we drop the reference and do a fresh
lookup.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c            | 60 ++++++++++++++++++++++++++++++---------------
 include/linux/nfsd/export.h | 20 +++------------
 2 files changed, 44 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ac0997731fce..587829ed651c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -73,8 +73,10 @@ void expkey_put(struct cache_head *item, struct cache_detail *cd)
 	if (cache_put(item, cd)) {
 		struct svc_expkey *key = container_of(item, struct svc_expkey, h);
 		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags))
-			exp_put(key->ek_export);
+		    !test_bit(CACHE_NEGATIVE, &item->flags)) {
+			dput(key->ek_dentry);
+			mntput(key->ek_mnt);
+		}
 		auth_domain_put(key->ek_client);
 		kfree(key);
 	}
@@ -164,26 +166,18 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	} else {
 		struct nameidata nd;
 		struct svc_expkey *ek;
-		struct svc_export *exp;
 		err = path_lookup(buf, 0, &nd);
 		if (err)
 			goto out;
 
 		dprintk("Found the path %s\n", buf);
-		exp = exp_get_by_name(dom, nd.mnt, nd.dentry, NULL);
-
-		err = -ENOENT;
-		if (!exp)
-			goto out_nd;
-		key.ek_export = exp;
-		dprintk("And found export\n");
+		key.ek_mnt = nd.mnt;
+		key.ek_dentry = nd.dentry;
 		
 		ek = svc_expkey_lookup(&key, 1);
 		if (ek)
 			expkey_put(&ek->h, &svc_expkey_cache);
-		exp_put(exp);
 		err = 0;
-	out_nd:
 		path_release(&nd);
 	}
 	cache_flush();
@@ -214,7 +208,7 @@ static int expkey_show(struct seq_file *m,
 	if (test_bit(CACHE_VALID, &h->flags) && 
 	    !test_bit(CACHE_NEGATIVE, &h->flags)) {
 		seq_printf(m, " ");
-		seq_path(m, ek->ek_export->ex_mnt, ek->ek_export->ex_dentry, "\\ \t\n");
+		seq_path(m, ek->ek_mnt, ek->ek_dentry, "\\ \t\n");
 	}
 	seq_printf(m, "\n");
 	return 0;
@@ -252,8 +246,8 @@ static inline void svc_expkey_init(struct svc_expkey *new, struct svc_expkey *it
 
 static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *item)
 {
-	cache_get(&item->ek_export->h);
-	new->ek_export = item->ek_export;
+	new->ek_mnt = mntget(item->ek_mnt);
+	new->ek_dentry = dget(item->ek_dentry);
 }
 
 static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
@@ -519,7 +513,8 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 	key.ek_client = clp;
 	key.ek_fsidtype = fsid_type;
 	memcpy(key.ek_fsid, fsidv, key_len(fsid_type));
-	key.ek_export = exp;
+	key.ek_mnt = exp->ex_mnt;
+	key.ek_dentry = exp->ex_dentry;
 	key.h.expiry_time = NEVER;
 	key.h.flags = 0;
 
@@ -741,8 +736,8 @@ exp_export(struct nfsctl_export *nxp)
 	if ((nxp->ex_flags & NFSEXP_FSID) &&
 	    (fsid_key = exp_get_fsid_key(clp, nxp->ex_dev)) &&
 	    !IS_ERR(fsid_key) &&
-	    fsid_key->ek_export &&
-	    fsid_key->ek_export != exp)
+	    fsid_key->ek_mnt &&
+	    (fsid_key->ek_mnt != nd.mnt || fsid_key->ek_dentry != nd.dentry) )
 		goto finish;
 
 	if (exp) {
@@ -912,6 +907,24 @@ out:
 	return err;
 }
 
+struct svc_export *
+exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
+	 struct cache_req *reqp)
+{
+	struct svc_export *exp;
+	struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
+	if (!ek || IS_ERR(ek))
+		return ERR_PTR(PTR_ERR(ek));
+
+	exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
+	expkey_put(&ek->h, &svc_expkey_cache);
+
+	if (!exp || IS_ERR(exp))
+		return ERR_PTR(PTR_ERR(exp));
+	return exp;
+}
+
+
 /*
  * Called when we need the filehandle for the root of the pseudofs,
  * for a given NFSv4 client.   The root is defined to be the
@@ -922,6 +935,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	       struct cache_req *creq)
 {
 	struct svc_expkey *fsid_key;
+	struct svc_export *exp;
 	int rv;
 	u32 fsidv[2];
 
@@ -933,8 +947,14 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	if (!fsid_key || IS_ERR(fsid_key))
 		return nfserr_perm;
 
-	rv = fh_compose(fhp, fsid_key->ek_export, 
-			  fsid_key->ek_export->ex_dentry, NULL);
+	exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
+	if (exp == NULL)
+		rv = nfserr_perm;
+	else if (IS_ERR(exp))
+		rv = nfserrno(PTR_ERR(exp));
+	else
+		rv = fh_compose(fhp, exp,
+				fsid_key->ek_dentry, NULL);
 	expkey_put(&fsid_key->h, &svc_expkey_cache);
 	return rv;
 }
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 6bad4766d3d9..d52e0b7ad37b 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -67,7 +67,8 @@ struct svc_expkey {
 	int			ek_fsidtype;
 	u32			ek_fsid[3];
 
-	struct svc_export *	ek_export;
+	struct vfsmount *	ek_mnt;
+	struct dentry *		ek_dentry;
 };
 
 #define EX_SECURE(exp)		(!((exp)->ex_flags & NFSEXP_INSECURE_PORT))
@@ -114,22 +115,9 @@ static inline void exp_get(struct svc_export *exp)
 {
 	cache_get(&exp->h);
 }
-static inline struct svc_export *
+extern struct svc_export *
 exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
-	 struct cache_req *reqp)
-{
-	struct svc_expkey *ek = exp_find_key(clp, fsid_type, fsidv, reqp);
-	if (ek && !IS_ERR(ek)) {
-		struct svc_export *exp = ek->ek_export;
-		int err;
-		exp_get(exp);
-		expkey_put(&ek->h, &svc_expkey_cache);
-		if ((err = cache_check(&svc_export_cache, &exp->h, reqp)))
-			exp = ERR_PTR(err);
-		return exp;
-	} else
-		return ERR_PTR(PTR_ERR(ek));
-}
+	 struct cache_req *reqp);
 
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From 7d317f2c9f1e9dcf4f632fa98f91d1d4a36c4cae Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:01 -0800
Subject: [PATCH] knfsd: Get rid of 'inplace' sunrpc caches

These were an unnecessary wart.  Also only have one 'DefineSimpleCache..'
instead of two.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  |  4 ++--
 fs/nfsd/nfs4idmap.c               | 10 ++--------
 include/linux/sunrpc/cache.h      | 28 +++++++++++-----------------
 net/sunrpc/auth_gss/svcauth_gss.c |  4 ++--
 net/sunrpc/svcauth_unix.c         |  2 +-
 5 files changed, 18 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 587829ed651c..c591761a1ad6 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -250,7 +250,7 @@ static inline void svc_expkey_update(struct svc_expkey *new, struct svc_expkey *
 	new->ek_dentry = dget(item->ek_dentry);
 }
 
-static DefineSimpleCacheLookup(svc_expkey,0) /* no inplace updates */
+static DefineSimpleCacheLookup(svc_expkey, svc_expkey)
 
 #define	EXPORT_HASHBITS		8
 #define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
@@ -482,7 +482,7 @@ static inline void svc_export_update(struct svc_export *new, struct svc_export *
 	new->ex_fsid = item->ex_fsid;
 }
 
-static DefineSimpleCacheLookup(svc_export,1) /* allow inplace updates */
+static DefineSimpleCacheLookup(svc_export, svc_export)
 
 
 struct svc_expkey *
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 13369650cdf9..dea690aa8bb5 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -76,12 +76,6 @@ struct ent {
 	char              authname[IDMAP_NAMESZ];
 };
 
-#define DefineSimpleCacheLookupMap(STRUCT, FUNC)			\
-        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
-        (struct STRUCT *item, int set), /*no setup */,			\
-	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
-	STRUCT##_init(new, item), STRUCT##_update(tmp, item), 0)
-
 /* Common entry handling */
 
 #define ENT_HASHBITS          8
@@ -264,7 +258,7 @@ out:
 	return error;
 }
 
-static DefineSimpleCacheLookupMap(ent, idtoname);
+static DefineSimpleCacheLookup(ent, idtoname);
 
 /*
  * Name -> ID cache
@@ -390,7 +384,7 @@ out:
 	return (error);
 }
 
-static DefineSimpleCacheLookupMap(ent, nametoid);
+static DefineSimpleCacheLookup(ent, nametoid);
 
 /*
  * Exported API
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index c4e3ea7cf154..405ac14e509a 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -133,14 +133,11 @@ struct cache_deferred_req {
  * If "set" == 0 :
  *    If an entry is found, it is returned
  *    If no entry is found, a new non-VALID entry is created.
- * If "set" == 1 and INPLACE == 0 :
+ * If "set" == 1 :
  *    If no entry is found a new one is inserted with data from "template"
  *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
  *    If a CACHE_VALID entry is found, a new entry is swapped in with data
  *       from "template"
- * If set == 1, and INPLACE == 1 :
- *    As above, except that if a CACHE_VALID entry is found, we UPDATE in place
- *       instead of swapping in a new entry.
  *
  * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
  * run but insteead CACHE_NEGATIVE is set in any new item.
@@ -159,13 +156,8 @@ struct cache_deferred_req {
  * TEST  tests if "tmp" matches "item"
  * INIT copies key information from "item" to "new"
  * UPDATE copies content information from "item" to "tmp"
- * INPLACE is true if updates can happen inplace rather than allocating a new structure
- *
- * WARNING: any substantial changes to this must be reflected in
- *   net/sunrpc/svcauth.c(auth_domain_lookup)
- *  which is a similar routine that is open-coded.
  */
-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE,INPLACE)	\
+#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE)	\
 RTN *FNAME ARGS										\
 {											\
 	RTN *tmp, *new=NULL;								\
@@ -179,13 +171,13 @@ RTN *FNAME ARGS										\
 		tmp = container_of(*hp, RTN, MEMBER);					\
 		if (TEST) { /* found a match */						\
 											\
-			if (set && !INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new) \
+			if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new)	\
 				break;							\
 											\
 			if (new)							\
 				{INIT;}							\
 			if (set) {							\
-				if (!INPLACE && test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
+				if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
 				{ /* need to swap in new */				\
 					RTN *t2;					\
 											\
@@ -206,7 +198,7 @@ RTN *FNAME ARGS										\
 			else read_unlock(&(DETAIL)->hash_lock);				\
 			if (set)							\
 				cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
-			if (set && !INPLACE && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
+			if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
 			if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);		\
 			return tmp;							\
 		}									\
@@ -239,10 +231,12 @@ RTN *FNAME ARGS										\
 	return NULL;									\
 }
 
-#define DefineSimpleCacheLookup(STRUCT,INPLACE)	\
-	DefineCacheLookup(struct STRUCT, h, STRUCT##_lookup, (struct STRUCT *item, int set), /*no setup */,	\
-			  & STRUCT##_cache, STRUCT##_hash(item), STRUCT##_match(item, tmp),\
-			  STRUCT##_init(new, item), STRUCT##_update(tmp, item),INPLACE)
+#define DefineSimpleCacheLookup(STRUCT, FUNC)				\
+        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
+        (struct STRUCT *item, int set), /*no setup */,			\
+	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
+	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
+
 
 #define cache_for_each(pos, detail, index, member) 						\
 	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 6b073c2e6930..aadb4e8d6aa7 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -259,7 +259,7 @@ static struct cache_detail rsi_cache = {
 	.cache_parse    = rsi_parse,
 };
 
-static DefineSimpleCacheLookup(rsi, 0)
+static DefineSimpleCacheLookup(rsi, rsi)
 
 /*
  * The rpcsec_context cache is used to store a context that is
@@ -446,7 +446,7 @@ static struct cache_detail rsc_cache = {
 	.cache_parse	= rsc_parse,
 };
 
-static DefineSimpleCacheLookup(rsc, 0);
+static DefineSimpleCacheLookup(rsc, rsc);
 
 static struct rsc *
 gss_svc_searchbyctx(struct xdr_netobj *handle)
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 17e8b2a3130c..7ddf068b5b25 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -258,7 +258,7 @@ struct cache_detail ip_map_cache = {
 	.cache_show	= ip_map_show,
 };
 
-static DefineSimpleCacheLookup(ip_map, 0)
+static DefineSimpleCacheLookup(ip_map, ip_map)
 
 
 int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom)
-- 
cgit v1.2.3


From 15a5f6bd23eddd5b3be80366f364be04fb1c1c99 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:02 -0800
Subject: [PATCH] knfsd: Create cache_lookup function instead of using a macro
 to declare one

The C++-like 'template' approach proves to be too ugly and hard to work with.

The old 'template' won't go away until all users are updated.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h | 12 ++++++
 net/sunrpc/cache.c           | 98 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 405ac14e509a..3e17a5ff1dea 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -81,6 +81,11 @@ struct cache_detail {
 					      struct cache_detail *cd,
 					      struct cache_head *h);
 
+	struct cache_head *	(*alloc)(void);
+	int			(*match)(struct cache_head *orig, struct cache_head *new);
+	void			(*init)(struct cache_head *orig, struct cache_head *new);
+	void			(*update)(struct cache_head *orig, struct cache_head *new);
+
 	/* fields below this comment are for internal use
 	 * and should not be touched by cache owners
 	 */
@@ -237,6 +242,13 @@ RTN *FNAME ARGS										\
 	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
 	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
 
+extern struct cache_head *
+sunrpc_cache_lookup(struct cache_detail *detail,
+		    struct cache_head *key, int hash);
+extern struct cache_head *
+sunrpc_cache_update(struct cache_detail *detail,
+		    struct cache_head *new, struct cache_head *old, int hash);
+
 
 #define cache_for_each(pos, detail, index, member) 						\
 	for (({read_lock(&(detail)->hash_lock); index = (detail)->hash_size;}) ;		\
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 0acccfeeb284..4449dc52edf5 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -47,6 +47,104 @@ void cache_init(struct cache_head *h)
 	h->last_refresh = now;
 }
 
+struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
+				       struct cache_head *key, int hash)
+{
+	struct cache_head **head,  **hp;
+	struct cache_head *new = NULL;
+
+	head = &detail->hash_table[hash];
+
+	read_lock(&detail->hash_lock);
+
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			cache_get(tmp);
+			read_unlock(&detail->hash_lock);
+			return tmp;
+		}
+	}
+	read_unlock(&detail->hash_lock);
+	/* Didn't find anything, insert an empty entry */
+
+	new = detail->alloc();
+	if (!new)
+		return NULL;
+	cache_init(new);
+
+	write_lock(&detail->hash_lock);
+
+	/* check if entry appeared while we slept */
+	for (hp=head; *hp != NULL ; hp = &(*hp)->next) {
+		struct cache_head *tmp = *hp;
+		if (detail->match(tmp, key)) {
+			cache_get(tmp);
+			write_unlock(&detail->hash_lock);
+			detail->cache_put(new, detail);
+			return tmp;
+		}
+	}
+	detail->init(new, key);
+	new->next = *head;
+	*head = new;
+	detail->entries++;
+	cache_get(new);
+	write_unlock(&detail->hash_lock);
+
+	return new;
+}
+EXPORT_SYMBOL(sunrpc_cache_lookup);
+
+struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
+				       struct cache_head *new, struct cache_head *old, int hash)
+{
+	/* The 'old' entry is to be replaced by 'new'.
+	 * If 'old' is not VALID, we update it directly,
+	 * otherwise we need to replace it
+	 */
+	struct cache_head **head;
+	struct cache_head *tmp;
+
+	if (!test_bit(CACHE_VALID, &old->flags)) {
+		write_lock(&detail->hash_lock);
+		if (!test_bit(CACHE_VALID, &old->flags)) {
+			if (test_bit(CACHE_NEGATIVE, &new->flags))
+				set_bit(CACHE_NEGATIVE, &old->flags);
+			else
+				detail->update(old, new);
+			/* FIXME cache_fresh should come first */
+			write_unlock(&detail->hash_lock);
+			cache_fresh(detail, old, new->expiry_time);
+			return old;
+		}
+		write_unlock(&detail->hash_lock);
+	}
+	/* We need to insert a new entry */
+	tmp = detail->alloc();
+	if (!tmp) {
+		detail->cache_put(old, detail);
+		return NULL;
+	}
+	cache_init(tmp);
+	detail->init(tmp, old);
+	head = &detail->hash_table[hash];
+
+	write_lock(&detail->hash_lock);
+	if (test_bit(CACHE_NEGATIVE, &new->flags))
+		set_bit(CACHE_NEGATIVE, &tmp->flags);
+	else
+		detail->update(tmp, new);
+	tmp->next = *head;
+	*head = tmp;
+	cache_get(tmp);
+	write_unlock(&detail->hash_lock);
+	cache_fresh(detail, tmp, new->expiry_time);
+	cache_fresh(detail, old, 0);
+	detail->cache_put(old, detail);
+	return tmp;
+}
+EXPORT_SYMBOL(sunrpc_cache_update);
 
 static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h);
 /*
-- 
cgit v1.2.3


From 4d90452cb23b08a9a9dd001010f0ee6b1ee83a45 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:07 -0800
Subject: [PATCH] knfsd: Remove DefineCacheLookup

This has been replaced by more traditional code.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h | 113 -------------------------------------------
 1 file changed, 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index 3e17a5ff1dea..afc481dd02dd 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -128,119 +128,6 @@ struct cache_deferred_req {
 					   int too_many);
 };
 
-/*
- * just like a template in C++, this macro does cache lookup
- * for us.
- * The function is passed some sort of HANDLE from which a cache_detail
- * structure can be determined (via SETUP, DETAIL), a template
- * cache entry (type RTN*), and a "set" flag.  Using the HASHFN and the 
- * TEST, the function will try to find a matching cache entry in the cache.
- * If "set" == 0 :
- *    If an entry is found, it is returned
- *    If no entry is found, a new non-VALID entry is created.
- * If "set" == 1 :
- *    If no entry is found a new one is inserted with data from "template"
- *    If a non-CACHE_VALID entry is found, it is updated from template using UPDATE
- *    If a CACHE_VALID entry is found, a new entry is swapped in with data
- *       from "template"
- *
- * If the passed handle has the CACHE_NEGATIVE flag set, then UPDATE is not
- * run but insteead CACHE_NEGATIVE is set in any new item.
-
- *  In any case, the new entry is returned with a reference count.
- *
- *    
- * RTN is a struct type for a cache entry
- * MEMBER is the member of the cache which is cache_head, which must be first
- * FNAME is the name for the function	
- * ARGS are arguments to function and must contain RTN *item, int set.  May
- *   also contain something to be usedby SETUP or DETAIL to find cache_detail.
- * SETUP  locates the cache detail and makes it available as...
- * DETAIL identifies the cache detail, possibly set up by SETUP
- * HASHFN returns a hash value of the cache entry "item"
- * TEST  tests if "tmp" matches "item"
- * INIT copies key information from "item" to "new"
- * UPDATE copies content information from "item" to "tmp"
- */
-#define DefineCacheLookup(RTN,MEMBER,FNAME,ARGS,SETUP,DETAIL,HASHFN,TEST,INIT,UPDATE)	\
-RTN *FNAME ARGS										\
-{											\
-	RTN *tmp, *new=NULL;								\
-	struct cache_head **hp, **head;							\
-	SETUP;										\
-	head = &(DETAIL)->hash_table[HASHFN];						\
- retry:											\
-	if (set||new) write_lock(&(DETAIL)->hash_lock);					\
-	else read_lock(&(DETAIL)->hash_lock);						\
-	for(hp=head; *hp != NULL; hp = &tmp->MEMBER.next) {				\
-		tmp = container_of(*hp, RTN, MEMBER);					\
-		if (TEST) { /* found a match */						\
-											\
-			if (set && test_bit(CACHE_VALID, &tmp->MEMBER.flags) && !new)	\
-				break;							\
-											\
-			if (new)							\
-				{INIT;}							\
-			if (set) {							\
-				if (test_bit(CACHE_VALID, &tmp->MEMBER.flags))\
-				{ /* need to swap in new */				\
-					RTN *t2;					\
-											\
-					new->MEMBER.next = tmp->MEMBER.next;		\
-					*hp = &new->MEMBER;				\
-					tmp->MEMBER.next = NULL;			\
-					t2 = tmp; tmp = new; new = t2;			\
-				}							\
-				if (test_bit(CACHE_NEGATIVE,  &item->MEMBER.flags))	\
-					set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);	\
-				else {							\
-					UPDATE;						\
-					clear_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);	\
-				}							\
-			}								\
-			cache_get(&tmp->MEMBER);					\
-			if (set||new) write_unlock(&(DETAIL)->hash_lock);		\
-			else read_unlock(&(DETAIL)->hash_lock);				\
-			if (set)							\
-				cache_fresh(DETAIL, &tmp->MEMBER, item->MEMBER.expiry_time); \
-			if (set && new) cache_fresh(DETAIL, &new->MEMBER, 0);	\
-			if (new) (DETAIL)->cache_put(&new->MEMBER, DETAIL);		\
-			return tmp;							\
-		}									\
-	}										\
-	/* Didn't find anything */							\
-	if (new) {									\
-		INIT;									\
-		new->MEMBER.next = *head;						\
-		*head = &new->MEMBER;							\
-		(DETAIL)->entries ++;							\
-		cache_get(&new->MEMBER);						\
-		if (set) {								\
-			tmp = new;							\
-			if (test_bit(CACHE_NEGATIVE, &item->MEMBER.flags))		\
-				set_bit(CACHE_NEGATIVE, &tmp->MEMBER.flags);		\
-			else {UPDATE;}							\
-		}									\
-	}										\
-	if (set||new) write_unlock(&(DETAIL)->hash_lock);				\
-	else read_unlock(&(DETAIL)->hash_lock);						\
-	if (new && set)									\
-		cache_fresh(DETAIL, &new->MEMBER, item->MEMBER.expiry_time);		\
-	if (new)				       					\
-		return new;								\
-	new = kmalloc(sizeof(*new), GFP_KERNEL);					\
-	if (new) {									\
-		cache_init(&new->MEMBER);						\
-		goto retry;								\
-	}										\
-	return NULL;									\
-}
-
-#define DefineSimpleCacheLookup(STRUCT, FUNC)				\
-        DefineCacheLookup(struct STRUCT, h, FUNC##_lookup,		\
-        (struct STRUCT *item, int set), /*no setup */,			\
-	& FUNC##_cache, FUNC##_hash(item), FUNC##_match(item, tmp),	\
-	STRUCT##_init(new, item), STRUCT##_update(tmp, item))
 
 extern struct cache_head *
 sunrpc_cache_lookup(struct cache_detail *detail,
-- 
cgit v1.2.3


From ebd0cb1af3be2729cc1f574681dfba01fcf458d9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:08 -0800
Subject: [PATCH] knfsd: Unexport cache_fresh and fix a small race

Cache_fresh is now only used in cache.c, so unexport it.

Part of cache_fresh (setting CACHE_VALID) should really be done under the
lock, while part (calling cache_revisit_request etc) must be done outside the
lock.  So we split it up appropriately.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sunrpc/cache.h |  2 --
 net/sunrpc/cache.c           | 51 ++++++++++++++++++++++++++------------------
 net/sunrpc/sunrpc_syms.c     |  1 -
 3 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index afc481dd02dd..a37fead1873b 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -165,8 +165,6 @@ static inline int cache_put(struct cache_head *h, struct cache_detail *cd)
 }
 
 extern void cache_init(struct cache_head *h);
-extern void cache_fresh(struct cache_detail *detail,
-			struct cache_head *head, time_t expiry);
 extern int cache_check(struct cache_detail *detail,
 		       struct cache_head *h, struct cache_req *rqstp);
 extern void cache_flush(void);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index b242f491cea9..edcda4fd88e8 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -96,6 +96,27 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 }
 EXPORT_SYMBOL(sunrpc_cache_lookup);
 
+
+static void queue_loose(struct cache_detail *detail, struct cache_head *ch);
+
+static int cache_fresh_locked(struct cache_head *head, time_t expiry)
+{
+	head->expiry_time = expiry;
+	head->last_refresh = get_seconds();
+	return !test_and_set_bit(CACHE_VALID, &head->flags);
+}
+
+static void cache_fresh_unlocked(struct cache_head *head,
+			struct cache_detail *detail, int new)
+{
+	if (new)
+		cache_revisit_request(head);
+	if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
+		cache_revisit_request(head);
+		queue_loose(detail, head);
+	}
+}
+
 struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 				       struct cache_head *new, struct cache_head *old, int hash)
 {
@@ -105,6 +126,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	 */
 	struct cache_head **head;
 	struct cache_head *tmp;
+	int is_new;
 
 	if (!test_bit(CACHE_VALID, &old->flags)) {
 		write_lock(&detail->hash_lock);
@@ -113,9 +135,9 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 				set_bit(CACHE_NEGATIVE, &old->flags);
 			else
 				detail->update(old, new);
-			/* FIXME cache_fresh should come first */
+			is_new = cache_fresh_locked(old, new->expiry_time);
 			write_unlock(&detail->hash_lock);
-			cache_fresh(detail, old, new->expiry_time);
+			cache_fresh_unlocked(old, detail, is_new);
 			return old;
 		}
 		write_unlock(&detail->hash_lock);
@@ -138,9 +160,11 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	tmp->next = *head;
 	*head = tmp;
 	cache_get(tmp);
+	is_new = cache_fresh_locked(tmp, new->expiry_time);
+	cache_fresh_locked(old, 0);
 	write_unlock(&detail->hash_lock);
-	cache_fresh(detail, tmp, new->expiry_time);
-	cache_fresh(detail, old, 0);
+	cache_fresh_unlocked(tmp, detail, is_new);
+	cache_fresh_unlocked(old, detail, 0);
 	detail->cache_put(old, detail);
 	return tmp;
 }
@@ -192,7 +216,8 @@ int cache_check(struct cache_detail *detail,
 				clear_bit(CACHE_PENDING, &h->flags);
 				if (rv == -EAGAIN) {
 					set_bit(CACHE_NEGATIVE, &h->flags);
-					cache_fresh(detail, h, get_seconds()+CACHE_NEW_EXPIRY);
+					cache_fresh_unlocked(h, detail,
+					     cache_fresh_locked(h, get_seconds()+CACHE_NEW_EXPIRY));
 					rv = -ENOENT;
 				}
 				break;
@@ -213,22 +238,6 @@ int cache_check(struct cache_detail *detail,
 	return rv;
 }
 
-static void queue_loose(struct cache_detail *detail, struct cache_head *ch);
-
-void cache_fresh(struct cache_detail *detail,
-		 struct cache_head *head, time_t expiry)
-{
-
-	head->expiry_time = expiry;
-	head->last_refresh = get_seconds();
-	if (!test_and_set_bit(CACHE_VALID, &head->flags))
-		cache_revisit_request(head);
-	if (test_and_clear_bit(CACHE_PENDING, &head->flags)) {
-		cache_revisit_request(head);
-		queue_loose(detail, head);
-	}
-}
-
 /*
  * caches need to be periodically cleaned.
  * For this we maintain a list of cache_detail and
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 40401196e7de..69b8238f3d10 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup);
 EXPORT_SYMBOL(cache_check);
 EXPORT_SYMBOL(cache_flush);
 EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_fresh);
 EXPORT_SYMBOL(cache_init);
 EXPORT_SYMBOL(cache_register);
 EXPORT_SYMBOL(cache_unregister);
-- 
cgit v1.2.3


From baab935ff3bdac20c558809da0d8e8f761840219 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:15:09 -0800
Subject: [PATCH] knfsd: Convert sunrpc_cache to use krefs

.. it makes some of the code nicer.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c                  | 51 ++++++++++++++++++---------------------
 fs/nfsd/nfs4idmap.c               | 18 ++++++--------
 fs/nfsd/nfsfh.c                   |  2 +-
 include/linux/nfsd/export.h       |  4 +--
 include/linux/sunrpc/cache.h      | 13 +++++-----
 net/sunrpc/auth_gss/svcauth_gss.c | 28 +++++++++------------
 net/sunrpc/cache.c                | 20 +++++++--------
 net/sunrpc/svcauth_unix.c         | 20 +++++++--------
 8 files changed, 72 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index abd68965822f..cc811a1094cb 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -57,18 +57,17 @@ static int		exp_verify_string(char *cp, int max);
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
 static struct cache_head *expkey_table[EXPKEY_HASHMAX];
 
-void expkey_put(struct cache_head *item, struct cache_detail *cd)
+void expkey_put(struct kref *ref)
 {
-	if (cache_put(item, cd)) {
-		struct svc_expkey *key = container_of(item, struct svc_expkey, h);
-		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags)) {
-			dput(key->ek_dentry);
-			mntput(key->ek_mnt);
-		}
-		auth_domain_put(key->ek_client);
-		kfree(key);
+	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
+
+	if (test_bit(CACHE_VALID, &key->h.flags) &&
+	    !test_bit(CACHE_NEGATIVE, &key->h.flags)) {
+		dput(key->ek_dentry);
+		mntput(key->ek_mnt);
 	}
+	auth_domain_put(key->ek_client);
+	kfree(key);
 }
 
 static void expkey_request(struct cache_detail *cd,
@@ -158,7 +157,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 		set_bit(CACHE_NEGATIVE, &key.h.flags);
 		ek = svc_expkey_update(&key, ek);
 		if (ek)
-			expkey_put(&ek->h, &svc_expkey_cache);
+			cache_put(&ek->h, &svc_expkey_cache);
 		else err = -ENOMEM;
 	} else {
 		struct nameidata nd;
@@ -172,7 +171,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 		
 		ek = svc_expkey_update(&key, ek);
 		if (ek)
-			expkey_put(&ek->h, &svc_expkey_cache);
+			cache_put(&ek->h, &svc_expkey_cache);
 		else
 			err = -ENOMEM;
 		path_release(&nd);
@@ -318,15 +317,13 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 
 static struct cache_head *export_table[EXPORT_HASHMAX];
 
-void svc_export_put(struct cache_head *item, struct cache_detail *cd)
+static void svc_export_put(struct kref *ref)
 {
-	if (cache_put(item, cd)) {
-		struct svc_export *exp = container_of(item, struct svc_export, h);
-		dput(exp->ex_dentry);
-		mntput(exp->ex_mnt);
-		auth_domain_put(exp->ex_client);
-		kfree(exp);
-	}
+	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+	dput(exp->ex_dentry);
+	mntput(exp->ex_mnt);
+	auth_domain_put(exp->ex_client);
+	kfree(exp);
 }
 
 static void svc_export_request(struct cache_detail *cd,
@@ -633,7 +630,7 @@ static int exp_set_key(svc_client *clp, int fsid_type, u32 *fsidv,
 	if (ek)
 		ek = svc_expkey_update(&key,ek);
 	if (ek) {
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 		return 0;
 	}
 	return -ENOMEM;
@@ -762,7 +759,7 @@ static void exp_fsid_unhash(struct svc_export *exp)
 	ek = exp_get_fsid_key(exp->ex_client, exp->ex_fsid);
 	if (ek && !IS_ERR(ek)) {
 		ek->h.expiry_time = get_seconds()-1;
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 	}
 	svc_expkey_cache.nextcheck = get_seconds();
 }
@@ -800,7 +797,7 @@ static void exp_unhash(struct svc_export *exp)
 	ek = exp_get_key(exp->ex_client, inode->i_sb->s_dev, inode->i_ino);
 	if (ek && !IS_ERR(ek)) {
 		ek->h.expiry_time = get_seconds()-1;
-		expkey_put(&ek->h, &svc_expkey_cache);
+		cache_put(&ek->h, &svc_expkey_cache);
 	}
 	svc_expkey_cache.nextcheck = get_seconds();
 }
@@ -902,7 +899,7 @@ finish:
 	if (exp)
 		exp_put(exp);
 	if (fsid_key && !IS_ERR(fsid_key))
-		expkey_put(&fsid_key->h, &svc_expkey_cache);
+		cache_put(&fsid_key->h, &svc_expkey_cache);
 	if (clp)
 		auth_domain_put(clp);
 	path_release(&nd);
@@ -1030,7 +1027,7 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
 		return ERR_PTR(PTR_ERR(ek));
 
 	exp = exp_get_by_name(clp, ek->ek_mnt, ek->ek_dentry, reqp);
-	expkey_put(&ek->h, &svc_expkey_cache);
+	cache_put(&ek->h, &svc_expkey_cache);
 
 	if (!exp || IS_ERR(exp))
 		return ERR_PTR(PTR_ERR(exp));
@@ -1068,7 +1065,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	else
 		rv = fh_compose(fhp, exp,
 				fsid_key->ek_dentry, NULL);
-	expkey_put(&fsid_key->h, &svc_expkey_cache);
+	cache_put(&fsid_key->h, &svc_expkey_cache);
 	return rv;
 }
 
@@ -1187,7 +1184,7 @@ static int e_show(struct seq_file *m, void *p)
 	cache_get(&exp->h);
 	if (cache_check(&svc_export_cache, &exp->h, NULL))
 		return 0;
-	if (cache_put(&exp->h, &svc_export_cache)) BUG();
+	cache_put(&exp->h, &svc_export_cache);
 	return svc_export_show(m, &svc_export_cache, cp);
 }
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 75cfbb68b205..4b6aa60dfceb 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -96,12 +96,10 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
 }
 
 static void
-ent_put(struct cache_head *ch, struct cache_detail *cd)
+ent_put(struct kref *ref)
 {
-	if (cache_put(ch, cd)) {
-		struct ent *map = container_of(ch, struct ent, h);
-		kfree(map);
-	}
+	struct ent *map = container_of(ref, struct ent, h.ref);
+	kfree(map);
 }
 
 static struct cache_head *
@@ -270,7 +268,7 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
 	if (res == NULL)
 		goto out;
 
-	ent_put(&res->h, &idtoname_cache);
+	cache_put(&res->h, &idtoname_cache);
 
 	error = 0;
 out:
@@ -433,7 +431,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 	if (res == NULL)
 		goto out;
 
-	ent_put(&res->h, &nametoid_cache);
+	cache_put(&res->h, &nametoid_cache);
 	error = 0;
 out:
 	kfree(buf1);
@@ -562,7 +560,7 @@ do_idmap_lookup_nowait(struct ent *(*lookup_fn)(struct ent *),
 		goto out_put;
 	return 0;
 out_put:
-	ent_put(&(*item)->h, detail);
+	cache_put(&(*item)->h, detail);
 out_err:
 	*item = NULL;
 	return ret;
@@ -613,7 +611,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 	if (ret)
 		return ret;
 	*id = item->id;
-	ent_put(&item->h, &nametoid_cache);
+	cache_put(&item->h, &nametoid_cache);
 	return 0;
 }
 
@@ -635,7 +633,7 @@ idmap_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 	ret = strlen(item->name);
 	BUG_ON(ret > IDMAP_NAMESZ);
 	memcpy(name, item->name, ret);
-	ent_put(&item->h, &idtoname_cache);
+	cache_put(&item->h, &idtoname_cache);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 7a3e397b4ed3..3f2ec2e6d06c 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -506,7 +506,7 @@ fh_put(struct svc_fh *fhp)
 		nfsd_nr_put++;
 	}
 	if (exp) {
-		svc_export_put(&exp->h, &svc_export_cache);
+		cache_put(&exp->h, &svc_export_cache);
 		fhp->fh_export = NULL;
 	}
 	return;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index d52e0b7ad37b..a6c08a47b25c 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -102,13 +102,11 @@ int			exp_rootfh(struct auth_domain *,
 int			exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq);
 int			nfserrno(int errno);
 
-extern void expkey_put(struct cache_head *item, struct cache_detail *cd);
-extern void svc_export_put(struct cache_head *item, struct cache_detail *cd);
 extern struct cache_detail svc_export_cache, svc_expkey_cache;
 
 static inline void exp_put(struct svc_export *exp)
 {
-	svc_export_put(&exp->h, &svc_export_cache);
+	cache_put(&exp->h, &svc_export_cache);
 }
 
 static inline void exp_get(struct svc_export *exp)
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index a37fead1873b..ad3f5cbdb770 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -50,7 +50,7 @@ struct cache_head {
 	time_t		last_refresh;   /* If CACHE_PENDING, this is when upcall 
 					 * was sent, else this is when update was received
 					 */
-	atomic_t 	refcnt;
+	struct kref	ref;
 	unsigned long	flags;
 };
 #define	CACHE_VALID	0	/* Entry contains valid data */
@@ -68,8 +68,7 @@ struct cache_detail {
 	atomic_t		inuse; /* active user-space update or lookup */
 
 	char			*name;
-	void			(*cache_put)(struct cache_head *,
-					     struct cache_detail*);
+	void			(*cache_put)(struct kref *);
 
 	void			(*cache_request)(struct cache_detail *cd,
 						 struct cache_head *h,
@@ -151,17 +150,17 @@ extern void cache_clean_deferred(void *owner);
 
 static inline struct cache_head  *cache_get(struct cache_head *h)
 {
-	atomic_inc(&h->refcnt);
+	kref_get(&h->ref);
 	return h;
 }
 
 
-static inline int cache_put(struct cache_head *h, struct cache_detail *cd)
+static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 {
-	if (atomic_read(&h->refcnt) <= 2 &&
+	if (atomic_read(&h->ref.refcount) <= 2 &&
 	    h->expiry_time < cd->nextcheck)
 		cd->nextcheck = h->expiry_time;
-	return atomic_dec_and_test(&h->refcnt);
+	kref_put(&h->ref, cd->cache_put);
 }
 
 extern void cache_init(struct cache_head *h);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 380152603d1e..4d7eb9e704da 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -89,13 +89,11 @@ static void rsi_free(struct rsi *rsii)
 	kfree(rsii->out_token.data);
 }
 
-static void rsi_put(struct cache_head *item, struct cache_detail *cd)
+static void rsi_put(struct kref *ref)
 {
-	struct rsi *rsii = container_of(item, struct rsi, h);
-	if (cache_put(item, cd)) {
-		rsi_free(rsii);
-		kfree(rsii);
-	}
+	struct rsi *rsii = container_of(ref, struct rsi, h.ref);
+	rsi_free(rsii);
+	kfree(rsii);
 }
 
 static inline int rsi_hash(struct rsi *item)
@@ -267,7 +265,7 @@ static int rsi_parse(struct cache_detail *cd,
 out:
 	rsi_free(&rsii);
 	if (rsip)
-		rsi_put(&rsip->h, &rsi_cache);
+		cache_put(&rsip->h, &rsi_cache);
 	else
 		status = -ENOMEM;
 	return status;
@@ -357,14 +355,12 @@ static void rsc_free(struct rsc *rsci)
 		put_group_info(rsci->cred.cr_group_info);
 }
 
-static void rsc_put(struct cache_head *item, struct cache_detail *cd)
+static void rsc_put(struct kref *ref)
 {
-	struct rsc *rsci = container_of(item, struct rsc, h);
+	struct rsc *rsci = container_of(ref, struct rsc, h.ref);
 
-	if (cache_put(item, cd)) {
-		rsc_free(rsci);
-		kfree(rsci);
-	}
+	rsc_free(rsci);
+	kfree(rsci);
 }
 
 static inline int
@@ -509,7 +505,7 @@ static int rsc_parse(struct cache_detail *cd,
 out:
 	rsc_free(&rsci);
 	if (rscp)
-		rsc_put(&rscp->h, &rsc_cache);
+		cache_put(&rscp->h, &rsc_cache);
 	else
 		status = -ENOMEM;
 	return status;
@@ -1076,7 +1072,7 @@ drop:
 	ret = SVC_DROP;
 out:
 	if (rsci)
-		rsc_put(&rsci->h, &rsc_cache);
+		cache_put(&rsci->h, &rsc_cache);
 	return ret;
 }
 
@@ -1168,7 +1164,7 @@ out_err:
 		put_group_info(rqstp->rq_cred.cr_group_info);
 	rqstp->rq_cred.cr_group_info = NULL;
 	if (gsd->rsci)
-		rsc_put(&gsd->rsci->h, &rsc_cache);
+		cache_put(&gsd->rsci->h, &rsc_cache);
 	gsd->rsci = NULL;
 
 	return stat;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index edcda4fd88e8..dd81e5928172 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -42,7 +42,7 @@ void cache_init(struct cache_head *h)
 	time_t now = get_seconds();
 	h->next = NULL;
 	h->flags = 0;
-	atomic_set(&h->refcnt, 1);
+	kref_init(&h->ref);
 	h->expiry_time = now + CACHE_NEW_EXPIRY;
 	h->last_refresh = now;
 }
@@ -81,7 +81,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
 		if (detail->match(tmp, key)) {
 			cache_get(tmp);
 			write_unlock(&detail->hash_lock);
-			detail->cache_put(new, detail);
+			cache_put(new, detail);
 			return tmp;
 		}
 	}
@@ -145,7 +145,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	/* We need to insert a new entry */
 	tmp = detail->alloc();
 	if (!tmp) {
-		detail->cache_put(old, detail);
+		cache_put(old, detail);
 		return NULL;
 	}
 	cache_init(tmp);
@@ -165,7 +165,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
 	write_unlock(&detail->hash_lock);
 	cache_fresh_unlocked(tmp, detail, is_new);
 	cache_fresh_unlocked(old, detail, 0);
-	detail->cache_put(old, detail);
+	cache_put(old, detail);
 	return tmp;
 }
 EXPORT_SYMBOL(sunrpc_cache_update);
@@ -234,7 +234,7 @@ int cache_check(struct cache_detail *detail,
 		cache_defer_req(rqstp, h);
 
 	if (rv)
-		detail->cache_put(h, detail);
+		cache_put(h, detail);
 	return rv;
 }
 
@@ -431,7 +431,7 @@ static int cache_clean(void)
 			if (test_and_clear_bit(CACHE_PENDING, &ch->flags))
 				queue_loose(current_detail, ch);
 
-			if (atomic_read(&ch->refcnt) == 1)
+			if (atomic_read(&ch->ref.refcount) == 1)
 				break;
 		}
 		if (ch) {
@@ -446,7 +446,7 @@ static int cache_clean(void)
 			current_index ++;
 		spin_unlock(&cache_list_lock);
 		if (ch)
-			d->cache_put(ch, d);
+			cache_put(ch, d);
 	} else
 		spin_unlock(&cache_list_lock);
 
@@ -723,7 +723,7 @@ cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 		    !test_bit(CACHE_PENDING, &rq->item->flags)) {
 			list_del(&rq->q.list);
 			spin_unlock(&queue_lock);
-			cd->cache_put(rq->item, cd);
+			cache_put(rq->item, cd);
 			kfree(rq->buf);
 			kfree(rq);
 		} else
@@ -906,7 +906,7 @@ static void queue_loose(struct cache_detail *detail, struct cache_head *ch)
 				continue;
 			list_del(&cr->q.list);
 			spin_unlock(&queue_lock);
-			detail->cache_put(cr->item, detail);
+			cache_put(cr->item, detail);
 			kfree(cr->buf);
 			kfree(cr);
 			return;
@@ -1192,7 +1192,7 @@ static int c_show(struct seq_file *m, void *p)
 
 	ifdebug(CACHE)
 		seq_printf(m, "# expiry=%ld refcnt=%d flags=%lx\n",
-			   cp->expiry_time, atomic_read(&cp->refcnt), cp->flags);
+			   cp->expiry_time, atomic_read(&cp->ref.refcount), cp->flags);
 	cache_get(cp);
 	if (cache_check(cd, cp, NULL))
 		/* cache_check does a cache_put on failure */
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 7e38621a20b7..11020c0b7db5 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -84,15 +84,15 @@ struct ip_map {
 };
 static struct cache_head	*ip_table[IP_HASHMAX];
 
-static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
+static void ip_map_put(struct kref *kref)
 {
+	struct cache_head *item = container_of(kref, struct cache_head, ref);
 	struct ip_map *im = container_of(item, struct ip_map,h);
-	if (cache_put(item, cd)) {
-		if (test_bit(CACHE_VALID, &item->flags) &&
-		    !test_bit(CACHE_NEGATIVE, &item->flags))
-			auth_domain_put(&im->m_client->h);
-		kfree(im);
-	}
+
+	if (test_bit(CACHE_VALID, &item->flags) &&
+	    !test_bit(CACHE_NEGATIVE, &item->flags))
+		auth_domain_put(&im->m_client->h);
+	kfree(im);
 }
 
 #if IP_HASHBITS == 8
@@ -315,7 +315,7 @@ static int ip_map_update(struct ip_map *ipm, struct unix_domain *udom, time_t ex
 				 hash_ip((unsigned long)ipm->m_addr.s_addr));
 	if (!ch)
 		return -ENOMEM;
-	ip_map_put(ch, &ip_map_cache);
+	cache_put(ch, &ip_map_cache);
 	return 0;
 }
 
@@ -369,7 +369,7 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr)
 		rv = &ipm->m_client->h;
 		kref_get(&rv->ref);
 	}
-	ip_map_put(&ipm->h, &ip_map_cache);
+	cache_put(&ipm->h, &ip_map_cache);
 	return rv;
 }
 
@@ -403,7 +403,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
 		case 0:
 			rqstp->rq_client = &ipm->m_client->h;
 			kref_get(&rqstp->rq_client->ref);
-			ip_map_put(&ipm->h, &ip_map_cache);
+			cache_put(&ipm->h, &ip_map_cache);
 			break;
 	}
 	return SVC_OK;
-- 
cgit v1.2.3


From 74cae61ab45f19a3e8c4d9f53c0e94df129c7915 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 27 Mar 2006 01:15:10 -0800
Subject: [PATCH] fs/nfsd/export.c,net/sunrpc/cache.c: make needlessly global
 code static

We can now make some code static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c             | 13 ++++++++-----
 include/linux/nfsd/export.h  |  5 +----
 include/linux/sunrpc/cache.h |  1 -
 net/sunrpc/cache.c           |  2 +-
 net/sunrpc/sunrpc_syms.c     |  1 -
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cc811a1094cb..c340be0a3f59 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -57,7 +57,7 @@ static int		exp_verify_string(char *cp, int max);
 #define	EXPKEY_HASHMASK		(EXPKEY_HASHMAX -1)
 static struct cache_head *expkey_table[EXPKEY_HASHMAX];
 
-void expkey_put(struct kref *ref)
+static void expkey_put(struct kref *ref)
 {
 	struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref);
 
@@ -87,6 +87,8 @@ static void expkey_request(struct cache_detail *cd,
 
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
+static struct cache_detail svc_expkey_cache;
+
 static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
 	/* client fsidtype fsid [path] */
@@ -255,7 +257,7 @@ static struct cache_head *expkey_alloc(void)
 		return NULL;
 }
 
-struct cache_detail svc_expkey_cache = {
+static struct cache_detail svc_expkey_cache = {
 	.owner		= THIS_MODULE,
 	.hash_size	= EXPKEY_HASHMAX,
 	.hash_table	= expkey_table,
@@ -345,7 +347,8 @@ static void svc_export_request(struct cache_detail *cd,
 	(*bpp)[-1] = '\n';
 }
 
-struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old);
+static struct svc_export *svc_export_update(struct svc_export *new,
+					    struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
 
 static int check_export(struct inode *inode, int flags)
@@ -574,7 +577,7 @@ svc_export_lookup(struct svc_export *exp)
 		return NULL;
 }
 
-struct svc_export *
+static struct svc_export *
 svc_export_update(struct svc_export *new, struct svc_export *old)
 {
 	struct cache_head *ch;
@@ -593,7 +596,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
 }
 
 
-struct svc_expkey *
+static struct svc_expkey *
 exp_find_key(svc_client *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp)
 {
 	struct svc_expkey key, *ek;
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index a6c08a47b25c..d2a8abb5011a 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -86,9 +86,6 @@ void			nfsd_export_shutdown(void);
 void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
-struct svc_expkey *	exp_find_key(struct auth_domain *clp, 
-				     int fsid_type, u32 *fsidv,
-				     struct cache_req *reqp);
 struct svc_export *	exp_get_by_name(struct auth_domain *clp,
 					struct vfsmount *mnt,
 					struct dentry *dentry,
@@ -102,7 +99,7 @@ int			exp_rootfh(struct auth_domain *,
 int			exp_pseudoroot(struct auth_domain *, struct svc_fh *fhp, struct cache_req *creq);
 int			nfserrno(int errno);
 
-extern struct cache_detail svc_export_cache, svc_expkey_cache;
+extern struct cache_detail svc_export_cache;
 
 static inline void exp_put(struct svc_export *exp)
 {
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index ad3f5cbdb770..b5612c958cce 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -163,7 +163,6 @@ static inline void cache_put(struct cache_head *h, struct cache_detail *cd)
 	kref_put(&h->ref, cd->cache_put);
 }
 
-extern void cache_init(struct cache_head *h);
 extern int cache_check(struct cache_detail *detail,
 		       struct cache_head *h, struct cache_req *rqstp);
 extern void cache_flush(void);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index dd81e5928172..3ac4193a78ed 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -37,7 +37,7 @@
 static void cache_defer_req(struct cache_req *req, struct cache_head *item);
 static void cache_revisit_request(struct cache_head *item);
 
-void cache_init(struct cache_head *h)
+static void cache_init(struct cache_head *h)
 {
 	time_t now = get_seconds();
 	h->next = NULL;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 69b8238f3d10..769114f0f886 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -105,7 +105,6 @@ EXPORT_SYMBOL(auth_unix_lookup);
 EXPORT_SYMBOL(cache_check);
 EXPORT_SYMBOL(cache_flush);
 EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_init);
 EXPORT_SYMBOL(cache_register);
 EXPORT_SYMBOL(cache_unregister);
 EXPORT_SYMBOL(qword_add);
-- 
cgit v1.2.3


From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 27 Mar 2006 01:15:22 -0800
Subject: [PATCH] sched: new sched domain for representing multi-core

Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig                      |  9 +++++
 arch/i386/kernel/cpu/common.c          | 10 +++--
 arch/i386/kernel/cpu/intel_cacheinfo.c | 22 +++++++++-
 arch/i386/kernel/smpboot.c             | 24 +++++++++++
 arch/x86_64/Kconfig                    |  9 +++++
 arch/x86_64/kernel/setup.c             |  3 +-
 arch/x86_64/kernel/smpboot.c           | 24 +++++++++++
 include/asm-i386/processor.h           |  5 +++
 include/asm-i386/topology.h            |  2 +
 include/asm-x86_64/processor.h         |  4 ++
 include/asm-x86_64/smp.h               |  1 +
 include/asm-x86_64/topology.h          |  2 +
 include/linux/topology.h               |  9 +++++
 kernel/sched.c                         | 73 +++++++++++++++++++++++++++++++---
 14 files changed, 186 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b913..f17bd1d2707e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e96..a06a49075f10 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
-	int junk;
+	int ebx;
 
 	if (have_cpuid_p()) {
 		/* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 		/* Intel-defined flags: level 0x00000001 */
 		if ( c->cpuid_level >= 0x00000001 ) {
 			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 			c->x86_capability[0] = capability;
 			c->x86_capability[4] = excap;
 			c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+			c->apicid = (ebx >> 24) & 0xFF;
+#endif
 		} else {
 			/* Have CPUID level 0 only - unheard of */
 			c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e5..7e7fd4e67dd0 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+	unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
 
 	if (c->cpuid_level > 3) {
 		static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					break;
 				    case 2:
 					new_l2 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l2_id = c->apicid >> index_msb;
 					break;
 				    case 3:
 					new_l3 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l3_id = c->apicid >> index_msb;
 					break;
 				    default:
 					break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 		if (new_l1i)
 			l1i = new_l1i;
 
-		if (new_l2)
+		if (new_l2) {
 			l2 = new_l2;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l2_id;
+#endif
+		}
 
-		if (new_l3)
+		if (new_l3) {
 			l3 = new_l3;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l3_id;
+#endif
+		}
 
 		if ( trace )
 			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa9..a6969903f2d6 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..1cb4aa241c8c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..d1f3e9272c05 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id(0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	c->apicid = phys_pkg_id(0);
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..ea48fa638070 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
 
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d961e2b..af4bfd012475 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
+#include <linux/cpumask.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
 	char	pad0;
 	int	x86_power;
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	unsigned char x86_max_cores;	/* cpuid returned max cores value */
 	unsigned char booted_cores;	/* number of cores as seen by OS */
 	unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
 
 extern	int phys_proc_id[NR_CPUS];
 extern	int cpu_core_id[NR_CPUS];
+extern	int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6ee83e..b94e5eeef917 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
 
 #endif /* CONFIG_NUMA */
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c036ed..1aa2cee43344 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
 #include <asm/mmsegment.h>
 #include <asm/percpu.h>
 #include <linux/personality.h>
+#include <linux/cpumask.h>
 
 #define TF_MASK		0x00000100
 #define IF_MASK		0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
         __u32   x86_power; 	
 	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	__u8	apicid;
 	__u8	booted_cores;	/* number of cores as seen by OS */
 } ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2cfd5c0..a4fdaeb5c397 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern u8 phys_proc_id[NR_CPUS];
 extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d9882d..9db54e9d17bb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
 
 #include <asm-generic/topology.h>
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb0040ce3a..a305ae2e44b6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
 	.nr_balance_failed	= 0,			\
 }
 
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+	return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
 	return first_cpu(cpu_sibling_map[cpu]);
 #else
 	return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
+#ifdef CONFIG_SCHED_MC
+		p = sd;
+		sd = &per_cpu(core_domains, i);
+		group = cpu_to_core_group(i);
+		*sd = SD_MC_INIT;
+		sd->span = cpu_coregroup_map(i);
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_core[group];
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	}
 #endif
 
+#ifdef CONFIG_SCHED_MC
+	/* Set up multi-core groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_core_map = cpu_coregroup_map(i);
+		cpus_and(this_core_map, this_core_map, *cpu_map);
+		if (i != first_cpu(this_core_map))
+			continue;
+		init_sched_build_groups(sched_group_core, this_core_map,
+					&cpu_to_core_group);
+	}
+#endif
+
+
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+		sd = &per_cpu(core_domains, i);
+		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+					    * SCHED_LOAD_SCALE / 10;
+		sd->groups->cpu_power = power;
+
+		sd = &per_cpu(phys_domains, i);
 
+ 		/*
+ 		 * This has to be < 2 * SCHED_LOAD_SCALE
+ 		 * Lets keep it SCHED_LOAD_SCALE, so that
+ 		 * while calculating NUMA group's cpu_power
+ 		 * we can simply do
+ 		 *  numa_group->cpu_power += phys_group->cpu_power;
+ 		 *
+ 		 * See "only add power once for each physical pkg"
+ 		 * comment below
+ 		 */
+ 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
 		sd = &per_cpu(phys_domains, i);
 		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
+#endif
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
 		for_each_cpu_mask(j, sg->cpumask) {
 			struct sched_domain *sd;
-			int power;
 
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
 				 */
 				continue;
 			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
 
-			sg->cpu_power += power;
+			sg->cpu_power += sd->groups->cpu_power;
 		}
 		sg = sg->next;
 		if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
-- 
cgit v1.2.3


From a117e66ed45ac0569c039ea60bd7a9a61e031858 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:25 -0800
Subject: [PATCH] unify pfn_to_page: generic functions

There are 3 memory models, FLATMEM, DISCONTIGMEM, SPARSEMEM.
Each arch has its own page_to_pfn(), pfn_to_page() for each models.
But most of them can use the same arithmetic.

This patch adds asm-generic/memory_model.h, which includes generic
page_to_pfn(), pfn_to_page() definitions for each memory model.

When CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y, out-of-line functions are
used instead of macro. This is enabled by some archs and  reduces
text size.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/memory_model.h | 77 ++++++++++++++++++++++++++++++++++++++
 include/asm-sparc64/page.h         |  2 +
 include/linux/mmzone.h             | 11 ------
 mm/page_alloc.c                    | 42 +++++++++++++++++++++
 4 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 include/asm-generic/memory_model.h

(limited to 'include/linux')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
new file mode 100644
index 000000000000..a7bb4978e808
--- /dev/null
+++ b/include/asm-generic/memory_model.h
@@ -0,0 +1,77 @@
+#ifndef __ASM_MEMORY_MODEL_H
+#define __ASM_MEMORY_MODEL_H
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#if defined(CONFIG_FLATMEM)
+
+#ifndef ARCH_PFN_OFFSET
+#define ARCH_PFN_OFFSET		(0UL)
+#endif
+
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#ifndef arch_pfn_to_nid
+#define arch_pfn_to_nid(pfn)	pfn_to_nid(pfn)
+#endif
+
+#ifndef arch_local_page_offset
+#define arch_local_page_offset(pfn, nid)	\
+	((pfn) - NODE_DATA(nid)->node_start_pfn)
+#endif
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page;
+/* this is useful when inlined pfn_to_page is too big */
+extern struct page *pfn_to_page(unsigned long pfn);
+extern unsigned long page_to_pfn(struct page *page);
+#else
+/*
+ * supports 3 memory models.
+ */
+#if defined(CONFIG_FLATMEM)
+
+#define pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
+				 ARCH_PFN_OFFSET)
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#define pfn_to_page(pfn)			\
+({	unsigned long __pfn = (pfn);		\
+	unsigned long __nid = arch_pfn_to_nid(pfn);  \
+	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
+})
+
+#define page_to_pfn(pg)			\
+({	struct page *__pg = (pg);		\
+	struct zone *__zone = page_zone(__pg);	\
+	(unsigned long)(__pg - __zone->zone_mem_map) +	\
+	 __zone->zone_start_pfn;			\
+})
+
+#elif defined(CONFIG_SPARSEMEM)
+/*
+ * Note: section's mem_map is encorded to reflect its start_pfn.
+ * section[i].section_mem_map == mem_map's address - start_pfn;
+ */
+#define page_to_pfn(pg)					\
+({	struct page *__pg = (pg);				\
+	int __sec = page_to_section(__pg);			\
+	__pg - __section_mem_map_addr(__nr_to_section(__sec));	\
+})
+
+#define pfn_to_page(pfn)				\
+({	unsigned long __pfn = (pfn);			\
+	struct mem_section *__sec = __pfn_to_section(__pfn);	\
+	__section_mem_map_addr(__sec) + __pfn;		\
+})
+#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index 66fe4ac59fd6..aabb21906724 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -111,6 +111,8 @@ typedef unsigned long pgprot_t;
 				 (_AC(0x0000000070000000,UL)) : \
 				 (_AC(0xfffff80000000000,UL) + (1UL << 32UL)))
 
+#include <asm-generic/memory_model.h>
+
 #endif /* !(__ASSEMBLY__) */
 
 /* to align the pointer to the (next) page boundary */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ebfc238cc243..0c1c0c0cce65 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -602,17 +602,6 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-#define pfn_to_page(pfn) 						\
-({ 									\
-	unsigned long __pfn = (pfn);					\
-	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
-})
-#define page_to_pfn(page)						\
-({									\
-	page - __section_mem_map_addr(__nr_to_section(			\
-		page_to_section(page)));				\
-})
-
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 338a02bb004d..349b328763b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2745,3 +2745,45 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	return table;
 }
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	int nid = arch_pfn_to_nid(pfn);
+	return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
+
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+	long section_id = page_to_section(page);
+	return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-- 
cgit v1.2.3


From a0140c1d85637ee5f4ea7c78f066e3611a6a79dc Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:55 -0800
Subject: [PATCH] remove zone_mem_map

This patch removes zone_mem_map.

pfn_to_page uses pgdat, page_to_pfn uses zone.  page_to_pfn can use pgdat
instead of zone, which is only one user of zone_mem_map.  By modifing it,
we can remove zone_mem_map.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mmzone.h         |  3 +--
 include/asm-generic/memory_model.h | 10 +++++-----
 include/linux/mmzone.h             |  1 -
 mm/page_alloc.c                    |  6 ++----
 4 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index c9004398f273..192d80c875b0 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -83,8 +83,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 	pte_t pte;                                                           \
 	unsigned long pfn;                                                   \
 									     \
-	pfn = ((unsigned long)((page)-page_zone(page)->zone_mem_map)) << 32; \
-	pfn += page_zone(page)->zone_start_pfn << 32;			     \
+	pfn = page_to_pfn(page) << 32; \
 	pte_val(pte) = pfn | pgprot_val(pgprot);			     \
 									     \
 	pte;								     \
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index a7bb4978e808..0cfb086dd373 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -45,11 +45,11 @@ extern unsigned long page_to_pfn(struct page *page);
 	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
 })
 
-#define page_to_pfn(pg)			\
-({	struct page *__pg = (pg);		\
-	struct zone *__zone = page_zone(__pg);	\
-	(unsigned long)(__pg - __zone->zone_mem_map) +	\
-	 __zone->zone_start_pfn;			\
+#define page_to_pfn(pg)							\
+({	struct page *__pg = (pg);					\
+	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
+	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
+	 __pgdat->node_start_pfn;					\
 })
 
 #elif defined(CONFIG_SPARSEMEM)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c1c0c0cce65..ace31c515a8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -225,7 +225,6 @@ struct zone {
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
-	struct page		*zone_mem_map;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 349b328763b7..8dc8f2735d22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2042,7 +2042,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	zone_wait_table_init(zone, size);
 	pgdat->nr_zones = zone_idx(zone) + 1;
 
-	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 	zone->zone_start_pfn = zone_start_pfn;
 
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2768,9 +2767,8 @@ struct page *pfn_to_page(unsigned long pfn)
 }
 unsigned long page_to_pfn(struct page *page)
 {
-	struct zone *zone = page_zone(page);
-	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
-
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+	return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
 }
 #elif defined(CONFIG_SPARSEMEM)
 struct page *pfn_to_page(unsigned long pfn)
-- 
cgit v1.2.3


From 8357f8695d58b50fbf2bd507b4b0fc2cd1e43bd6 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:57 -0800
Subject: [PATCH] define for_each_online_pgdat

This patch defines for_each_online_pgdat() as a replacement of
for_each_pgdat()

Now, online nodes are managed by node_online_map.  But for_each_pgdat()
uses pgdat_link to iterate over all nodes(pgdat).  This means management
structure for online pgdat is duplicated.

I think using node_online_map for for_each_pgdat() is simple and sane
rather ather than pgdat_link.  New macro is named as
for_each_online_pgdat().  Following patch will fix callers of
for_each_pgdat().

The bootmem allocater uses for_each_pgdat() before pgdat initialization.  I
don't think it's sane.  Following patch will fix it.

Signed-off-by: Yasunori Goto     <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h   | 108 +++++++++++++++++++++++++----------------------
 include/linux/nodemask.h |   4 ++
 2 files changed, 61 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ace31c515a8c..96eb08025092 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/init.h>
 #include <linux/seqlock.h>
+#include <linux/nodemask.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -349,57 +350,6 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-/**
- * for_each_pgdat - helper macro to iterate over all nodes
- * @pgdat - pointer to a pg_data_t variable
- *
- * Meant to help with common loops of the form
- * pgdat = pgdat_list;
- * while(pgdat) {
- * 	...
- * 	pgdat = pgdat->pgdat_next;
- * }
- */
-#define for_each_pgdat(pgdat) \
-	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->pgdat_next)
-
-/*
- * next_zone - helper magic for for_each_zone()
- * Thanks to William Lee Irwin III for this piece of ingenuity.
- */
-static inline struct zone *next_zone(struct zone *zone)
-{
-	pg_data_t *pgdat = zone->zone_pgdat;
-
-	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
-		zone++;
-	else if (pgdat->pgdat_next) {
-		pgdat = pgdat->pgdat_next;
-		zone = pgdat->node_zones;
-	} else
-		zone = NULL;
-
-	return zone;
-}
-
-/**
- * for_each_zone - helper macro to iterate over all memory zones
- * @zone - pointer to struct zone variable
- *
- * The user only needs to declare the zone variable, for_each_zone
- * fills it in. This basically means for_each_zone() is an
- * easier to read version of this piece of code:
- *
- * for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
- * 	for (i = 0; i < MAX_NR_ZONES; ++i) {
- * 		struct zone * z = pgdat->node_zones + i;
- * 		...
- * 	}
- * }
- */
-#define for_each_zone(zone) \
-	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
-
 static inline int populated_zone(struct zone *zone)
 {
 	return (!!zone->present_pages);
@@ -471,6 +421,62 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
+static inline struct pglist_data *first_online_pgdat(void)
+{
+	return NODE_DATA(first_online_node);
+}
+
+static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+	int nid = next_online_node(pgdat->node_id);
+
+	if (nid == MAX_NUMNODES)
+		return NULL;
+	return NODE_DATA(nid);
+}
+
+
+/**
+ * for_each_pgdat - helper macro to iterate over all nodes
+ * @pgdat - pointer to a pg_data_t variable
+ */
+#define for_each_online_pgdat(pgdat)			\
+	for (pgdat = first_online_pgdat();		\
+	     pgdat;					\
+	     pgdat = next_online_pgdat(pgdat))
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ * Thanks to William Lee Irwin III for this piece of ingenuity.
+ */
+static inline struct zone *next_zone(struct zone *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+		zone++;
+	else {
+		pgdat = next_online_pgdat(pgdat);
+		if (pgdat)
+			zone = pgdat->node_zones;
+		else
+			zone = NULL;
+	}
+	return zone;
+}
+
+/**
+ * for_each_zone - helper macro to iterate over all memory zones
+ * @zone - pointer to struct zone variable
+ *
+ * The user only needs to declare the zone variable, for_each_zone
+ * fills it in.
+ */
+#define for_each_zone(zone)			        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index b959a4525cbd..1a9ef3e627d1 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -350,11 +350,15 @@ extern nodemask_t node_possible_map;
 #define num_possible_nodes()	nodes_weight(node_possible_map)
 #define node_online(node)	node_isset((node), node_online_map)
 #define node_possible(node)	node_isset((node), node_possible_map)
+#define first_online_node	first_node(node_online_map)
+#define next_online_node(nid)	next_node((nid), node_online_map)
 #else
 #define num_online_nodes()	1
 #define num_possible_nodes()	1
 #define node_online(node)	((node) == 0)
 #define node_possible(node)	((node) == 0)
+#define first_online_node	0
+#define next_online_node(nid)	(MAX_NUMNODES)
 #endif
 
 #define any_online_node(mask)			\
-- 
cgit v1.2.3


From 679bc9fbb508a0aac9539b2de747eb5849feb428 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:58 -0800
Subject: [PATCH] for_each_online_pgdat: for_each_bootmem

Add a list_head to bootmem_data_t and make bootmems use it.  bootmem list is
sorted by node_boot_start.

Only nodes against which init_bootmem() is called are linked to the list.
(i386 allocates bootmem only from one node(0) not from all online nodes.)

A summary:
 1. for_each_online_pgdat() traverses all *online* nodes.
 2. alloc_bootmem() allocates memory only from initialized-for-bootmem nodes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  1 +
 mm/bootmem.c            | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7155452fb4a8..de3eb8d8ae26 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -38,6 +38,7 @@ typedef struct bootmem_data {
 	unsigned long last_pos;
 	unsigned long last_success;	/* Previous allocation point.  To speed
 					 * up searching */
+	struct list_head list;
 } bootmem_data_t;
 
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index b55bd39fc5dd..d3e3bd2ffcea 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
 				 * dma_get_required_mask(), which uses
 				 * it, can be an inline function */
 
+static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
  * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 
 	return mapsize;
 }
+/*
+ * link bdata in order
+ */
+static void link_bootmem(bootmem_data_t *bdata)
+{
+	bootmem_data_t *ent;
+	if (list_empty(&bdata_list)) {
+		list_add(&bdata->list, &bdata_list);
+		return;
+	}
+	/* insert in order */
+	list_for_each_entry(ent, &bdata_list, list) {
+		if (bdata->node_boot_start < ent->node_boot_start) {
+			list_add_tail(&bdata->list, &ent->list);
+			return;
+		}
+	}
+	list_add_tail(&bdata->list, &bdata_list);
+	return;
+}
+
 
 /*
  * Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long mapsize = ((end - start)+7)/8;
 
-	pgdat->pgdat_next = pgdat_list;
-	pgdat_list = pgdat;
-
 	mapsize = ALIGN(mapsize, sizeof(long));
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
+	link_bootmem(bdata);
 
 	/*
 	 * Initially all pages are reserved - setup_arch() has to
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
 
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, 0)))
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size,
 						 align, goal, LOW32LIMIT)))
 			return(ptr);
 
-- 
cgit v1.2.3


From ae0f15fb91274e67d78836d38c99ec363df33073 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:01 -0800
Subject: [PATCH] for_each_online_pgdat: remove pgdat_list

By using for_each_online_pgdat(), pgdat_list is not necessary now.  This patch
removes it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 3 ---
 mm/page_alloc.c        | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 96eb08025092..0d12c3cf1f86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -307,7 +307,6 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
-	struct pglist_data *pgdat_next;
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
@@ -324,8 +323,6 @@ typedef struct pglist_data {
 
 #include <linux/memory_hotplug.h>
 
-extern struct pglist_data *pgdat_list;
-
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat);
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccc3713dd407..dc523a1f270d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,7 +49,6 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
@@ -2169,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
 {
 	pg_data_t *pgdat;
 	loff_t node = *pos;
-
-	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+	for (pgdat = first_online_pgdat();
+	     pgdat && node;
+	     pgdat = next_online_pgdat(pgdat))
 		--node;
 
 	return pgdat;
@@ -2181,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 	pg_data_t *pgdat = (pg_data_t *)arg;
 
 	(*pos)++;
-	return pgdat->pgdat_next;
+	return next_online_pgdat(pgdat);
 }
 
 static void frag_stop(struct seq_file *m, void *arg)
-- 
cgit v1.2.3


From 95144c788dc01b6a0ff2c9c2222e37ffdab358b8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:02 -0800
Subject: [PATCH] uninline zone helpers

Helper functions for for_each_online_pgdat/for_each_zone look too big to be
inlined.  Speed of these helper macro itself is not very important.  (inner
loops are tend to do more work than this)

This patch make helper function to be out-of-lined.

	inline		out-of-line
.text   005c0680        005bf6a0

005c0680 - 005bf6a0 = FE0 = 4Kbytes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 38 +++-----------------------------------
 mm/Makefile            |  2 +-
 mm/mmzone.c            | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 36 deletions(-)
 create mode 100644 mm/mmzone.c

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d12c3cf1f86..b5c21122c299 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -418,20 +418,9 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static inline struct pglist_data *first_online_pgdat(void)
-{
-	return NODE_DATA(first_online_node);
-}
-
-static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
-{
-	int nid = next_online_node(pgdat->node_id);
-
-	if (nid == MAX_NUMNODES)
-		return NULL;
-	return NODE_DATA(nid);
-}
-
+extern struct pglist_data *first_online_pgdat(void);
+extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
+extern struct zone *next_zone(struct zone *zone);
 
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
@@ -441,27 +430,6 @@ static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 	for (pgdat = first_online_pgdat();		\
 	     pgdat;					\
 	     pgdat = next_online_pgdat(pgdat))
-
-/*
- * next_zone - helper magic for for_each_zone()
- * Thanks to William Lee Irwin III for this piece of ingenuity.
- */
-static inline struct zone *next_zone(struct zone *zone)
-{
-	pg_data_t *pgdat = zone->zone_pgdat;
-
-	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
-		zone++;
-	else {
-		pgdat = next_online_pgdat(pgdat);
-		if (pgdat)
-			zone = pgdat->node_zones;
-		else
-			zone = NULL;
-	}
-	return zone;
-}
-
 /**
  * for_each_zone - helper macro to iterate over all memory zones
  * @zone - pointer to struct zone variable
diff --git a/mm/Makefile b/mm/Makefile
index f10c753dce6d..0b8f73f2ed16 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o $(mmu-y)
+			   prio_tree.o util.o mmzone.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000000..b022370e612e
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+	return NODE_DATA(first_online_node);
+}
+
+EXPORT_SYMBOL(first_online_pgdat);
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+	int nid = next_online_node(pgdat->node_id);
+
+	if (nid == MAX_NUMNODES)
+		return NULL;
+	return NODE_DATA(nid);
+}
+EXPORT_SYMBOL(next_online_pgdat);
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+		zone++;
+	else {
+		pgdat = next_online_pgdat(pgdat);
+		if (pgdat)
+			zone = pgdat->node_zones;
+		else
+			zone = NULL;
+	}
+	return zone;
+}
+EXPORT_SYMBOL(next_zone);
+
-- 
cgit v1.2.3


From 22a9835c350782a5c3257343713932af3ac92ee0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Mon, 27 Mar 2006 01:16:04 -0800
Subject: [PATCH] unify PFN_* macros

Just about every architecture defines some macros to do operations on pfns.
 They're all virtually identical.  This patch consolidates all of them.

One minor glitch is that at least i386 uses them in a very skeletal header
file.  To keep away from #include dependency hell, I stuck the new
definitions in a new, isolated header.

Of all of the implementations, sh64 is the only one that varied by a bit.
It used some masks to ensure that any sign-extension got ripped away before
the arithmetic is done.  This has been posted to that sh64 maintainers and
the development list.

Compiles on x86, x86_64, ia64 and ppc64.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c               | 9 +--------
 arch/alpha/mm/numa.c                    | 4 +---
 arch/arm26/mm/init.c                    | 7 +------
 arch/cris/kernel/setup.c                | 5 +----
 arch/i386/kernel/setup.c                | 1 +
 arch/i386/mm/discontig.c                | 1 +
 arch/m32r/kernel/setup.c                | 1 +
 arch/m32r/mm/discontig.c                | 1 +
 arch/m32r/mm/init.c                     | 1 +
 arch/mips/ite-boards/ivr/init.c         | 3 ---
 arch/mips/ite-boards/qed-4n-s01b/init.c | 3 ---
 arch/mips/kernel/setup.c                | 9 +--------
 arch/mips/mips-boards/generic/memory.c  | 7 ++-----
 arch/mips/mips-boards/sim/sim_mem.c     | 7 ++-----
 arch/mips/mm/init.c                     | 4 +---
 arch/mips/sgi-ip27/ip27-memory.c        | 3 +--
 arch/sh/kernel/setup.c                  | 5 +----
 arch/sh64/kernel/setup.c                | 1 +
 arch/um/kernel/physmem.c                | 3 +--
 include/asm-i386/setup.h                | 4 +---
 include/asm-m32r/setup.h                | 4 ----
 include/asm-sh64/platform.h             | 5 -----
 include/linux/pfn.h                     | 9 +++++++++
 23 files changed, 29 insertions(+), 68 deletions(-)
 create mode 100644 include/linux/pfn.h

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index b4e5f8ff2b25..9402624453c2 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -34,6 +34,7 @@
 #include <linux/root_dev.h>
 #include <linux/initrd.h>
 #include <linux/eisa.h>
+#include <linux/pfn.h>
 #ifdef CONFIG_MAGIC_SYSRQ
 #include <linux/sysrq.h>
 #include <linux/reboot.h>
@@ -241,9 +242,6 @@ reserve_std_resources(void)
 		request_resource(io, standard_io_resources+i);
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
 #define PFN_MAX		PFN_DOWN(0x80000000)
 #define for_each_mem_cluster(memdesc, cluster, i)		\
 	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
@@ -472,11 +470,6 @@ page_is_ram(unsigned long pfn)
 	return 0;
 }
 
-#undef PFN_UP
-#undef PFN_DOWN
-#undef PFN_PHYS
-#undef PFN_MAX
-
 void __init
 setup_arch(char **cmdline_p)
 {
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 6d5251254f68..bf6b65c81bef 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -13,6 +13,7 @@
 #include <linux/bootmem.h>
 #include <linux/swap.h>
 #include <linux/initrd.h>
+#include <linux/pfn.h>
 
 #include <asm/hwrpb.h>
 #include <asm/pgalloc.h>
@@ -27,9 +28,6 @@ bootmem_data_t node_bdata[MAX_NUMNODES];
 #define DBGDCONT(args...)
 #endif
 
-#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)     ((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
 #define for_each_mem_cluster(memdesc, cluster, i)		\
 	for ((cluster) = (memdesc)->cluster, (i) = 0;		\
 	     (i) < (memdesc)->numclusters; (i)++, (cluster)++)
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index e3ecaa453747..7da8a5205678 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/blkdev.h>
+#include <linux/pfn.h>
 
 #include <asm/segment.h>
 #include <asm/mach-types.h>
@@ -101,12 +102,6 @@ struct node_info {
 	int bootmap_pages;
 };
 
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_UP(x)	(PAGE_ALIGN(x) >> PAGE_SHIFT)
-#define PFN_SIZE(x)	((x) >> PAGE_SHIFT)
-#define PFN_RANGE(s,e)	PFN_SIZE(PAGE_ALIGN((unsigned long)(e)) - \
-				(((unsigned long)(s)) & PAGE_MASK))
-
 /*
  * FIXME: We really want to avoid allocating the bootmap bitmap
  * over the top of the initrd.  Hopefully, this is located towards
diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c
index 1ba57efff60d..619a6eefd893 100644
--- a/arch/cris/kernel/setup.c
+++ b/arch/cris/kernel/setup.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/tty.h>
 #include <linux/utsname.h>
+#include <linux/pfn.h>
 
 #include <asm/setup.h>
 
@@ -88,10 +89,6 @@ setup_arch(char **cmdline_p)
 	init_mm.end_data =   (unsigned long) &_edata;
 	init_mm.brk =        (unsigned long) &_end;
 
-#define PFN_UP(x)       (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)     ((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
-
 	/* min_low_pfn points to the start of DRAM, start_pfn points
 	 * to the first DRAM pages after the kernel, and max_low_pfn
 	 * to the end of DRAM.
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 6917daa159ab..8c08660b4e5d 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -46,6 +46,7 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/dmi.h>
+#include <linux/pfn.h>
 
 #include <video/edid.h>
 
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index c3f3ae95e22d..fe6eb901326e 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -31,6 +31,7 @@
 #include <linux/nodemask.h>
 #include <linux/module.h>
 #include <linux/kexec.h>
+#include <linux/pfn.h>
 
 #include <asm/e820.h>
 #include <asm/setup.h>
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index d742037a7ccb..0d78942b4c76 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -24,6 +24,7 @@
 #include <linux/tty.h>
 #include <linux/cpu.h>
 #include <linux/nodemask.h>
+#include <linux/pfn.h>
 
 #include <asm/processor.h>
 #include <asm/pgtable.h>
diff --git a/arch/m32r/mm/discontig.c b/arch/m32r/mm/discontig.c
index 70c8528a0ad5..cf610a7c5ff0 100644
--- a/arch/m32r/mm/discontig.c
+++ b/arch/m32r/mm/discontig.c
@@ -13,6 +13,7 @@
 #include <linux/initrd.h>
 #include <linux/nodemask.h>
 #include <linux/module.h>
+#include <linux/pfn.h>
 
 #include <asm/setup.h>
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 2e0fe199ce38..b71348fec1f4 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 #include <linux/nodemask.h>
+#include <linux/pfn.h>
 #include <asm/types.h>
 #include <asm/processor.h>
 #include <asm/page.h>
diff --git a/arch/mips/ite-boards/ivr/init.c b/arch/mips/ite-boards/ivr/init.c
index ea4e1935fec5..b774db035b31 100644
--- a/arch/mips/ite-boards/ivr/init.c
+++ b/arch/mips/ite-boards/ivr/init.c
@@ -45,9 +45,6 @@ extern void  __init prom_init_cmdline(void);
 extern unsigned long __init prom_get_memsize(void);
 extern void __init it8172_init_ram_resource(unsigned long memsize);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
 const char *get_system_type(void)
 {
 	return "Globespan IVR";
diff --git a/arch/mips/ite-boards/qed-4n-s01b/init.c b/arch/mips/ite-boards/qed-4n-s01b/init.c
index 56dca7e0c21d..e8ec8be66a80 100644
--- a/arch/mips/ite-boards/qed-4n-s01b/init.c
+++ b/arch/mips/ite-boards/qed-4n-s01b/init.c
@@ -45,9 +45,6 @@ extern void  __init prom_init_cmdline(void);
 extern unsigned long __init prom_get_memsize(void);
 extern void __init it8172_init_ram_resource(unsigned long memsize);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
 const char *get_system_type(void)
 {
 	return "ITE QED-4N-S01B";
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 0cb3b6097e0e..dcbfd27071f0 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -34,6 +34,7 @@
 #include <linux/highmem.h>
 #include <linux/console.h>
 #include <linux/mmzone.h>
+#include <linux/pfn.h>
 
 #include <asm/addrspace.h>
 #include <asm/bootinfo.h>
@@ -257,10 +258,6 @@ static inline int parse_rd_cmdline(unsigned long* rd_start, unsigned long* rd_en
 	return 0;
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 #define MAXMEM		HIGHMEM_START
 #define MAXMEM_PFN	PFN_DOWN(MAXMEM)
 
@@ -493,10 +490,6 @@ static inline void resource_init(void)
 	}
 }
 
-#undef PFN_UP
-#undef PFN_DOWN
-#undef PFN_PHYS
-
 #undef MAXMEM
 #undef MAXMEM_PFN
 
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index ee5e70c95cf3..32c9210373ac 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -49,9 +49,6 @@ static char *mtypes[3] = {
 /* References to section boundaries */
 extern char _end;
 
-#define PFN_ALIGN(x)    (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
-
 struct prom_pmemblock * __init prom_getmdesc(void)
 {
 	char *memsize_str;
@@ -109,10 +106,10 @@ struct prom_pmemblock * __init prom_getmdesc(void)
 
 	mdesc[3].type = yamon_dontuse;
 	mdesc[3].base = 0x00100000;
-	mdesc[3].size = CPHYSADDR(PFN_ALIGN(&_end)) - mdesc[3].base;
+	mdesc[3].size = CPHYSADDR(PAGE_ALIGN(&_end)) - mdesc[3].base;
 
 	mdesc[4].type = yamon_free;
-	mdesc[4].base = CPHYSADDR(PFN_ALIGN(&_end));
+	mdesc[4].base = CPHYSADDR(PAGE_ALIGN(&_end));
 	mdesc[4].size = memsize - mdesc[4].base;
 
 	return &mdesc[0];
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 1ec4e75656bd..e57f737bab10 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -42,9 +42,6 @@ static char *mtypes[3] = {
 /* References to section boundaries */
 extern char _end;
 
-#define PFN_ALIGN(x)    (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
-
-
 struct prom_pmemblock * __init prom_getmdesc(void)
 {
 	unsigned int memsize;
@@ -64,10 +61,10 @@ struct prom_pmemblock * __init prom_getmdesc(void)
 
 	mdesc[2].type = simmem_reserved;
 	mdesc[2].base = 0x00100000;
-	mdesc[2].size = CPHYSADDR(PFN_ALIGN(&_end)) - mdesc[2].base;
+	mdesc[2].size = CPHYSADDR(PAGE_ALIGN(&_end)) - mdesc[2].base;
 
 	mdesc[3].type = simmem_free;
-	mdesc[3].base = CPHYSADDR(PFN_ALIGN(&_end));
+	mdesc[3].base = CPHYSADDR(PAGE_ALIGN(&_end));
 	mdesc[3].size = memsize - mdesc[3].base;
 
 	return &mdesc[0];
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 52f7d59fe612..ad89c442f299 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -25,6 +25,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/proc_fs.h>
+#include <linux/pfn.h>
 
 #include <asm/bootinfo.h>
 #include <asm/cachectl.h>
@@ -177,9 +178,6 @@ void __init paging_init(void)
 	free_area_init(zones_size);
 }
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-
 static inline int page_is_ram(unsigned long pagenr)
 {
 	int i;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index e0d095daa5ed..6c00dce9f73f 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -19,6 +19,7 @@
 #include <linux/nodemask.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 #include <asm/page.h>
 #include <asm/sections.h>
 
@@ -28,8 +29,6 @@
 #include <asm/sn/sn_private.h>
 
 
-#define PFN_UP(x)		(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-
 #define SLOT_PFNSHIFT           (SLOT_SHIFT - PAGE_SHIFT)
 #define PFN_NASIDSHFT           (NASID_SHFT - PAGE_SHIFT)
 
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index c0e79843f580..7ee4ca203616 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -20,6 +20,7 @@
 #include <linux/root_dev.h>
 #include <linux/utsname.h>
 #include <linux/cpu.h>
+#include <linux/pfn.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/sections.h>
@@ -275,10 +276,6 @@ void __init setup_arch(char **cmdline_p)
 
 	sh_mv_setup(cmdline_p);
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 	/*
 	 * Find the highest page frame number we have available
 	 */
diff --git a/arch/sh64/kernel/setup.c b/arch/sh64/kernel/setup.c
index c7a7b816a30f..d2711c9c9d13 100644
--- a/arch/sh64/kernel/setup.c
+++ b/arch/sh64/kernel/setup.c
@@ -48,6 +48,7 @@
 #include <linux/root_dev.h>
 #include <linux/cpu.h>
 #include <linux/initrd.h>
+#include <linux/pfn.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c
index 0e65340eee33..0500800df1c1 100644
--- a/arch/um/kernel/physmem.c
+++ b/arch/um/kernel/physmem.c
@@ -9,6 +9,7 @@
 #include "linux/vmalloc.h"
 #include "linux/bootmem.h"
 #include "linux/module.h"
+#include "linux/pfn.h"
 #include "asm/types.h"
 #include "asm/pgtable.h"
 #include "kern_util.h"
@@ -316,8 +317,6 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len,
 	}
 }
 
-#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-
 extern int __syscall_stub_start, __binary_start;
 
 void setup_physmem(unsigned long start, unsigned long reserve_end,
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 826a8ca50ac8..ee941457b55d 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -6,9 +6,7 @@
 #ifndef _i386_SETUP_H
 #define _i386_SETUP_H
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
+#include <linux/pfn.h>
 
 /*
  * Reserved space for vmalloc and iomap - defined in asm/page.h
diff --git a/include/asm-m32r/setup.h b/include/asm-m32r/setup.h
index 5f028dc26a9b..52f4fa29abfc 100644
--- a/include/asm-m32r/setup.h
+++ b/include/asm-m32r/setup.h
@@ -24,10 +24,6 @@
 #define RAMDISK_PROMPT_FLAG		(0x8000)
 #define RAMDISK_LOAD_FLAG		(0x4000)
 
-#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
-#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
-#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
-
 extern unsigned long memory_start;
 extern unsigned long memory_end;
 
diff --git a/include/asm-sh64/platform.h b/include/asm-sh64/platform.h
index 7046a9014027..bd0d9c405a80 100644
--- a/include/asm-sh64/platform.h
+++ b/include/asm-sh64/platform.h
@@ -61,9 +61,4 @@ extern int platform_int_priority[NR_INTC_IRQS];
 #define code_resource (platform_parms.kram_res_p[STANDARD_KRAM_RESOURCES - 2])
 #define data_resource (platform_parms.kram_res_p[STANDARD_KRAM_RESOURCES - 1])
 
-/* Be prepared to 64-bit sign extensions */
-#define PFN_UP(x)       ((((x) + PAGE_SIZE-1) >> PAGE_SHIFT) & 0x000fffff)
-#define PFN_DOWN(x)     (((x) >> PAGE_SHIFT) & 0x000fffff)
-#define PFN_PHYS(x)     ((x) << PAGE_SHIFT)
-
 #endif	/* __ASM_SH64_PLATFORM_H */
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
new file mode 100644
index 000000000000..bb01f8b92b56
--- /dev/null
+++ b/include/linux/pfn.h
@@ -0,0 +1,9 @@
+#ifndef _LINUX_PFN_H_
+#define _LINUX_PFN_H_
+
+#define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
+#define PFN_UP(x)	(((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
+#define PFN_DOWN(x)	((x) >> PAGE_SHIFT)
+#define PFN_PHYS(x)	((x) << PAGE_SHIFT)
+
+#endif
-- 
cgit v1.2.3


From 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:22 -0800
Subject: [PATCH] lightweight robust futexes: core

Add the core infrastructure for robust futexes: structure definitions, the new
syscalls and the do_exit() based cleanup mechanism.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h   |  95 ++++++++++++++++++++++++++
 include/linux/sched.h   |   3 +
 include/linux/threads.h |   3 +-
 kernel/exit.c           |   3 +
 kernel/futex.c          | 172 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c         |   4 ++
 6 files changed, 279 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 10f96c31971e..20face6b798d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/sched.h>
+
 /* Second argument to futex syscall */
 
 
@@ -11,10 +13,103 @@
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ */
+
+/*
+ * Per-lock list entry - embedded in user-space locks, somewhere close
+ * to the futex field. (Note: user-space uses a double-linked list to
+ * achieve O(1) list add and remove, but the kernel only needs to know
+ * about the forward link)
+ *
+ * NOTE: this structure is part of the syscall ABI, and must not be
+ * changed.
+ */
+struct robust_list {
+	struct robust_list __user *next;
+};
+
+/*
+ * Per-thread list head:
+ *
+ * NOTE: this structure is part of the syscall ABI, and must only be
+ * changed if the change is first communicated with the glibc folks.
+ * (When an incompatible change is done, we'll increase the structure
+ *  size, which glibc will detect)
+ */
+struct robust_list_head {
+	/*
+	 * The head of the list. Points back to itself if empty:
+	 */
+	struct robust_list list;
+
+	/*
+	 * This relative offset is set by user-space, it gives the kernel
+	 * the relative position of the futex field to examine. This way
+	 * we keep userspace flexible, to freely shape its data-structure,
+	 * without hardcoding any particular offset into the kernel:
+	 */
+	long futex_offset;
+
+	/*
+	 * The death of the thread may race with userspace setting
+	 * up a lock's links. So to handle this race, userspace first
+	 * sets this field to the address of the to-be-taken lock,
+	 * then does the lock acquire, and then adds itself to the
+	 * list, and then clears this field. Hence the kernel will
+	 * always have full knowledge of all locks that the thread
+	 * _might_ have taken. We check the owner TID in any case,
+	 * so only truly owned locks will be handled.
+	 */
+	struct robust_list __user *list_op_pending;
+};
+
+/*
+ * Are there any waiters for this robust futex:
+ */
+#define FUTEX_WAITERS		0x80000000
+
+/*
+ * The kernel signals via this bit that a thread holding a futex
+ * has exited without unlocking the futex. The kernel also does
+ * a FUTEX_WAKE on such futexes, after setting the bit, to wake
+ * up any possible waiters:
+ */
+#define FUTEX_OWNER_DIED	0x40000000
+
+/*
+ * Reserved bit:
+ */
+#define FUTEX_OWNER_PENDING	0x20000000
+
+/*
+ * The rest of the robust-futex field is for the TID:
+ */
+#define FUTEX_TID_MASK		0x1fffffff
+
+/*
+ * A limit of one million locks held per thread (!) ought to be enough
+ * for some time. This also protects against a deliberately circular
+ * list. Not worth introducing an rlimit for this:
+ */
+#define ROBUST_LIST_LIMIT	1048576
+
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
+extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+
+#ifdef CONFIG_FUTEX
+extern void exit_robust_list(struct task_struct *curr);
+#else
+static inline void exit_robust_list(struct task_struct *curr)
+{
+}
+#endif
+
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
 #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
 #define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 036d14d2bf90..fd4848f2d750 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/futex.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -872,6 +873,8 @@ struct task_struct {
 	int cpuset_mems_generation;
 	int cpuset_mem_spread_rotor;
 #endif
+	struct robust_list_head __user *robust_list;
+
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 };
diff --git a/include/linux/threads.h b/include/linux/threads.h
index b59738ac6197..e646bcdf2614 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -28,7 +28,8 @@
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
 
 /*
- * A maximum of 4 million PIDs should be enough for a while:
+ * A maximum of 4 million PIDs should be enough for a while.
+ * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
  */
 #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e136e..aecb48ca7370 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code)
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
+	if (unlikely(tsk->robust_list))
+		exit_robust_list(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..feb724b2554e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  *  (C) Copyright 2003, 2004 Jamie Lokier
  *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,174 @@ error:
 	goto out;
 }
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+		    size_t len)
+{
+	/*
+	 * The kernel knows only one size for now:
+	 */
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->robust_list = head;
+
+	return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+		    size_t __user *len_ptr)
+{
+	struct robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(head, head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+{
+	unsigned int futex_val;
+
+repeat:
+	if (get_user(futex_val, uaddr))
+		return -1;
+
+	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+		/*
+		 * Ok, this dying thread is truly holding a futex
+		 * of interest. Set the OWNER_DIED bit atomically
+		 * via cmpxchg, and if the value had FUTEX_WAITERS
+		 * set, wake up a waiter (if any). (We have to do a
+		 * futex_wake() even if OWNER_DIED is already set -
+		 * to handle the rare but possible case of recursive
+		 * thread-death.) The rest of the cleanup is done in
+		 * userspace.
+		 */
+		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
+					 futex_val | FUTEX_OWNER_DIED) !=
+								   futex_val)
+			goto repeat;
+
+		if (futex_val & FUTEX_WAITERS)
+			futex_wake((unsigned long)uaddr, 1);
+	}
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+	struct robust_list_head __user *head = curr->robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned long futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(entry, &head->list.next))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(pending, &head->list_op_pending))
+		return;
+	if (pending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (entry != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(entry, &entry->next))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 		unsigned long uaddr2, int val2, int val3)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
-- 
cgit v1.2.3


From 34f192c6527f20c47ccec239e7d51a27691b93fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:24 -0800
Subject: [PATCH] lightweight robust futexes: compat

32-bit syscall compatibility support.  (This patch also moves all futex
related compat functionality into kernel/futex_compat.c.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compat.h |  18 +++++++
 include/linux/sched.h  |   3 ++
 kernel/Makefile        |   3 ++
 kernel/compat.c        |  23 --------
 kernel/exit.c          |   5 ++
 kernel/futex_compat.c  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 23 deletions(-)
 create mode 100644 kernel/futex_compat.c

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 24d659cdbafe..6d3a654be1ae 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -147,6 +147,24 @@ typedef struct compat_sigevent {
 	} _sigev_un;
 } compat_sigevent_t;
 
+struct compat_robust_list {
+	compat_uptr_t			next;
+};
+
+struct compat_robust_list_head {
+	struct compat_robust_list	list;
+	compat_long_t			futex_offset;
+	compat_uptr_t			list_op_pending;
+};
+
+extern void compat_exit_robust_list(struct task_struct *curr);
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len);
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr);
 
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fd4848f2d750..20b4f0372e44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,6 +874,9 @@ struct task_struct {
 	int cpuset_mem_spread_rotor;
 #endif
 	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12cf..58908f9d156a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/compat.c b/kernel/compat.c
index b9bdd1271f44..c1601a84f8d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,7 +17,6 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>	/* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>	/* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
@@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
 	return ret;
 }
 
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
-{
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
-
-	if ((op == FUTEX_WAIT) && utime) {
-		if (get_compat_timespec(&t, utime))
-			return -EFAULT;
-		timeout = timespec_to_jiffies(&t) + 1;
-	}
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
-
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
-}
-#endif
-
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index aecb48ca7370..a8c7efc7a681 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -855,6 +856,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	if (unlikely(tsk->robust_list))
 		exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+	if (unlikely(tsk->compat_robust_list))
+		compat_exit_robust_list(tsk);
+#endif
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000000..c153559ef289
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *pending;
+	compat_uptr_t uentry, upending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	compat_long_t futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(uentry, &head->list.next))
+		return;
+	entry = compat_ptr(uentry);
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(upending, &head->list_op_pending))
+		return;
+	pending = compat_ptr(upending);
+	if (upending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (compat_ptr(uentry) != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+			return;
+		entry = compat_ptr(uentry);
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr)
+{
+	struct compat_robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->compat_robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->compat_robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		int val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if ((op == FUTEX_WAIT) && utime) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		timeout = timespec_to_jiffies(&t) + 1;
+	}
+	if (op >= FUTEX_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
+}
-- 
cgit v1.2.3


From 8f17d3a5049d32392b79925c73a0cf99ce6d5af0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:27 -0800
Subject: [PATCH] lightweight robust futexes updates

- fix: initialize the robust list(s) to NULL in copy_process.

- doc update

- cleanup: rename _inuser to _inatomic

- __user cleanups and other small cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/robust-futex-ABI.txt |  2 --
 Documentation/robust-futexes.txt   |  2 +-
 include/asm-frv/futex.h            |  2 +-
 include/asm-generic/futex.h        |  2 +-
 include/asm-i386/futex.h           |  2 +-
 include/asm-mips/futex.h           |  2 +-
 include/asm-powerpc/futex.h        |  2 +-
 include/asm-x86_64/futex.h         |  2 +-
 include/linux/futex.h              |  2 +-
 kernel/fork.c                      |  5 ++++-
 kernel/futex.c                     | 20 +++++++++-----------
 kernel/futex_compat.c              |  7 +++----
 12 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index def5d8735286..8529a17ffaa1 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -142,8 +142,6 @@ On insertion:
     of the 'lock word', to the linked list starting at 'head', and
  4) clear the 'list_op_pending' word.
 
-	XXX I am particularly unsure of the following -pj XXX
-
 On removal:
  1) set the 'list_op_pending' word to the address of the 'lock word'
     to be removed,
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
index 7aecc67b1361..df82d75245a0 100644
--- a/Documentation/robust-futexes.txt
+++ b/Documentation/robust-futexes.txt
@@ -213,6 +213,6 @@ robust-mutex testcases.
 All other architectures should build just fine too - but they wont have
 the new syscalls yet.
 
-Architectures need to implement the new futex_atomic_cmpxchg_inuser()
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
 inline function before writing up the syscalls (that function returns
 -ENOSYS right now).
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9a0e9026ba5e..08b3d1da3583 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -10,7 +10,7 @@
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 514bd401cd7e..df893c160318 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -50,7 +50,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
index 41184a31885c..7b8ceefd010f 100644
--- a/include/asm-i386/futex.h
+++ b/include/asm-i386/futex.h
@@ -105,7 +105,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index c5fb2d6d918a..a554089991f2 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -100,7 +100,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-powerpc/futex.h b/include/asm-powerpc/futex.h
index 80ed9854e42b..f1b3c00bc1ce 100644
--- a/include/asm-powerpc/futex.h
+++ b/include/asm-powerpc/futex.h
@@ -82,7 +82,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
index 7d9eb1a84546..9804bf07b092 100644
--- a/include/asm-x86_64/futex.h
+++ b/include/asm-x86_64/futex.h
@@ -95,7 +95,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 20face6b798d..55fff96ae859 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,7 +100,7 @@ long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
-extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449dea6..c49bd193b058 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,7 +1061,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
-
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index feb724b2554e..9c9b2b6b22dd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -913,15 +913,15 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	unsigned int futex_val;
+	u32 uval;
 
-repeat:
-	if (get_user(futex_val, uaddr))
+retry:
+	if (get_user(uval, uaddr))
 		return -1;
 
-	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+	if ((uval & FUTEX_TID_MASK) == curr->pid) {
 		/*
 		 * Ok, this dying thread is truly holding a futex
 		 * of interest. Set the OWNER_DIED bit atomically
@@ -932,12 +932,11 @@ repeat:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
-					 futex_val | FUTEX_OWNER_DIED) !=
-								   futex_val)
-			goto repeat;
+		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+					 uval | FUTEX_OWNER_DIED) != uval)
+			goto retry;
 
-		if (futex_val & FUTEX_WAITERS)
+		if (uval & FUTEX_WAITERS)
 			futex_wake((unsigned long)uaddr, 1);
 	}
 	return 0;
@@ -985,7 +984,6 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void *)entry + futex_offset,
 						curr))
 				return;
-
 		/*
 		 * Fetch the next entry in the list:
 		 */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c153559ef289..9c077cf9aa84 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -121,9 +121,9 @@ err_unlock:
 	return ret;
 }
 
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
+		u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -137,6 +137,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From 76b81e2b0e2241accebcc68e126bc5ab958661b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:28 -0800
Subject: [PATCH] lightweight robust futexes updates 2

futex.h updates:

- get rid of FUTEX_OWNER_PENDING - it's not used
- reduce ROBUST_LIST_LIMIT to a saner value

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 55fff96ae859..966a5b3da439 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -79,22 +79,16 @@ struct robust_list_head {
  */
 #define FUTEX_OWNER_DIED	0x40000000
 
-/*
- * Reserved bit:
- */
-#define FUTEX_OWNER_PENDING	0x20000000
-
 /*
  * The rest of the robust-futex field is for the TID:
  */
-#define FUTEX_TID_MASK		0x1fffffff
+#define FUTEX_TID_MASK		0x3fffffff
 
 /*
- * A limit of one million locks held per thread (!) ought to be enough
- * for some time. This also protects against a deliberately circular
- * list. Not worth introducing an rlimit for this:
+ * This limit protects against a deliberately circular list.
+ * (Not worth introducing an rlimit for it)
  */
-#define ROBUST_LIST_LIMIT	1048576
+#define ROBUST_LIST_LIMIT	2048
 
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
-- 
cgit v1.2.3


From e041c683412d5bf44dc2b109053e3b837b71742d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 27 Mar 2006 01:16:30 -0800
Subject: [PATCH] Notifier chain update: API changes

The kernel's implementation of notifier chains is unsafe.  There is no
protection against entries being added to or removed from a chain while the
chain is in use.  The issues were discussed in this thread:

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

We noticed that notifier chains in the kernel fall into two basic usage
classes:

	"Blocking" chains are always called from a process context
	and the callout routines are allowed to sleep;

	"Atomic" chains can be called from an atomic context and
	the callout routines are not allowed to sleep.

We decided to codify this distinction and make it part of the API.  Therefore
this set of patches introduces three new, parallel APIs: one for blocking
notifiers, one for atomic notifiers, and one for "raw" notifiers (which is
really just the old API under a new name).  New kinds of data structures are
used for the heads of the chains, and new routines are defined for
registration, unregistration, and calling a chain.  The three APIs are
explained in include/linux/notifier.h and their implementation is in
kernel/sys.c.

With atomic and blocking chains, the implementation guarantees that the chain
links will not be corrupted and that chain callers will not get messed up by
entries being added or removed.  For raw chains the implementation provides no
guarantees at all; users of this API must provide their own protections.  (The
idea was that situations may come up where the assumptions of the atomic and
blocking APIs are not appropriate, so it should be possible for users to
handle these things in their own way.)

There are some limitations, which should not be too hard to live with.  For
atomic/blocking chains, registration and unregistration must always be done in
a process context since the chain is protected by a mutex/rwsem.  Also, a
callout routine for a non-raw chain must not try to register or unregister
entries on its own chain.  (This did happen in a couple of places and the code
had to be changed to avoid it.)

Since atomic chains may be called from within an NMI handler, they cannot use
spinlocks for synchronization.  Instead we use RCU.  The overhead falls almost
entirely in the unregister routine, which is okay since unregistration is much
less frequent that calling a chain.

Here is the list of chains that we adjusted and their classifications.  None
of them use the raw API, so for the moment it is only a placeholder.

  ATOMIC CHAINS
  -------------
arch/i386/kernel/traps.c:		i386die_chain
arch/ia64/kernel/traps.c:		ia64die_chain
arch/powerpc/kernel/traps.c:		powerpc_die_chain
arch/sparc64/kernel/traps.c:		sparc64die_chain
arch/x86_64/kernel/traps.c:		die_chain
drivers/char/ipmi/ipmi_si_intf.c:	xaction_notifier_list
kernel/panic.c:				panic_notifier_list
kernel/profile.c:			task_free_notifier
net/bluetooth/hci_core.c:		hci_notifier
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_chain
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_expect_chain
net/ipv6/addrconf.c:			inet6addr_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_expect_chain
net/netlink/af_netlink.c:		netlink_chain

  BLOCKING CHAINS
  ---------------
arch/powerpc/platforms/pseries/reconfig.c:	pSeries_reconfig_chain
arch/s390/kernel/process.c:		idle_chain
arch/x86_64/kernel/process.c		idle_notifier
drivers/base/memory.c:			memory_chain
drivers/cpufreq/cpufreq.c		cpufreq_policy_notifier_list
drivers/cpufreq/cpufreq.c		cpufreq_transition_notifier_list
drivers/macintosh/adb.c:		adb_client_list
drivers/macintosh/via-pmu.c		sleep_notifier_list
drivers/macintosh/via-pmu68k.c		sleep_notifier_list
drivers/macintosh/windfarm_core.c	wf_client_list
drivers/usb/core/notify.c		usb_notifier_list
drivers/video/fbmem.c			fb_notifier_list
kernel/cpu.c				cpu_chain
kernel/module.c				module_notify_list
kernel/profile.c			munmap_notifier
kernel/profile.c			task_exit_notifier
kernel/sys.c				reboot_notifier_list
net/core/dev.c				netdev_chain
net/decnet/dn_dev.c:			dnaddr_chain
net/ipv4/devinet.c:			inetaddr_chain

It's possible that some of these classifications are wrong.  If they are,
please let us know or submit a patch to fix them.  Note that any chain that
gets called very frequently should be atomic, because the rwsem read-locking
used for blocking chains is very likely to incur cache misses on SMP systems.
(However, if the chain's callout routines may sleep then the chain cannot be
atomic.)

The patch set was written by Alan Stern and Chandra Seetharaman, incorporating
material written by Keith Owens and suggestions from Paul McKenney and Andrew
Morton.

[jes@sgi.com: restructure the notifier chain initialization macros]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c                   |   5 +-
 arch/arm/mach-omap1/board-netstar.c         |   2 +-
 arch/arm/mach-omap1/board-voiceblue.c       |   2 +-
 arch/i386/kernel/traps.c                    |  17 +-
 arch/ia64/kernel/traps.c                    |   6 +-
 arch/mips/lasat/setup.c                     |   3 +-
 arch/mips/sgi-ip22/ip22-reset.c             |   2 +-
 arch/mips/sgi-ip32/ip32-reset.c             |   2 +-
 arch/parisc/kernel/pdc_chassis.c            |   3 +-
 arch/powerpc/kernel/setup_64.c              |   3 +-
 arch/powerpc/kernel/traps.c                 |  16 +-
 arch/powerpc/platforms/pseries/reconfig.c   |  10 +-
 arch/ppc/platforms/prep_setup.c             |   2 +-
 arch/s390/kernel/process.c                  |  11 +-
 arch/sparc64/kernel/traps.c                 |  17 +-
 arch/um/drivers/mconsole_kern.c             |   3 +-
 arch/um/kernel/um_arch.c                    |   3 +-
 arch/x86_64/kernel/process.c                |  17 +-
 arch/x86_64/kernel/traps.c                  |  18 +-
 arch/xtensa/platform-iss/setup.c            |   2 +-
 drivers/base/memory.c                       |   8 +-
 drivers/char/ipmi/ipmi_msghandler.c         |   4 +-
 drivers/char/ipmi/ipmi_si_intf.c            |   7 +-
 drivers/char/ipmi/ipmi_watchdog.c           |   6 +-
 drivers/cpufreq/cpufreq.c                   |  61 +++---
 drivers/firmware/dcdbas.c                   |  19 +-
 drivers/macintosh/adb.c                     |  11 +-
 drivers/macintosh/adbhid.c                  |   3 +-
 drivers/macintosh/via-pmu.c                 |   2 +-
 drivers/macintosh/via-pmu68k.c              |   7 +-
 drivers/macintosh/windfarm_core.c           |   8 +-
 drivers/misc/ibmasm/heartbeat.c             |   5 +-
 drivers/net/bonding/bond_main.c             |   2 +-
 drivers/parisc/led.c                        |  14 +-
 drivers/parisc/power.c                      |   6 +-
 drivers/scsi/gdth.c                         |   9 +-
 drivers/usb/core/notify.c                   |  65 +-----
 drivers/video/fbmem.c                       |  31 +--
 include/asm-i386/kdebug.h                   |  10 +-
 include/asm-ia64/kdebug.h                   |   4 +-
 include/asm-powerpc/kdebug.h                |  12 +-
 include/asm-sparc64/kdebug.h                |  11 +-
 include/asm-x86_64/kdebug.h                 |  23 +-
 include/linux/adb.h                         |   2 +-
 include/linux/kernel.h                      |   2 +-
 include/linux/memory.h                      |   1 -
 include/linux/netfilter_ipv4/ip_conntrack.h |  17 +-
 include/linux/notifier.h                    |  96 +++++++-
 include/net/netfilter/nf_conntrack.h        |  17 +-
 kernel/cpu.c                                |  29 +--
 kernel/module.c                             |  20 +-
 kernel/panic.c                              |   4 +-
 kernel/profile.c                            |  53 ++---
 kernel/softlockup.c                         |   2 +-
 kernel/sys.c                                | 327 ++++++++++++++++++++++------
 net/bluetooth/hci_core.c                    |   8 +-
 net/core/dev.c                              |  42 ++--
 net/decnet/dn_dev.c                         |  10 +-
 net/ipv4/devinet.c                          |  16 +-
 net/ipv4/netfilter/ip_conntrack_core.c      |   6 +-
 net/ipv6/addrconf.c                         |  10 +-
 net/netfilter/nf_conntrack_core.c           |   6 +-
 net/netlink/af_netlink.c                    |   9 +-
 63 files changed, 677 insertions(+), 472 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 9402624453c2..dd8769670596 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -43,7 +43,7 @@
 #include <asm/setup.h>
 #include <asm/io.h>
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 static int alpha_panic_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block alpha_panic_block = {
 	alpha_panic_event,
@@ -500,7 +500,8 @@ setup_arch(char **cmdline_p)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &alpha_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&alpha_panic_block);
 
 #ifdef CONFIG_ALPHA_GENERIC
 	/* Assume that we've booted from SRM if we haven't booted from MILO.
diff --git a/arch/arm/mach-omap1/board-netstar.c b/arch/arm/mach-omap1/board-netstar.c
index 60d5f8a3339c..7520e602d7a2 100644
--- a/arch/arm/mach-omap1/board-netstar.c
+++ b/arch/arm/mach-omap1/board-netstar.c
@@ -141,7 +141,7 @@ static int __init netstar_late_init(void)
 	/* TODO: Setup front panel switch here */
 
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c
index bfd5fdd1a875..52e4a9d69642 100644
--- a/arch/arm/mach-omap1/board-voiceblue.c
+++ b/arch/arm/mach-omap1/board-voiceblue.c
@@ -235,7 +235,7 @@ static struct notifier_block panic_block = {
 static int __init voiceblue_setup(void)
 {
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4624f8ca2459..6b63a5aa1e46 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -92,22 +92,21 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
-struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&i386die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&i386die_chain, nb);
 }
 EXPORT_SYMBOL(register_die_notifier);
 
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&i386die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
 	return	p > (void *)tinfo &&
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index dabd6c32641e..7c1ddc8ac443 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,19 +30,19 @@ extern spinlock_t timerlist_lock;
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
-struct notifier_block *ia64die_chain;
+ATOMIC_NOTIFIER_HEAD(ia64die_chain);
 
 int
 register_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ia64die_chain, nb);
+	return atomic_notifier_chain_register(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_die_notifier);
 
 int
 unregister_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ia64die_chain, nb);
+	return atomic_notifier_chain_unregister(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_die_notifier);
 
diff --git a/arch/mips/lasat/setup.c b/arch/mips/lasat/setup.c
index 83eb08b7a072..e9e9a89c6741 100644
--- a/arch/mips/lasat/setup.c
+++ b/arch/mips/lasat/setup.c
@@ -165,7 +165,8 @@ void __init plat_setup(void)
 
 	/* Set up panic notifier */
 	for (i = 0; i < sizeof(lasat_panic_block) / sizeof(struct notifier_block); i++)
-		notifier_chain_register(&panic_notifier_list, &lasat_panic_block[i]);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&lasat_panic_block[i]);
 
 	lasat_reboot_setup();
 
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index 92a3b3c15ed3..a9c58e067b53 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -238,7 +238,7 @@ static int __init reboot_setup(void)
 	request_irq(SGI_PANEL_IRQ, panel_int, 0, "Front Panel", NULL);
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 0c948008b023..ab9d9cef089e 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -193,7 +193,7 @@ static __init int ip32_reboot_setup(void)
 
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	request_irq(MACEISA_RTC_IRQ, ip32_rtc_int, 0, "rtc", NULL);
 
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 2a01fe1bdc98..0cea6958f427 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -150,7 +150,8 @@ void __init parisc_pdc_chassis_init(void)
 
 		if (handle) {
 			/* initialize panic notifier chain */
-			notifier_chain_register(&panic_notifier_list, &pdc_chassis_panic_block);
+			atomic_notifier_chain_register(&panic_notifier_list,
+					&pdc_chassis_panic_block);
 
 			/* initialize reboot notifier chain */
 			register_reboot_notifier(&pdc_chassis_reboot_block);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2f3fdad35594..e20c1fae3423 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -579,7 +579,8 @@ void __init setup_arch(char **cmdline_p)
 	panic_timeout = 180;
 
 	if (ppc_md.panic)
-		notifier_chain_register(&panic_notifier_list, &ppc64_panic_block);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&ppc64_panic_block);
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) _etext;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 98660aedeeb7..9763faab6739 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -74,19 +74,19 @@ EXPORT_SYMBOL(__debugger_dabr_match);
 EXPORT_SYMBOL(__debugger_fault_handler);
 #endif
 
-struct notifier_block *powerpc_die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(powerpc_die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
+	return atomic_notifier_chain_register(&powerpc_die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
 
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&powerpc_die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&powerpc_die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /*
  * Trap & Exception support
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 86cfa6ecdcf3..5ad90676567a 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -94,16 +94,16 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static struct notifier_block *pSeries_reconfig_chain;
+static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
 
 int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
-	return notifier_chain_register(&pSeries_reconfig_chain, nb);
+	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
 }
 
 void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
 {
-	notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
 }
 
 static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
@@ -131,7 +131,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 		goto out_err;
 	}
 
-	err = notifier_call_chain(&pSeries_reconfig_chain,
+	err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
 				  PSERIES_RECONFIG_ADD, np);
 	if (err == NOTIFY_BAD) {
 		printk(KERN_ERR "Failed to add device node %s\n", path);
@@ -171,7 +171,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 
 	remove_node_proc_entries(np);
 
-	notifier_call_chain(&pSeries_reconfig_chain,
+	blocking_notifier_call_chain(&pSeries_reconfig_chain,
 			    PSERIES_RECONFIG_REMOVE, np);
 	of_detach_node(np);
 
diff --git a/arch/ppc/platforms/prep_setup.c b/arch/ppc/platforms/prep_setup.c
index a0fc628ffb1e..d95c05d9824d 100644
--- a/arch/ppc/platforms/prep_setup.c
+++ b/arch/ppc/platforms/prep_setup.c
@@ -736,7 +736,7 @@ ibm_statusled_progress(char *s, unsigned short hex)
 		hex = 0xfff;
 		if (!notifier_installed) {
 			++notifier_installed;
-			notifier_chain_register(&panic_notifier_list,
+			atomic_notifier_chain_register(&panic_notifier_list,
 						&ibm_statusled_block);
 		}
 	}
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 99182a415fe7..4a0f5a1551ea 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -76,17 +76,17 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Need to know about CPUs going idle?
  */
-static struct notifier_block *idle_chain;
+static ATOMIC_NOTIFIER_HEAD(idle_chain);
 
 int register_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&idle_chain, nb);
+	return atomic_notifier_chain_register(&idle_chain, nb);
 }
 EXPORT_SYMBOL(register_idle_notifier);
 
 int unregister_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&idle_chain, nb);
+	return atomic_notifier_chain_unregister(&idle_chain, nb);
 }
 EXPORT_SYMBOL(unregister_idle_notifier);
 
@@ -95,7 +95,7 @@ void do_monitor_call(struct pt_regs *regs, long interruption_code)
 	/* disable monitor call class 0 */
 	__ctl_clear_bit(8, 15);
 
-	notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
+	atomic_notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
 			    (void *)(long) smp_processor_id());
 }
 
@@ -116,7 +116,8 @@ static void default_idle(void)
 		return;
 	}
 
-	rc = notifier_call_chain(&idle_chain, CPU_IDLE, (void *)(long) cpu);
+	rc = atomic_notifier_call_chain(&idle_chain,
+			CPU_IDLE, (void *)(long) cpu);
 	if (rc != NOTIFY_OK && rc != NOTIFY_DONE)
 		BUG();
 	if (rc != NOTIFY_OK) {
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index df612e4f75f9..ff090bb9734b 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -43,18 +43,19 @@
 #include <linux/kmod.h>
 #endif
 
-struct notifier_block *sparc64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(sparc64die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&sparc64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&sparc64die_chain, nb);
 }
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&sparc64die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /* When an irrecoverable trap occurs at tl > 0, the trap entry
  * code logs the trap state registers at every level in the trap
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 54388d10bcf9..1488816588ea 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -762,7 +762,8 @@ static struct notifier_block panic_exit_notifier = {
 
 static int add_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	return(0);
 }
 
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index bb1c87211ac1..7d51dd7201c3 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -477,7 +477,8 @@ static struct notifier_block panic_exit_notifier = {
 
 void __init setup_arch(char **cmdline_p)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	paging_init();
         strlcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
  	*cmdline_p = command_line;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0370720515f1..70dd8e5c6889 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override);
 void (*pm_idle)(void);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-static struct notifier_block *idle_notifier;
-static DEFINE_SPINLOCK(idle_notifier_lock);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_register(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_register(&idle_notifier, n);
 }
 EXPORT_SYMBOL_GPL(idle_notifier_register);
 
 void idle_notifier_unregister(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_unregister(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_unregister(&idle_notifier, n);
 }
 EXPORT_SYMBOL(idle_notifier_unregister);
 
@@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
 void enter_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_NOT_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
 /* Called from interrupts to signify idle end */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7b148309c529..edaa9fe654dc 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -69,20 +69,20 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
-struct notifier_block *die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
diff --git a/arch/xtensa/platform-iss/setup.c b/arch/xtensa/platform-iss/setup.c
index 2e6dcbf0cc04..23790a5610e2 100644
--- a/arch/xtensa/platform-iss/setup.c
+++ b/arch/xtensa/platform-iss/setup.c
@@ -108,5 +108,5 @@ static struct notifier_block iss_panic_block = {
 
 void __init platform_setup(char **p_cmdline)
 {
-	notifier_chain_register(&panic_notifier_list, &iss_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &iss_panic_block);
 }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 105a0d61eb1f..dd547af4681a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -47,16 +47,16 @@ static struct kset_uevent_ops memory_uevent_ops = {
 	.uevent		= memory_uevent,
 };
 
-static struct notifier_block *memory_chain;
+static BLOCKING_NOTIFIER_HEAD(memory_chain);
 
 int register_memory_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&memory_chain, nb);
+        return blocking_notifier_chain_register(&memory_chain, nb);
 }
 
 void unregister_memory_notifier(struct notifier_block *nb)
 {
-        notifier_chain_unregister(&memory_chain, nb);
+        blocking_notifier_chain_unregister(&memory_chain, nb);
 }
 
 /*
@@ -140,7 +140,7 @@ static ssize_t show_mem_state(struct sys_device *dev, char *buf)
 
 static inline int memory_notify(unsigned long val, void *v)
 {
-	return notifier_call_chain(&memory_chain, val, v);
+	return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
 /*
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index b8fb87c6c29f..40eb005b9d77 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3744,7 +3744,7 @@ static int ipmi_init_msghandler(void)
 	ipmi_timer.expires = jiffies + IPMI_TIMEOUT_JIFFIES;
 	add_timer(&ipmi_timer);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	initialized = 1;
 
@@ -3764,7 +3764,7 @@ static __exit void cleanup_ipmi(void)
 	if (!initialized)
 		return;
 
-	notifier_chain_unregister(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list, &panic_block);
 
 	/* This can't be called if any interfaces exist, so no worry about
 	   shutting down the interfaces. */
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 12f858dc9994..35fbd4d8ed4b 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -237,10 +237,10 @@ struct smi_info
 
 static int try_smi_init(struct smi_info *smi);
 
-static struct notifier_block *xaction_notifier_list;
+static ATOMIC_NOTIFIER_HEAD(xaction_notifier_list);
 static int register_xaction_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&xaction_notifier_list, nb);
+	return atomic_notifier_chain_register(&xaction_notifier_list, nb);
 }
 
 static void si_restart_short_timer(struct smi_info *smi_info);
@@ -302,7 +302,8 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
 		do_gettimeofday(&t);
 		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
 #endif
-		err = notifier_call_chain(&xaction_notifier_list, 0, smi_info);
+		err = atomic_notifier_call_chain(&xaction_notifier_list,
+				0, smi_info);
 		if (err & NOTIFY_STOP_MASK) {
 			rv = SI_SM_CALL_WITHOUT_DELAY;
 			goto out;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 616539310d9a..7ece9f3c8f70 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1158,7 +1158,8 @@ static int __init ipmi_wdog_init(void)
 	}
 
 	register_reboot_notifier(&wdog_reboot_notifier);
-	notifier_chain_register(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&wdog_panic_notifier);
 
 	printk(KERN_INFO PFX "driver initialized\n");
 
@@ -1176,7 +1177,8 @@ static __exit void ipmi_unregister_watchdog(void)
 		release_nmi(&ipmi_nmi_handler);
 #endif
 
-	notifier_chain_unregister(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&wdog_panic_notifier);
 	unregister_reboot_notifier(&wdog_reboot_notifier);
 
 	if (! watchdog_user)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index aed80e6aec6d..9b6ae7dc8b8a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -52,9 +52,8 @@ static void handle_update(void *data);
  * changes to devices when the CPU clock speed changes.
  * The mutex locks both lists.
  */
-static struct notifier_block *cpufreq_policy_notifier_list;
-static struct notifier_block *cpufreq_transition_notifier_list;
-static DECLARE_RWSEM (cpufreq_notifier_rwsem);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_transition_notifier_list);
 
 
 static LIST_HEAD(cpufreq_governor_list);
@@ -247,8 +246,6 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 	dprintk("notification %u of frequency transition to %u kHz\n",
 		state, freqs->new);
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	policy = cpufreq_cpu_data[freqs->cpu];
 	switch (state) {
 
@@ -266,20 +263,19 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 				freqs->old = policy->cur;
 			}
 		}
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_PRECHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_PRECHANGE, freqs);
 		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
 		break;
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_POSTCHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
 			policy->cur = freqs->new;
 		break;
 	}
-	up_read(&cpufreq_notifier_rwsem);
 }
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
@@ -1007,7 +1003,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
 		freqs.old = cpu_policy->cur;
 		freqs.new = cur_freq;
 
-		notifier_call_chain(&cpufreq_transition_notifier_list,
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
 				    CPUFREQ_SUSPENDCHANGE, &freqs);
 		adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);
 
@@ -1088,7 +1084,8 @@ static int cpufreq_resume(struct sys_device * sysdev)
 			freqs.old = cpu_policy->cur;
 			freqs.new = cur_freq;
 
-			notifier_call_chain(&cpufreq_transition_notifier_list,
+			blocking_notifier_call_chain(
+					&cpufreq_transition_notifier_list,
 					CPUFREQ_RESUMECHANGE, &freqs);
 			adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);
 
@@ -1125,24 +1122,24 @@ static struct sysdev_driver cpufreq_sysdev_driver = {
  *      changes in cpufreq policy.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_register.
+ *	blocking_notifier_chain_register.
  */
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1157,24 +1154,24 @@ EXPORT_SYMBOL(cpufreq_register_notifier);
  *	Remove a driver from the CPU frequency notifier list.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_unregister.
+ *	blocking_notifier_chain_unregister.
  */
 int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1346,29 +1343,23 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
 	if (ret)
 		goto error_out;
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	/* adjust if necessary - all reasons */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_ADJUST, policy);
 
 	/* adjust if necessary - hardware incompatibility*/
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_INCOMPATIBLE,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_INCOMPATIBLE, policy);
 
 	/* verify the cpu speed can be set within this limit,
 	   which might be different to the first one */
 	ret = cpufreq_driver->verify(policy);
-	if (ret) {
-		up_read(&cpufreq_notifier_rwsem);
+	if (ret)
 		goto error_out;
-	}
 
 	/* notification of the new policy */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY,
-			    policy);
-
-	up_read(&cpufreq_notifier_rwsem);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_NOTIFY, policy);
 
 	data->min = policy->min;
 	data->max = policy->max;
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index d6543fc4a923..339f405ff708 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -484,26 +484,15 @@ static void dcdbas_host_control(void)
 static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 				void *unused)
 {
-	static unsigned int notify_cnt = 0;
-
 	switch (code) {
 	case SYS_DOWN:
 	case SYS_HALT:
 	case SYS_POWER_OFF:
 		if (host_control_on_shutdown) {
 			/* firmware is going to perform host control action */
-			if (++notify_cnt == 2) {
-				printk(KERN_WARNING
-				       "Please wait for shutdown "
-				       "action to complete...\n");
-				dcdbas_host_control();
-			}
-			/*
-			 * register again and initiate the host control
-			 * action on the second notification to allow
-			 * everyone that registered to be notified
-			 */
-			register_reboot_notifier(nb);
+			printk(KERN_WARNING "Please wait for shutdown "
+			       "action to complete...\n");
+			dcdbas_host_control();
 		}
 		break;
 	}
@@ -514,7 +503,7 @@ static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 static struct notifier_block dcdbas_reboot_nb = {
 	.notifier_call = dcdbas_reboot_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = INT_MIN
 };
 
 static DCDBAS_BIN_ATTR_RW(smi_data);
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index d2ead1776c16..34fcabac5fdb 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -80,7 +80,7 @@ static struct adb_driver *adb_driver_list[] = {
 static struct class *adb_dev_class;
 
 struct adb_driver *adb_controller;
-struct notifier_block *adb_client_list = NULL;
+BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
 static pid_t adb_probe_task_pid;
@@ -354,7 +354,8 @@ adb_notify_sleep(struct pmu_sleep_notifier *self, int when)
 		/* Stop autopoll */
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(0);
-		ret = notifier_call_chain(&adb_client_list, ADB_MSG_POWERDOWN, NULL);
+		ret = blocking_notifier_call_chain(&adb_client_list,
+				ADB_MSG_POWERDOWN, NULL);
 		if (ret & NOTIFY_STOP_MASK) {
 			up(&adb_probe_mutex);
 			return PBOOK_SLEEP_REFUSE;
@@ -391,7 +392,8 @@ do_adb_reset_bus(void)
 	if (adb_controller->autopoll)
 		adb_controller->autopoll(0);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_PRE_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_PRE_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK) {
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(autopoll_devs);
@@ -426,7 +428,8 @@ do_adb_reset_bus(void)
 	}
 	up(&adb_handler_sem);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_POST_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_POST_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 	
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index c0b46bceb5df..f5779a73184d 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1214,7 +1214,8 @@ static int __init adbhid_init(void)
 
 	adbhid_probe();
 
-	notifier_chain_register(&adb_client_list, &adbhid_adb_notifier);
+	blocking_notifier_chain_register(&adb_client_list,
+			&adbhid_adb_notifier);
 
 	return 0;
 }
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 4f5f3abc9cb3..0b5ff553e39a 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -187,7 +187,7 @@ extern int disable_kernel_backlight;
 
 int __fake_sleep;
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 #ifdef CONFIG_ADB
 static int adb_dev_map = 0;
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index f08e52f2107b..35b70323e7e3 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -102,7 +102,7 @@ static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited = 0;
 
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 static int pmu_probe(void);
 static int pmu_init(void);
@@ -913,7 +913,8 @@ int powerbook_sleep(void)
 	struct adb_request sleep_req;
 
 	/* Notify device drivers */
-	ret = notifier_call_chain(&sleep_notifier_list, PBOOK_SLEEP, NULL);
+	ret = blocking_notifier_call_chain(&sleep_notifier_list,
+			PBOOK_SLEEP, NULL);
 	if (ret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 
@@ -984,7 +985,7 @@ int powerbook_sleep(void)
 			enable_irq(i);
 
 	/* Notify drivers */
-	notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
+	blocking_notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
 
 	/* reenable ADB autopoll */
 	pmu_adb_autopoll(adb_dev_map);
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index 6c0ba04bc57a..ab3faa702d58 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -52,7 +52,7 @@
 static LIST_HEAD(wf_controls);
 static LIST_HEAD(wf_sensors);
 static DEFINE_MUTEX(wf_lock);
-static struct notifier_block *wf_client_list;
+static BLOCKING_NOTIFIER_HEAD(wf_client_list);
 static int wf_client_count;
 static unsigned int wf_overtemp;
 static unsigned int wf_overtemp_counter;
@@ -68,7 +68,7 @@ static struct platform_device wf_platform_device = {
 
 static inline void wf_notify(int event, void *param)
 {
-	notifier_call_chain(&wf_client_list, event, param);
+	blocking_notifier_call_chain(&wf_client_list, event, param);
 }
 
 int wf_critical_overtemp(void)
@@ -398,7 +398,7 @@ int wf_register_client(struct notifier_block *nb)
 	struct wf_sensor *sr;
 
 	mutex_lock(&wf_lock);
-	rc = notifier_chain_register(&wf_client_list, nb);
+	rc = blocking_notifier_chain_register(&wf_client_list, nb);
 	if (rc != 0)
 		goto bail;
 	wf_client_count++;
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(wf_register_client);
 int wf_unregister_client(struct notifier_block *nb)
 {
 	mutex_lock(&wf_lock);
-	notifier_chain_unregister(&wf_client_list, nb);
+	blocking_notifier_chain_unregister(&wf_client_list, nb);
 	wf_client_count++;
 	if (wf_client_count == 0)
 		wf_stop_thread();
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index f295401fac21..7fd7a43e38de 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -52,12 +52,13 @@ static struct notifier_block panic_notifier = { panic_happened, NULL, 1 };
 
 void ibmasm_register_panic_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_notifier);
 }
 
 void ibmasm_unregister_panic_notifier(void)
 {
-	notifier_chain_unregister(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&panic_notifier);
 }
 
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2d0ac169a86c..f13a539dc169 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3159,7 +3159,7 @@ static int bond_slave_netdev_event(unsigned long event, struct net_device *slave
  * bond_netdev_event: handle netdev notifier chain events.
  *
  * This function receives events for the netdev chain.  The caller (an
- * ioctl handler calling notifier_call_chain) holds the necessary
+ * ioctl handler calling blocking_notifier_call_chain) holds the necessary
  * locks for us to safely manipulate the slave devices (RTNL lock,
  * dev_probe_lock).
  */
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 3627a2d7f79f..298f2ddb2c17 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -499,11 +499,16 @@ static int led_halt(struct notifier_block *, unsigned long, void *);
 static struct notifier_block led_notifier = {
 	.notifier_call = led_halt,
 };
+static int notifier_disabled = 0;
 
 static int led_halt(struct notifier_block *nb, unsigned long event, void *buf) 
 {
 	char *txt;
-	
+
+	if (notifier_disabled)
+		return NOTIFY_OK;
+
+	notifier_disabled = 1;
 	switch (event) {
 	case SYS_RESTART:	txt = "SYSTEM RESTART";
 				break;
@@ -527,7 +532,6 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf)
 		if (led_func_ptr)
 			led_func_ptr(0xff); /* turn all LEDs ON */
 	
-	unregister_reboot_notifier(&led_notifier);
 	return NOTIFY_OK;
 }
 
@@ -758,6 +762,12 @@ not_found:
 	return 1;
 }
 
+static void __exit led_exit(void)
+{
+	unregister_reboot_notifier(&led_notifier);
+	return;
+}
+
 #ifdef CONFIG_PROC_FS
 module_init(led_create_procfs)
 #endif
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 54b2b7f20b96..0bcab83b4080 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -251,7 +251,8 @@ static int __init power_init(void)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&parisc_panic_block);
 
 	tasklet_enable(&power_tasklet);
 
@@ -264,7 +265,8 @@ static void __exit power_exit(void)
 		return;
 
 	tasklet_disable(&power_tasklet);
-	notifier_chain_unregister(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&parisc_panic_block);
 	power_tasklet.func = NULL;
 	pdc_soft_power_button(0);
 }
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 62e3cda859af..7f7013e80a88 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -671,7 +671,7 @@ static struct file_operations gdth_fops = {
 static struct notifier_block gdth_notifier = {
     gdth_halt, NULL, 0
 };
-
+static int notifier_disabled = 0;
 
 static void gdth_delay(int milliseconds)
 {
@@ -4595,13 +4595,13 @@ static int __init gdth_detect(struct scsi_host_template *shtp)
         add_timer(&gdth_timer);
 #endif
         major = register_chrdev(0,"gdth",&gdth_fops);
+        notifier_disabled = 0;
         register_reboot_notifier(&gdth_notifier);
     }
     gdth_polling = FALSE;
     return gdth_ctr_vcount;
 }
 
-
 static int gdth_release(struct Scsi_Host *shp)
 {
     int hanum;
@@ -5632,10 +5632,14 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
     char            cmnd[MAX_COMMAND_SIZE];   
 #endif
 
+    if (notifier_disabled)
+    	return NOTIFY_OK;
+
     TRACE2(("gdth_halt() event %d\n",(int)event));
     if (event != SYS_RESTART && event != SYS_HALT && event != SYS_POWER_OFF)
         return NOTIFY_DONE;
 
+    notifier_disabled = 1;
     printk("GDT-HA: Flushing all host drives .. ");
     for (hanum = 0; hanum < gdth_ctr_count; ++hanum) {
         gdth_flush(hanum);
@@ -5679,7 +5683,6 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
 #ifdef GDTH_STATISTICS
     del_timer(&gdth_timer);
 #endif
-    unregister_reboot_notifier(&gdth_notifier);
     return NOTIFY_OK;
 }
 
diff --git a/drivers/usb/core/notify.c b/drivers/usb/core/notify.c
index 4b55285de9a0..fe0ed54fa0ae 100644
--- a/drivers/usb/core/notify.c
+++ b/drivers/usb/core/notify.c
@@ -16,57 +16,7 @@
 #include <linux/mutex.h>
 #include "usb.h"
 
-
-static struct notifier_block *usb_notifier_list;
-static DEFINE_MUTEX(usb_notifier_lock);
-
-static void usb_notifier_chain_register(struct notifier_block **list,
-					struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while (*list) {
-		if (n->priority > (*list)->priority)
-			break;
-		list = &((*list)->next);
-	}
-	n->next = *list;
-	*list = n;
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static void usb_notifier_chain_unregister(struct notifier_block **nl,
-				   struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while ((*nl)!=NULL) {
-		if ((*nl)==n) {
-			*nl = n->next;
-			goto exit;
-		}
-		nl=&((*nl)->next);
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static int usb_notifier_call_chain(struct notifier_block **n,
-				   unsigned long val, void *v)
-{
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
-
-	mutex_lock(&usb_notifier_lock);
-	while (nb) {
-		ret = nb->notifier_call(nb,val,v);
-		if (ret&NOTIFY_STOP_MASK) {
-			goto exit;
-		}
-		nb = nb->next;
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-	return ret;
-}
+static BLOCKING_NOTIFIER_HEAD(usb_notifier_list);
 
 /**
  * usb_register_notify - register a notifier callback whenever a usb change happens
@@ -76,7 +26,7 @@ exit:
  */
 void usb_register_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_register(&usb_notifier_list, nb);
+	blocking_notifier_chain_register(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_register_notify);
 
@@ -89,27 +39,28 @@ EXPORT_SYMBOL_GPL(usb_register_notify);
  */
 void usb_unregister_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_unregister(&usb_notifier_list, nb);
+	blocking_notifier_chain_unregister(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_unregister_notify);
 
 
 void usb_notify_add_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
 }
 
 void usb_notify_remove_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev);
+	blocking_notifier_call_chain(&usb_notifier_list,
+			USB_DEVICE_REMOVE, udev);
 }
 
 void usb_notify_add_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
 }
 
 void usb_notify_remove_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 07d882b14396..b1a8dca76430 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -55,7 +55,7 @@
 
 #define FBPIXMAPSIZE	(1024 * 8)
 
-static struct notifier_block *fb_notifier_list;
+static BLOCKING_NOTIFIER_HEAD(fb_notifier_list);
 struct fb_info *registered_fb[FB_MAX];
 int num_registered_fb;
 
@@ -784,7 +784,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 		    event.info = info;
 		    event.data = &mode1;
-		    ret = notifier_call_chain(&fb_notifier_list,
+		    ret = blocking_notifier_call_chain(&fb_notifier_list,
 					      FB_EVENT_MODE_DELETE, &event);
 		}
 
@@ -830,8 +830,8 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 				info->flags &= ~FBINFO_MISC_USEREVENT;
 				event.info = info;
-				notifier_call_chain(&fb_notifier_list, evnt,
-						    &event);
+				blocking_notifier_call_chain(&fb_notifier_list,
+						evnt, &event);
 			}
 		}
 	}
@@ -854,7 +854,8 @@ fb_blank(struct fb_info *info, int blank)
 
 		event.info = info;
 		event.data = &blank;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_BLANK, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_BLANK, &event);
 	}
 
  	return ret;
@@ -925,7 +926,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		con2fb.framebuffer = -1;
 		event.info = info;
 		event.data = &con2fb;
-		notifier_call_chain(&fb_notifier_list,
+		blocking_notifier_call_chain(&fb_notifier_list,
 				    FB_EVENT_GET_CONSOLE_MAP, &event);
 		return copy_to_user(argp, &con2fb,
 				    sizeof(con2fb)) ? -EFAULT : 0;
@@ -944,7 +945,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		    return -EINVAL;
 		event.info = info;
 		event.data = &con2fb;
-		return notifier_call_chain(&fb_notifier_list,
+		return blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_SET_CONSOLE_MAP,
 					   &event);
 	case FBIOBLANK:
@@ -1324,7 +1325,7 @@ register_framebuffer(struct fb_info *fb_info)
 	devfs_mk_cdev(MKDEV(FB_MAJOR, i),
 			S_IFCHR | S_IRUGO | S_IWUGO, "fb/%d", i);
 	event.info = fb_info;
-	notifier_call_chain(&fb_notifier_list,
+	blocking_notifier_call_chain(&fb_notifier_list,
 			    FB_EVENT_FB_REGISTERED, &event);
 	return 0;
 }
@@ -1366,7 +1367,7 @@ unregister_framebuffer(struct fb_info *fb_info)
  */
 int fb_register_client(struct notifier_block *nb)
 {
-	return notifier_chain_register(&fb_notifier_list, nb);
+	return blocking_notifier_chain_register(&fb_notifier_list, nb);
 }
 
 /**
@@ -1375,7 +1376,7 @@ int fb_register_client(struct notifier_block *nb)
  */
 int fb_unregister_client(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&fb_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&fb_notifier_list, nb);
 }
 
 /**
@@ -1393,11 +1394,13 @@ void fb_set_suspend(struct fb_info *info, int state)
 
 	event.info = info;
 	if (state) {
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_SUSPEND, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_SUSPEND, &event);
 		info->state = FBINFO_STATE_SUSPENDED;
 	} else {
 		info->state = FBINFO_STATE_RUNNING;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_RESUME, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_RESUME, &event);
 	}
 }
 
@@ -1469,7 +1472,7 @@ int fb_new_modelist(struct fb_info *info)
 
 	if (!list_empty(&info->modelist)) {
 		event.info = info;
-		err = notifier_call_chain(&fb_notifier_list,
+		err = blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_NEW_MODELIST,
 					   &event);
 	}
@@ -1495,7 +1498,7 @@ int fb_con_duit(struct fb_info *info, int event, void *data)
 	evnt.info = info;
 	evnt.data = data;
 
-	return notifier_call_chain(&fb_notifier_list, event, &evnt);
+	return blocking_notifier_call_chain(&fb_notifier_list, event, &evnt);
 }
 EXPORT_SYMBOL(fb_con_duit);
 
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index 316138e89910..96d0828ce096 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -17,11 +17,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *i386die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head i386die_chain;
 
 
 /* Grossly misnamed. */
@@ -51,7 +49,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&i386die_chain, val, &args);
+	return atomic_notifier_call_chain(&i386die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
index 8b01a083dde6..218c458ab60c 100644
--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -40,7 +40,7 @@ struct die_args {
 
 extern int register_die_notifier(struct notifier_block *);
 extern int unregister_die_notifier(struct notifier_block *);
-extern struct notifier_block *ia64die_chain;
+extern struct atomic_notifier_head ia64die_chain;
 
 enum die_val {
 	DIE_BREAK = 1,
@@ -81,7 +81,7 @@ static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
 		.signr  = sig
 	};
 
-	return notifier_call_chain(&ia64die_chain, val, &args);
+	return atomic_notifier_call_chain(&ia64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-powerpc/kdebug.h b/include/asm-powerpc/kdebug.h
index 7c16265568e0..c01786ab5fa6 100644
--- a/include/asm-powerpc/kdebug.h
+++ b/include/asm-powerpc/kdebug.h
@@ -16,13 +16,9 @@ struct die_args {
 	int signr;
 };
 
-/*
-   Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched -
-   then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *powerpc_die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head powerpc_die_chain;
 
 /* Grossly misnamed. */
 enum die_val {
@@ -37,7 +33,7 @@ enum die_val {
 static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
 {
 	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
-	return notifier_call_chain(&powerpc_die_chain, val, &args);
+	return atomic_notifier_call_chain(&powerpc_die_chain, val, &args);
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index 6321f5a0198d..4040d127ac3e 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -15,12 +15,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
- * If you really want to do it first unregister - then synchronize_sched
- * - then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *sparc64die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head sparc64die_chain;
 
 extern void bad_trap(struct pt_regs *, long);
 
@@ -46,7 +43,7 @@ static inline int notify_die(enum die_val val,char *str, struct pt_regs *regs,
 				 .trapnr	= trap,
 				 .signr		= sig };
 
-	return notifier_call_chain(&sparc64die_chain, val, &args);
+	return atomic_notifier_call_chain(&sparc64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index b9ed4c0c8783..cf795631d9b4 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -5,21 +5,20 @@
 
 struct pt_regs;
 
-struct die_args { 
+struct die_args {
 	struct pt_regs *regs;
 	const char *str;
-	long err; 
+	long err;
 	int trapnr;
 	int signr;
-}; 
+};
+
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head die_chain;
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *die_chain;
 /* Grossly misnamed. */
-enum die_val { 
+enum die_val {
 	DIE_OOPS = 1,
 	DIE_INT3,
 	DIE_DEBUG,
@@ -33,8 +32,8 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
-}; 
-	
+};
+
 static inline int notify_die(enum die_val val, const char *str,
 			struct pt_regs *regs, long err, int trap, int sig)
 {
@@ -45,7 +44,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&die_chain, val, &args); 
+	return atomic_notifier_call_chain(&die_chain, val, &args);
 } 
 
 extern int printk_address(unsigned long address);
diff --git a/include/linux/adb.h b/include/linux/adb.h
index e9fdc63483c7..b7305b178279 100644
--- a/include/linux/adb.h
+++ b/include/linux/adb.h
@@ -85,7 +85,7 @@ enum adb_message {
     ADB_MSG_POST_RESET	/* Called after resetting the bus (re-do init & register) */
 };
 extern struct adb_driver *adb_controller;
-extern struct notifier_block *adb_client_list;
+extern struct blocking_notifier_head adb_client_list;
 
 int adb_request(struct adb_request *req, void (*done)(struct adb_request *),
 		int flags, int nbytes, ...);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 03d6cfaa5b8a..a3720f973ea5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -87,7 +87,7 @@ extern int cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e251dc43d0f5..8f04143ca363 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,7 +77,6 @@ extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-struct notifier_block;
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index f32d75c4f4cf..d54d7b278e96 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -308,29 +308,30 @@ DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
  
-extern struct notifier_block *ip_conntrack_chain;
-extern struct notifier_block *ip_conntrack_expect_chain;
+extern struct atomic_notifier_head ip_conntrack_chain;
+extern struct atomic_notifier_head ip_conntrack_expect_chain;
 
 static inline int ip_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_chain, nb);
 }
 
 static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_chain, nb);
 }
 
 static inline int 
 ip_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_expect_chain, nb);
 }
 
 static inline int
 ip_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_expect_chain,
+			nb);
 }
 
 extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
@@ -355,14 +356,14 @@ static inline void ip_conntrack_event(enum ip_conntrack_events event,
 				      struct ip_conntrack *ct)
 {
 	if (is_confirmed(ct) && !is_dying(ct))
-		notifier_call_chain(&ip_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&ip_conntrack_chain, event, ct);
 }
 
 static inline void 
 ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct ip_conntrack_expect *exp)
 {
-	notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
 static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, 
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 5937dd6053c3..51dbab9710c7 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -10,25 +10,107 @@
 #ifndef _LINUX_NOTIFIER_H
 #define _LINUX_NOTIFIER_H
 #include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
 
-struct notifier_block
-{
-	int (*notifier_call)(struct notifier_block *self, unsigned long, void *);
+/*
+ * Notifier chains are of three types:
+ *
+ *	Atomic notifier chains: Chain callbacks run in interrupt/atomic
+ *		context. Callouts are not allowed to block.
+ *	Blocking notifier chains: Chain callbacks run in process context.
+ *		Callouts are allowed to block.
+ *	Raw notifier chains: There are no restrictions on callbacks,
+ *		registration, or unregistration.  All locking and protection
+ *		must be provided by the caller.
+ *
+ * atomic_notifier_chain_register() may be called from an atomic context,
+ * but blocking_notifier_chain_register() must be called from a process
+ * context.  Ditto for the corresponding _unregister() routines.
+ *
+ * atomic_notifier_chain_unregister() and blocking_notifier_chain_unregister()
+ * _must not_ be called from within the call chain.
+ */
+
+struct notifier_block {
+	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
 	struct notifier_block *next;
 	int priority;
 };
 
+struct atomic_notifier_head {
+	spinlock_t lock;
+	struct notifier_block *head;
+};
+
+struct blocking_notifier_head {
+	struct rw_semaphore rwsem;
+	struct notifier_block *head;
+};
+
+struct raw_notifier_head {
+	struct notifier_block *head;
+};
+
+#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
+		spin_lock_init(&(name)->lock);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {	\
+		init_rwsem(&(name)->rwsem);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define RAW_INIT_NOTIFIER_HEAD(name) do {	\
+		(name)->head = NULL;		\
+	} while (0)
+
+#define ATOMIC_NOTIFIER_INIT(name) {				\
+		.lock = SPIN_LOCK_UNLOCKED,			\
+		.head = NULL }
+#define BLOCKING_NOTIFIER_INIT(name) {				\
+		.rwsem = __RWSEM_INITIALIZER((name).rwsem),	\
+		.head = NULL }
+#define RAW_NOTIFIER_INIT(name)	{				\
+		.head = NULL }
+
+#define ATOMIC_NOTIFIER_HEAD(name)				\
+	struct atomic_notifier_head name =			\
+		ATOMIC_NOTIFIER_INIT(name)
+#define BLOCKING_NOTIFIER_HEAD(name)				\
+	struct blocking_notifier_head name =			\
+		BLOCKING_NOTIFIER_INIT(name)
+#define RAW_NOTIFIER_HEAD(name)					\
+	struct raw_notifier_head name =				\
+		RAW_NOTIFIER_INIT(name)
 
 #ifdef __KERNEL__
 
-extern int notifier_chain_register(struct notifier_block **list, struct notifier_block *n);
-extern int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n);
-extern int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v);
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_register(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+		unsigned long val, void *v);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+		unsigned long val, void *v);
+extern int raw_notifier_call_chain(struct raw_notifier_head *,
+		unsigned long val, void *v);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)	/* Bad/Veto action	*/
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
  */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index b6f0905a4ee2..916013ca4a5c 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -300,29 +300,30 @@ DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(nf_conntrack_ecache).x)
 
-extern struct notifier_block *nf_conntrack_chain;
-extern struct notifier_block *nf_conntrack_expect_chain;
+extern struct atomic_notifier_head nf_conntrack_chain;
+extern struct atomic_notifier_head nf_conntrack_expect_chain;
 
 static inline int nf_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_chain, nb);
 }
 
 static inline int nf_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_expect_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_expect_chain,
+			nb);
 }
 
 extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
@@ -347,14 +348,14 @@ static inline void nf_conntrack_event(enum ip_conntrack_events event,
 				      struct nf_conn *ct)
 {
 	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
-		notifier_call_chain(&nf_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&nf_conntrack_chain, event, ct);
 }
 
 static inline void
 nf_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct nf_conntrack_expect *exp)
 {
-	notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_NF_CONNTRACK_EVENTS */
 static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd80933..fe2b8d0bfe4c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DECLARE_MUTEX(cpucontrol);
 
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
-
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-	ret = notifier_chain_register(&cpu_chain, nb);
-	unlock_cpu_hotplug();
-	return ret;
+	return blocking_notifier_chain_register(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	lock_cpu_hotplug();
-	notifier_chain_unregister(&cpu_chain, nb);
-	unlock_cpu_hotplug();
+	blocking_notifier_chain_unregister(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
 	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
-	    == NOTIFY_BAD)
+	if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+			(void *)(long)cpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
 out_notify:
 	if (ret != 0)
-		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+		blocking_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu);
 out:
 	unlock_cpu_hotplug();
 	return ret;
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2fd1..4fafd58038a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
-static DEFINE_MUTEX(notify_mutex);
-static struct notifier_block * module_notify_list;
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
 int register_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_register(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_register(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(register_module_notifier);
 
 int unregister_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_unregister(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_unregister(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(unregister_module_notifier);
 
@@ -1816,9 +1807,8 @@ sys_init_module(void __user *umod,
 	/* Drop lock so they can recurse */
 	mutex_unlock(&module_mutex);
 
-	mutex_lock(&notify_mutex);
-	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-	mutex_unlock(&notify_mutex);
+	blocking_notifier_call_chain(&module_notify_list,
+			MODULE_STATE_COMING, mod);
 
 	/* Start the module */
 	if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb93..f895c7c01d5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
@@ -97,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	smp_send_stop();
 #endif
 
-	notifier_call_chain(&panic_notifier_list, 0, buf);
+	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	if (!panic_blink)
 		panic_blink = no_blink;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9b4..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
  
 #ifdef CONFIG_PROFILING
  
-static DECLARE_RWSEM(profile_rwsem);
-static DEFINE_RWLOCK(handoff_lock);
-static struct notifier_block * task_exit_notifier;
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
  
 void profile_task_exit(struct task_struct * task)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&task_exit_notifier, 0, task);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
  
 int profile_handoff_task(struct task_struct * task)
 {
 	int ret;
-	read_lock(&handoff_lock);
-	ret = notifier_call_chain(&task_free_notifier, 0, task);
-	read_unlock(&handoff_lock);
+	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 	return (ret == NOTIFY_OK) ? 1 : 0;
 }
 
 void profile_munmap(unsigned long addr)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 
 int task_handoff_register(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_register(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
 
 int task_handoff_unregister(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_unregister(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
 
 int profile_event_register(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_register(&task_exit_notifier, n);
+			err = blocking_notifier_chain_register(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_register(&munmap_notifier, n);
+			err = blocking_notifier_chain_register(
+					&munmap_notifier, n);
 			break;
 	}
  
-	up_write(&profile_rwsem);
- 
 	return err;
 }
 
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_unregister(&task_exit_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_unregister(&munmap_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&munmap_notifier, n);
 			break;
 	}
 
-	up_write(&profile_rwsem);
 	return err;
 }
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847ed8..ced91e1ff564 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2ba..c93d37f71aef 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
  *	and the like. 
  */
 
-static struct notifier_block *reboot_notifier_list;
-static DEFINE_RWLOCK(notifier_lock);
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ *	Notifier chain core routines.  The exported routines below
+ *	are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if (n->priority > (*nl)->priority)
+			break;
+		nl = &((*nl)->next);
+	}
+	n->next = *nl;
+	rcu_assign_pointer(*nl, n);
+	return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if ((*nl) == n) {
+			rcu_assign_pointer(*nl, n->next);
+			return 0;
+		}
+		nl = &((*nl)->next);
+	}
+	return -ENOENT;
+}
+
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+		unsigned long val, void *v)
+{
+	int ret = NOTIFY_DONE;
+	struct notifier_block *nb;
+
+	nb = rcu_dereference(*nl);
+	while (nb) {
+		ret = nb->notifier_call(nb, val, v);
+		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+			break;
+		nb = rcu_dereference(nb->next);
+	}
+	return ret;
+}
+
+/*
+ *	Atomic notifier chain routines.  Registration and unregistration
+ *	use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
 
 /**
- *	notifier_chain_register	- Add notifier to a notifier chain
- *	@list: Pointer to root list pointer
+ *	atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
  *	@n: New entry in notifier chain
  *
- *	Adds a notifier to a notifier chain.
+ *	Adds a notifier to an atomic notifier chain.
  *
  *	Currently always returns zero.
  */
+
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_register(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ *	atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from an atomic notifier chain.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_unregister(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	synchronize_rcu();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an atomic context, so they must not block.
+ *	This routine uses RCU to synchronize with changes to the chain.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
  
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	write_lock(&notifier_lock);
-	while(*list)
-	{
-		if(n->priority > (*list)->priority)
-			break;
-		list= &((*list)->next);
-	}
-	n->next = *list;
-	*list=n;
-	write_unlock(&notifier_lock);
-	return 0;
+	int ret;
+
+	rcu_read_lock();
+	ret = notifier_call_chain(&nh->head, val, v);
+	rcu_read_unlock();
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ *	Blocking notifier chain routines.  All access to the chain is
+ *	synchronized by an rwsem.
+ */
 
 /**
- *	notifier_chain_unregister - Remove notifier from a notifier chain
- *	@nl: Pointer to root list pointer
+ *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@n: New entry in notifier chain
  *
- *	Removes a notifier from a notifier chain.
+ *	Adds a notifier to a blocking notifier chain.
+ *	Must be called in process context.
  *
- *	Returns zero on success, or %-ENOENT on failure.
+ *	Currently always returns zero.
  */
  
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
 {
-	write_lock(&notifier_lock);
-	while((*nl)!=NULL)
-	{
-		if((*nl)==n)
-		{
-			*nl=n->next;
-			write_unlock(&notifier_lock);
-			return 0;
-		}
-		nl=&((*nl)->next);
-	}
-	write_unlock(&notifier_lock);
-	return -ENOENT;
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_register(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_register(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 
 /**
- *	notifier_call_chain - Call functions in a notifier chain
- *	@n: Pointer to root pointer of notifier chain
+ *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a blocking notifier chain.
+ *	Must be called from process context.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_unregister(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_unregister(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
  *
- *	Calls each function in a notifier chain in turn.
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in a process context, so they are allowed to block.
  *
- *	If the return value of the notifier can be and'd
- *	with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
  *	will return immediately, with the return value of
  *	the notifier function which halted execution.
- *	Otherwise, the return value is the return value
+ *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
  
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
+	int ret;
 
-	while(nb)
-	{
-		ret=nb->notifier_call(nb,val,v);
-		if(ret&NOTIFY_STOP_MASK)
-		{
-			return ret;
-		}
-		nb=nb->next;
-	}
+	down_read(&nh->rwsem);
+	ret = notifier_call_chain(&nh->head, val, v);
+	up_read(&nh->rwsem);
 	return ret;
 }
 
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ *	Raw notifier chain routines.  There is no protection;
+ *	the caller must provide it.  Use at your own risk!
+ */
+
+/**
+ *	raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Currently always returns zero.
+ */
+
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_register(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ *	raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_unregister(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an undefined context.
+ *	All locking must be provided by the caller.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return notifier_call_chain(&nh->head, val, v);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
 /**
  *	register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
  *	Registers a function with the list of functions
  *	to be called at reboot time.
  *
- *	Currently always returns zero, as notifier_chain_register
+ *	Currently always returns zero, as blocking_notifier_chain_register
  *	always returns zero.
  */
  
 int register_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
  
 int unregister_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_unregister(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
 
 void kernel_restart_prepare(char *cmd)
 {
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
 }
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
 
 void kernel_shutdown_prepare(enum system_states state)
 {
-	notifier_call_chain(&reboot_notifier_list,
+	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
 	system_state = state;
 	device_shutdown();
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 9106354c781e..a49a6975092d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -73,23 +73,23 @@ DEFINE_RWLOCK(hci_cb_list_lock);
 struct hci_proto *hci_proto[HCI_MAX_PROTO];
 
 /* HCI notifiers list */
-static struct notifier_block *hci_notifier;
+static ATOMIC_NOTIFIER_HEAD(hci_notifier);
 
 /* ---- HCI notifications ---- */
 
 int hci_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&hci_notifier, nb);
+	return atomic_notifier_chain_register(&hci_notifier, nb);
 }
 
 int hci_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&hci_notifier, nb);
+	return atomic_notifier_chain_unregister(&hci_notifier, nb);
 }
 
 static void hci_notify(struct hci_dev *hdev, int event)
 {
-	notifier_call_chain(&hci_notifier, event, hdev);
+	atomic_notifier_call_chain(&hci_notifier, event, hdev);
 }
 
 /* ---- HCI requests ---- */
diff --git a/net/core/dev.c b/net/core/dev.c
index 8e1dc3051222..a3ab11f34153 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -193,7 +193,7 @@ static inline struct hlist_head *dev_index_hash(int ifindex)
  *	Our notifier list
  */
 
-static struct notifier_block *netdev_chain;
+static BLOCKING_NOTIFIER_HEAD(netdev_chain);
 
 /*
  *	Device drivers call our routines to queue packets here. We empty the
@@ -736,7 +736,8 @@ int dev_change_name(struct net_device *dev, char *newname)
 	if (!err) {
 		hlist_del(&dev->name_hlist);
 		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGENAME, dev);
 	}
 
 	return err;
@@ -750,7 +751,7 @@ int dev_change_name(struct net_device *dev, char *newname)
  */
 void netdev_features_change(struct net_device *dev)
 {
-	notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 }
 EXPORT_SYMBOL(netdev_features_change);
 
@@ -765,7 +766,8 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 	}
 }
@@ -862,7 +864,7 @@ int dev_open(struct net_device *dev)
 		/*
 		 *	... and announce new interface.
 		 */
-		notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+		blocking_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 	}
 	return ret;
 }
@@ -885,7 +887,7 @@ int dev_close(struct net_device *dev)
 	 *	Tell people we are going down, so that they can
 	 *	prepare to death, when device is still operating.
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 
 	dev_deactivate(dev);
 
@@ -922,7 +924,7 @@ int dev_close(struct net_device *dev)
 	/*
 	 * Tell people we are down
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 
 	return 0;
 }
@@ -953,7 +955,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_register(&netdev_chain, nb);
+	err = blocking_notifier_chain_register(&netdev_chain, nb);
 	if (!err) {
 		for (dev = dev_base; dev; dev = dev->next) {
 			nb->notifier_call(nb, NETDEV_REGISTER, dev);
@@ -981,7 +983,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_unregister(&netdev_chain, nb);
+	err = blocking_notifier_chain_unregister(&netdev_chain, nb);
 	rtnl_unlock();
 	return err;
 }
@@ -992,12 +994,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
  *      @v:   pointer passed unmodified to notifier function
  *
  *	Call all network notifier blocks.  Parameters and return value
- *	are as for notifier_call_chain().
+ *	are as for blocking_notifier_call_chain().
  */
 
 int call_netdevice_notifiers(unsigned long val, void *v)
 {
-	return notifier_call_chain(&netdev_chain, val, v);
+	return blocking_notifier_call_chain(&netdev_chain, val, v);
 }
 
 /* When > 0 there are consumers of rx skb time stamps */
@@ -2242,7 +2244,8 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	if (dev->flags & IFF_UP &&
 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
 					  IFF_VOLATILE)))
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
@@ -2286,8 +2289,8 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	else
 		dev->mtu = new_mtu;
 	if (!err && dev->flags & IFF_UP)
-		notifier_call_chain(&netdev_chain,
-				    NETDEV_CHANGEMTU, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEMTU, dev);
 	return err;
 }
 
@@ -2303,7 +2306,8 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 		return -ENODEV;
 	err = dev->set_mac_address(dev, sa);
 	if (!err)
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEADDR, dev);
 	return err;
 }
 
@@ -2359,7 +2363,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 				return -EINVAL;
 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_CHANGEADDR, dev);
 			return 0;
 
@@ -2813,7 +2817,7 @@ int register_netdevice(struct net_device *dev)
 	write_unlock_bh(&dev_base_lock);
 
 	/* Notify protocols, that a new device appeared. */
-	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
 
 	/* Finish registration after unlock */
 	net_set_todo(dev);
@@ -2892,7 +2896,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 			rtnl_lock();
 
 			/* Rebroadcast unregister notification */
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_UNREGISTER, dev);
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
@@ -3148,7 +3152,7 @@ int unregister_netdevice(struct net_device *dev)
 	/* Notify protocols, that we are about to destroy
 	   this device. They should clean all the things.
 	*/
-	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
 	
 	/*
 	 *	Flush the multicast chain
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cc7b9d9255ef..d2ae9893ca17 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -68,7 +68,7 @@ __le16 decnet_address = 0;
 
 static DEFINE_RWLOCK(dndev_lock);
 static struct net_device *decnet_default_device;
-static struct notifier_block *dnaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
 
 static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
 static void dn_dev_delete(struct net_device *dev);
@@ -446,7 +446,7 @@ static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int de
 	}
 
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
 		dn_dev_free_ifa(ifa1);
 
@@ -481,7 +481,7 @@ static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
 	dn_db->ifa_list = ifa;
 
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -1285,12 +1285,12 @@ void dn_dev_devices_on(void)
 
 int register_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&dnaddr_chain, nb);
+	return blocking_notifier_chain_register(&dnaddr_chain, nb);
 }
 
 int unregister_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&dnaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44fdf1413e2c..81c2f7885292 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -81,7 +81,7 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
 
 static void rtmsg_ifa(int event, struct in_ifaddr *);
 
-static struct notifier_block *inetaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
@@ -267,7 +267,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 				*ifap1 = ifa->ifa_next;
 
 				rtmsg_ifa(RTM_DELADDR, ifa);
-				notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+				blocking_notifier_call_chain(&inetaddr_chain,
+						NETDEV_DOWN, ifa);
 				inet_free_ifa(ifa);
 			} else {
 				promote = ifa;
@@ -291,7 +292,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	   So that, this order is correct.
 	 */
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
 
@@ -303,7 +304,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
 		rtmsg_ifa(RTM_NEWADDR, promote);
-		notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
+		blocking_notifier_call_chain(&inetaddr_chain,
+				NETDEV_UP, promote);
 		for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
 			if (ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa))
@@ -366,7 +368,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
 	   Notifier will trigger FIB update, so that
 	   listeners of netlink will know about new ifaddr */
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -938,12 +940,12 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
 
 int register_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&inetaddr_chain, nb);
+	return blocking_notifier_chain_register(&inetaddr_chain, nb);
 }
 
 int unregister_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&inetaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
 }
 
 /* Rename ifa_labels for a device name change. Make some effort to preserve existing
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 9e34034729a6..ceaabc18202b 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -80,8 +80,8 @@ static int ip_conntrack_vmalloc;
 static unsigned int ip_conntrack_next_id;
 static unsigned int ip_conntrack_expect_next_id;
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-struct notifier_block *ip_conntrack_chain;
-struct notifier_block *ip_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
@@ -92,7 +92,7 @@ __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
-		notifier_call_chain(&ip_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
 				    ecache->ct);
 	ecache->events = 0;
 	ip_conntrack_put(ecache->ct);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 01c62a0d3742..445006ee4522 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -143,7 +143,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 				struct prefix_info *pinfo);
 static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev);
 
-static struct notifier_block *inet6addr_chain;
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
 
 struct ipv6_devconf ipv6_devconf = {
 	.forwarding		= 0,
@@ -593,7 +593,7 @@ out2:
 	read_unlock_bh(&addrconf_lock);
 
 	if (likely(err == 0))
-		notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
+		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
 	else {
 		kfree(ifa);
 		ifa = ERR_PTR(err);
@@ -688,7 +688,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 	ipv6_ifa_notify(RTM_DELADDR, ifp);
 
-	notifier_call_chain(&inet6addr_chain,NETDEV_DOWN,ifp);
+	atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
 
 	addrconf_del_timer(ifp);
 
@@ -3767,12 +3767,12 @@ static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
 
 int register_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&inet6addr_chain, nb);
+        return atomic_notifier_chain_register(&inet6addr_chain, nb);
 }
 
 int unregister_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_unregister(&inet6addr_chain,nb);
+        return atomic_notifier_chain_unregister(&inet6addr_chain,nb);
 }
 
 /*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0ae281d9bfc3..56389c83557c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -90,8 +90,8 @@ static int nf_conntrack_vmalloc;
 static unsigned int nf_conntrack_next_id;
 static unsigned int nf_conntrack_expect_next_id;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct notifier_block *nf_conntrack_chain;
-struct notifier_block *nf_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
@@ -103,7 +103,7 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
 	    && ecache->events)
-		notifier_call_chain(&nf_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
 				    ecache->ct);
 
 	ecache->events = 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d00a9034cb5f..2a233ffcf618 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -123,7 +123,7 @@ static void netlink_destroy_callback(struct netlink_callback *cb);
 static DEFINE_RWLOCK(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
 
-static struct notifier_block *netlink_chain;
+static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
 {
@@ -469,7 +469,8 @@ static int netlink_release(struct socket *sock)
 						.protocol = sk->sk_protocol,
 						.pid = nlk->pid,
 					  };
-		notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
+		atomic_notifier_call_chain(&netlink_chain,
+				NETLINK_URELEASE, &n);
 	}	
 
 	if (nlk->module)
@@ -1695,12 +1696,12 @@ static struct file_operations netlink_seq_fops = {
 
 int netlink_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&netlink_chain, nb);
+	return atomic_notifier_chain_register(&netlink_chain, nb);
 }
 
 int netlink_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&netlink_chain, nb);
+	return atomic_notifier_chain_unregister(&netlink_chain, nb);
 }
                 
 static const struct proto_ops netlink_ops = {
-- 
cgit v1.2.3


From c58411e95d7f5062dedd1a3064af4d359da1e633 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:34 -0800
Subject: [PATCH] RTC Subsystem: library functions

RTC and date/time related functions.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/Kconfig       |   2 +
 drivers/Makefile      |   1 +
 drivers/rtc/Kconfig   |   6 +++
 drivers/rtc/Makefile  |   5 +++
 drivers/rtc/rtc-lib.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h   |   5 +++
 6 files changed, 120 insertions(+)
 create mode 100644 drivers/rtc/Kconfig
 create mode 100644 drivers/rtc/Makefile
 create mode 100644 drivers/rtc/rtc-lib.c

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index bddf431bbb72..9f5c0da57c90 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -70,4 +70,6 @@ source "drivers/sn/Kconfig"
 
 source "drivers/edac/Kconfig"
 
+source "drivers/rtc/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5c69b86db624..424955274e60 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_USB_GADGET)	+= usb/gadget/
 obj-$(CONFIG_GAMEPORT)		+= input/gameport/
 obj-$(CONFIG_INPUT)		+= input/
 obj-$(CONFIG_I2O)		+= message/
+obj-$(CONFIG_RTC_LIB)		+= rtc/
 obj-$(CONFIG_I2C)		+= i2c/
 obj-$(CONFIG_W1)		+= w1/
 obj-$(CONFIG_HWMON)		+= hwmon/
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
new file mode 100644
index 000000000000..15df7c130fa6
--- /dev/null
+++ b/drivers/rtc/Kconfig
@@ -0,0 +1,6 @@
+#
+# RTC class/drivers configuration
+#
+
+config RTC_LIB
+	tristate
\ No newline at end of file
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
new file mode 100644
index 000000000000..eb9ad77c3e95
--- /dev/null
+++ b/drivers/rtc/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for RTC class/drivers.
+#
+
+obj-$(CONFIG_RTC_LIB)	+= rtc-lib.o
diff --git a/drivers/rtc/rtc-lib.c b/drivers/rtc/rtc-lib.c
new file mode 100644
index 000000000000..cfedc1d28ee1
--- /dev/null
+++ b/drivers/rtc/rtc-lib.c
@@ -0,0 +1,101 @@
+/*
+ * rtc and date/time utility functions
+ *
+ * Copyright (C) 2005-06 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * based on arch/arm/common/rtctime.c and other bits
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+
+static const unsigned char rtc_days_in_month[] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+#define LEAPS_THRU_END_OF(y) ((y)/4 - (y)/100 + (y)/400)
+#define LEAP_YEAR(year) ((!(year % 4) && (year % 100)) || !(year % 400))
+
+int rtc_month_days(unsigned int month, unsigned int year)
+{
+	return rtc_days_in_month[month] + (LEAP_YEAR(year) && month == 1);
+}
+EXPORT_SYMBOL(rtc_month_days);
+
+/*
+ * Convert seconds since 01-01-1970 00:00:00 to Gregorian date.
+ */
+void rtc_time_to_tm(unsigned long time, struct rtc_time *tm)
+{
+	register int days, month, year;
+
+	days = time / 86400;
+	time -= days * 86400;
+
+	/* day of the week, 1970-01-01 was a Thursday */
+	tm->tm_wday = (days + 4) % 7;
+
+	year = 1970 + days / 365;
+	days -= (year - 1970) * 365
+		+ LEAPS_THRU_END_OF(year - 1)
+		- LEAPS_THRU_END_OF(1970 - 1);
+	if (days < 0) {
+		year -= 1;
+		days += 365 + LEAP_YEAR(year);
+	}
+	tm->tm_year = year - 1900;
+	tm->tm_yday = days + 1;
+
+	for (month = 0; month < 11; month++) {
+		int newdays;
+
+		newdays = days - rtc_month_days(month, year);
+		if (newdays < 0)
+			break;
+		days = newdays;
+	}
+	tm->tm_mon = month;
+	tm->tm_mday = days + 1;
+
+	tm->tm_hour = time / 3600;
+	time -= tm->tm_hour * 3600;
+	tm->tm_min = time / 60;
+	tm->tm_sec = time - tm->tm_min * 60;
+}
+EXPORT_SYMBOL(rtc_time_to_tm);
+
+/*
+ * Does the rtc_time represent a valid date/time?
+ */
+int rtc_valid_tm(struct rtc_time *tm)
+{
+	if (tm->tm_year < 70
+		|| tm->tm_mon >= 12
+		|| tm->tm_mday < 1
+		|| tm->tm_mday > rtc_month_days(tm->tm_mon, tm->tm_year + 1900)
+		|| tm->tm_hour >= 24
+		|| tm->tm_min >= 60
+		|| tm->tm_sec >= 60)
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(rtc_valid_tm);
+
+/*
+ * Convert Gregorian date to seconds since 01-01-1970 00:00:00.
+ */
+int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time)
+{
+	*time = mktime(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+			tm->tm_hour, tm->tm_min, tm->tm_sec);
+	return 0;
+}
+EXPORT_SYMBOL(rtc_tm_to_time);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index b739ac1f7ca0..8454337c7058 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -95,6 +95,11 @@ struct rtc_pll_info {
 
 #include <linux/interrupt.h>
 
+extern int rtc_month_days(unsigned int month, unsigned int year);
+extern int rtc_valid_tm(struct rtc_time *tm);
+extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
+extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
-- 
cgit v1.2.3


From 0c86edc0d4970649f39748c4ce4f2895f728468f Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:37 -0800
Subject: [PATCH] RTC subsystem: class

Add the basic RTC subsystem infrastructure to the kernel.

rtc/class.c - registration facilities for RTC drivers
rtc/interface.c - kernel/rtc interface functions
rtc/hctosys.c - snippet of code that copies hw clock to sw clock
		at bootup, if configured to do so.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 CREDITS                 |   5 +-
 MAINTAINERS             |   6 ++
 drivers/rtc/Kconfig     |  44 +++++++-
 drivers/rtc/Makefile    |   5 +-
 drivers/rtc/class.c     | 145 +++++++++++++++++++++++++
 drivers/rtc/hctosys.c   |  69 ++++++++++++
 drivers/rtc/interface.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h     |  87 +++++++++++++++
 8 files changed, 633 insertions(+), 5 deletions(-)
 create mode 100644 drivers/rtc/class.c
 create mode 100644 drivers/rtc/hctosys.c
 create mode 100644 drivers/rtc/interface.c

(limited to 'include/linux')

diff --git a/CREDITS b/CREDITS
index c6d69bf10e15..35850d882c34 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3741,10 +3741,11 @@ D: Mylex DAC960 PCI RAID driver
 D: Miscellaneous kernel fixes
 
 N: Alessandro Zummo
-E: azummo@ita.flashnet.it
-W: http://freepage.logicom.it/azummo/
+E: a.zummo@towertech.it
 D: CMI8330 support is sb_card.c
 D: ISAPnP fixes in sb_card.c
+D: ZyXEL omni.net lcd plus driver
+D: RTC subsystem
 S: Italy
 
 N: Marc Zyngier
diff --git a/MAINTAINERS b/MAINTAINERS
index f27846734b06..e5b051f0e27e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2233,6 +2233,12 @@ M:	p_gortmaker@yahoo.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+REAL TIME CLOCK (RTC) SUBSYSTEM
+P:	Alessandro Zummo
+M:	a.zummo@towertech.it
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 REISERFS FILE SYSTEM
 P:	Hans Reiser
 M:	reiserfs-dev@namesys.com
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 15df7c130fa6..a256f67b78e4 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1,6 +1,46 @@
-#
+\#
 # RTC class/drivers configuration
 #
 
+menu "Real Time Clock"
+
 config RTC_LIB
-	tristate
\ No newline at end of file
+	tristate
+
+config RTC_CLASS
+	tristate "RTC class"
+	depends on EXPERIMENTAL
+	default n
+	select RTC_LIB
+	help
+	  Generic RTC class support. If you say yes here, you will
+ 	  be allowed to plug one or more RTCs to your system. You will
+	  probably want to enable one of more of the interfaces below.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-class.
+
+config RTC_HCTOSYS
+	bool "Set system time from RTC on startup"
+	depends on RTC_CLASS = y
+	default y
+	help
+	  If you say yes here, the system time will be set using
+	  the value read from the specified RTC device. This is useful
+	  in order to avoid unnecessary fschk runs.
+
+config RTC_HCTOSYS_DEVICE
+	string "The RTC to read the time from"
+	depends on RTC_HCTOSYS = y
+	default "rtc0"
+	help
+	  The RTC device that will be used as the source for
+	  the system time, usually rtc0.
+
+comment "RTC interfaces"
+	depends on RTC_CLASS
+
+comment "RTC drivers"
+	depends on RTC_CLASS
+
+endmenu
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index eb9ad77c3e95..7b87f3710dff 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -2,4 +2,7 @@
 # Makefile for RTC class/drivers.
 #
 
-obj-$(CONFIG_RTC_LIB)	+= rtc-lib.o
+obj-$(CONFIG_RTC_LIB)		+= rtc-lib.o
+obj-$(CONFIG_RTC_HCTOSYS)	+= hctosys.o
+obj-$(CONFIG_RTC_CLASS)		+= rtc-core.o
+rtc-core-y			:= class.o interface.o
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
new file mode 100644
index 000000000000..8533936d50d8
--- /dev/null
+++ b/drivers/rtc/class.c
@@ -0,0 +1,145 @@
+/*
+ * RTC subsystem, base class
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * class skeleton from drivers/hwmon/hwmon.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+#include <linux/kdev_t.h>
+#include <linux/idr.h>
+
+static DEFINE_IDR(rtc_idr);
+static DEFINE_MUTEX(idr_lock);
+struct class *rtc_class;
+
+static void rtc_device_release(struct class_device *class_dev)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+	mutex_lock(&idr_lock);
+	idr_remove(&rtc_idr, rtc->id);
+	mutex_unlock(&idr_lock);
+	kfree(rtc);
+}
+
+/**
+ * rtc_device_register - register w/ RTC class
+ * @dev: the device to register
+ *
+ * rtc_device_unregister() must be called when the class device is no
+ * longer needed.
+ *
+ * Returns the pointer to the new struct class device.
+ */
+struct rtc_device *rtc_device_register(const char *name, struct device *dev,
+					struct rtc_class_ops *ops,
+					struct module *owner)
+{
+	struct rtc_device *rtc;
+	int id, err;
+
+	if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+
+	mutex_lock(&idr_lock);
+	err = idr_get_new(&rtc_idr, NULL, &id);
+	mutex_unlock(&idr_lock);
+
+	if (err < 0)
+		goto exit;
+
+	id = id & MAX_ID_MASK;
+
+	rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL);
+	if (rtc == NULL) {
+		err = -ENOMEM;
+		goto exit_idr;
+	}
+
+	rtc->id = id;
+	rtc->ops = ops;
+	rtc->owner = owner;
+	rtc->class_dev.dev = dev;
+	rtc->class_dev.class = rtc_class;
+	rtc->class_dev.release = rtc_device_release;
+
+	mutex_init(&rtc->ops_lock);
+	spin_lock_init(&rtc->irq_lock);
+	spin_lock_init(&rtc->irq_task_lock);
+
+	strlcpy(rtc->name, name, RTC_DEVICE_NAME_SIZE);
+	snprintf(rtc->class_dev.class_id, BUS_ID_SIZE, "rtc%d", id);
+
+	err = class_device_register(&rtc->class_dev);
+	if (err)
+		goto exit_kfree;
+
+	dev_info(dev, "rtc core: registered %s as %s\n",
+			rtc->name, rtc->class_dev.class_id);
+
+	return rtc;
+
+exit_kfree:
+	kfree(rtc);
+
+exit_idr:
+	idr_remove(&rtc_idr, id);
+
+exit:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rtc_device_register);
+
+
+/**
+ * rtc_device_unregister - removes the previously registered RTC class device
+ *
+ * @rtc: the RTC class device to destroy
+ */
+void rtc_device_unregister(struct rtc_device *rtc)
+{
+	mutex_lock(&rtc->ops_lock);
+	rtc->ops = NULL;
+	mutex_unlock(&rtc->ops_lock);
+	class_device_unregister(&rtc->class_dev);
+}
+EXPORT_SYMBOL_GPL(rtc_device_unregister);
+
+int rtc_interface_register(struct class_interface *intf)
+{
+	intf->class = rtc_class;
+	return class_interface_register(intf);
+}
+EXPORT_SYMBOL_GPL(rtc_interface_register);
+
+static int __init rtc_init(void)
+{
+	rtc_class = class_create(THIS_MODULE, "rtc");
+	if (IS_ERR(rtc_class)) {
+		printk(KERN_ERR "%s: couldn't create class\n", __FILE__);
+		return PTR_ERR(rtc_class);
+	}
+	return 0;
+}
+
+static void __exit rtc_exit(void)
+{
+	class_destroy(rtc_class);
+}
+
+module_init(rtc_init);
+module_exit(rtc_exit);
+
+MODULE_AUTHOR("Alessandro Zummo <a.zummo@towerteh.it>");
+MODULE_DESCRIPTION("RTC class support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rtc/hctosys.c b/drivers/rtc/hctosys.c
new file mode 100644
index 000000000000..d02fe9a0001f
--- /dev/null
+++ b/drivers/rtc/hctosys.c
@@ -0,0 +1,69 @@
+/*
+ * RTC subsystem, initialize system time on startup
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/rtc.h>
+
+/* IMPORTANT: the RTC only stores whole seconds. It is arbitrary
+ * whether it stores the most close value or the value with partial
+ * seconds truncated. However, it is important that we use it to store
+ * the truncated value. This is because otherwise it is necessary,
+ * in an rtc sync function, to read both xtime.tv_sec and
+ * xtime.tv_nsec. On some processors (i.e. ARM), an atomic read
+ * of >32bits is not possible. So storing the most close value would
+ * slow down the sync API. So here we have the truncated value and
+ * the best guess is to add 0.5s.
+ */
+
+static int __init rtc_hctosys(void)
+{
+	int err;
+	struct rtc_time tm;
+	struct class_device *class_dev = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
+
+	if (class_dev == NULL) {
+		printk("%s: unable to open rtc device (%s)\n",
+			__FILE__, CONFIG_RTC_HCTOSYS_DEVICE);
+		return -ENODEV;
+	}
+
+	err = rtc_read_time(class_dev, &tm);
+	if (err == 0) {
+		err = rtc_valid_tm(&tm);
+		if (err == 0) {
+			struct timespec tv;
+
+			tv.tv_nsec = NSEC_PER_SEC >> 1;
+
+			rtc_tm_to_time(&tm, &tv.tv_sec);
+
+			do_settimeofday(&tv);
+
+			dev_info(class_dev->dev,
+				"setting the system clock to "
+				"%d-%02d-%02d %02d:%02d:%02d (%u)\n",
+				tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+				tm.tm_hour, tm.tm_min, tm.tm_sec,
+				(unsigned int) tv.tv_sec);
+		}
+		else
+			dev_err(class_dev->dev,
+				"hctosys: invalid date/time\n");
+	}
+	else
+		dev_err(class_dev->dev,
+			"hctosys: unable to read the hardware clock\n");
+
+	rtc_class_close(class_dev);
+
+	return 0;
+}
+
+late_initcall(rtc_hctosys);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
new file mode 100644
index 000000000000..56e490709b87
--- /dev/null
+++ b/drivers/rtc/interface.c
@@ -0,0 +1,277 @@
+/*
+ * RTC subsystem, interface functions
+ *
+ * Copyright (C) 2005 Tower Technologies
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * based on arch/arm/common/rtctime.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#include <linux/rtc.h>
+
+int rtc_read_time(struct class_device *class_dev, struct rtc_time *tm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->read_time)
+		err = -EINVAL;
+	else {
+		memset(tm, 0, sizeof(struct rtc_time));
+		err = rtc->ops->read_time(class_dev->dev, tm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_read_time);
+
+int rtc_set_time(struct class_device *class_dev, struct rtc_time *tm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = rtc_valid_tm(tm);
+	if (err != 0)
+		return err;
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_time)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_time(class_dev->dev, tm);
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_time);
+
+int rtc_set_mmss(struct class_device *class_dev, unsigned long secs)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (rtc->ops->set_mmss)
+		err = rtc->ops->set_mmss(class_dev->dev, secs);
+	else if (rtc->ops->read_time && rtc->ops->set_time) {
+		struct rtc_time new, old;
+
+		err = rtc->ops->read_time(class_dev->dev, &old);
+		if (err == 0) {
+			rtc_time_to_tm(secs, &new);
+
+			/*
+			 * avoid writing when we're going to change the day of
+			 * the month. We will retry in the next minute. This
+			 * basically means that if the RTC must not drift
+			 * by more than 1 minute in 11 minutes.
+			 */
+			if (!((old.tm_hour == 23 && old.tm_min == 59) ||
+				(new.tm_hour == 23 && new.tm_min == 59)))
+				err = rtc->ops->set_time(class_dev->dev, &new);
+		}
+	}
+	else
+		err = -EINVAL;
+
+	mutex_unlock(&rtc->ops_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_mmss);
+
+int rtc_read_alarm(struct class_device *class_dev, struct rtc_wkalrm *alarm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (rtc->ops == NULL)
+		err = -ENODEV;
+	else if (!rtc->ops->read_alarm)
+		err = -EINVAL;
+	else {
+		memset(alarm, 0, sizeof(struct rtc_wkalrm));
+		err = rtc->ops->read_alarm(class_dev->dev, alarm);
+	}
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_read_alarm);
+
+int rtc_set_alarm(struct class_device *class_dev, struct rtc_wkalrm *alarm)
+{
+	int err;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	err = mutex_lock_interruptible(&rtc->ops_lock);
+	if (err)
+		return -EBUSY;
+
+	if (!rtc->ops)
+		err = -ENODEV;
+	else if (!rtc->ops->set_alarm)
+		err = -EINVAL;
+	else
+		err = rtc->ops->set_alarm(class_dev->dev, alarm);
+
+	mutex_unlock(&rtc->ops_lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_set_alarm);
+
+void rtc_update_irq(struct class_device *class_dev,
+		unsigned long num, unsigned long events)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock(&rtc->irq_lock);
+	rtc->irq_data = (rtc->irq_data + (num << 8)) | events;
+	spin_unlock(&rtc->irq_lock);
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task)
+		rtc->irq_task->func(rtc->irq_task->private_data);
+	spin_unlock(&rtc->irq_task_lock);
+
+	wake_up_interruptible(&rtc->irq_queue);
+	kill_fasync(&rtc->async_queue, SIGIO, POLL_IN);
+}
+EXPORT_SYMBOL_GPL(rtc_update_irq);
+
+struct class_device *rtc_class_open(char *name)
+{
+	struct class_device *class_dev = NULL,
+				*class_dev_tmp;
+
+	down(&rtc_class->sem);
+	list_for_each_entry(class_dev_tmp, &rtc_class->children, node) {
+		if (strncmp(class_dev_tmp->class_id, name, BUS_ID_SIZE) == 0) {
+			class_dev = class_dev_tmp;
+			break;
+		}
+	}
+
+	if (class_dev) {
+		if (!try_module_get(to_rtc_device(class_dev)->owner))
+			class_dev = NULL;
+	}
+	up(&rtc_class->sem);
+
+	return class_dev;
+}
+EXPORT_SYMBOL_GPL(rtc_class_open);
+
+void rtc_class_close(struct class_device *class_dev)
+{
+	module_put(to_rtc_device(class_dev)->owner);
+}
+EXPORT_SYMBOL_GPL(rtc_class_close);
+
+int rtc_irq_register(struct class_device *class_dev, struct rtc_task *task)
+{
+	int retval = -EBUSY;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	if (task == NULL || task->func == NULL)
+		return -EINVAL;
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task == NULL) {
+		rtc->irq_task = task;
+		retval = 0;
+	}
+	spin_unlock(&rtc->irq_task_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(rtc_irq_register);
+
+void rtc_irq_unregister(struct class_device *class_dev, struct rtc_task *task)
+{
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock(&rtc->irq_task_lock);
+	if (rtc->irq_task == task)
+		rtc->irq_task = NULL;
+	spin_unlock(&rtc->irq_task_lock);
+}
+EXPORT_SYMBOL_GPL(rtc_irq_unregister);
+
+int rtc_irq_set_state(struct class_device *class_dev, struct rtc_task *task, int enabled)
+{
+	int err = 0;
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	spin_lock_irqsave(&rtc->irq_task_lock, flags);
+	if (rtc->irq_task != task)
+		err = -ENXIO;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
+
+	if (err == 0)
+		err = rtc->ops->irq_set_state(class_dev->dev, enabled);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rtc_irq_set_state);
+
+int rtc_irq_set_freq(struct class_device *class_dev, struct rtc_task *task, int freq)
+{
+	int err = 0, tmp = 0;
+	unsigned long flags;
+	struct rtc_device *rtc = to_rtc_device(class_dev);
+
+	/* allowed range is 2-8192 */
+	if (freq < 2 || freq > 8192)
+		return -EINVAL;
+/*
+	FIXME: this does not belong here, will move where appropriate
+	at a later stage. It cannot hurt right now, trust me :)
+	if ((freq > rtc_max_user_freq) && (!capable(CAP_SYS_RESOURCE)))
+		return -EACCES;
+*/
+	/* check if freq is a power of 2 */
+	while (freq > (1 << tmp))
+		tmp++;
+
+	if (freq != (1 << tmp))
+		return -EINVAL;
+
+	spin_lock_irqsave(&rtc->irq_task_lock, flags);
+	if (rtc->irq_task != task)
+		err = -ENXIO;
+	spin_unlock_irqrestore(&rtc->irq_task_lock, flags);
+
+	if (err == 0) {
+		err = rtc->ops->irq_set_freq(class_dev->dev, freq);
+		if (err == 0)
+			rtc->irq_freq = freq;
+	}
+	return err;
+}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 8454337c7058..ab61cd1199f2 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -91,6 +91,12 @@ struct rtc_pll_info {
 #define RTC_PLL_GET	_IOR('p', 0x11, struct rtc_pll_info)  /* Get PLL correction */
 #define RTC_PLL_SET	_IOW('p', 0x12, struct rtc_pll_info)  /* Set PLL correction */
 
+/* interrupt flags */
+#define RTC_IRQF 0x80 /* any of the following is active */
+#define RTC_PF 0x40
+#define RTC_AF 0x20
+#define RTC_UF 0x10
+
 #ifdef __KERNEL__
 
 #include <linux/interrupt.h>
@@ -100,6 +106,87 @@ extern int rtc_valid_tm(struct rtc_time *tm);
 extern int rtc_tm_to_time(struct rtc_time *tm, unsigned long *time);
 extern void rtc_time_to_tm(unsigned long time, struct rtc_time *tm);
 
+#include <linux/device.h>
+#include <linux/seq_file.h>
+#include <linux/cdev.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+extern struct class *rtc_class;
+
+struct rtc_class_ops {
+	int (*open)(struct device *);
+	void (*release)(struct device *);
+	int (*ioctl)(struct device *, unsigned int, unsigned long);
+	int (*read_time)(struct device *, struct rtc_time *);
+	int (*set_time)(struct device *, struct rtc_time *);
+	int (*read_alarm)(struct device *, struct rtc_wkalrm *);
+	int (*set_alarm)(struct device *, struct rtc_wkalrm *);
+	int (*proc)(struct device *, struct seq_file *);
+	int (*set_mmss)(struct device *, unsigned long secs);
+	int (*irq_set_state)(struct device *, int enabled);
+	int (*irq_set_freq)(struct device *, int freq);
+	int (*read_callback)(struct device *, int data);
+};
+
+#define RTC_DEVICE_NAME_SIZE 20
+struct rtc_task;
+
+struct rtc_device
+{
+	struct class_device class_dev;
+	struct module *owner;
+
+	int id;
+	char name[RTC_DEVICE_NAME_SIZE];
+
+	struct rtc_class_ops *ops;
+	struct mutex ops_lock;
+
+	struct class_device *rtc_dev;
+	struct cdev char_dev;
+	struct mutex char_lock;
+
+	unsigned long irq_data;
+	spinlock_t irq_lock;
+	wait_queue_head_t irq_queue;
+	struct fasync_struct *async_queue;
+
+	struct rtc_task *irq_task;
+	spinlock_t irq_task_lock;
+	int irq_freq;
+};
+#define to_rtc_device(d) container_of(d, struct rtc_device, class_dev)
+
+extern struct rtc_device *rtc_device_register(const char *name,
+					struct device *dev,
+					struct rtc_class_ops *ops,
+					struct module *owner);
+extern void rtc_device_unregister(struct rtc_device *rdev);
+extern int rtc_interface_register(struct class_interface *intf);
+
+extern int rtc_read_time(struct class_device *class_dev, struct rtc_time *tm);
+extern int rtc_set_time(struct class_device *class_dev, struct rtc_time *tm);
+extern int rtc_set_mmss(struct class_device *class_dev, unsigned long secs);
+extern int rtc_read_alarm(struct class_device *class_dev,
+			struct rtc_wkalrm *alrm);
+extern int rtc_set_alarm(struct class_device *class_dev,
+				struct rtc_wkalrm *alrm);
+extern void rtc_update_irq(struct class_device *class_dev,
+			unsigned long num, unsigned long events);
+
+extern struct class_device *rtc_class_open(char *name);
+extern void rtc_class_close(struct class_device *class_dev);
+
+extern int rtc_irq_register(struct class_device *class_dev,
+				struct rtc_task *task);
+extern void rtc_irq_unregister(struct class_device *class_dev,
+				struct rtc_task *task);
+extern int rtc_irq_set_state(struct class_device *class_dev,
+				struct rtc_task *task, int enabled);
+extern int rtc_irq_set_freq(struct class_device *class_dev,
+				struct rtc_task *task, int freq);
+
 typedef struct rtc_task {
 	void (*func)(void *private_data);
 	void *private_data;
-- 
cgit v1.2.3


From 6fc7f10cee28c7fa190920fefda8c696d5bf3074 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:37 -0800
Subject: [PATCH] RTC subsystem: I2C cleanup

This patch, completely optional, removes from drivers/i2c/chips all the
drivers that are implemented in the new RTC subsystem.

It should be noted that none of the current driver is actually integrated,
i.e.  usable without further patches.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/i2c/chips/Kconfig   |  18 --
 drivers/i2c/chips/Makefile  |   2 -
 drivers/i2c/chips/rtc8564.c | 385 ------------------------
 drivers/i2c/chips/rtc8564.h |  78 -----
 drivers/i2c/chips/x1205.c   | 698 --------------------------------------------
 include/linux/x1205.h       |  31 --
 6 files changed, 1212 deletions(-)
 delete mode 100644 drivers/i2c/chips/rtc8564.c
 delete mode 100644 drivers/i2c/chips/rtc8564.h
 delete mode 100644 drivers/i2c/chips/x1205.c
 delete mode 100644 include/linux/x1205.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index f9fae28f5612..7aa5c38f0855 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -65,15 +65,6 @@ config SENSORS_PCF8591
 	  This driver can also be built as a module.  If so, the module
 	  will be called pcf8591.
 
-config SENSORS_RTC8564
-	tristate "Epson 8564 RTC chip"
-	depends on I2C && EXPERIMENTAL
-	help
-	  If you say yes here you get support for the Epson 8564 RTC chip.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-rtc8564.
-
 config ISP1301_OMAP
 	tristate "Philips ISP1301 with OMAP OTG"
 	depends on I2C && ARCH_OMAP_OTG
@@ -126,13 +117,4 @@ config SENSORS_MAX6875
 	  This driver can also be built as a module.  If so, the module
 	  will be called max6875.
 
-config RTC_X1205_I2C
-	tristate "Xicor X1205 RTC chip"
-	depends on I2C && EXPERIMENTAL
-	help
-	  If you say yes here you get support for the Xicor X1205 RTC chip.
-
-	  This driver can also be built as a module. If so, the module
-	  will be called x1205.
-
 endmenu
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index 46178b57b1f1..779868ef2e26 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -10,10 +10,8 @@ obj-$(CONFIG_SENSORS_M41T00)	+= m41t00.o
 obj-$(CONFIG_SENSORS_PCA9539)	+= pca9539.o
 obj-$(CONFIG_SENSORS_PCF8574)	+= pcf8574.o
 obj-$(CONFIG_SENSORS_PCF8591)	+= pcf8591.o
-obj-$(CONFIG_SENSORS_RTC8564)	+= rtc8564.o
 obj-$(CONFIG_ISP1301_OMAP)	+= isp1301_omap.o
 obj-$(CONFIG_TPS65010)		+= tps65010.o
-obj-$(CONFIG_RTC_X1205_I2C)	+= x1205.o
 
 ifeq ($(CONFIG_I2C_DEBUG_CHIP),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/i2c/chips/rtc8564.c b/drivers/i2c/chips/rtc8564.c
deleted file mode 100644
index 0d8699b3f488..000000000000
--- a/drivers/i2c/chips/rtc8564.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- *  linux/drivers/i2c/chips/rtc8564.c
- *
- *  Copyright (C) 2002-2004 Stefan Eletzhofer
- *
- *	based on linux/drivers/acron/char/pcf8583.c
- *  Copyright (C) 2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Driver for system3's EPSON RTC 8564 chip
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bcd.h>
-#include <linux/i2c.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/rtc.h>		/* get the user-level API */
-#include <linux/init.h>
-
-#include "rtc8564.h"
-
-#ifdef DEBUG
-# define _DBG(x, fmt, args...) do{ if (debug>=x) printk(KERN_DEBUG"%s: " fmt "\n", __FUNCTION__, ##args); } while(0);
-#else
-# define _DBG(x, fmt, args...) do { } while(0);
-#endif
-
-#define _DBGRTCTM(x, rtctm) if (debug>=x) printk("%s: secs=%d, mins=%d, hours=%d, mday=%d, " \
-			"mon=%d, year=%d, wday=%d VL=%d\n", __FUNCTION__, \
-			(rtctm).secs, (rtctm).mins, (rtctm).hours, (rtctm).mday, \
-			(rtctm).mon, (rtctm).year, (rtctm).wday, (rtctm).vl);
-
-struct rtc8564_data {
-	struct i2c_client client;
-	u16 ctrl;
-};
-
-static inline u8 _rtc8564_ctrl1(struct i2c_client *client)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	return data->ctrl & 0xff;
-}
-static inline u8 _rtc8564_ctrl2(struct i2c_client *client)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	return (data->ctrl & 0xff00) >> 8;
-}
-
-#define CTRL1(c) _rtc8564_ctrl1(c)
-#define CTRL2(c) _rtc8564_ctrl2(c)
-
-static int debug;
-module_param(debug, int, S_IRUGO | S_IWUSR);
-
-static struct i2c_driver rtc8564_driver;
-
-static unsigned short ignore[] = { I2C_CLIENT_END };
-static unsigned short normal_addr[] = { 0x51, I2C_CLIENT_END };
-
-static struct i2c_client_address_data addr_data = {
-	.normal_i2c		= normal_addr,
-	.probe			= ignore,
-	.ignore			= ignore,
-};
-
-static int rtc8564_read_mem(struct i2c_client *client, struct mem *mem);
-static int rtc8564_write_mem(struct i2c_client *client, struct mem *mem);
-
-static int rtc8564_read(struct i2c_client *client, unsigned char adr,
-			unsigned char *buf, unsigned char len)
-{
-	int ret = -EIO;
-	unsigned char addr[1] = { adr };
-	struct i2c_msg msgs[2] = {
-		{client->addr, 0, 1, addr},
-		{client->addr, I2C_M_RD, len, buf}
-	};
-
-	_DBG(1, "client=%p, adr=%d, buf=%p, len=%d", client, adr, buf, len);
-
-	if (!buf) {
-		ret = -EINVAL;
-		goto done;
-	}
-
-	ret = i2c_transfer(client->adapter, msgs, 2);
-	if (ret == 2) {
-		ret = 0;
-	}
-
-done:
-	return ret;
-}
-
-static int rtc8564_write(struct i2c_client *client, unsigned char adr,
-			 unsigned char *data, unsigned char len)
-{
-	int ret = 0;
-	unsigned char _data[16];
-	struct i2c_msg wr;
-	int i;
-
-	if (!data || len > 15) {
-		ret = -EINVAL;
-		goto done;
-	}
-
-	_DBG(1, "client=%p, adr=%d, buf=%p, len=%d", client, adr, data, len);
-
-	_data[0] = adr;
-	for (i = 0; i < len; i++) {
-		_data[i + 1] = data[i];
-		_DBG(5, "data[%d] = 0x%02x (%d)", i, data[i], data[i]);
-	}
-
-	wr.addr = client->addr;
-	wr.flags = 0;
-	wr.len = len + 1;
-	wr.buf = _data;
-
-	ret = i2c_transfer(client->adapter, &wr, 1);
-	if (ret == 1) {
-		ret = 0;
-	}
-
-done:
-	return ret;
-}
-
-static int rtc8564_attach(struct i2c_adapter *adap, int addr, int kind)
-{
-	int ret;
-	struct i2c_client *new_client;
-	struct rtc8564_data *d;
-	unsigned char data[10];
-	unsigned char ad[1] = { 0 };
-	struct i2c_msg ctrl_wr[1] = {
-		{addr, 0, 2, data}
-	};
-	struct i2c_msg ctrl_rd[2] = {
-		{addr, 0, 1, ad},
-		{addr, I2C_M_RD, 2, data}
-	};
-
-	d = kzalloc(sizeof(struct rtc8564_data), GFP_KERNEL);
-	if (!d) {
-		ret = -ENOMEM;
-		goto done;
-	}
-	new_client = &d->client;
-
-	strlcpy(new_client->name, "RTC8564", I2C_NAME_SIZE);
-	i2c_set_clientdata(new_client, d);
-	new_client->addr = addr;
-	new_client->adapter = adap;
-	new_client->driver = &rtc8564_driver;
-
-	_DBG(1, "client=%p", new_client);
-
-	/* init ctrl1 reg */
-	data[0] = 0;
-	data[1] = 0;
-	ret = i2c_transfer(new_client->adapter, ctrl_wr, 1);
-	if (ret != 1) {
-		printk(KERN_INFO "rtc8564: cant init ctrl1\n");
-		ret = -ENODEV;
-		goto done;
-	}
-
-	/* read back ctrl1 and ctrl2 */
-	ret = i2c_transfer(new_client->adapter, ctrl_rd, 2);
-	if (ret != 2) {
-		printk(KERN_INFO "rtc8564: cant read ctrl\n");
-		ret = -ENODEV;
-		goto done;
-	}
-
-	d->ctrl = data[0] | (data[1] << 8);
-
-	_DBG(1, "RTC8564_REG_CTRL1=%02x, RTC8564_REG_CTRL2=%02x",
-	     data[0], data[1]);
-
-	ret = i2c_attach_client(new_client);
-done:
-	if (ret) {
-		kfree(d);
-	}
-	return ret;
-}
-
-static int rtc8564_probe(struct i2c_adapter *adap)
-{
-	return i2c_probe(adap, &addr_data, rtc8564_attach);
-}
-
-static int rtc8564_detach(struct i2c_client *client)
-{
-	i2c_detach_client(client);
-	kfree(i2c_get_clientdata(client));
-	return 0;
-}
-
-static int rtc8564_get_datetime(struct i2c_client *client, struct rtc_tm *dt)
-{
-	int ret = -EIO;
-	unsigned char buf[15];
-
-	_DBG(1, "client=%p, dt=%p", client, dt);
-
-	if (!dt)
-		return -EINVAL;
-
-	memset(buf, 0, sizeof(buf));
-
-	ret = rtc8564_read(client, 0, buf, 15);
-	if (ret)
-		return ret;
-
-	/* century stored in minute alarm reg */
-	dt->year = BCD2BIN(buf[RTC8564_REG_YEAR]);
-	dt->year += 100 * BCD2BIN(buf[RTC8564_REG_AL_MIN] & 0x3f);
-	dt->mday = BCD2BIN(buf[RTC8564_REG_DAY] & 0x3f);
-	dt->wday = BCD2BIN(buf[RTC8564_REG_WDAY] & 7);
-	dt->mon = BCD2BIN(buf[RTC8564_REG_MON_CENT] & 0x1f);
-
-	dt->secs = BCD2BIN(buf[RTC8564_REG_SEC] & 0x7f);
-	dt->vl = (buf[RTC8564_REG_SEC] & 0x80) == 0x80;
-	dt->mins = BCD2BIN(buf[RTC8564_REG_MIN] & 0x7f);
-	dt->hours = BCD2BIN(buf[RTC8564_REG_HR] & 0x3f);
-
-	_DBGRTCTM(2, *dt);
-
-	return 0;
-}
-
-static int
-rtc8564_set_datetime(struct i2c_client *client, struct rtc_tm *dt, int datetoo)
-{
-	int ret, len = 5;
-	unsigned char buf[15];
-
-	_DBG(1, "client=%p, dt=%p", client, dt);
-
-	if (!dt)
-		return -EINVAL;
-
-	_DBGRTCTM(2, *dt);
-
-	buf[RTC8564_REG_CTRL1] = CTRL1(client) | RTC8564_CTRL1_STOP;
-	buf[RTC8564_REG_CTRL2] = CTRL2(client);
-	buf[RTC8564_REG_SEC] = BIN2BCD(dt->secs);
-	buf[RTC8564_REG_MIN] = BIN2BCD(dt->mins);
-	buf[RTC8564_REG_HR] = BIN2BCD(dt->hours);
-
-	if (datetoo) {
-		len += 5;
-		buf[RTC8564_REG_DAY] = BIN2BCD(dt->mday);
-		buf[RTC8564_REG_WDAY] = BIN2BCD(dt->wday);
-		buf[RTC8564_REG_MON_CENT] = BIN2BCD(dt->mon) & 0x1f;
-		/* century stored in minute alarm reg */
-		buf[RTC8564_REG_YEAR] = BIN2BCD(dt->year % 100);
-		buf[RTC8564_REG_AL_MIN] = BIN2BCD(dt->year / 100);
-	}
-
-	ret = rtc8564_write(client, 0, buf, len);
-	if (ret) {
-		_DBG(1, "error writing data! %d", ret);
-	}
-
-	buf[RTC8564_REG_CTRL1] = CTRL1(client);
-	ret = rtc8564_write(client, 0, buf, 1);
-	if (ret) {
-		_DBG(1, "error writing data! %d", ret);
-	}
-
-	return ret;
-}
-
-static int rtc8564_get_ctrl(struct i2c_client *client, unsigned int *ctrl)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-
-	if (!ctrl)
-		return -1;
-
-	*ctrl = data->ctrl;
-	return 0;
-}
-
-static int rtc8564_set_ctrl(struct i2c_client *client, unsigned int *ctrl)
-{
-	struct rtc8564_data *data = i2c_get_clientdata(client);
-	unsigned char buf[2];
-
-	if (!ctrl)
-		return -1;
-
-	buf[0] = *ctrl & 0xff;
-	buf[1] = (*ctrl & 0xff00) >> 8;
-	data->ctrl = *ctrl;
-
-	return rtc8564_write(client, 0, buf, 2);
-}
-
-static int rtc8564_read_mem(struct i2c_client *client, struct mem *mem)
-{
-
-	if (!mem)
-		return -EINVAL;
-
-	return rtc8564_read(client, mem->loc, mem->data, mem->nr);
-}
-
-static int rtc8564_write_mem(struct i2c_client *client, struct mem *mem)
-{
-
-	if (!mem)
-		return -EINVAL;
-
-	return rtc8564_write(client, mem->loc, mem->data, mem->nr);
-}
-
-static int
-rtc8564_command(struct i2c_client *client, unsigned int cmd, void *arg)
-{
-
-	_DBG(1, "cmd=%d", cmd);
-
-	switch (cmd) {
-	case RTC_GETDATETIME:
-		return rtc8564_get_datetime(client, arg);
-
-	case RTC_SETTIME:
-		return rtc8564_set_datetime(client, arg, 0);
-
-	case RTC_SETDATETIME:
-		return rtc8564_set_datetime(client, arg, 1);
-
-	case RTC_GETCTRL:
-		return rtc8564_get_ctrl(client, arg);
-
-	case RTC_SETCTRL:
-		return rtc8564_set_ctrl(client, arg);
-
-	case MEM_READ:
-		return rtc8564_read_mem(client, arg);
-
-	case MEM_WRITE:
-		return rtc8564_write_mem(client, arg);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-static struct i2c_driver rtc8564_driver = {
-	.driver = {
-		.name	= "RTC8564",
-	},
-	.id		= I2C_DRIVERID_RTC8564,
-	.attach_adapter = rtc8564_probe,
-	.detach_client	= rtc8564_detach,
-	.command	= rtc8564_command
-};
-
-static __init int rtc8564_init(void)
-{
-	return i2c_add_driver(&rtc8564_driver);
-}
-
-static __exit void rtc8564_exit(void)
-{
-	i2c_del_driver(&rtc8564_driver);
-}
-
-MODULE_AUTHOR("Stefan Eletzhofer <Stefan.Eletzhofer@eletztrick.de>");
-MODULE_DESCRIPTION("EPSON RTC8564 Driver");
-MODULE_LICENSE("GPL");
-
-module_init(rtc8564_init);
-module_exit(rtc8564_exit);
diff --git a/drivers/i2c/chips/rtc8564.h b/drivers/i2c/chips/rtc8564.h
deleted file mode 100644
index e5342d10b8fa..000000000000
--- a/drivers/i2c/chips/rtc8564.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- *  linux/drivers/i2c/chips/rtc8564.h
- *
- *  Copyright (C) 2002-2004 Stefan Eletzhofer
- *
- *	based on linux/drivers/acron/char/pcf8583.h
- *  Copyright (C) 2000 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-struct rtc_tm {
-	unsigned char	secs;
-	unsigned char	mins;
-	unsigned char	hours;
-	unsigned char	mday;
-	unsigned char	mon;
-	unsigned short	year; /* xxxx 4 digits :) */
-	unsigned char	wday;
-	unsigned char	vl;
-};
-
-struct mem {
-	unsigned int	loc;
-	unsigned int	nr;
-	unsigned char	*data;
-};
-
-#define RTC_GETDATETIME	0
-#define RTC_SETTIME	1
-#define RTC_SETDATETIME	2
-#define RTC_GETCTRL	3
-#define RTC_SETCTRL	4
-#define MEM_READ	5
-#define MEM_WRITE	6
-
-#define RTC8564_REG_CTRL1		0x0 /* T  0 S 0 | T 0 0 0 */
-#define RTC8564_REG_CTRL2		0x1 /* 0  0 0 TI/TP | AF TF AIE TIE */
-#define RTC8564_REG_SEC			0x2 /* VL 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_MIN			0x3 /* x  4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_HR			0x4 /* x  x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_DAY			0x5 /* x  x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_WDAY		0x6 /* x  x x x | x 4 2 1 */
-#define RTC8564_REG_MON_CENT	0x7 /* C  x x 1 | 8 4 2 1 */
-#define RTC8564_REG_YEAR		0x8 /* 8  4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_MIN		0x9 /* AE 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_HR		0xa /* AE 4 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_DAY		0xb /* AE x 2 1 | 8 4 2 1 */
-#define RTC8564_REG_AL_WDAY		0xc /* AE x x x | x 4 2 1 */
-#define RTC8564_REG_CLKOUT		0xd /* FE x x x | x x FD1 FD0 */
-#define RTC8564_REG_TCTL		0xe /* TE x x x | x x FD1 FD0 */
-#define RTC8564_REG_TIMER		0xf /* 8 bit binary */
-
-/* Control reg */
-#define RTC8564_CTRL1_TEST1		(1<<3)
-#define RTC8564_CTRL1_STOP		(1<<5)
-#define RTC8564_CTRL1_TEST2		(1<<7)
-
-#define RTC8564_CTRL2_TIE		(1<<0)
-#define RTC8564_CTRL2_AIE		(1<<1)
-#define RTC8564_CTRL2_TF		(1<<2)
-#define RTC8564_CTRL2_AF		(1<<3)
-#define RTC8564_CTRL2_TI_TP		(1<<4)
-
-/* CLKOUT frequencies */
-#define RTC8564_FD_32768HZ		(0x0)
-#define RTC8564_FD_1024HZ		(0x1)
-#define RTC8564_FD_32			(0x2)
-#define RTC8564_FD_1HZ			(0x3)
-
-/* Timer CTRL */
-#define RTC8564_TD_4096HZ		(0x0)
-#define RTC8564_TD_64HZ			(0x1)
-#define RTC8564_TD_1HZ			(0x2)
-#define RTC8564_TD_1_60HZ		(0x3)
-
-#define I2C_DRIVERID_RTC8564 0xf000
diff --git a/drivers/i2c/chips/x1205.c b/drivers/i2c/chips/x1205.c
deleted file mode 100644
index 245fffa92dbd..000000000000
--- a/drivers/i2c/chips/x1205.c
+++ /dev/null
@@ -1,698 +0,0 @@
-/*
- *  x1205.c - An i2c driver for the Xicor X1205 RTC
- *  Copyright 2004 Karen Spearel
- *  Copyright 2005 Alessandro Zummo
- *
- *  please send all reports to:
- *	kas11 at tampabay dot rr dot com
- *      a dot zummo at towertech dot it
- *
- *  based on the other drivers in this same directory.
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/i2c.h>
-#include <linux/string.h>
-#include <linux/bcd.h>
-#include <linux/rtc.h>
-#include <linux/list.h>
-
-#include <linux/x1205.h>
-
-#define DRV_VERSION "0.9.9"
-
-/* Addresses to scan: none. This chip is located at
- * 0x6f and uses a two bytes register addressing.
- * Two bytes need to be written to read a single register,
- * while most other chips just require one and take the second
- * one as the data to be written. To prevent corrupting
- * unknown chips, the user must explicitely set the probe parameter.
- */
-
-static unsigned short normal_i2c[] = { I2C_CLIENT_END };
-
-/* Insmod parameters */
-I2C_CLIENT_INSMOD;
-I2C_CLIENT_MODULE_PARM(hctosys,
-	"Set the system time from the hardware clock upon initialization");
-
-/* offsets into CCR area */
-
-#define CCR_SEC			0
-#define CCR_MIN			1
-#define CCR_HOUR		2
-#define CCR_MDAY		3
-#define CCR_MONTH		4
-#define CCR_YEAR		5
-#define CCR_WDAY		6
-#define CCR_Y2K			7
-
-#define X1205_REG_SR		0x3F	/* status register */
-#define X1205_REG_Y2K		0x37
-#define X1205_REG_DW		0x36
-#define X1205_REG_YR		0x35
-#define X1205_REG_MO		0x34
-#define X1205_REG_DT		0x33
-#define X1205_REG_HR		0x32
-#define X1205_REG_MN		0x31
-#define X1205_REG_SC		0x30
-#define X1205_REG_DTR		0x13
-#define X1205_REG_ATR		0x12
-#define X1205_REG_INT		0x11
-#define X1205_REG_0		0x10
-#define X1205_REG_Y2K1		0x0F
-#define X1205_REG_DWA1		0x0E
-#define X1205_REG_YRA1		0x0D
-#define X1205_REG_MOA1		0x0C
-#define X1205_REG_DTA1		0x0B
-#define X1205_REG_HRA1		0x0A
-#define X1205_REG_MNA1		0x09
-#define X1205_REG_SCA1		0x08
-#define X1205_REG_Y2K0		0x07
-#define X1205_REG_DWA0		0x06
-#define X1205_REG_YRA0		0x05
-#define X1205_REG_MOA0		0x04
-#define X1205_REG_DTA0		0x03
-#define X1205_REG_HRA0		0x02
-#define X1205_REG_MNA0		0x01
-#define X1205_REG_SCA0		0x00
-
-#define X1205_CCR_BASE		0x30	/* Base address of CCR */
-#define X1205_ALM0_BASE		0x00	/* Base address of ALARM0 */
-
-#define X1205_SR_RTCF		0x01	/* Clock failure */
-#define X1205_SR_WEL		0x02	/* Write Enable Latch */
-#define X1205_SR_RWEL		0x04	/* Register Write Enable */
-
-#define X1205_DTR_DTR0		0x01
-#define X1205_DTR_DTR1		0x02
-#define X1205_DTR_DTR2		0x04
-
-#define X1205_HR_MIL		0x80	/* Set in ccr.hour for 24 hr mode */
-
-/* Prototypes */
-static int x1205_attach(struct i2c_adapter *adapter);
-static int x1205_detach(struct i2c_client *client);
-static int x1205_probe(struct i2c_adapter *adapter, int address, int kind);
-static int x1205_command(struct i2c_client *client, unsigned int cmd,
-	void *arg);
-
-static struct i2c_driver x1205_driver = {
-	.driver = {
-		.name	= "x1205",
-	},
-	.attach_adapter = &x1205_attach,
-	.detach_client	= &x1205_detach,
-};
-
-struct x1205_data {
-	struct i2c_client client;
-	struct list_head list;
-	unsigned int epoch;
-};
-
-static const unsigned char days_in_mo[] =
-	{ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
-
-static LIST_HEAD(x1205_clients);
-
-/* Workaround until the I2C subsytem will allow to send
- * commands to a specific client. This function will send the command
- * to the first client.
- */
-int x1205_do_command(unsigned int cmd, void *arg)
-{
-	struct list_head *walk;
-	struct list_head *tmp;
-	struct x1205_data *data;
-
-	list_for_each_safe(walk, tmp, &x1205_clients) {
-		data = list_entry(walk, struct x1205_data, list);
-		return x1205_command(&data->client, cmd, arg);
-	}
-
-	return -ENODEV;
-}
-
-#define is_leap(year) \
-	((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
-
-/* make sure the rtc_time values are in bounds */
-static int x1205_validate_tm(struct rtc_time *tm)
-{
-	int year = tm->tm_year + 1900;
-
-	if ((tm->tm_year < 70) || (tm->tm_year > 255))
-		return -EINVAL;
-
-	if ((tm->tm_mon > 11) || (tm->tm_mday == 0))
-		return -EINVAL;
-
-	if (tm->tm_mday > days_in_mo[tm->tm_mon]
-		+ ((tm->tm_mon == 1) && is_leap(year)))
-		return -EINVAL;
-
-	if ((tm->tm_hour >= 24) || (tm->tm_min >= 60) || (tm->tm_sec >= 60))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*
- * In the routines that deal directly with the x1205 hardware, we use
- * rtc_time -- month 0-11, hour 0-23, yr = calendar year-epoch
- * Epoch is initialized as 2000. Time is set to UTC.
- */
-static int x1205_get_datetime(struct i2c_client *client, struct rtc_time *tm,
-				u8 reg_base)
-{
-	unsigned char dt_addr[2] = { 0, reg_base };
-	static unsigned char sr_addr[2] = { 0, X1205_REG_SR };
-
-	unsigned char buf[8], sr;
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, sr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &sr }, 	/* read status */
-		{ client->addr, 0, 2, dt_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 8, buf },	/* read date */
-	};
-
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	/* read status register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	/* check for battery failure */
-	if (sr & X1205_SR_RTCF) {
-		dev_warn(&client->dev,
-			"Clock had a power failure, you must set the date.\n");
-		return -EINVAL;
-	}
-
-	/* read date registers */
-	if ((i2c_transfer(client->adapter, &msgs[2], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev,
-		"%s: raw read data - sec=%02x, min=%02x, hr=%02x, "
-		"mday=%02x, mon=%02x, year=%02x, wday=%02x, y2k=%02x\n",
-		__FUNCTION__,
-		buf[0], buf[1], buf[2], buf[3],
-		buf[4], buf[5], buf[6], buf[7]);
-
-	tm->tm_sec = BCD2BIN(buf[CCR_SEC]);
-	tm->tm_min = BCD2BIN(buf[CCR_MIN]);
-	tm->tm_hour = BCD2BIN(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */
-	tm->tm_mday = BCD2BIN(buf[CCR_MDAY]);
-	tm->tm_mon = BCD2BIN(buf[CCR_MONTH]);
-	data->epoch = BCD2BIN(buf[CCR_Y2K]) * 100;
-	tm->tm_year = BCD2BIN(buf[CCR_YEAR]) + data->epoch - 1900;
-	tm->tm_wday = buf[CCR_WDAY];
-
-	dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, "
-		"mday=%d, mon=%d, year=%d, wday=%d\n",
-		__FUNCTION__,
-		tm->tm_sec, tm->tm_min, tm->tm_hour,
-		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
-
-	return 0;
-}
-
-static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm,
-				int datetoo, u8 reg_base)
-{
-	int i, err, xfer;
-
-	unsigned char buf[8];
-
-	static const unsigned char wel[3] = { 0, X1205_REG_SR,
-						X1205_SR_WEL };
-
-	static const unsigned char rwel[3] = { 0, X1205_REG_SR,
-						X1205_SR_WEL | X1205_SR_RWEL };
-
-	static const unsigned char diswe[3] = { 0, X1205_REG_SR, 0 };
-
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	/* check if all values in the tm struct are correct */
-	if ((err = x1205_validate_tm(tm)) < 0)
-		return err;
-
-	dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, "
-		"mday=%d, mon=%d, year=%d, wday=%d\n",
-		__FUNCTION__,
-		tm->tm_sec, tm->tm_min, tm->tm_hour,
-		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
-
-	buf[CCR_SEC] = BIN2BCD(tm->tm_sec);
-	buf[CCR_MIN] = BIN2BCD(tm->tm_min);
-
-	/* set hour and 24hr bit */
-	buf[CCR_HOUR] = BIN2BCD(tm->tm_hour) | X1205_HR_MIL;
-
-	/* should we also set the date? */
-	if (datetoo) {
-		buf[CCR_MDAY] = BIN2BCD(tm->tm_mday);
-
-		/* month, 0 - 11 */
-		buf[CCR_MONTH] = BIN2BCD(tm->tm_mon);
-
-		/* year, since 1900 */
-		buf[CCR_YEAR] = BIN2BCD(tm->tm_year + 1900 - data->epoch);
-		buf[CCR_WDAY] = tm->tm_wday & 0x07;
-		buf[CCR_Y2K] = BIN2BCD(data->epoch / 100);
-	}
-
-	/* this sequence is required to unlock the chip */
-	xfer = i2c_master_send(client, wel, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: wel - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	xfer = i2c_master_send(client, rwel, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: rwel - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	/* write register's data */
-	for (i = 0; i < (datetoo ? 8 : 3); i++) {
-		unsigned char rdata[3] = { 0, reg_base + i, buf[i] };
-
-		xfer = i2c_master_send(client, rdata, 3);
-		if (xfer != 3) {
-			dev_err(&client->dev,
-				"%s: xfer=%d addr=%02x, data=%02x\n",
-				__FUNCTION__,
-				 xfer, rdata[1], rdata[2]);
-			return -EIO;
-		}
-	};
-
-	/* disable further writes */
-	xfer = i2c_master_send(client, diswe, 3);
-	if (xfer != 3) {
-		dev_err(&client->dev, "%s: diswe - %d\n", __FUNCTION__, xfer);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int x1205_get_dtrim(struct i2c_client *client, int *trim)
-{
-	unsigned char dtr;
-	static unsigned char dtr_addr[2] = { 0, X1205_REG_DTR };
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, dtr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &dtr }, 	/* read dtr */
-	};
-
-	/* read dtr register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev, "%s: raw dtr=%x\n", __FUNCTION__, dtr);
-
-	*trim = 0;
-
-	if (dtr & X1205_DTR_DTR0)
-		*trim += 20;
-
-	if (dtr & X1205_DTR_DTR1)
-		*trim += 10;
-
-	if (dtr & X1205_DTR_DTR2)
-		*trim = -*trim;
-
-	return 0;
-}
-
-static int x1205_get_atrim(struct i2c_client *client, int *trim)
-{
-	s8 atr;
-	static unsigned char atr_addr[2] = { 0, X1205_REG_ATR };
-
-	struct i2c_msg msgs[] = {
-		{ client->addr, 0, 2, atr_addr },	/* setup read ptr */
-		{ client->addr, I2C_M_RD, 1, &atr }, 	/* read atr */
-	};
-
-	/* read atr register */
-	if ((i2c_transfer(client->adapter, &msgs[0], 2)) != 2) {
-		dev_err(&client->dev, "%s: read error\n", __FUNCTION__);
-		return -EIO;
-	}
-
-	dev_dbg(&client->dev, "%s: raw atr=%x\n", __FUNCTION__, atr);
-
-	/* atr is a two's complement value on 6 bits,
-	 * perform sign extension. The formula is
-	 * Catr = (atr * 0.25pF) + 11.00pF.
-	 */
-	if (atr & 0x20)
-		atr |= 0xC0;
-
-	dev_dbg(&client->dev, "%s: raw atr=%x (%d)\n", __FUNCTION__, atr, atr);
-
-	*trim = (atr * 250) + 11000;
-
-	dev_dbg(&client->dev, "%s: real=%d\n", __FUNCTION__, *trim);
-
-	return 0;
-}
-
-static int x1205_hctosys(struct i2c_client *client)
-{
-	int err;
-
-	struct rtc_time tm;
-	struct timespec tv;
-
-	err = x1205_command(client, X1205_CMD_GETDATETIME, &tm);
-
-	if (err) {
-		dev_err(&client->dev,
-			"Unable to set the system clock\n");
-		return err;
-	}
-
-	/* IMPORTANT: the RTC only stores whole seconds. It is arbitrary
-	 * whether it stores the most close value or the value with partial
-	 * seconds truncated. However, it is important that we use it to store
-	 * the truncated value. This is because otherwise it is necessary,
-	 * in an rtc sync function, to read both xtime.tv_sec and
-	 * xtime.tv_nsec. On some processors (i.e. ARM), an atomic read
-	 * of >32bits is not possible. So storing the most close value would
-	 * slow down the sync API. So here we have the truncated value and
-	 * the best guess is to add 0.5s.
-	 */
-
-	tv.tv_nsec = NSEC_PER_SEC >> 1;
-
-	/* WARNING: this is not the C library 'mktime' call, it is a built in
-	 * inline function from include/linux/time.h.  It expects (requires)
-	 * the month to be in the range 1-12
-	 */
-
-	tv.tv_sec  = mktime(tm.tm_year + 1900, tm.tm_mon + 1,
-				tm.tm_mday, tm.tm_hour,
-				tm.tm_min, tm.tm_sec);
-
-	do_settimeofday(&tv);
-
-	dev_info(&client->dev,
-		"setting the system clock to %d-%d-%d %d:%d:%d\n",
-		tm.tm_year + 1900, tm.tm_mon + 1,
-		tm.tm_mday, tm.tm_hour, tm.tm_min,
-		tm.tm_sec);
-
-	return 0;
-}
-
-struct x1205_limit
-{
-	unsigned char reg;
-	unsigned char mask;
-	unsigned char min;
-	unsigned char max;
-};
-
-static int x1205_validate_client(struct i2c_client *client)
-{
-	int i, xfer;
-
-	/* Probe array. We will read the register at the specified
-	 * address and check if the given bits are zero.
-	 */
-	static const unsigned char probe_zero_pattern[] = {
-		/* register, mask */
-		X1205_REG_SR,	0x18,
-		X1205_REG_DTR,	0xF8,
-		X1205_REG_ATR,	0xC0,
-		X1205_REG_INT,	0x18,
-		X1205_REG_0,	0xFF,
-	};
-
-	static const struct x1205_limit probe_limits_pattern[] = {
-		/* register, mask, min, max */
-		{ X1205_REG_Y2K,	0xFF,	19,	20	},
-		{ X1205_REG_DW,		0xFF,	0,	6	},
-		{ X1205_REG_YR,		0xFF,	0,	99	},
-		{ X1205_REG_MO,		0xFF,	0,	12	},
-		{ X1205_REG_DT,		0xFF,	0,	31	},
-		{ X1205_REG_HR,		0x7F,	0,	23	},
-		{ X1205_REG_MN,		0xFF,	0,	59	},
-		{ X1205_REG_SC,		0xFF,	0,	59	},
-		{ X1205_REG_Y2K1,	0xFF,	19,	20	},
-		{ X1205_REG_Y2K0,	0xFF,	19,	20	},
-	};
-
-	/* check that registers have bits a 0 where expected */
-	for (i = 0; i < ARRAY_SIZE(probe_zero_pattern); i += 2) {
-		unsigned char buf;
-
-		unsigned char addr[2] = { 0, probe_zero_pattern[i] };
-
-		struct i2c_msg msgs[2] = {
-			{ client->addr, 0, 2, addr },
-			{ client->addr, I2C_M_RD, 1, &buf },
-		};
-
-		xfer = i2c_transfer(client->adapter, msgs, 2);
-		if (xfer != 2) {
-			dev_err(&client->adapter->dev,
-				"%s: could not read register %x\n",
-				__FUNCTION__, addr[1]);
-
-			return -EIO;
-		}
-
-		if ((buf & probe_zero_pattern[i+1]) != 0) {
-			dev_err(&client->adapter->dev,
-				"%s: register=%02x, zero pattern=%d, value=%x\n",
-				__FUNCTION__, addr[1], i, buf);
-
-			return -ENODEV;
-		}
-	}
-
-	/* check limits (only registers with bcd values) */
-	for (i = 0; i < ARRAY_SIZE(probe_limits_pattern); i++) {
-		unsigned char reg, value;
-
-		unsigned char addr[2] = { 0, probe_limits_pattern[i].reg };
-
-		struct i2c_msg msgs[2] = {
-			{ client->addr, 0, 2, addr },
-			{ client->addr, I2C_M_RD, 1, &reg },
-		};
-
-		xfer = i2c_transfer(client->adapter, msgs, 2);
-
-		if (xfer != 2) {
-			dev_err(&client->adapter->dev,
-				"%s: could not read register %x\n",
-				__FUNCTION__, addr[1]);
-
-			return -EIO;
-		}
-
-		value = BCD2BIN(reg & probe_limits_pattern[i].mask);
-
-		if (value > probe_limits_pattern[i].max ||
-			value < probe_limits_pattern[i].min) {
-			dev_dbg(&client->adapter->dev,
-				"%s: register=%x, lim pattern=%d, value=%d\n",
-				__FUNCTION__, addr[1], i, value);
-
-			return -ENODEV;
-		}
-	}
-
-	return 0;
-}
-
-static int x1205_attach(struct i2c_adapter *adapter)
-{
-	dev_dbg(&adapter->dev, "%s\n", __FUNCTION__);
-
-	return i2c_probe(adapter, &addr_data, x1205_probe);
-}
-
-int x1205_direct_attach(int adapter_id,
-	struct i2c_client_address_data *address_data)
-{
-	int err;
-	struct i2c_adapter *adapter = i2c_get_adapter(adapter_id);
-
-	if (adapter) {
-		err = i2c_probe(adapter,
-			address_data, x1205_probe);
-
-		i2c_put_adapter(adapter);
-
-		return err;
-	}
-
-	return -ENODEV;
-}
-
-static int x1205_probe(struct i2c_adapter *adapter, int address, int kind)
-{
-	struct i2c_client *client;
-	struct x1205_data *data;
-
-	int err = 0;
-
-	dev_dbg(&adapter->dev, "%s\n", __FUNCTION__);
-
-	if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) {
-		err = -ENODEV;
-		goto exit;
-	}
-
-	if (!(data = kzalloc(sizeof(struct x1205_data), GFP_KERNEL))) {
-		err = -ENOMEM;
-		goto exit;
-	}
-
-	/* Initialize our structures */
-	data->epoch = 2000;
-
-	client = &data->client;
-	client->addr = address;
-	client->driver = &x1205_driver;
-	client->adapter	= adapter;
-
-	strlcpy(client->name, "x1205", I2C_NAME_SIZE);
-
-	i2c_set_clientdata(client, data);
-
-	/* Verify the chip is really an X1205 */
-	if (kind < 0) {
-		if (x1205_validate_client(client) < 0) {
-			err = -ENODEV;
-			goto exit_kfree;
-		}
-	}
-
-	/* Inform the i2c layer */
-	if ((err = i2c_attach_client(client)))
-		goto exit_kfree;
-
-	list_add(&data->list, &x1205_clients);
-
-	dev_info(&client->dev, "chip found, driver version " DRV_VERSION "\n");
-
-	/* If requested, set the system time */
-	if (hctosys)
-		x1205_hctosys(client);
-
-	return 0;
-
-exit_kfree:
-	kfree(data);
-
-exit:
-	return err;
-}
-
-static int x1205_detach(struct i2c_client *client)
-{
-	int err;
-	struct x1205_data *data = i2c_get_clientdata(client);
-
-	dev_dbg(&client->dev, "%s\n", __FUNCTION__);
-
-	if ((err = i2c_detach_client(client)))
-		return err;
-
-	list_del(&data->list);
-
-	kfree(data);
-
-	return 0;
-}
-
-static int x1205_command(struct i2c_client *client, unsigned int cmd,
-	void *param)
-{
-	if (param == NULL)
-		return -EINVAL;
-
-	if (!capable(CAP_SYS_TIME))
-		return -EACCES;
-
-	dev_dbg(&client->dev, "%s: cmd=%d\n", __FUNCTION__, cmd);
-
-	switch (cmd) {
-	case X1205_CMD_GETDATETIME:
-		return x1205_get_datetime(client, param, X1205_CCR_BASE);
-
-	case X1205_CMD_SETTIME:
-		return x1205_set_datetime(client, param, 0,
-				X1205_CCR_BASE);
-
-	case X1205_CMD_SETDATETIME:
-		return x1205_set_datetime(client, param, 1,
-				X1205_CCR_BASE);
-
-	case X1205_CMD_GETALARM:
-		return x1205_get_datetime(client, param, X1205_ALM0_BASE);
-
-	case X1205_CMD_SETALARM:
-		return x1205_set_datetime(client, param, 1,
-				X1205_ALM0_BASE);
-
-	case X1205_CMD_GETDTRIM:
-		return x1205_get_dtrim(client, param);
-
-	case X1205_CMD_GETATRIM:
-		return x1205_get_atrim(client, param);
-
-	default:
-		return -EINVAL;
-	}
-}
-
-static int __init x1205_init(void)
-{
-	return i2c_add_driver(&x1205_driver);
-}
-
-static void __exit x1205_exit(void)
-{
-	i2c_del_driver(&x1205_driver);
-}
-
-MODULE_AUTHOR(
-	"Karen Spearel <kas11@tampabay.rr.com>, "
-	"Alessandro Zummo <a.zummo@towertech.it>");
-MODULE_DESCRIPTION("Xicor X1205 RTC driver");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(DRV_VERSION);
-
-EXPORT_SYMBOL_GPL(x1205_do_command);
-EXPORT_SYMBOL_GPL(x1205_direct_attach);
-
-module_init(x1205_init);
-module_exit(x1205_exit);
diff --git a/include/linux/x1205.h b/include/linux/x1205.h
deleted file mode 100644
index 64fd3af894a5..000000000000
--- a/include/linux/x1205.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  x1205.h - defines for drivers/i2c/chips/x1205.c
- *  Copyright 2004 Karen Spearel
- *  Copyright 2005 Alessandro Zummo
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- */
-
-#ifndef __LINUX_X1205_H__
-#define __LINUX_X1205_H__
-
-/* commands */
-
-#define X1205_CMD_GETDATETIME	0
-#define X1205_CMD_SETTIME	1
-#define X1205_CMD_SETDATETIME	2
-#define X1205_CMD_GETALARM	3
-#define X1205_CMD_SETALARM	4
-#define X1205_CMD_GETDTRIM	5
-#define X1205_CMD_SETDTRIM	6
-#define X1205_CMD_GETATRIM	7
-#define X1205_CMD_SETATRIM	8
-
-extern int x1205_do_command(unsigned int cmd, void *arg);
-extern int x1205_direct_attach(int adapter_id,
-	struct i2c_client_address_data *address_data);
-
-#endif /* __LINUX_X1205_H__ */
-- 
cgit v1.2.3


From f7f3682fb2f8bc8a9c912baeea15454416ca1972 Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:38 -0800
Subject: [PATCH] RTC subsystem: I2C driver ids

This patch adds the I2C driver ids to i2c-id.h in preparation of the I2C
direct probing method.

This is kept separate so that it can be integrated to

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/i2c-id.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 679b46a6a565..c8b81f419fd8 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -108,6 +108,10 @@
 #define I2C_DRIVERID_UPD64083	78	/* upd64083 video processor	*/
 #define I2C_DRIVERID_UPD64031A	79	/* upd64031a video processor	*/
 #define I2C_DRIVERID_SAA717X	80	/* saa717x video encoder	*/
+#define I2C_DRIVERID_DS1672	81	/* Dallas/Maxim DS1672 RTC	*/
+#define I2C_DRIVERID_X1205	82	/* Xicor/Intersil X1205 RTC	*/
+#define I2C_DRIVERID_PCF8563	83	/* Philips PCF8563 RTC		*/
+#define I2C_DRIVERID_RS5C372	84	/* Ricoh RS5C372 RTC		*/
 
 #define I2C_DRIVERID_I2CDEV	900
 #define I2C_DRIVERID_ARP        902    /* SMBus ARP Client              */
-- 
cgit v1.2.3


From 1d98af87270cc08bb8251e004b9dc63cc838f24b Mon Sep 17 00:00:00 2001
From: Alessandro Zummo <a.zummo@towertech.it>
Date: Mon, 27 Mar 2006 01:16:47 -0800
Subject: [PATCH] RTC subsystem: M48T86 driver

Add a driver for the ST M48T86 / Dallas DS12887 RTC.

This is a platform driver.  The platform device must provide I/O routines to
access the RTC.

Signed-off-by: Alessandro Zummo <a.zummo@towertech.it>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/rtc/Kconfig      |  10 +++
 drivers/rtc/Makefile     |   1 +
 drivers/rtc/rtc-m48t86.c | 209 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/m48t86.h   |  16 ++++
 4 files changed, 236 insertions(+)
 create mode 100644 drivers/rtc/rtc-m48t86.c
 create mode 100644 include/linux/m48t86.h

(limited to 'include/linux')

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 166232a3f56b..929dd8090578 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -117,6 +117,16 @@ config RTC_DRV_RS5C372
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-rs5c372.
 
+config RTC_DRV_M48T86
+	tristate "ST M48T86/Dallas DS12887"
+	depends on RTC_CLASS
+	help
+	  If you say Y here you will get support for the
+	  ST M48T86 and Dallas DS12887 RTC chips.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-m48t86.
+
 config RTC_DRV_EP93XX
 	tristate "Cirrus Logic EP93XX"
 	depends on RTC_CLASS && ARCH_EP93XX
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 56820d18161e..8d4c7fe88d58 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -16,5 +16,6 @@ obj-$(CONFIG_RTC_DRV_TEST)	+= rtc-test.o
 obj-$(CONFIG_RTC_DRV_DS1672)	+= rtc-ds1672.o
 obj-$(CONFIG_RTC_DRV_PCF8563)	+= rtc-pcf8563.o
 obj-$(CONFIG_RTC_DRV_RS5C372)	+= rtc-rs5c372.o
+obj-$(CONFIG_RTC_DRV_M48T86)	+= rtc-m48t86.o
 obj-$(CONFIG_RTC_DRV_EP93XX)	+= rtc-ep93xx.o
 obj-$(CONFIG_RTC_DRV_SA1100)	+= rtc-sa1100.o
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
new file mode 100644
index 000000000000..db445c872b1b
--- /dev/null
+++ b/drivers/rtc/rtc-m48t86.c
@@ -0,0 +1,209 @@
+/*
+ * ST M48T86 / Dallas DS12887 RTC driver
+ * Copyright (c) 2006 Tower Technologies
+ *
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This drivers only supports the clock running in BCD and 24H mode.
+ * If it will be ever adapted to binary and 12H mode, care must be taken
+ * to not introduce bugs.
+ */
+
+#include <linux/module.h>
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/m48t86.h>
+#include <linux/bcd.h>
+
+#define M48T86_REG_SEC		0x00
+#define M48T86_REG_SECALRM	0x01
+#define M48T86_REG_MIN		0x02
+#define M48T86_REG_MINALRM	0x03
+#define M48T86_REG_HOUR	0x04
+#define M48T86_REG_HOURALRM	0x05
+#define M48T86_REG_DOW		0x06 /* 1 = sunday */
+#define M48T86_REG_DOM		0x07
+#define M48T86_REG_MONTH	0x08 /* 1 - 12 */
+#define M48T86_REG_YEAR		0x09 /* 0 - 99 */
+#define M48T86_REG_A		0x0A
+#define M48T86_REG_B		0x0B
+#define M48T86_REG_C		0x0C
+#define M48T86_REG_D		0x0D
+
+#define M48T86_REG_B_H24	(1 << 1)
+#define M48T86_REG_B_DM		(1 << 2)
+#define M48T86_REG_B_SET	(1 << 7)
+#define M48T86_REG_D_VRT	(1 << 7)
+
+#define DRV_VERSION "0.1"
+
+
+static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	if (reg & M48T86_REG_B_DM) {
+		/* data (binary) mode */
+		tm->tm_sec	= ops->readb(M48T86_REG_SEC);
+		tm->tm_min	= ops->readb(M48T86_REG_MIN);
+		tm->tm_hour	= ops->readb(M48T86_REG_HOUR) & 0x3F;
+		tm->tm_mday	= ops->readb(M48T86_REG_DOM);
+		/* tm_mon is 0-11 */
+		tm->tm_mon	= ops->readb(M48T86_REG_MONTH) - 1;
+		tm->tm_year	= ops->readb(M48T86_REG_YEAR) + 100;
+		tm->tm_wday	= ops->readb(M48T86_REG_DOW);
+	} else {
+		/* bcd mode */
+		tm->tm_sec	= BCD2BIN(ops->readb(M48T86_REG_SEC));
+		tm->tm_min	= BCD2BIN(ops->readb(M48T86_REG_MIN));
+		tm->tm_hour	= BCD2BIN(ops->readb(M48T86_REG_HOUR) & 0x3F);
+		tm->tm_mday	= BCD2BIN(ops->readb(M48T86_REG_DOM));
+		/* tm_mon is 0-11 */
+		tm->tm_mon	= BCD2BIN(ops->readb(M48T86_REG_MONTH)) - 1;
+		tm->tm_year	= BCD2BIN(ops->readb(M48T86_REG_YEAR)) + 100;
+		tm->tm_wday	= BCD2BIN(ops->readb(M48T86_REG_DOW));
+	}
+
+	/* correct the hour if the clock is in 12h mode */
+	if (!(reg & M48T86_REG_B_H24))
+		if (ops->readb(M48T86_REG_HOUR) & 0x80)
+			tm->tm_hour += 12;
+
+	return 0;
+}
+
+static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	/* update flag and 24h mode */
+	reg |= M48T86_REG_B_SET | M48T86_REG_B_H24;
+	ops->writeb(reg, M48T86_REG_B);
+
+	if (reg & M48T86_REG_B_DM) {
+		/* data (binary) mode */
+		ops->writeb(tm->tm_sec, M48T86_REG_SEC);
+		ops->writeb(tm->tm_min, M48T86_REG_MIN);
+		ops->writeb(tm->tm_hour, M48T86_REG_HOUR);
+		ops->writeb(tm->tm_mday, M48T86_REG_DOM);
+		ops->writeb(tm->tm_mon + 1, M48T86_REG_MONTH);
+		ops->writeb(tm->tm_year % 100, M48T86_REG_YEAR);
+		ops->writeb(tm->tm_wday, M48T86_REG_DOW);
+	} else {
+		/* bcd mode */
+		ops->writeb(BIN2BCD(tm->tm_sec), M48T86_REG_SEC);
+		ops->writeb(BIN2BCD(tm->tm_min), M48T86_REG_MIN);
+		ops->writeb(BIN2BCD(tm->tm_hour), M48T86_REG_HOUR);
+		ops->writeb(BIN2BCD(tm->tm_mday), M48T86_REG_DOM);
+		ops->writeb(BIN2BCD(tm->tm_mon + 1), M48T86_REG_MONTH);
+		ops->writeb(BIN2BCD(tm->tm_year % 100), M48T86_REG_YEAR);
+		ops->writeb(BIN2BCD(tm->tm_wday), M48T86_REG_DOW);
+	}
+
+	/* update ended */
+	reg &= ~M48T86_REG_B_SET;
+	ops->writeb(reg, M48T86_REG_B);
+
+	return 0;
+}
+
+static int m48t86_rtc_proc(struct device *dev, struct seq_file *seq)
+{
+	unsigned char reg;
+	struct platform_device *pdev = to_platform_device(dev);
+	struct m48t86_ops *ops = pdev->dev.platform_data;
+
+	reg = ops->readb(M48T86_REG_B);
+
+	seq_printf(seq, "24hr\t\t: %s\n",
+		 (reg & M48T86_REG_B_H24) ? "yes" : "no");
+
+	seq_printf(seq, "mode\t\t: %s\n",
+		 (reg & M48T86_REG_B_DM) ? "binary" : "bcd");
+
+	reg = ops->readb(M48T86_REG_D);
+
+	seq_printf(seq, "battery\t\t: %s\n",
+		 (reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+
+	return 0;
+}
+
+static struct rtc_class_ops m48t86_rtc_ops = {
+	.read_time	= m48t86_rtc_read_time,
+	.set_time	= m48t86_rtc_set_time,
+	.proc		= m48t86_rtc_proc,
+};
+
+static int __devinit m48t86_rtc_probe(struct platform_device *dev)
+{
+	unsigned char reg;
+	struct m48t86_ops *ops = dev->dev.platform_data;
+	struct rtc_device *rtc = rtc_device_register("m48t86",
+				&dev->dev, &m48t86_rtc_ops, THIS_MODULE);
+
+	if (IS_ERR(rtc)) {
+		dev_err(&dev->dev, "unable to register\n");
+		return PTR_ERR(rtc);
+	}
+
+	platform_set_drvdata(dev, rtc);
+
+	/* read battery status */
+	reg = ops->readb(M48T86_REG_D);
+	dev_info(&dev->dev, "battery %s\n",
+		(reg & M48T86_REG_D_VRT) ? "ok" : "exhausted");
+
+	return 0;
+}
+
+static int __devexit m48t86_rtc_remove(struct platform_device *dev)
+{
+	struct rtc_device *rtc = platform_get_drvdata(dev);
+
+ 	if (rtc)
+		rtc_device_unregister(rtc);
+
+	platform_set_drvdata(dev, NULL);
+
+	return 0;
+}
+
+static struct platform_driver m48t86_rtc_platform_driver = {
+	.driver		= {
+		.name	= "rtc-m48t86",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= m48t86_rtc_probe,
+	.remove		= __devexit_p(m48t86_rtc_remove),
+};
+
+static int __init m48t86_rtc_init(void)
+{
+	return platform_driver_register(&m48t86_rtc_platform_driver);
+}
+
+static void __exit m48t86_rtc_exit(void)
+{
+	platform_driver_unregister(&m48t86_rtc_platform_driver);
+}
+
+MODULE_AUTHOR("Alessandro Zummo <a.zummo@towertech.it>");
+MODULE_DESCRIPTION("M48T86 RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+module_init(m48t86_rtc_init);
+module_exit(m48t86_rtc_exit);
diff --git a/include/linux/m48t86.h b/include/linux/m48t86.h
new file mode 100644
index 000000000000..9065199319d0
--- /dev/null
+++ b/include/linux/m48t86.h
@@ -0,0 +1,16 @@
+/*
+ * ST M48T86 / Dallas DS12887 RTC driver
+ * Copyright (c) 2006 Tower Technologies
+ *
+ * Author: Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+struct m48t86_ops
+{
+	void (*writeb)(unsigned char value, unsigned long addr);
+	unsigned char (*readb)(unsigned long addr);
+};
-- 
cgit v1.2.3


From ed49843b897da9969e349c279ffc832efcb93213 Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Mon, 27 Mar 2006 01:17:36 -0800
Subject: [PATCH] Add ID for Quadro NVS280

Quadro NVS280 is a dual-head PCIe card with PCI ID 10de:00fd and subsystem ID
10de:0215.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Antonino Daplas <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/video/nvidia/nvidia.c | 2 ++
 include/linux/pci_ids.h       | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index 018dd8174c7d..6d3e4890cb43 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -297,6 +297,8 @@ static struct pci_device_id nvidiafb_pci_tbl[] = {
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{PCI_VENDOR_ID_NVIDIA, PCIE_DEVICE_ID_NVIDIA_QUADRO_NVS280,
+	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0252,
 	 PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{PCI_VENDOR_ID_NVIDIA, 0x0313,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6f080ae59286..02f6cf20b141 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1052,6 +1052,7 @@
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6600_ALT2 0x00f2
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6200_ALT1 0x00f3
 #define PCIE_DEVICE_ID_NVIDIA_GEFORCE_6800_GT   0x00f9
+#define PCIE_DEVICE_ID_NVIDIA_QUADRO_NVS280	0x00fd
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_SDR	0x0100
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_DDR	0x0101
 #define PCI_DEVICE_ID_NVIDIA_QUADRO		0x0103
-- 
cgit v1.2.3


From 969429b504ae866d3f8b1cafd68a2c099e305093 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:17:49 -0800
Subject: [PATCH] dm: make sure QUEUE_FLAG_CLUSTER is set properly

This flag should be set for a virtual device iff it is set for all
underlying devices.

Signed-off-by: Neil Brown <neilb@suse.de>
Acked-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-table.c         | 9 +++++++++
 include/linux/device-mapper.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 907b08ddb783..9558a4acec12 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -97,6 +97,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 
 	lhs->seg_boundary_mask =
 		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
+
+	lhs->no_cluster |= rhs->no_cluster;
 }
 
 /*
@@ -523,6 +525,8 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		rs->seg_boundary_mask =
 			min_not_zero(rs->seg_boundary_mask,
 				     q->seg_boundary_mask);
+
+		rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	}
 
 	return r;
@@ -832,6 +836,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	q->hardsect_size = t->limits.hardsect_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
+	if (t->limits.no_cluster)
+		q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER);
+	else
+		q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER);
+
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 51e0e95a421a..aee10b2ea4c6 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -97,6 +97,7 @@ struct io_restrictions {
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
 	unsigned long		seg_boundary_mask;
+	unsigned char		no_cluster; /* inverted so that 0 is default */
 };
 
 struct dm_target {
-- 
cgit v1.2.3


From 3ac51e741a46af7a20f55e79d3e3aeaa93c6c544 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@us.ibm.com>
Date: Mon, 27 Mar 2006 01:17:54 -0800
Subject: [PATCH] dm store geometry

Allow drive geometry to be stored with a new DM_DEV_SET_GEOMETRY ioctl.
Device-mapper will now respond to HDIO_GETGEO.  If the geometry information is
not available, zero will be returned for all of the parameters.

Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c        | 52 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.c              | 46 +++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.h              |  7 ++++++
 include/linux/compat_ioctl.h |  2 ++
 include/linux/dm-ioctl.h     | 17 +++++++++++++--
 5 files changed, 121 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 65826bdac00c..8edd6435414d 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/dm-ioctl.h>
+#include <linux/hdreg.h>
 
 #include <asm/uaccess.h>
 
@@ -700,6 +701,54 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 	return dm_hash_rename(param->name, new_name);
 }
 
+static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+{
+	int r = -EINVAL, x;
+	struct mapped_device *md;
+	struct hd_geometry geometry;
+	unsigned long indata[4];
+	char *geostr = (char *) param + param->data_start;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	if (geostr < (char *) (param + 1) ||
+	    invalid_str(geostr, (void *) param + param_size)) {
+		DMWARN("Invalid geometry supplied.");
+		goto out;
+	}
+
+	x = sscanf(geostr, "%lu %lu %lu %lu", indata,
+		   indata + 1, indata + 2, indata + 3);
+
+	if (x != 4) {
+		DMWARN("Unable to interpret geometry settings.");
+		goto out;
+	}
+
+	if (indata[0] > 65535 || indata[1] > 255 ||
+	    indata[2] > 255 || indata[3] > ULONG_MAX) {
+		DMWARN("Geometry exceeds range limits.");
+		goto out;
+	}
+
+	geometry.cylinders = indata[0];
+	geometry.heads = indata[1];
+	geometry.sectors = indata[2];
+	geometry.start = indata[3];
+
+	r = dm_set_geometry(md, &geometry);
+	if (!r)
+		r = __dev_status(md, param);
+
+	param->data_size = 0;
+
+out:
+	dm_put(md);
+	return r;
+}
+
 static int do_suspend(struct dm_ioctl *param)
 {
 	int r = 0;
@@ -1234,7 +1283,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd)
 
 		{DM_LIST_VERSIONS_CMD, list_versions},
 
-		{DM_TARGET_MSG_CMD, target_message}
+		{DM_TARGET_MSG_CMD, target_message},
+		{DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry}
 	};
 
 	return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b99df48cffed..973e63d530ae 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
@@ -102,6 +103,9 @@ struct mapped_device {
 	 */
 	struct super_block *frozen_sb;
 	struct block_device *suspended_bdev;
+
+	/* forced geometry settings */
+	struct hd_geometry geometry;
 };
 
 #define MIN_IOS 256
@@ -227,6 +231,13 @@ static int dm_blk_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+
+	return dm_get_geometry(md, geo);
+}
+
 static inline struct dm_io *alloc_io(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_NOIO);
@@ -313,6 +324,33 @@ struct dm_table *dm_get_table(struct mapped_device *md)
 	return t;
 }
 
+/*
+ * Get the geometry associated with a dm device
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	*geo = md->geometry;
+
+	return 0;
+}
+
+/*
+ * Set the geometry of a device.
+ */
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
+{
+	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
+
+	if (geo->start > sz) {
+		DMWARN("Start sector is beyond the geometry limits.");
+		return -EINVAL;
+	}
+
+	md->geometry = *geo;
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
@@ -906,6 +944,13 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 	sector_t size;
 
 	size = dm_table_get_size(t);
+
+	/*
+	 * Wipe any geometry if the size of the table changed.
+	 */
+	if (size != get_capacity(md->disk))
+		memset(&md->geometry, 0, sizeof(md->geometry));
+
 	__set_size(md, size);
 	if (size == 0)
 		return 0;
@@ -1261,6 +1306,7 @@ int dm_suspended(struct mapped_device *md)
 static struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
+	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 79e800051a10..fd90bc8f9e45 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/hdreg.h>
 
 #define DM_NAME "device-mapper"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
@@ -85,6 +86,12 @@ int dm_wait_event(struct mapped_device *md, int event_nr);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 
+/*
+ * Geometry functions.
+ */
+int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
+int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);
+
 /*-----------------------------------------------------------------
  * Functions for manipulating a table.  Tables are also reference
  * counted.
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index efb518f16bb3..89ab677cb993 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -140,6 +140,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
 COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32)
 COMPATIBLE_IOCTL(DM_VERSION)
 COMPATIBLE_IOCTL(DM_REMOVE_ALL)
 COMPATIBLE_IOCTL(DM_LIST_DEVICES)
@@ -155,6 +156,7 @@ COMPATIBLE_IOCTL(DM_TABLE_DEPS)
 COMPATIBLE_IOCTL(DM_TABLE_STATUS)
 COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
 COMPATIBLE_IOCTL(DM_TARGET_MSG)
+COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index fa75ba0d635e..c67c6786612a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -80,6 +80,16 @@
  *
  * DM_TARGET_MSG:
  * Pass a message string to the target at a specific offset of a device.
+ *
+ * DM_DEV_SET_GEOMETRY:
+ * Set the geometry of a device by passing in a string in this format:
+ *
+ * "cylinders heads sectors_per_track start_sector"
+ *
+ * Beware that CHS geometry is nearly obsolete and only provided
+ * for compatibility with dm devices that can be booted by a PC
+ * BIOS.  See struct hd_geometry for range limits.  Also note that
+ * the geometry is erased if the device size changes.
  */
 
 /*
@@ -218,6 +228,7 @@ enum {
 	/* Added later */
 	DM_LIST_VERSIONS_CMD,
 	DM_TARGET_MSG_CMD,
+	DM_DEV_SET_GEOMETRY_CMD
 };
 
 /*
@@ -247,6 +258,7 @@ typedef char ioctl_struct[308];
 #define DM_TABLE_STATUS_32  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
 #define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
 #define DM_TARGET_MSG_32    _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
+#define DM_DEV_SET_GEOMETRY_32	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, ioctl_struct)
 #endif
 
 #define DM_IOCTL 0xfd
@@ -270,11 +282,12 @@ typedef char ioctl_struct[308];
 #define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
 
 #define DM_TARGET_MSG	 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
+#define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	5
+#define DM_VERSION_MINOR	6
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2005-10-04)"
+#define DM_VERSION_EXTRA	"-ioctl (2006-02-17)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 6a4d44c1f1108d6c9e8850e8cf166aaba0e56eae Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:55 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: holders/slaves subdirectory

Creating "slaves" and "holders" directories in /sys/block/<disk> and
creating "holders" directory under /sys/block/<disk>/<partition>

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/genhd.h |  7 +++++++
 2 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f924f459bdb8..60523cea7136 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -297,6 +297,30 @@ struct kobj_type ktype_part = {
 	.sysfs_ops	= &part_sysfs_ops,
 };
 
+#ifdef CONFIG_SYSFS
+static inline void partition_sysfs_add_subdir(struct hd_struct *p)
+{
+	struct kobject *k;
+
+	k = kobject_get(&p->kobj);
+	p->holder_dir = kobject_add_dir(k, "holders");
+	kobject_put(k);
+}
+
+static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
+{
+	struct kobject *k;
+
+	k = kobject_get(&disk->kobj);
+	disk->holder_dir = kobject_add_dir(k, "holders");
+	disk->slave_dir = kobject_add_dir(k, "slaves");
+	kobject_put(k);
+}
+#else
+#define partition_sysfs_add_subdir(x)	do { } while (0)
+#define disk_sysfs_add_subdirs(x)	do { } while (0)
+#endif
+
 void delete_partition(struct gendisk *disk, int part)
 {
 	struct hd_struct *p = disk->part[part-1];
@@ -310,6 +334,10 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
+#ifdef CONFIG_SYSFS
+	if (p->holder_dir)
+		kobject_unregister(p->holder_dir);
+#endif
 	kobject_unregister(&p->kobj);
 }
 
@@ -337,6 +365,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
 	kobject_register(&p->kobj);
+	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
 
@@ -383,6 +412,7 @@ void register_disk(struct gendisk *disk)
 	if ((err = kobject_add(&disk->kobj)))
 		return;
 	disk_sysfs_symlinks(disk);
+ 	disk_sysfs_add_subdirs(disk);
 	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
@@ -483,6 +513,12 @@ void del_gendisk(struct gendisk *disk)
 
 	devfs_remove_disk(disk);
 
+#ifdef CONFIG_SYSFS
+	if (disk->holder_dir)
+		kobject_unregister(disk->holder_dir);
+	if (disk->slave_dir)
+		kobject_unregister(disk->slave_dir);
+#endif
 	if (disk->driverfs_dev) {
 		char *disk_name = make_block_name(disk);
 		sysfs_remove_link(&disk->kobj, "device");
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index fd647fde5ec1..eea61cc8fac1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -78,6 +78,9 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	struct kobject kobj;
+#ifdef CONFIG_SYSFS
+	struct kobject *holder_dir;
+#endif
 	unsigned ios[2], sectors[2];	/* READs and WRITEs */
 	int policy, partno;
 };
@@ -114,6 +117,10 @@ struct gendisk {
 	int number;			/* more of the same */
 	struct device *driverfs_dev;
 	struct kobject kobj;
+#ifdef CONFIG_SYSFS
+	struct kobject *holder_dir;
+	struct kobject *slave_dir;
+#endif
 
 	struct timer_rand_state *random;
 	int policy;
-- 
cgit v1.2.3


From 100873687d81d4ce7b1299b447d33e87ba1e9583 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 27 Mar 2006 01:17:56 -0800
Subject: [PATCH]
 dm-md-dependency-tree-in-sysfs-holders-slaves-subdirectory-tidy

Remove all the CONFIG_SYSFS stuff.  That's supposed to all be implemented up
in header files.

Yes, the CONFIG_SYSFS=n data structures will be a little larger than
necessary, but that's a tradeoff we can decide to make.

Cc: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 9 ---------
 include/linux/genhd.h | 4 ----
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 60523cea7136..af0cb4b9e784 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -297,7 +297,6 @@ struct kobj_type ktype_part = {
 	.sysfs_ops	= &part_sysfs_ops,
 };
 
-#ifdef CONFIG_SYSFS
 static inline void partition_sysfs_add_subdir(struct hd_struct *p)
 {
 	struct kobject *k;
@@ -316,10 +315,6 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	disk->slave_dir = kobject_add_dir(k, "slaves");
 	kobject_put(k);
 }
-#else
-#define partition_sysfs_add_subdir(x)	do { } while (0)
-#define disk_sysfs_add_subdirs(x)	do { } while (0)
-#endif
 
 void delete_partition(struct gendisk *disk, int part)
 {
@@ -334,10 +329,8 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
-#ifdef CONFIG_SYSFS
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-#endif
 	kobject_unregister(&p->kobj);
 }
 
@@ -513,12 +506,10 @@ void del_gendisk(struct gendisk *disk)
 
 	devfs_remove_disk(disk);
 
-#ifdef CONFIG_SYSFS
 	if (disk->holder_dir)
 		kobject_unregister(disk->holder_dir);
 	if (disk->slave_dir)
 		kobject_unregister(disk->slave_dir);
-#endif
 	if (disk->driverfs_dev) {
 		char *disk_name = make_block_name(disk);
 		sysfs_remove_link(&disk->kobj, "device");
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index eea61cc8fac1..bd7db861041e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -78,9 +78,7 @@ struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
 	struct kobject kobj;
-#ifdef CONFIG_SYSFS
 	struct kobject *holder_dir;
-#endif
 	unsigned ios[2], sectors[2];	/* READs and WRITEs */
 	int policy, partno;
 };
@@ -117,10 +115,8 @@ struct gendisk {
 	int number;			/* more of the same */
 	struct device *driverfs_dev;
 	struct kobject kobj;
-#ifdef CONFIG_SYSFS
 	struct kobject *holder_dir;
 	struct kobject *slave_dir;
-#endif
 
 	struct timer_rand_state *random;
 	int policy;
-- 
cgit v1.2.3


From 641dc636b0475582e48584340b774bd1e90d40d9 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Mon, 27 Mar 2006 01:17:57 -0800
Subject: [PATCH] dm/md dependency tree in sysfs: bd_claim_by_kobject

Adding bd_claim_by_kobject() function which takes kobject as additional
signature of holder device and creates sysfs symlinks between holder device
and claimed device.  bd_release_from_kobject() is a counterpart of
bd_claim_by_kobject.

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/block_dev.c     | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  10 ++
 2 files changed, 307 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5983d42df015..3f36df7e037c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -266,6 +266,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		mutex_init(&bdev->bd_mount_mutex);
 		INIT_LIST_HEAD(&bdev->bd_inodes);
 		INIT_LIST_HEAD(&bdev->bd_list);
+#ifdef CONFIG_SYSFS
+		INIT_LIST_HEAD(&bdev->bd_holder_list);
+#endif
 		inode_init_once(&ei->vfs_inode);
 	}
 }
@@ -490,6 +493,300 @@ void bd_release(struct block_device *bdev)
 
 EXPORT_SYMBOL(bd_release);
 
+#ifdef CONFIG_SYSFS
+/*
+ * Functions for bd_claim_by_kobject / bd_release_from_kobject
+ *
+ *     If a kobject is passed to bd_claim_by_kobject()
+ *     and the kobject has a parent directory,
+ *     following symlinks are created:
+ *        o from the kobject to the claimed bdev
+ *        o from "holders" directory of the bdev to the parent of the kobject
+ *     bd_release_from_kobject() removes these symlinks.
+ *
+ *     Example:
+ *        If /dev/dm-0 maps to /dev/sda, kobject corresponding to
+ *        /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
+ *           /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ */
+
+static struct kobject *bdev_get_kobj(struct block_device *bdev)
+{
+	if (bdev->bd_contains != bdev)
+		return kobject_get(&bdev->bd_part->kobj);
+	else
+		return kobject_get(&bdev->bd_disk->kobj);
+}
+
+static struct kobject *bdev_get_holder(struct block_device *bdev)
+{
+	if (bdev->bd_contains != bdev)
+		return kobject_get(bdev->bd_part->holder_dir);
+	else
+		return kobject_get(bdev->bd_disk->holder_dir);
+}
+
+static void add_symlink(struct kobject *from, struct kobject *to)
+{
+	if (!from || !to)
+		return;
+	sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+	if (!from || !to)
+		return;
+	sysfs_remove_link(from, kobject_name(to));
+}
+
+/*
+ * 'struct bd_holder' contains pointers to kobjects symlinked by
+ * bd_claim_by_kobject.
+ * It's connected to bd_holder_list which is protected by bdev->bd_sem.
+ */
+struct bd_holder {
+	struct list_head list;	/* chain of holders of the bdev */
+	int count;		/* references from the holder */
+	struct kobject *sdir;	/* holder object, e.g. "/block/dm-0/slaves" */
+	struct kobject *hdev;	/* e.g. "/block/dm-0" */
+	struct kobject *hdir;	/* e.g. "/block/sda/holders" */
+	struct kobject *sdev;	/* e.g. "/block/sda" */
+};
+
+/*
+ * Get references of related kobjects at once.
+ * Returns 1 on success. 0 on failure.
+ *
+ * Should call bd_holder_release_dirs() after successful use.
+ */
+static int bd_holder_grab_dirs(struct block_device *bdev,
+			struct bd_holder *bo)
+{
+	if (!bdev || !bo)
+		return 0;
+
+	bo->sdir = kobject_get(bo->sdir);
+	if (!bo->sdir)
+		return 0;
+
+	bo->hdev = kobject_get(bo->sdir->parent);
+	if (!bo->hdev)
+		goto fail_put_sdir;
+
+	bo->sdev = bdev_get_kobj(bdev);
+	if (!bo->sdev)
+		goto fail_put_hdev;
+
+	bo->hdir = bdev_get_holder(bdev);
+	if (!bo->hdir)
+		goto fail_put_sdev;
+
+	return 1;
+
+fail_put_sdev:
+	kobject_put(bo->sdev);
+fail_put_hdev:
+	kobject_put(bo->hdev);
+fail_put_sdir:
+	kobject_put(bo->sdir);
+
+	return 0;
+}
+
+/* Put references of related kobjects at once. */
+static void bd_holder_release_dirs(struct bd_holder *bo)
+{
+	kobject_put(bo->hdir);
+	kobject_put(bo->sdev);
+	kobject_put(bo->hdev);
+	kobject_put(bo->sdir);
+}
+
+static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return NULL;
+
+	bo->count = 1;
+	bo->sdir = kobj;
+
+	return bo;
+}
+
+static void free_bd_holder(struct bd_holder *bo)
+{
+	kfree(bo);
+}
+
+/**
+ * add_bd_holder - create sysfs symlinks for bd_claim() relationship
+ *
+ * @bdev:	block device to be bd_claimed
+ * @bo:		preallocated and initialized by alloc_bd_holder()
+ *
+ * If there is no matching entry with @bo in @bdev->bd_holder_list,
+ * add @bo to the list, create symlinks.
+ *
+ * Returns 1 if @bo was added to the list.
+ * Returns 0 if @bo wasn't used by any reason and should be freed.
+ */
+static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
+{
+	struct bd_holder *tmp;
+
+	if (!bo)
+		return 0;
+
+	list_for_each_entry(tmp, &bdev->bd_holder_list, list) {
+		if (tmp->sdir == bo->sdir) {
+			tmp->count++;
+			return 0;
+		}
+	}
+
+	if (!bd_holder_grab_dirs(bdev, bo))
+		return 0;
+
+	add_symlink(bo->sdir, bo->sdev);
+	add_symlink(bo->hdir, bo->hdev);
+	list_add_tail(&bo->list, &bdev->bd_holder_list);
+	return 1;
+}
+
+/**
+ * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
+ *
+ * @bdev:	block device to be bd_claimed
+ * @kobj:	holder's kobject
+ *
+ * If there is matching entry with @kobj in @bdev->bd_holder_list
+ * and no other bd_claim() from the same kobject,
+ * remove the struct bd_holder from the list, delete symlinks for it.
+ *
+ * Returns a pointer to the struct bd_holder when it's removed from the list
+ * and ready to be freed.
+ * Returns NULL if matching claim isn't found or there is other bd_claim()
+ * by the same kobject.
+ */
+static struct bd_holder *del_bd_holder(struct block_device *bdev,
+					struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	list_for_each_entry(bo, &bdev->bd_holder_list, list) {
+		if (bo->sdir == kobj) {
+			bo->count--;
+			BUG_ON(bo->count < 0);
+			if (!bo->count) {
+				list_del(&bo->list);
+				del_symlink(bo->sdir, bo->sdev);
+				del_symlink(bo->hdir, bo->hdev);
+				bd_holder_release_dirs(bo);
+				return bo;
+			}
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+/**
+ * bd_claim_by_kobject - bd_claim() with additional kobject signature
+ *
+ * @bdev:	block device to be claimed
+ * @holder:	holder's signature
+ * @kobj:	holder's kobject
+ *
+ * Do bd_claim() and if it succeeds, create sysfs symlinks between
+ * the bdev and the holder's kobject.
+ * Use bd_release_from_kobject() when relesing the claimed bdev.
+ *
+ * Returns 0 on success. (same as bd_claim())
+ * Returns errno on failure.
+ */
+static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
+				struct kobject *kobj)
+{
+	int res;
+	struct bd_holder *bo;
+
+	if (!kobj)
+		return -EINVAL;
+
+	bo = alloc_bd_holder(kobj);
+	if (!bo)
+		return -ENOMEM;
+
+	down(&bdev->bd_sem);
+	res = bd_claim(bdev, holder);
+	if (res || !add_bd_holder(bdev, bo))
+		free_bd_holder(bo);
+	up(&bdev->bd_sem);
+
+	return res;
+}
+
+/**
+ * bd_release_from_kobject - bd_release() with additional kobject signature
+ *
+ * @bdev:	block device to be released
+ * @kobj:	holder's kobject
+ *
+ * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
+ */
+static void bd_release_from_kobject(struct block_device *bdev,
+					struct kobject *kobj)
+{
+	struct bd_holder *bo;
+
+	if (!kobj)
+		return;
+
+	down(&bdev->bd_sem);
+	bd_release(bdev);
+	if ((bo = del_bd_holder(bdev, kobj)))
+		free_bd_holder(bo);
+	up(&bdev->bd_sem);
+}
+
+/**
+ * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
+ *
+ * @bdev:	block device to be claimed
+ * @holder:	holder's signature
+ * @disk:	holder's gendisk
+ *
+ * Call bd_claim_by_kobject() with getting @disk->slave_dir.
+ */
+int bd_claim_by_disk(struct block_device *bdev, void *holder,
+			struct gendisk *disk)
+{
+	return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
+}
+EXPORT_SYMBOL_GPL(bd_claim_by_disk);
+
+/**
+ * bd_release_from_disk - wrapper function for bd_release_from_kobject()
+ *
+ * @bdev:	block device to be claimed
+ * @disk:	holder's gendisk
+ *
+ * Call bd_release_from_kobject() and put @disk->slave_dir.
+ */
+void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
+{
+	bd_release_from_kobject(bdev, disk->slave_dir);
+	kobject_put(disk->slave_dir);
+}
+EXPORT_SYMBOL_GPL(bd_release_from_disk);
+#endif
+
 /*
  * Tries to open block device by device number.  Use it ONLY if you
  * really do not have anything better - i.e. when you are behind a
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9d9674946956..680d913350e7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -410,6 +410,9 @@ struct block_device {
 	struct list_head	bd_inodes;
 	void *			bd_holder;
 	int			bd_holders;
+#ifdef CONFIG_SYSFS
+	struct list_head	bd_holder_list;
+#endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
 	struct hd_struct *	bd_part;
@@ -1399,6 +1402,13 @@ extern int blkdev_get(struct block_device *, mode_t, unsigned);
 extern int blkdev_put(struct block_device *);
 extern int bd_claim(struct block_device *, void *);
 extern void bd_release(struct block_device *);
+#ifdef CONFIG_SYSFS
+extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
+extern void bd_release_from_disk(struct block_device *, struct gendisk *);
+#else
+#define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
+#define bd_release_from_disk(bdev, disk)	bd_release(bdev)
+#endif
 
 /* fs/char_dev.c */
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
-- 
cgit v1.2.3


From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:06 -0800
Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is
 easier to grow

The remainder of this batch implements raid5 reshaping.  Currently the only
shape change that is supported is added a device, but it is envisioned that
changing the chunksize and layout will also be supported, as well as changing
the level (e.g.  1->5, 5->6).

The reshape process naturally has to move all of the data in the array, and so
should be used with caution.  It is believed to work, and some testing does
support this, but wider testing would be great for increasing my confidence.

You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth.
 This is because mdadm need to take a copy of a 'critical section' at the
start of the array incase there is a crash at an awkward moment.  On restart,
mdadm will restore the critical section and allow reshape to continue.

I hope to release a 2.4-pre by early next week - it still needs a little more
polishing.

This patch:

Previously the array of disk information was included in the raid5 'conf'
structure which was allocated to an appropriate size.  This makes it awkward
to change the size of that array.  So we split it off into a separate
kmalloced array which will require a little extra indexing, but is much easier
to grow.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 10 +++++++---
 drivers/md/raid6main.c     | 10 +++++++---
 include/linux/raid/raid5.h |  2 +-
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2dba305daf3c..03f31379cebb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1822,11 +1822,13 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid5_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+			      GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
 
 	conf->mddev = mddev;
 
@@ -1966,6 +1968,7 @@ static int run(mddev_t *mddev)
 abort:
 	if (conf) {
 		print_raid5_conf(conf);
+		kfree(conf->disks);
 		kfree(conf->stripe_hashtbl);
 		kfree(conf);
 	}
@@ -1986,6 +1989,7 @@ static int stop(mddev_t *mddev)
 	kfree(conf->stripe_hashtbl);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
+	kfree(conf->disks);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index cd477ebf2ee4..c7632f6cc487 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -2006,11 +2006,14 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
-	mddev->private = kzalloc(sizeof (raid6_conf_t)
-				 + mddev->raid_disks * sizeof(struct disk_info),
-				 GFP_KERNEL);
+	mddev->private = kzalloc(sizeof (raid6_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
+	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+				 GFP_KERNEL);
+	if (!conf->disks)
+		goto abort;
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
@@ -2158,6 +2161,7 @@ abort:
 		print_raid6_conf(conf);
 		safe_put_page(conf->spare_page);
 		kfree(conf->stripe_hashtbl);
+		kfree(conf->disks);
 		kfree(conf);
 	}
 	mddev->private = NULL;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 394da8207b34..94dbdd406f12 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -240,7 +240,7 @@ struct raid5_private_data {
 							 * waiting for 25% to be free
 							 */        
 	spinlock_t		device_lock;
-	struct disk_info	disks[0];
+	struct disk_info	*disks;
 };
 
 typedef struct raid5_private_data raid5_conf_t;
-- 
cgit v1.2.3


From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:07 -0800
Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding
 an array

Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache
data structure.

This requires allocating new stripes in a new kmem_cache.  If this succeeds,
we copy cache pages over and release the old stripes and kmem_cache.

We then allocate new pages.  If that fails, we leave the stripe cache at it's
new size.  It isn't worth the effort to shrink it back again.

Unfortuanately this means we need two kmem_cache names as we, for a short
period of time, we have two kmem_caches.  So they are raid5/%s and
raid5/%s-alt

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   2 +-
 drivers/md/raid5.c         | 131 +++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid6main.c     |   4 +-
 include/linux/raid/raid5.h |   9 +++-
 4 files changed, 137 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a3ecaf8ed30a..c7b7656f9aa5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-	struct list_head candidates;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev0, *rdev;
 	mddev_t *mddev;
@@ -2784,6 +2783,7 @@ static void autorun_devices(int part)
 	printk(KERN_INFO "md: autorun ...\n");
 	while (!list_empty(&pending_raid_disks)) {
 		dev_t dev;
+		LIST_HEAD(candidates);
 		rdev0 = list_entry(pending_raid_disks.next,
 					 mdk_rdev_t, same_set);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 03f31379cebb..6c20b44509d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
-
-	sc = kmem_cache_create(conf->cache_name, 
+	sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev));
+	conf->active_name = 0;
+	sc = kmem_cache_create(conf->cache_name[conf->active_name],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
 		return 1;
 	conf->slab_cache = sc;
+	conf->pool_size = devs;
 	while (num--) {
 		if (!grow_one_stripe(conf))
 			return 1;
 	}
 	return 0;
 }
+static int resize_stripes(raid5_conf_t *conf, int newsize)
+{
+	/* Make all the stripes able to hold 'newsize' devices.
+	 * New slots in each stripe get 'page' set to a new page.
+	 *
+	 * This happens in stages:
+	 * 1/ create a new kmem_cache and allocate the required number of
+	 *    stripe_heads.
+	 * 2/ gather all the old stripe_heads and tranfer the pages across
+	 *    to the new stripe_heads.  This will have the side effect of
+	 *    freezing the array as once all stripe_heads have been collected,
+	 *    no IO will be possible.  Old stripe heads are freed once their
+	 *    pages have been transferred over, and the old kmem_cache is
+	 *    freed when all stripes are done.
+	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
+	 *    we simple return a failre status - no need to clean anything up.
+	 * 4/ allocate new pages for the new slots in the new stripe_heads.
+	 *    If this fails, we don't bother trying the shrink the
+	 *    stripe_heads down again, we just leave them as they are.
+	 *    As each stripe_head is processed the new one is released into
+	 *    active service.
+	 *
+	 * Once step2 is started, we cannot afford to wait for a write,
+	 * so we use GFP_NOIO allocations.
+	 */
+	struct stripe_head *osh, *nsh;
+	LIST_HEAD(newstripes);
+	struct disk_info *ndisks;
+	int err = 0;
+	kmem_cache_t *sc;
+	int i;
+
+	if (newsize <= conf->pool_size)
+		return 0; /* never bother to shrink */
+
+	/* Step 1 */
+	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
+			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
+			       0, 0, NULL, NULL);
+	if (!sc)
+		return -ENOMEM;
+
+	for (i = conf->max_nr_stripes; i; i--) {
+		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
+		if (!nsh)
+			break;
+
+		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
+
+		nsh->raid_conf = conf;
+		spin_lock_init(&nsh->lock);
+
+		list_add(&nsh->lru, &newstripes);
+	}
+	if (i) {
+		/* didn't get enough, give up */
+		while (!list_empty(&newstripes)) {
+			nsh = list_entry(newstripes.next, struct stripe_head, lru);
+			list_del(&nsh->lru);
+			kmem_cache_free(sc, nsh);
+		}
+		kmem_cache_destroy(sc);
+		return -ENOMEM;
+	}
+	/* Step 2 - Must use GFP_NOIO now.
+	 * OK, we have enough stripes, start collecting inactive
+	 * stripes and copying them over
+	 */
+	list_for_each_entry(nsh, &newstripes, lru) {
+		spin_lock_irq(&conf->device_lock);
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    !list_empty(&conf->inactive_list),
+				    conf->device_lock,
+				    unplug_slaves(conf->mddev);
+			);
+		osh = get_free_stripe(conf);
+		spin_unlock_irq(&conf->device_lock);
+		atomic_set(&nsh->count, 1);
+		for(i=0; i<conf->pool_size; i++)
+			nsh->dev[i].page = osh->dev[i].page;
+		for( ; i<newsize; i++)
+			nsh->dev[i].page = NULL;
+		kmem_cache_free(conf->slab_cache, osh);
+	}
+	kmem_cache_destroy(conf->slab_cache);
+
+	/* Step 3.
+	 * At this point, we are holding all the stripes so the array
+	 * is completely stalled, so now is a good time to resize
+	 * conf->disks.
+	 */
+	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
+	if (ndisks) {
+		for (i=0; i<conf->raid_disks; i++)
+			ndisks[i] = conf->disks[i];
+		kfree(conf->disks);
+		conf->disks = ndisks;
+	} else
+		err = -ENOMEM;
+
+	/* Step 4, return new stripes to service */
+	while(!list_empty(&newstripes)) {
+		nsh = list_entry(newstripes.next, struct stripe_head, lru);
+		list_del_init(&nsh->lru);
+		for (i=conf->raid_disks; i < newsize; i++)
+			if (nsh->dev[i].page == NULL) {
+				struct page *p = alloc_page(GFP_NOIO);
+				nsh->dev[i].page = p;
+				if (!p)
+					err = -ENOMEM;
+			}
+		release_stripe(nsh);
+	}
+	/* critical section pass, GFP_NOIO no longer needed */
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+	conf->pool_size = newsize;
+	return err;
+}
+
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
 		return 0;
 	if (atomic_read(&sh->count))
 		BUG();
-	shrink_buffers(sh, conf->raid_disks);
+	shrink_buffers(sh, conf->pool_size);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	return 1;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index c7632f6cc487..6df4930fddec 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num)
 	kmem_cache_t *sc;
 	int devs = conf->raid_disks;
 
-	sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
+	sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev));
 
-	sc = kmem_cache_create(conf->cache_name,
+	sc = kmem_cache_create(conf->cache_name[0],
 			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
 			       0, 0, NULL, NULL);
 	if (!sc)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 94dbdd406f12..b7b2653af7bb 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -216,7 +216,11 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
-	char			cache_name[20];
+	/* unfortunately we need two cache names as we temporarily have
+	 * two caches.
+	 */
+	int			active_name;
+	char			cache_name[2][20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
 
 	int			seq_flush, seq_write;
@@ -238,7 +242,8 @@ struct raid5_private_data {
 	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
 							 * waiting for 25% to be free
-							 */        
+							 */
+	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
 };
-- 
cgit v1.2.3


From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:08 -0800
Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array
 is expanding

We need to allow that different stripes are of different effective sizes, and
use the appropriate size.  Also, when a stripe is being expanded, we must
block any IO attempts until the stripe is stable again.

Key elements in this change are:
 - each stripe_head gets a 'disk' field which is part of the key,
   thus there can sometimes be two stripe heads of the same area of
   the array, but covering different numbers of devices.  One of these
   will be marked STRIPE_EXPANDING and so won't accept new requests.
 - conf->expand_progress tracks how the expansion is progressing and
   is used to determine whether the target part of the array has been
   expanded yet or not.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 88 ++++++++++++++++++++++++++++++----------------
 include/linux/raid/raid5.h |  6 ++++
 2 files changed, 64 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6c20b44509d8..7a6df515b008 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -178,10 +178,10 @@ static int grow_buffers(struct stripe_head *sh, int num)
 
 static void raid5_build_block (struct stripe_head *sh, int i);
 
-static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
+static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int i;
 
 	if (atomic_read(&sh->count) != 0)
 		BUG();
@@ -198,7 +198,9 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	sh->pd_idx = pd_idx;
 	sh->state = 0;
 
-	for (i=disks; i--; ) {
+	sh->disks = disks;
+
+	for (i = sh->disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->toread || dev->towrite || dev->written ||
@@ -215,7 +217,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
 	insert_hash(conf, sh);
 }
 
-static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks)
 {
 	struct stripe_head *sh;
 	struct hlist_node *hn;
@@ -223,7 +225,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	CHECK_DEVLOCK();
 	PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
 	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
-		if (sh->sector == sector)
+		if (sh->sector == sector && sh->disks == disks)
 			return sh;
 	PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
@@ -232,8 +234,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(request_queue_t *q);
 
-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
-					     int pd_idx, int noblock) 
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks,
+					     int pd_idx, int noblock)
 {
 	struct stripe_head *sh;
 
@@ -245,7 +247,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 		wait_event_lock_irq(conf->wait_for_stripe,
 				    conf->quiesce == 0,
 				    conf->device_lock, /* nothing */);
-		sh = __find_stripe(conf, sector);
+		sh = __find_stripe(conf, sector, disks);
 		if (!sh) {
 			if (!conf->inactive_blocked)
 				sh = get_free_stripe(conf);
@@ -263,7 +265,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 					);
 				conf->inactive_blocked = 0;
 			} else
-				init_stripe(sh, sector, pd_idx);
+				init_stripe(sh, sector, pd_idx, disks);
 		} else {
 			if (atomic_read(&sh->count)) {
 				if (!list_empty(&sh->lru))
@@ -300,6 +302,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
+	sh->disks = conf->raid_disks;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -483,7 +486,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
 	if (bi->bi_size)
@@ -581,7 +584,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
 {
  	struct stripe_head *sh = bi->bi_private;
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks, i;
+	int disks = sh->disks, i;
 	unsigned long flags;
 	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
 
@@ -735,7 +738,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
 static sector_t compute_blocknr(struct stripe_head *sh, int i)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+	int raid_disks = sh->disks, data_disks = raid_disks - 1;
 	sector_t new_sector = sh->sector, check;
 	int sectors_per_chunk = conf->chunk_size >> 9;
 	sector_t stripe;
@@ -836,8 +839,7 @@ static void copy_data(int frombio, struct bio *bio,
 
 static void compute_block(struct stripe_head *sh, int dd_idx)
 {
-	raid5_conf_t *conf = sh->raid_conf;
-	int i, count, disks = conf->raid_disks;
+	int i, count, disks = sh->disks;
 	void *ptr[MAX_XOR_BLOCKS], *p;
 
 	PRINTK("compute_block, stripe %llu, idx %d\n", 
@@ -867,7 +869,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
 static void compute_parity(struct stripe_head *sh, int method)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+	int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
 	void *ptr[MAX_XOR_BLOCKS];
 	struct bio *chosen;
 
@@ -1055,7 +1057,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 static void handle_stripe(struct stripe_head *sh)
 {
 	raid5_conf_t *conf = sh->raid_conf;
-	int disks = conf->raid_disks;
+	int disks = sh->disks;
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
@@ -1649,12 +1651,10 @@ static inline void raid5_plug_device(raid5_conf_t *conf)
 	spin_unlock_irq(&conf->device_lock);
 }
 
-static int make_request (request_queue_t *q, struct bio * bi)
+static int make_request(request_queue_t *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
-	const unsigned int raid_disks = conf->raid_disks;
-	const unsigned int data_disks = raid_disks - 1;
 	unsigned int dd_idx, pd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -1678,20 +1678,48 @@ static int make_request (request_queue_t *q, struct bio * bi)
 
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
+		int disks;
 		
-		new_sector = raid5_compute_sector(logical_sector,
-						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-
+	retry:
+		if (likely(conf->expand_progress == MaxSector))
+			disks = conf->raid_disks;
+		else {
+			spin_lock_irq(&conf->device_lock);
+			disks = conf->raid_disks;
+			if (logical_sector >= conf->expand_progress)
+				disks = conf->previous_raid_disks;
+			spin_unlock_irq(&conf->device_lock);
+		}
+ 		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
+						  &dd_idx, &pd_idx, conf);
 		PRINTK("raid5: make_request, sector %llu logical %llu\n",
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-	retry:
 		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
-		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* Add failed due to overlap.  Flush everything
+			if (unlikely(conf->expand_progress != MaxSector)) {
+				/* expansion might have moved on while waiting for a
+				 * stripe, so we much do the range check again.
+				 */
+				int must_retry = 0;
+				spin_lock_irq(&conf->device_lock);
+				if (logical_sector <  conf->expand_progress &&
+				    disks == conf->previous_raid_disks)
+					/* mismatch, need to try again */
+					must_retry = 1;
+				spin_unlock_irq(&conf->device_lock);
+				if (must_retry) {
+					release_stripe(sh);
+					goto retry;
+				}
+			}
+
+			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Stripe is busy expanding or
+				 * add failed due to overlap.  Flush everything
 				 * and wait a while
 				 */
 				raid5_unplug_device(mddev->queue);
@@ -1703,7 +1731,6 @@ static int make_request (request_queue_t *q, struct bio * bi)
 			raid5_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
-
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1779,9 +1806,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
 		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
-	sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
+	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
-		sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access 
 		 */
@@ -1998,6 +2025,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
+	conf->expand_progress = MaxSector;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2128,7 +2156,7 @@ static void print_sh (struct stripe_head *sh)
 	printk("sh %llu,  count %d.\n",
 		(unsigned long long)sh->sector, atomic_read(&sh->count));
 	printk("sh %llu, ", (unsigned long long)sh->sector);
-	for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+	for (i = 0; i < sh->disks; i++) {
 		printk("(cache%d: %p %ld) ", 
 			i, sh->dev[i].page, sh->dev[i].flags);
 	}
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index b7b2653af7bb..6fa274aea2a0 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -135,6 +135,7 @@ struct stripe_head {
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
 	int			bm_seq;	/* sequence number for bitmap flushes */
+	int			disks;			/* disks in stripe */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -174,6 +175,7 @@ struct stripe_head {
 #define	STRIPE_DELAYED		6
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
+#define	STRIPE_EXPANDING	9
 
 /*
  * Plugging:
@@ -211,6 +213,10 @@ struct raid5_private_data {
 	int			raid_disks, working_disks, failed_disks;
 	int			max_nr_stripes;
 
+	/* used during an expand */
+	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	int			previous_raid_disks;
+
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-- 
cgit v1.2.3


From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:09 -0800
Subject: [PATCH] md: Core of raid5 resize process

This patch provides the core of the resize/expand process.

sync_request notices if a 'reshape' is happening and acts accordingly.

It allocated new stripe_heads for the next chunk-wide-stripe in the target
geometry, marking them STRIPE_EXPANDING.

Then it finds which stripe heads in the old geometry can provide data needed
by these and marks them STRIPE_EXPAND_SOURCE.  This causes stripe_handle to
read all blocks on those stripes.

Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are
needed are copied into the corresponding STRIPE_EXPANDING stripe_head.  Once a
STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then
is written out and released.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  14 +++-
 drivers/md/raid5.c         | 185 +++++++++++++++++++++++++++++++++++++++------
 include/linux/raid/md_k.h  |   4 +
 include/linux/raid/raid5.h |   4 +-
 4 files changed, 181 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7b7656f9aa5..8e65986bc63f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page)
 	char *type = "idle";
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
-		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+			type = "reshape";
+		else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				type = "resync";
 			else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
@@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		seq_printf(seq, "] ");
 	}
 	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+		    "reshape" :
 		      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
-		       "resync" : "recovery"),
+		       "resync" : "recovery")),
 		      per_milli/10, per_milli % 10,
 		   (unsigned long long) resync,
 		   (unsigned long long) max_blocks);
@@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev)
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
-	} else
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		max_sectors = mddev->size << 1;
+	else
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
 
@@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev)
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2 &&
 	    mddev->curr_resync >= mddev->recovery_cp) {
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a6df515b008..56cba8d3e398 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 					md_wakeup_thread(conf->mddev->thread);
 			}
-			list_add_tail(&sh->lru, &conf->inactive_list);
 			atomic_dec(&conf->active_stripes);
-			if (!conf->inactive_blocked ||
-			    atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4))
+			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+				list_add_tail(&sh->lru, &conf->inactive_list);
 				wake_up(&conf->wait_for_stripe);
+			}
 		}
 	}
 }
@@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 			} else {
 				if (!test_bit(STRIPE_HANDLE, &sh->state))
 					atomic_inc(&conf->active_stripes);
-				if (list_empty(&sh->lru))
-					BUG();
-				list_del_init(&sh->lru);
+				if (!list_empty(&sh->lru))
+					list_del_init(&sh->lru);
 			}
 		}
 	} while (sh == NULL);
@@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
+{
+	int sectors_per_chunk = conf->chunk_size >> 9;
+	sector_t x = stripe;
+	int pd_idx, dd_idx;
+	int chunk_offset = sector_div(x, sectors_per_chunk);
+	stripe = x;
+	raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk
+			     + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf);
+	return pd_idx;
+}
+
 
 /*
  * handle_stripe - do things to a stripe.
@@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh)
 	struct bio *return_bi= NULL;
 	struct bio *bi;
 	int i;
-	int syncing;
+	int syncing, expanding, expanded;
 	int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
 	int non_overwrite = 0;
 	int failed_num=0;
@@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh)
 	clear_bit(STRIPE_DELAYED, &sh->state);
 
 	syncing = test_bit(STRIPE_SYNCING, &sh->state);
+	expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+	expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
 	/* Now to look around and see what can be done */
 
 	rcu_read_lock();
@@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh)
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
 	 */
-	if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
+	if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
 		for (i=disks; i--;) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
 			    (dev->toread ||
 			     (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
 			     syncing ||
+			     expanding ||
 			     (failed && (sh->dev[failed_num].toread ||
 					 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
 				    )
@@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh)
 			set_bit(R5_Wantwrite, &dev->flags);
 			set_bit(R5_ReWrite, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		} else {
 			/* let's read it back */
 			set_bit(R5_Wantread, &dev->flags);
 			set_bit(R5_LOCKED, &dev->flags);
+			locked++;
 		}
 	}
 
+	if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+		/* Need to write out all blocks after computing parity */
+		sh->disks = conf->raid_disks;
+		sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
+		compute_parity(sh, RECONSTRUCT_WRITE);
+		for (i= conf->raid_disks; i--;) {
+			set_bit(R5_LOCKED, &sh->dev[i].flags);
+			locked++;
+			set_bit(R5_Wantwrite, &sh->dev[i].flags);
+		}
+		clear_bit(STRIPE_EXPANDING, &sh->state);
+	} else if (expanded) {
+		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		wake_up(&conf->wait_for_overlap);
+		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
+	}
+
+	if (expanding && locked == 0) {
+		/* We have read all the blocks in this stripe and now we need to
+		 * copy some of them into a target stripe for expand.
+		 */
+		clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+		for (i=0; i< sh->disks; i++)
+			if (i != sh->pd_idx) {
+				int dd_idx, pd_idx, j;
+				struct stripe_head *sh2;
+
+				sector_t bn = compute_blocknr(sh, i);
+				sector_t s = raid5_compute_sector(bn, conf->raid_disks,
+								  conf->raid_disks-1,
+								  &dd_idx, &pd_idx, conf);
+				sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
+				if (sh2 == NULL)
+					/* so far only the early blocks of this stripe
+					 * have been requested.  When later blocks
+					 * get requested, we will try again
+					 */
+					continue;
+				if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
+				   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
+					/* must have already done this block */
+					release_stripe(sh2);
+					continue;
+				}
+				memcpy(page_address(sh2->dev[dd_idx].page),
+				       page_address(sh->dev[i].page),
+				       STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
+				set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+				for (j=0; j<conf->raid_disks; j++)
+					if (j != sh2->pd_idx &&
+					    !test_bit(R5_Expanded, &sh2->dev[j].flags))
+						break;
+				if (j == conf->raid_disks) {
+					set_bit(STRIPE_EXPAND_READY, &sh2->state);
+					set_bit(STRIPE_HANDLE, &sh2->state);
+				}
+				release_stripe(sh2);
+			}
+	}
+
 	spin_unlock(&sh->lock);
 
 	while ((bi=return_bi)) {
@@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh)
 		rcu_read_unlock();
  
 		if (rdev) {
-			if (syncing)
+			if (syncing || expanding || expanded)
 				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 
 			bi->bi_bdev = rdev->bdev;
@@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 {
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	struct stripe_head *sh;
-	int sectors_per_chunk = conf->chunk_size >> 9;
-	sector_t x;
-	unsigned long stripe;
-	int chunk_offset;
-	int dd_idx, pd_idx;
-	sector_t first_sector;
+	int pd_idx;
+	sector_t first_sector, last_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 	sector_t max_sector = mddev->size << 1;
@@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 
 		return 0;
 	}
+
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		/* reshaping is quite different to recovery/resync so it is
+		 * handled quite separately ... here.
+		 *
+		 * On each call to sync_request, we gather one chunk worth of
+		 * destination stripes and flag them as expanding.
+		 * Then we find all the source stripes and request reads.
+		 * As the reads complete, handle_stripe will copy the data
+		 * into the destination stripe and release that stripe.
+		 */
+		int i;
+		int dd_idx;
+		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
+			int j;
+			int skipped = 0;
+			pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks);
+			sh = get_active_stripe(conf, sector_nr+i,
+					       conf->raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPANDING, &sh->state);
+			/* If any of this stripe is beyond the end of the old
+			 * array, then we need to zero those blocks
+			 */
+			for (j=sh->disks; j--;) {
+				sector_t s;
+				if (j == sh->pd_idx)
+					continue;
+				s = compute_blocknr(sh, j);
+				if (s < (mddev->array_size<<1)) {
+					skipped = 1;
+					continue;
+				}
+				memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
+				set_bit(R5_Expanded, &sh->dev[j].flags);
+				set_bit(R5_UPTODATE, &sh->dev[j].flags);
+			}
+			if (!skipped) {
+				set_bit(STRIPE_EXPAND_READY, &sh->state);
+				set_bit(STRIPE_HANDLE, &sh->state);
+			}
+			release_stripe(sh);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1);
+		spin_unlock_irq(&conf->device_lock);
+		/* Ok, those stripe are ready. We can start scheduling
+		 * reads on the source stripes.
+		 * The source stripes are determined by mapping the first and last
+		 * block on the destination stripes.
+		 */
+		raid_disks = conf->previous_raid_disks;
+		data_disks = raid_disks - 1;
+		first_sector =
+			raid5_compute_sector(sector_nr*(conf->raid_disks-1),
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		last_sector =
+			raid5_compute_sector((sector_nr+conf->chunk_size/512)
+					       *(conf->raid_disks-1) -1,
+					     raid_disks, data_disks,
+					     &dd_idx, &pd_idx, conf);
+		if (last_sector >= (mddev->size<<1))
+			last_sector = (mddev->size<<1)-1;
+		while (first_sector <= last_sector) {
+			pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks);
+			sh = get_active_stripe(conf, first_sector,
+					       conf->previous_raid_disks, pd_idx, 0);
+			set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+			set_bit(STRIPE_HANDLE, &sh->state);
+			release_stripe(sh);
+			first_sector += STRIPE_SECTORS;
+		}
+		return conf->chunk_size>>9;
+	}
 	/* if there is 1 or more failed drives and we are trying
 	 * to resync, then assert that we are finished, because there is
 	 * nothing we can do.
@@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 	}
 
-	x = sector_nr;
-	chunk_offset = sector_div(x, sectors_per_chunk);
-	stripe = x;
-	BUG_ON(x != stripe);
-
-	first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
-		+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+	pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks);
 	sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
 	if (sh == NULL) {
 		sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 617b9506c760..4e26ef2cacca 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -157,6 +157,9 @@ struct mddev_s
 	 * DONE:     thread is done and is waiting to be reaped
 	 * REQUEST:  user-space has requested a sync (used with SYNC)
 	 * CHECK:    user-space request for for check-only, no repair
+	 * RESHAPE:  A reshape is happening
+	 *
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
 	 */
 #define	MD_RECOVERY_RUNNING	0
 #define	MD_RECOVERY_SYNC	1
@@ -166,6 +169,7 @@ struct mddev_s
 #define	MD_RECOVERY_NEEDED	5
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
+#define MD_RECOVERY_RESHAPE	8
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 6fa274aea2a0..55c738d50508 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -157,6 +157,7 @@ struct stripe_head {
 #define	R5_ReadError	8	/* seen a read error here recently */
 #define	R5_ReWrite	9	/* have tried to over-write the readerror */
 
+#define	R5_Expanded	10	/* This block now has post-expand data */
 /*
  * Write method
  */
@@ -176,7 +177,8 @@ struct stripe_head {
 #define	STRIPE_DEGRADED		7
 #define	STRIPE_BIT_DELAY	8
 #define	STRIPE_EXPANDING	9
-
+#define	STRIPE_EXPAND_SOURCE	10
+#define	STRIPE_EXPAND_READY	11
 /*
  * Plugging:
  *
-- 
cgit v1.2.3


From 292695531ae4019bb15deedc121b218d1908b648 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:10 -0800
Subject: [PATCH] md: Final stages of raid5 expand code

This patch adds raid5_reshape and end_reshape which will start and finish the
reshape processes.

raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage
accidental use.

Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry.

and Make sure that you have backups, just in case.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/Kconfig      |  26 ++++++++++
 drivers/md/md.c         |   6 ++-
 drivers/md/raid5.c      | 123 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/raid/md.h |   3 +-
 4 files changed, 154 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index ac43f98062fd..fd2aae150ccc 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -127,6 +127,32 @@ config MD_RAID5
 
 	  If unsure, say Y.
 
+config MD_RAID5_RESHAPE
+	bool "Support adding drives to a raid-5 array (experimental)"
+	depends on MD_RAID5 && EXPERIMENTAL
+	---help---
+	  A RAID-5 set can be expanded by adding extra drives. This
+	  requires "restriping" the array which means (almost) every
+	  block must be written to a different place.
+
+          This option allows such restriping to be done while the array
+	  is online.  However it is still EXPERIMENTAL code.  It should
+	  work, but please be sure that you have backups.
+
+	  You will need a version of mdadm newer than 2.3.1.   During the
+	  early stage of reshape there is a critical section where live data
+	  is being over-written.  A crash during this time needs extra care
+	  for recovery.  The newer mdadm takes a copy of the data in the
+	  critical section and will restore it, if necessary, after a crash.
+
+	  The mdadm usage is e.g.
+	       mdadm --grow /dev/md1 --raid-disks=6
+	  to grow '/dev/md1' to having 6 disks.
+
+	  Note: The array can only be expanded, not contracted.
+	  There should be enough spares already present to make the new
+	  array workable.
+
 config MD_RAID6
 	tristate "RAID-6 mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e65986bc63f..d169bc964676 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -158,11 +158,12 @@ static int start_readonly;
  */
 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
 static atomic_t md_event_count;
-static void md_new_event(mddev_t *mddev)
+void md_new_event(mddev_t *mddev)
 {
 	atomic_inc(&md_event_count);
 	wake_up(&md_event_waiters);
 }
+EXPORT_SYMBOL_GPL(md_new_event);
 
 /*
  * Enables to iterate over all existing md arrays
@@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 
 #define SYNC_MARKS	10
 #define	SYNC_MARK_STEP	(3*HZ)
-static void md_do_sync(mddev_t *mddev)
+void md_do_sync(mddev_t *mddev)
 {
 	mddev_t *mddev2;
 	unsigned int currspeed = 0,
@@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev)
 	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 }
+EXPORT_SYMBOL_GPL(md_do_sync);
 
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 56cba8d3e398..b29135acb1d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num)
 	}
 	return 0;
 }
+
+#ifdef CONFIG_MD_RAID5_RESHAPE
 static int resize_stripes(raid5_conf_t *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 	conf->pool_size = newsize;
 	return err;
 }
-
+#endif
 
 static int drop_one_stripe(raid5_conf_t *conf)
 {
@@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	return 0;
 }
 
+static void end_reshape(raid5_conf_t *conf);
+
 static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
 {
 	int sectors_per_chunk = conf->chunk_size >> 9;
@@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+			end_reshape(conf);
+			return 0;
+		}
 
 		if (mddev->curr_resync < max_sector) /* aborted */
 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
@@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+#ifdef CONFIG_MD_RAID5_RESHAPE
+static int raid5_reshape(mddev_t *mddev, int raid_disks)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int err;
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+	if (conf->raid_disks > raid_disks)
+		return -EINVAL; /* Cannot shrink array yet */
+	if (conf->raid_disks == raid_disks)
+		return 0; /* nothing to do */
+
+	/* Can only proceed if there are plenty of stripe_heads.
+	 * We need a minimum of one full stripe,, and for sensible progress
+	 * it is best to have about 4 times that.
+	 * If we require 4 times, then the default 256 4K stripe_heads will
+	 * allow for chunk sizes up to 256K, which is probably OK.
+	 * If the chunk size is greater, user-space should request more
+	 * stripe_heads first.
+	 */
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		       (mddev->chunk_size / STRIPE_SIZE)*4);
+		return -ENOSPC;
+	}
+
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags))
+			spares++;
+	if (conf->raid_disks + spares < raid_disks-1)
+		/* Not enough devices even to make a degraded array
+		 * of that size
+		 */
+		return -EINVAL;
+
+	err = resize_stripes(conf, raid_disks);
+	if (err)
+		return err;
+
+	spin_lock_irq(&conf->device_lock);
+	conf->previous_raid_disks = conf->raid_disks;
+	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->expand_progress = 0;
+	spin_unlock_irq(&conf->device_lock);
+
+	/* Add some new drives, as many as will fit.
+	 * We know there are enough to make the newly sized array work.
+	 */
+	ITERATE_RDEV(mddev, rdev, rtmp)
+		if (rdev->raid_disk < 0 &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			if (raid5_add_disk(mddev, rdev)) {
+				char nm[20];
+				set_bit(In_sync, &rdev->flags);
+				conf->working_disks++;
+				added_devices++;
+				sprintf(nm, "rd%d", rdev->raid_disk);
+				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+			} else
+				break;
+		}
+
+	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+						"%s_reshape");
+	if (!mddev->sync_thread) {
+		mddev->recovery = 0;
+		spin_lock_irq(&conf->device_lock);
+		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		return -EAGAIN;
+	}
+	md_wakeup_thread(mddev->sync_thread);
+	md_new_event(mddev);
+	return 0;
+}
+#endif
+
+static void end_reshape(raid5_conf_t *conf)
+{
+	struct block_device *bdev;
+
+	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
+	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+	conf->mddev->changed = 1;
+
+	bdev = bdget_disk(conf->mddev->gendisk, 0);
+	if (bdev) {
+		mutex_lock(&bdev->bd_inode->i_mutex);
+		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+		mutex_unlock(&bdev->bd_inode->i_mutex);
+		bdput(bdev);
+	}
+	spin_lock_irq(&conf->device_lock);
+	conf->expand_progress = MaxSector;
+	spin_unlock_irq(&conf->device_lock);
+}
+
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality =
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+#ifdef CONFIG_MD_RAID5_RESHAPE
+	.reshape	= raid5_reshape,
+#endif
 	.quiesce	= raid5_quiesce,
 };
 
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index b6e0bcad84e1..9c77cde5a795 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 extern void md_super_wait(mddev_t *mddev);
 extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 			struct page *page, int rw);
-
+extern void md_do_sync(mddev_t *mddev);
+extern void md_new_event(mddev_t *mddev);
 
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
-- 
cgit v1.2.3


From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:11 -0800
Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape

We allow the superblock to record an 'old' and a 'new' geometry, and a
position where any conversion is up to.  The geometry allows for changing
chunksize, layout and level as well as number of devices.

When using verion-0.90 superblock, we convert the version to 0.91 while the
conversion is happening so that an old kernel will refuse the assemble the
array.  For version-1, we use a feature bit for the same effect.

When starting an array we check for an incomplete reshape and restart the
reshape process if needed.  If the reshape stopped at an awkward time (like
when updating the first stripe) we refuse to assemble the array, and let
user-space worry about it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |  69 +++++++++++++++++++++-
 drivers/md/raid1.c         |   5 ++
 drivers/md/raid5.c         | 140 ++++++++++++++++++++++++++++++++++++++-------
 include/linux/raid/md.h    |   2 +
 include/linux/raid/md_k.h  |   8 +++
 include/linux/raid/md_p.h  |  32 ++++++++++-
 include/linux/raid/raid5.h |   1 +
 7 files changed, 231 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d169bc964676..b9dfdfccdb78 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	}
 
 	if (sb->major_version != 0 ||
-	    sb->minor_version != 90) {
+	    sb->minor_version < 90 ||
+	    sb->minor_version > 91) {
 		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
 			sb->major_version, sb->minor_version,
 			b);
@@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->bitmap_offset = 0;
 		mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 
+		if (mddev->minor_version >= 91) {
+			mddev->reshape_position = sb->reshape_position;
+			mddev->delta_disks = sb->delta_disks;
+			mddev->new_level = sb->new_level;
+			mddev->new_layout = sb->new_layout;
+			mddev->new_chunk = sb->new_chunk;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 		if (sb->state & (1<<MD_SB_CLEAN))
 			mddev->recovery_cp = MaxSector;
 		else {
@@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->md_magic = MD_SB_MAGIC;
 	sb->major_version = mddev->major_version;
-	sb->minor_version = mddev->minor_version;
 	sb->patch_version = mddev->patch_version;
 	sb->gvalid_words  = 0; /* ignored */
 	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
@@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->events_hi = (mddev->events>>32);
 	sb->events_lo = (u32)mddev->events;
 
+	if (mddev->reshape_position == MaxSector)
+		sb->minor_version = 90;
+	else {
+		sb->minor_version = 91;
+		sb->reshape_position = mddev->reshape_position;
+		sb->new_level = mddev->new_level;
+		sb->delta_disks = mddev->delta_disks;
+		sb->new_layout = mddev->new_layout;
+		sb->new_chunk = mddev->new_chunk;
+	}
+	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
 		sb->recovery_cp = mddev->recovery_cp;
@@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			}
 			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
 		}
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+			mddev->new_level = le32_to_cpu(sb->new_level);
+			mddev->new_layout = le32_to_cpu(sb->new_layout);
+			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+		} else {
+			mddev->reshape_position = MaxSector;
+			mddev->delta_disks = 0;
+			mddev->new_level = mddev->level;
+			mddev->new_layout = mddev->layout;
+			mddev->new_chunk = mddev->chunk_size;
+		}
+
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling */
 		__u64 ev1 = le64_to_cpu(sb->events);
@@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+	if (mddev->reshape_position != MaxSector) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+		sb->new_layout = cpu_to_le32(mddev->new_layout);
+		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+		sb->new_level = cpu_to_le32(mddev->new_level);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+	}
 
 	max_dev = 0;
 	ITERATE_RDEV(mddev,rdev2,tmp)
@@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev)
 	}
 }
 
-static void md_update_sb(mddev_t * mddev)
+void md_update_sb(mddev_t * mddev)
 {
 	int err;
 	struct list_head *tmp;
@@ -1574,6 +1621,7 @@ repeat:
 	wake_up(&mddev->sb_wait);
 
 }
+EXPORT_SYMBOL_GPL(md_update_sb);
 
 /* words written to sysfs files may, or my not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev)
 	mddev->level = pers->level;
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
+	if (mddev->reshape_position != MaxSector &&
+	    pers->reshape == NULL) {
+		/* This personality cannot handle reshaping... */
+		mddev->pers = NULL;
+		module_put(pers->owner);
+		return -EINVAL;
+	}
+
 	mddev->recovery = 0;
 	mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
 	mddev->barriers_work = 1;
@@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
 	mddev->bitmap_offset = 0;
 
+	mddev->reshape_position = MaxSector;
+
 	/*
 	 * Generate a 128 bit UUID
 	 */
 	get_random_bytes(mddev->uuid, 16);
 
+	mddev->new_level = mddev->level;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->delta_disks = 0;
+
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5d88329e3c7a..b65b8cfbdf30 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev)
 		       mdname(mddev), mddev->level);
 		goto out;
 	}
+	if (mddev->reshape_position != MaxSector) {
+		printk("raid1: %s: reshape_position set but not supported\n",
+		       mdname(mddev));
+		goto out;
+	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b29135acb1d9..20ae32d67e21 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -22,6 +22,7 @@
 #include <linux/raid/raid5.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/kthread.h>
 #include <asm/atomic.h>
 
 #include <linux/raid/bitmap.h>
@@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh)
 		clear_bit(STRIPE_EXPANDING, &sh->state);
 	} else if (expanded) {
 		clear_bit(STRIPE_EXPAND_READY, &sh->state);
+		atomic_dec(&conf->reshape_stripes);
 		wake_up(&conf->wait_for_overlap);
 		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
 	}
@@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+
+		if (sector_nr == 0 &&
+		    conf->expand_progress != 0) {
+			/* restarting in the middle, skip the initial sectors */
+			sector_nr = conf->expand_progress;
+			sector_div(sector_nr, conf->raid_disks-1);
+			*skipped = 1;
+			return sector_nr;
+		}
+
+		/* Cannot proceed until we've updated the superblock... */
+		wait_event(conf->wait_for_overlap,
+			   atomic_read(&conf->reshape_stripes)==0);
+		mddev->reshape_position = conf->expand_progress;
+
+		mddev->sb_dirty = 1;
+		md_wakeup_thread(mddev->thread);
+		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+			kthread_should_stop());
+
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
 			int skipped = 0;
@@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			sh = get_active_stripe(conf, sector_nr+i,
 					       conf->raid_disks, pd_idx, 0);
 			set_bit(STRIPE_EXPANDING, &sh->state);
+			atomic_inc(&conf->reshape_stripes);
 			/* If any of this stripe is beyond the end of the old
 			 * array, then we need to zero those blocks
 			 */
@@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev)
 		return -EIO;
 	}
 
+	if (mddev->reshape_position != MaxSector) {
+		/* Check that we can continue the reshape.
+		 * Currently only disks can change, it must
+		 * increase, and we must be past the point where
+		 * a stripe over-writes itself
+		 */
+		sector_t here_new, here_old;
+		int old_disks;
+
+		if (mddev->new_level != mddev->level ||
+		    mddev->new_layout != mddev->layout ||
+		    mddev->new_chunk != mddev->chunk_size) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		if (mddev->delta_disks <= 0) {
+			printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n",
+			       mdname(mddev));
+			return -EINVAL;
+		}
+		old_disks = mddev->raid_disks - mddev->delta_disks;
+		/* reshape_position must be on a new-stripe boundary, and one
+		 * further up in new geometry must map after here in old geometry.
+		 */
+		here_new = mddev->reshape_position;
+		if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) {
+			printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n");
+			return -EINVAL;
+		}
+		/* here_new is the stripe we will write to */
+		here_old = mddev->reshape_position;
+		sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1));
+		/* here_old is the first stripe that we might need to read from */
+		if (here_new >= here_old) {
+			/* Reading from the same stripe as writing to - bad */
+			printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n");
+			return -EINVAL;
+		}
+		printk(KERN_INFO "raid5: reshape will continue\n");
+		/* OK, we should be able to continue; */
+	}
+
+
 	mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL);
 	if ((conf = mddev->private) == NULL)
 		goto abort;
-	conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info),
+	if (mddev->reshape_position == MaxSector) {
+		conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
+	} else {
+		conf->raid_disks = mddev->raid_disks;
+		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
+	}
+
+	conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
 	if (!conf->disks)
 		goto abort;
@@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev)
 
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		raid_disk = rdev->raid_disk;
-		if (raid_disk >= mddev->raid_disks
+		if (raid_disk >= conf->raid_disks
 		    || raid_disk < 0)
 			continue;
 		disk = conf->disks + raid_disk;
@@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev)
 		}
 	}
 
-	conf->raid_disks = mddev->raid_disks;
 	/*
 	 * 0 for a fully functional array, 1 for a degraded array.
 	 */
@@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev)
 	conf->level = mddev->level;
 	conf->algorithm = mddev->layout;
 	conf->max_nr_stripes = NR_STRIPES;
-	conf->expand_progress = MaxSector;
+	conf->expand_progress = mddev->reshape_position;
 
 	/* device size must be a multiple of chunk size */
 	mddev->size &= ~(mddev->chunk_size/1024 -1);
@@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev)
 
 	print_raid5_conf(conf);
 
+	if (conf->expand_progress != MaxSector) {
+		printk("...ok start reshape thread\n");
+		atomic_set(&conf->reshape_stripes, 0);
+		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
+							"%s_reshape");
+		/* FIXME if md_register_thread fails?? */
+		md_wakeup_thread(mddev->sync_thread);
+
+	}
+
 	/* read-ahead size must cover two whole stripes, which is
 	 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
 	 */
@@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev)
 
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
+	mddev->array_size =  mddev->size * (conf->previous_raid_disks - 1);
 
-	mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
 	return 0;
 abort:
 	if (conf) {
@@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	/*
 	 * find the disk ...
 	 */
-	for (disk=0; disk < mddev->raid_disks; disk++)
+	for (disk=0; disk < conf->raid_disks; disk++)
 		if ((p=conf->disks + disk)->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
@@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	if (err)
 		return err;
 
+	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	mddev->raid_disks = conf->raid_disks = raid_disks;
+	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 
@@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		}
 
 	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_layout = mddev->layout;
+	mddev->new_level = mddev->level;
+	mddev->raid_disks = raid_disks;
+	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->reshape_position = 0;
+	mddev->sb_dirty = 1;
+
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf)
 {
 	struct block_device *bdev;
 
-	conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1);
-	set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
-	conf->mddev->changed = 1;
-
-	bdev = bdget_disk(conf->mddev->gendisk, 0);
-	if (bdev) {
-		mutex_lock(&bdev->bd_inode->i_mutex);
-		i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
-		mutex_unlock(&bdev->bd_inode->i_mutex);
-		bdput(bdev);
+	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1);
+		set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
+		conf->mddev->changed = 1;
+
+		bdev = bdget_disk(conf->mddev->gendisk, 0);
+		if (bdev) {
+			mutex_lock(&bdev->bd_inode->i_mutex);
+			i_size_write(bdev->bd_inode, conf->mddev->array_size << 10);
+			mutex_unlock(&bdev->bd_inode->i_mutex);
+			bdput(bdev);
+		}
+		spin_lock_irq(&conf->device_lock);
+		conf->expand_progress = MaxSector;
+		spin_unlock_irq(&conf->device_lock);
+		conf->mddev->reshape_position = MaxSector;
 	}
-	spin_lock_irq(&conf->device_lock);
-	conf->expand_progress = MaxSector;
-	spin_unlock_irq(&conf->device_lock);
 }
 
 static void raid5_quiesce(mddev_t *mddev, int state)
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 9c77cde5a795..66b44e5e0d6e 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 extern void md_do_sync(mddev_t *mddev);
 extern void md_new_event(mddev_t *mddev);
 
+extern void md_update_sb(mddev_t * mddev);
+
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 
 #endif 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 4e26ef2cacca..1a6f9f2f6282 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -132,6 +132,14 @@ struct mddev_s
 
 	char				uuid[16];
 
+	/* If the array is being reshaped, we need to record the
+	 * new shape and an indication of where we are up to.
+	 * This is written to the superblock.
+	 * If reshape_position is MaxSector, then no reshape is happening (yet).
+	 */
+	sector_t			reshape_position;
+	int				delta_disks, new_level, new_layout, new_chunk;
+
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
 	sector_t			curr_resync;	/* blocks scheduled */
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index c100fa5d4bfa..774e1acfb8c4 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_ERRORS		1
 
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+
+/*
+ * Notes:
+ * - if an array is being reshaped (restriped) in order to change the
+ *   the number of active devices in the array, 'raid_disks' will be
+ *   the larger of the old and new numbers.  'delta_disks' will
+ *   be the "new - old".  So if +ve, raid_disks is the new value, and
+ *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the
+ *   old value and "raid_disks+delta_disks" is the new (smaller) value.
+ */
+
+
 typedef struct mdp_superblock_s {
 	/*
 	 * Constant generic information
@@ -146,7 +158,13 @@ typedef struct mdp_superblock_s {
 	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */
 #endif
 	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
-	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12];
+	/* There are only valid for minor_version > 90 */
+	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
+	__u32 new_level;	/* 14 new level we are reshaping to	      */
+	__u32 delta_disks;	/* 15 change in number of raid_disks	      */
+	__u32 new_layout;	/* 16 new layout			      */
+	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */
+	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
 
 	/*
 	 * Personality information
@@ -207,7 +225,14 @@ struct mdp_superblock_1 {
 				 * NOTE: signed, so bitmap can be before superblock
 				 * only meaningful of feature_map[0] is set.
 				 */
-	__u8	pad1[128-100];	/* set to 0 when written */
+
+	/* These are only valid with feature bit '4' */
+	__u64	reshape_position;	/* next address in array-space for reshape */
+	__u32	new_level;	/* new level we are reshaping to		*/
+	__u32	delta_disks;	/* change in number of raid_disks		*/
+	__u32	new_layout;	/* new layout					*/
+	__u32	new_chunk;	/* new chunk size (bytes)			*/
+	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
 	__u64	data_offset;	/* sector start of data, often 0 */
@@ -240,8 +265,9 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			1
+#define	MD_FEATURE_ALL			5
 
 #endif 
 
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 55c738d50508..abcdf0d0658a 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -224,6 +224,7 @@ struct raid5_private_data {
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
+	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
 	 */
-- 
cgit v1.2.3


From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:12 -0800
Subject: [PATCH] md: Only checkpoint expansion progress occasionally

Instead of checkpointing at each stripe, only checkpoint when a new write
would overwrite uncheckpointed data.  Block any write to the uncheckpointed
area.  Arbitrarily checkpoint at least every 3Meg.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/raid5.c         | 53 ++++++++++++++++++++++++++++++++++++----------
 include/linux/raid/raid5.h |  3 +++
 2 files changed, 45 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 20ae32d67e21..089a32604305 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1762,8 +1762,9 @@ static int make_request(request_queue_t *q, struct bio * bi)
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
 		DEFINE_WAIT(w);
 		int disks;
-		
+
 	retry:
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		if (likely(conf->expand_progress == MaxSector))
 			disks = conf->raid_disks;
 		else {
@@ -1771,6 +1772,13 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			disks = conf->raid_disks;
 			if (logical_sector >= conf->expand_progress)
 				disks = conf->previous_raid_disks;
+			else {
+				if (logical_sector >= conf->expand_lo) {
+					spin_unlock_irq(&conf->device_lock);
+					schedule();
+					goto retry;
+				}
+			}
 			spin_unlock_irq(&conf->device_lock);
 		}
  		new_sector = raid5_compute_sector(logical_sector, disks, disks - 1,
@@ -1779,7 +1787,6 @@ static int make_request(request_queue_t *q, struct bio * bi)
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
-		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 		sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
 			if (unlikely(conf->expand_progress != MaxSector)) {
@@ -1877,6 +1884,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 */
 		int i;
 		int dd_idx;
+		sector_t writepos, safepos, gap;
 
 		if (sector_nr == 0 &&
 		    conf->expand_progress != 0) {
@@ -1887,15 +1895,36 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			return sector_nr;
 		}
 
-		/* Cannot proceed until we've updated the superblock... */
-		wait_event(conf->wait_for_overlap,
-			   atomic_read(&conf->reshape_stripes)==0);
-		mddev->reshape_position = conf->expand_progress;
-
-		mddev->sb_dirty = 1;
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
-			kthread_should_stop());
+		/* we update the metadata when there is more than 3Meg
+		 * in the block range (that is rather arbitrary, should
+		 * probably be time based) or when the data about to be
+		 * copied would over-write the source of the data at
+		 * the front of the range.
+		 * i.e. one new_stripe forward from expand_progress new_maps
+		 * to after where expand_lo old_maps to
+		 */
+		writepos = conf->expand_progress +
+			conf->chunk_size/512*(conf->raid_disks-1);
+		sector_div(writepos, conf->raid_disks-1);
+		safepos = conf->expand_lo;
+		sector_div(safepos, conf->previous_raid_disks-1);
+		gap = conf->expand_progress - conf->expand_lo;
+
+		if (writepos >= safepos ||
+		    gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) {
+			/* Cannot proceed until we've updated the superblock... */
+			wait_event(conf->wait_for_overlap,
+				   atomic_read(&conf->reshape_stripes)==0);
+			mddev->reshape_position = conf->expand_progress;
+			mddev->sb_dirty = 1;
+			md_wakeup_thread(mddev->thread);
+			wait_event(mddev->sb_wait, mddev->sb_dirty == 0 ||
+				   kthread_should_stop());
+			spin_lock_irq(&conf->device_lock);
+			conf->expand_lo = mddev->reshape_position;
+			spin_unlock_irq(&conf->device_lock);
+			wake_up(&conf->wait_for_overlap);
+		}
 
 		for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) {
 			int j;
@@ -2322,6 +2351,7 @@ static int run(mddev_t *mddev)
 
 	if (conf->expand_progress != MaxSector) {
 		printk("...ok start reshape thread\n");
+		conf->expand_lo = conf->expand_progress;
 		atomic_set(&conf->reshape_stripes, 0);
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -2610,6 +2640,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks = raid_disks;
 	conf->expand_progress = 0;
+	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
 
 	/* Add some new drives, as many as will fit.
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index abcdf0d0658a..914af667044f 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -217,6 +217,9 @@ struct raid5_private_data {
 
 	/* used during an expand */
 	sector_t		expand_progress;	/* MaxSector when no expand happening */
+	sector_t		expand_lo; /* from here up to expand_progress it out-of-bounds
+					    * as we haven't flushed the metadata yet
+					    */
 	int			previous_raid_disks;
 
 	struct list_head	handle_list; /* stripes needing handling */
-- 
cgit v1.2.3


From 63c70c4f3a30e77e6f445bd16eff7934a031ebd3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:13 -0800
Subject: [PATCH] md: Split reshape handler in check_reshape and start_reshape

check_reshape checks validity and does things that can be done instantly -
like adding devices to raid1.  start_reshape initiates a restriping process to
convert the whole array.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 10 ++++----
 drivers/md/raid1.c        | 19 ++++++++++++---
 drivers/md/raid5.c        | 60 +++++++++++++++++++++++++++--------------------
 include/linux/raid/md_k.h |  3 ++-
 4 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b9dfdfccdb78..1a637676a930 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2594,7 +2594,7 @@ static int do_md_run(mddev_t * mddev)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 
 	if (mddev->reshape_position != MaxSector &&
-	    pers->reshape == NULL) {
+	    pers->start_reshape == NULL) {
 		/* This personality cannot handle reshaping... */
 		mddev->pers = NULL;
 		module_put(pers->owner);
@@ -3556,14 +3556,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks)
 {
 	int rv;
 	/* change the number of raid disks */
-	if (mddev->pers->reshape == NULL)
+	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
 	if (raid_disks <= 0 ||
 	    raid_disks >= mddev->max_disks)
 		return -EINVAL;
-	if (mddev->sync_thread)
+	if (mddev->sync_thread || mddev->reshape_position != MaxSector)
 		return -EBUSY;
-	rv = mddev->pers->reshape(mddev, raid_disks);
+	mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+	rv = mddev->pers->check_reshape(mddev);
 	return rv;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b65b8cfbdf30..04418e10d1dc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1976,7 +1976,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
-static int raid1_reshape(mddev_t *mddev, int raid_disks)
+static int raid1_reshape(mddev_t *mddev)
 {
 	/* We need to:
 	 * 1/ resize the r1bio_pool
@@ -1993,10 +1993,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
 	conf_t *conf = mddev_to_conf(mddev);
-	int cnt;
+	int cnt, raid_disks;
 
 	int d, d2;
 
+	/* Cannot change chunk_size, layout, or level */
+	if (mddev->chunk_size != mddev->new_chunk ||
+	    mddev->layout != mddev->new_layout ||
+	    mddev->level != mddev->new_level) {
+		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_layout = mddev->layout;
+		mddev->new_level = mddev->level;
+		return -EINVAL;
+	}
+
+	raid_disks = mddev->raid_disks + mddev->delta_disks;
+
 	if (raid_disks < conf->raid_disks) {
 		cnt=0;
 		for (d= 0; d < conf->raid_disks; d++)
@@ -2043,6 +2055,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
 
 	mddev->degraded += (raid_disks - conf->raid_disks);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
+	mddev->delta_disks = 0;
 
 	conf->last_used = 0; /* just make sure it is in-range */
 	lower_barrier(conf);
@@ -2084,7 +2097,7 @@ static struct mdk_personality raid1_personality =
 	.spare_active	= raid1_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
-	.reshape	= raid1_reshape,
+	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
 };
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 089a32604305..355dafb98aac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2590,21 +2590,15 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 }
 
 #ifdef CONFIG_MD_RAID5_RESHAPE
-static int raid5_reshape(mddev_t *mddev, int raid_disks)
+static int raid5_check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	int err;
-	mdk_rdev_t *rdev;
-	struct list_head *rtmp;
-	int spares = 0;
-	int added_devices = 0;
 
-	if (mddev->degraded ||
-	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
-		return -EBUSY;
-	if (conf->raid_disks > raid_disks)
-		return -EINVAL; /* Cannot shrink array yet */
-	if (conf->raid_disks == raid_disks)
+	if (mddev->delta_disks < 0 ||
+	    mddev->new_level != mddev->level)
+		return -EINVAL; /* Cannot shrink array or change level yet */
+	if (mddev->delta_disks == 0)
 		return 0; /* nothing to do */
 
 	/* Can only proceed if there are plenty of stripe_heads.
@@ -2615,30 +2609,48 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 	 * If the chunk size is greater, user-space should request more
 	 * stripe_heads first.
 	 */
-	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
+	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
 		       (mddev->chunk_size / STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
 
+	err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
+	if (err)
+		return err;
+
+	/* looks like we might be able to manage this */
+	return 0;
+}
+
+static int raid5_start_reshape(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *rdev;
+	struct list_head *rtmp;
+	int spares = 0;
+	int added_devices = 0;
+
+	if (mddev->degraded ||
+	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+		return -EBUSY;
+
 	ITERATE_RDEV(mddev, rdev, rtmp)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
 			spares++;
-	if (conf->raid_disks + spares < raid_disks-1)
+
+	if (spares < mddev->delta_disks-1)
 		/* Not enough devices even to make a degraded array
 		 * of that size
 		 */
 		return -EINVAL;
 
-	err = resize_stripes(conf, raid_disks);
-	if (err)
-		return err;
-
 	atomic_set(&conf->reshape_stripes, 0);
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
-	conf->raid_disks = raid_disks;
+	conf->raid_disks += mddev->delta_disks;
 	conf->expand_progress = 0;
 	conf->expand_lo = 0;
 	spin_unlock_irq(&conf->device_lock);
@@ -2660,12 +2672,8 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 				break;
 		}
 
-	mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices;
-	mddev->new_chunk = mddev->chunk_size;
-	mddev->new_layout = mddev->layout;
-	mddev->new_level = mddev->level;
-	mddev->raid_disks = raid_disks;
-	mddev->delta_disks = raid_disks - conf->previous_raid_disks;
+	mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices;
+	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = 0;
 	mddev->sb_dirty = 1;
 
@@ -2679,7 +2687,6 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks)
 		mddev->recovery = 0;
 		spin_lock_irq(&conf->device_lock);
 		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-		mddev->delta_disks = 0;
 		conf->expand_progress = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		return -EAGAIN;
@@ -2752,7 +2759,8 @@ static struct mdk_personality raid5_personality =
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
 #ifdef CONFIG_MD_RAID5_RESHAPE
-	.reshape	= raid5_reshape,
+	.check_reshape	= raid5_check_reshape,
+	.start_reshape  = raid5_start_reshape,
 #endif
 	.quiesce	= raid5_quiesce,
 };
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 1a6f9f2f6282..002ee631fabb 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -261,7 +261,8 @@ struct mdk_personality
 	int (*spare_active) (mddev_t *mddev);
 	sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
 	int (*resize) (mddev_t *mddev, sector_t sectors);
-	int (*reshape) (mddev_t *mddev, int raid_disks);
+	int (*check_reshape) (mddev_t *mddev);
+	int (*start_reshape) (mddev_t *mddev);
 	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
-- 
cgit v1.2.3


From e464eafdb4400c6d6576ba3840d8bd40340f8a96 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:14 -0800
Subject: [PATCH] md: Support suspending of IO to regions of an md array

This allows user-space to access data safely.  This is needed for raid5
reshape as user-space needs to take a backup of the first few stripes before
allowing reshape to commence.

It will also be useful in cluster-aware raid1 configurations so that all
cluster members can leave a section of the array untouched while a
resync/recovery happens.

A 'start' and 'end' of the suspended range are written to 2 sysfs attributes.
Note that only one range can be suspended at a time.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 59 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c        | 14 +++++++++++
 include/linux/raid/md_k.h |  4 ++++
 3 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a79dd33d343d..92fd0104fa04 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2365,6 +2365,63 @@ sync_completed_show(mddev_t *mddev, char *page)
 static struct md_sysfs_entry
 md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+suspend_lo_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if (new >= mddev->suspend_hi ||
+	    (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+		mddev->suspend_lo = new;
+		mddev->pers->quiesce(mddev, 2);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+
+static ssize_t
+suspend_hi_show(mddev_t *mddev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	char *e;
+	unsigned long long new = simple_strtoull(buf, &e, 10);
+
+	if (mddev->pers->quiesce == NULL)
+		return -EINVAL;
+	if (buf == e || (*e && *e != '\n'))
+		return -EINVAL;
+	if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
+	    (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+		mddev->suspend_hi = new;
+		mddev->pers->quiesce(mddev, 1);
+		mddev->pers->quiesce(mddev, 0);
+		return len;
+	} else
+		return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_raid_disks.attr,
@@ -2382,6 +2439,8 @@ static struct attribute *md_redundancy_attrs[] = {
 	&md_sync_max.attr,
 	&md_sync_speed.attr,
 	&md_sync_completed.attr,
+	&md_suspend_lo.attr,
+	&md_suspend_hi.attr,
 	NULL,
 };
 static struct attribute_group md_redundancy_group = {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 355dafb98aac..bb16ac231a40 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1805,6 +1805,15 @@ static int make_request(request_queue_t *q, struct bio * bi)
 					goto retry;
 				}
 			}
+			/* FIXME what if we get a false positive because these
+			 * are being updated.
+			 */
+			if (logical_sector >= mddev->suspend_lo &&
+			    logical_sector < mddev->suspend_hi) {
+				release_stripe(sh);
+				schedule();
+				goto retry;
+			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
 			    !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
@@ -2725,6 +2734,10 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 
 	switch(state) {
+	case 2: /* resume for a suspend */
+		wake_up(&conf->wait_for_overlap);
+		break;
+
 	case 1: /* stop all writes */
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 1;
@@ -2738,6 +2751,7 @@ static void raid5_quiesce(mddev_t *mddev, int state)
 		spin_lock_irq(&conf->device_lock);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_stripe);
+		wake_up(&conf->wait_for_overlap);
 		spin_unlock_irq(&conf->device_lock);
 		break;
 	}
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 002ee631fabb..c0d3097846a7 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -151,6 +151,10 @@ struct mddev_s
 	sector_t			resync_mismatches; /* count of sectors where
 							    * parity/replica mismatch found
 							    */
+
+	/* allow user-space to request suspension of IO to regions of the array */
+	sector_t			suspend_lo;
+	sector_t			suspend_hi;
 	/* if zero, use the system-wide default */
 	int				sync_speed_min;
 	int				sync_speed_max;
-- 
cgit v1.2.3


From df5b89b323b922f56650b4b4d7c41899b937cf19 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Mar 2006 01:18:20 -0800
Subject: [PATCH] md: Convert reconfig_sem to reconfig_mutex

... being careful that mutex_trylock is inverted wrt down_trylock

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 14 +++++++-------
 include/linux/raid/md_k.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c9c9c096ad80..039e071c1007 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -255,7 +255,7 @@ static mddev_t * mddev_find(dev_t unit)
 	else
 		new->md_minor = MINOR(unit) >> MdpMinorShift;
 
-	init_MUTEX(&new->reconfig_sem);
+	mutex_init(&new->reconfig_mutex);
 	INIT_LIST_HEAD(&new->disks);
 	INIT_LIST_HEAD(&new->all_mddevs);
 	init_timer(&new->safemode_timer);
@@ -277,22 +277,22 @@ static mddev_t * mddev_find(dev_t unit)
 
 static inline int mddev_lock(mddev_t * mddev)
 {
-	return down_interruptible(&mddev->reconfig_sem);
+	return mutex_lock_interruptible(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_lock_uninterruptible(mddev_t * mddev)
 {
-	down(&mddev->reconfig_sem);
+	mutex_lock(&mddev->reconfig_mutex);
 }
 
 static inline int mddev_trylock(mddev_t * mddev)
 {
-	return down_trylock(&mddev->reconfig_sem);
+	return mutex_trylock(&mddev->reconfig_mutex);
 }
 
 static inline void mddev_unlock(mddev_t * mddev)
 {
-	up(&mddev->reconfig_sem);
+	mutex_unlock(&mddev->reconfig_mutex);
 
 	md_wakeup_thread(mddev->thread);
 }
@@ -4893,7 +4893,7 @@ void md_check_recovery(mddev_t *mddev)
 		))
 		return;
 
-	if (mddev_trylock(mddev)==0) {
+	if (mddev_trylock(mddev)) {
 		int spares =0;
 
 		spin_lock_irq(&mddev->write_lock);
@@ -5029,7 +5029,7 @@ static int md_notify_reboot(struct notifier_block *this,
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
 		ITERATE_MDDEV(mddev,tmp)
-			if (mddev_trylock(mddev)==0)
+			if (mddev_trylock(mddev))
 				do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index c0d3097846a7..e2df61f5b09a 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -185,7 +185,7 @@ struct mddev_s
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */
-	struct semaphore		reconfig_sem;
+	struct mutex			reconfig_mutex;
 	atomic_t			active;
 
 	int				changed;	/* true if we might need to reread partition info */
-- 
cgit v1.2.3


From e2d74ac0664c89757bde8fb18c98cd7bf53da61c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 28 Mar 2006 08:59:01 +0200
Subject: [PATCH] [BLOCK] cfq-iosched: change cfq io context linking from list
 to tree

On setups with many disks, we spend a considerable amount of time
looking up the process-disk mapping on each queue of io. Testing with
a NULL based block driver, this costs 40-50% reduction in throughput
for 1000 disks.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c    | 205 +++++++++++++++++++++++--------------------------
 block/ll_rw_blk.c      |  19 +++--
 include/linux/blkdev.h |  14 ++--
 3 files changed, 114 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index bde40a6ae665..bb43a1677626 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1190,19 +1190,19 @@ cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
 	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+static void cfq_free_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry, *next;
-	int freed = 1;
+	struct rb_node *n;
+	int freed = 0;
 
-	list_for_each_safe(entry, next, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	while ((n = rb_first(&ioc->cic_root)) != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+		rb_erase(&__cic->rb_node, &ioc->cic_root);
 		kmem_cache_free(cfq_ioc_pool, __cic);
 		freed++;
 	}
 
-	kmem_cache_free(cfq_ioc_pool, cic);
 	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
 		complete(ioc_gone);
 }
@@ -1210,8 +1210,7 @@ static void cfq_free_io_context(struct cfq_io_context *cic)
 static void cfq_trim(struct io_context *ioc)
 {
 	ioc->set_ioprio = NULL;
-	if (ioc->cic)
-		cfq_free_io_context(ioc->cic);
+	cfq_free_io_context(ioc);
 }
 
 /*
@@ -1250,26 +1249,26 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	spin_unlock(q->queue_lock);
 }
 
-static void cfq_exit_io_context(struct cfq_io_context *cic)
+static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	struct list_head *entry;
 	unsigned long flags;
-
-	local_irq_save(flags);
+	struct rb_node *n;
 
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	read_lock(&cfq_exit_lock);
-	list_for_each(entry, &cic->list) {
-		__cic = list_entry(entry, struct cfq_io_context, list);
+	read_lock_irqsave(&cfq_exit_lock, flags);
+
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		__cic = rb_entry(n, struct cfq_io_context, rb_node);
+
 		cfq_exit_single_io_context(__cic);
+		n = rb_next(n);
 	}
 
-	cfq_exit_single_io_context(cic);
-	read_unlock(&cfq_exit_lock);
-	local_irq_restore(flags);
+	read_unlock_irqrestore(&cfq_exit_lock, flags);
 }
 
 static struct cfq_io_context *
@@ -1278,10 +1277,10 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		INIT_LIST_HEAD(&cic->list);
+		RB_CLEAR(&cic->rb_node);
+		cic->key = NULL;
 		cic->cfqq[ASYNC] = NULL;
 		cic->cfqq[SYNC] = NULL;
-		cic->key = NULL;
 		cic->last_end_request = jiffies;
 		cic->ttime_total = 0;
 		cic->ttime_samples = 0;
@@ -1373,15 +1372,17 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 {
 	struct cfq_io_context *cic;
+	struct rb_node *n;
 
 	write_lock(&cfq_exit_lock);
 
-	cic = ioc->cic;
-
-	changed_ioprio(cic);
-
-	list_for_each_entry(cic, &cic->list, list)
+	n = rb_first(&ioc->cic_root);
+	while (n != NULL) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+ 
 		changed_ioprio(cic);
+		n = rb_next(n);
+	}
 
 	write_unlock(&cfq_exit_lock);
 
@@ -1445,14 +1446,67 @@ out:
 	return cfqq;
 }
 
+static struct cfq_io_context *
+cfq_cic_rb_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+{
+	struct rb_node *n = ioc->cic_root.rb_node;
+	struct cfq_io_context *cic;
+	void *key = cfqd;
+
+	while (n) {
+		cic = rb_entry(n, struct cfq_io_context, rb_node);
+
+		if (key < cic->key)
+			n = n->rb_left;
+		else if (key > cic->key)
+			n = n->rb_right;
+		else
+			return cic;
+	}
+
+	return NULL;
+}
+
+static inline void
+cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
+	     struct cfq_io_context *cic)
+{
+	struct rb_node **p = &ioc->cic_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_io_context *__cic;
+
+	read_lock(&cfq_exit_lock);
+
+	cic->ioc = ioc;
+	cic->key = cfqd;
+
+	ioc->set_ioprio = cfq_ioc_set_ioprio;
+
+	while (*p) {
+		parent = *p;
+		__cic = rb_entry(parent, struct cfq_io_context, rb_node);
+
+		if (cic->key < __cic->key)
+			p = &(*p)->rb_left;
+		else if (cic->key > __cic->key)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&cic->rb_node, parent, p);
+	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+	list_add(&cic->queue_list, &cfqd->cic_list);
+	read_unlock(&cfq_exit_lock);
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq. Note that caller is holding a reference to
- * cfqq, so we don't need to worry about it disappearing
+ * than one device managed by cfq.
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
+cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
@@ -1463,88 +1517,15 @@ cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
-restart:
-	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(cfqd, gfp_mask);
-
-		if (cic == NULL)
-			goto err;
-
-		/*
-		 * manually increment generic io_context usage count, it
-		 * cannot go away since we are already holding one ref to it
-		 */
-		cic->ioc = ioc;
-		cic->key = cfqd;
-		read_lock(&cfq_exit_lock);
-		ioc->set_ioprio = cfq_ioc_set_ioprio;
-		ioc->cic = cic;
-		list_add(&cic->queue_list, &cfqd->cic_list);
-		read_unlock(&cfq_exit_lock);
-	} else {
-		struct cfq_io_context *__cic;
-
-		/*
-		 * the first cic on the list is actually the head itself
-		 */
-		if (cic->key == cfqd)
-			goto out;
-
-		if (unlikely(!cic->key)) {
-			read_lock(&cfq_exit_lock);
-			if (list_empty(&cic->list))
-				ioc->cic = NULL;
-			else
-				ioc->cic = list_entry(cic->list.next,
-						      struct cfq_io_context,
-						      list);
-			read_unlock(&cfq_exit_lock);
-			kmem_cache_free(cfq_ioc_pool, cic);
-			atomic_dec(&ioc_count);
-			goto restart;
-		}
-
-		/*
-		 * cic exists, check if we already are there. linear search
-		 * should be ok here, the list will usually not be more than
-		 * 1 or a few entries long
-		 */
-		list_for_each_entry(__cic, &cic->list, list) {
-			/*
-			 * this process is already holding a reference to
-			 * this queue, so no need to get one more
-			 */
-			if (__cic->key == cfqd) {
-				cic = __cic;
-				goto out;
-			}
-			if (unlikely(!__cic->key)) {
-				read_lock(&cfq_exit_lock);
-				list_del(&__cic->list);
-				read_unlock(&cfq_exit_lock);
-				kmem_cache_free(cfq_ioc_pool, __cic);
-				atomic_dec(&ioc_count);
-				goto restart;
-			}
-		}
+	cic = cfq_cic_rb_lookup(cfqd, ioc);
+	if (cic)
+		goto out;
 
-		/*
-		 * nope, process doesn't have a cic assoicated with this
-		 * cfqq yet. get a new one and add to list
-		 */
-		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
-		if (__cic == NULL)
-			goto err;
-
-		__cic->ioc = ioc;
-		__cic->key = cfqd;
-		read_lock(&cfq_exit_lock);
-		list_add(&__cic->list, &cic->list);
-		list_add(&__cic->queue_list, &cfqd->cic_list);
-		read_unlock(&cfq_exit_lock);
-		cic = __cic;
-	}
+	cic = cfq_alloc_io_context(cfqd, gfp_mask);
+	if (cic == NULL)
+		goto err;
 
+	cfq_cic_link(cfqd, ioc, cic);
 out:
 	return cic;
 err:
@@ -1965,7 +1946,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, key, gfp_mask);
+	cic = cfq_get_io_context(cfqd, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -2133,11 +2114,14 @@ static void cfq_exit_queue(elevator_t *e)
 	request_queue_t *q = cfqd->queue;
 
 	cfq_shutdown_timer_wq(cfqd);
+
 	write_lock(&cfq_exit_lock);
 	spin_lock_irq(q->queue_lock);
+
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
-	while(!list_empty(&cfqd->cic_list)) {
+
+	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
@@ -2152,6 +2136,7 @@ static void cfq_exit_queue(elevator_t *e)
 		cic->key = NULL;
 		list_del_init(&cic->queue_list);
 	}
+
 	spin_unlock_irq(q->queue_lock);
 	write_unlock(&cfq_exit_lock);
 
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..cb608768ca37 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3539,11 +3539,15 @@ void put_io_context(struct io_context *ioc)
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 
 	if (atomic_dec_and_test(&ioc->refcount)) {
+		struct cfq_io_context *cic;
+
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
-		if (ioc->cic && ioc->cic->dtor)
-			ioc->cic->dtor(ioc->cic);
+		if (ioc->cic_root.rb_node != NULL) {
+			cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+			cic->dtor(ioc);
+		}
 		rcu_read_unlock();
 
 		kmem_cache_free(iocontext_cachep, ioc);
@@ -3556,6 +3560,7 @@ void exit_io_context(void)
 {
 	unsigned long flags;
 	struct io_context *ioc;
+	struct cfq_io_context *cic;
 
 	local_irq_save(flags);
 	task_lock(current);
@@ -3567,9 +3572,11 @@ void exit_io_context(void)
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
-	if (ioc->cic && ioc->cic->exit)
-		ioc->cic->exit(ioc->cic);
-
+	if (ioc->cic_root.rb_node != NULL) {
+		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
+		cic->exit(ioc);
+	}
+ 
 	put_io_context(ioc);
 }
 
@@ -3598,7 +3605,7 @@ struct io_context *current_io_context(gfp_t gfp_flags)
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
-		ret->cic = NULL;
+		ret->cic_root.rb_node = NULL;
 		tsk->io_context = ret;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c179966f1a2f..ed0ffa673568 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -55,13 +55,11 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	/*
-	 * circular list of cfq_io_contexts belonging to a process io context
-	 */
-	struct list_head list;
-	struct cfq_queue *cfqq[2];
+	struct rb_node rb_node;
 	void *key;
 
+	struct cfq_queue *cfqq[2];
+
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
@@ -72,8 +70,8 @@ struct cfq_io_context {
 
 	struct list_head queue_list;
 
-	void (*dtor)(struct cfq_io_context *);
-	void (*exit)(struct cfq_io_context *);
+	void (*dtor)(struct io_context *); /* destructor */
+	void (*exit)(struct io_context *); /* called on task exit */
 };
 
 /*
@@ -94,7 +92,7 @@ struct io_context {
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
 	struct as_io_context *aic;
-	struct cfq_io_context *cic;
+	struct rb_root cic_root;
 };
 
 void put_io_context(struct io_context *ioc);
-- 
cgit v1.2.3


From 206dc69b31ca05baac68c75b8ed2ba7dd857d273 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 28 Mar 2006 13:03:44 +0200
Subject: [BLOCK] cfq-iosched: seek and async performance fixes

Detect whether a given process is seeky and if so disable (mostly) the
idle window if it is. We still allow just a little idle time, just enough
to allow that process to submit a new request. That is needed to maintain
fairness across priority groups.

In some cases, we could setup several async queues. This is not optimal
from a performance POV, since we want all async io in one queue to perform
good sorting on it. It also impacted sync queues, as async io got too much
slice time.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/cfq-iosched.c    | 102 +++++++++++++++++++++++++++++++------------------
 include/linux/blkdev.h |   8 +++-
 2 files changed, 72 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 15152e2da0d2..67d446de0227 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -26,18 +26,12 @@ static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 100;
+static int cfq_slice_idle = HZ / 70;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
-#define CFQ_KEY_ANY		(0xffff)
-
-/*
- * disable queueing at the driver/hardware level
- */
-static const int cfq_max_depth = 2;
 
 static DEFINE_RWLOCK(cfq_exit_lock);
 
@@ -102,6 +96,8 @@ static struct completion *ioc_gone;
 #define cfq_cfqq_sync(cfqq)		\
 	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
+#define sample_valid(samples)	((samples) > 80)
+
 /*
  * Per block device queue structure
  */
@@ -170,7 +166,6 @@ struct cfq_data {
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
-	unsigned int cfq_max_depth;
 
 	struct list_head cic_list;
 };
@@ -343,6 +338,14 @@ static int cfq_queue_empty(request_queue_t *q)
 	return !cfqd->busy_queues;
 }
 
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
+
+	return CFQ_KEY_ASYNC;
+}
+
 /*
  * Lifted from AS - choose which of crq1 and crq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
@@ -626,15 +629,20 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
-
+static struct request *
+cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
+	struct task_struct *tsk = current;
+	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
+	struct cfq_queue *cfqq;
 	struct rb_node *n;
+	sector_t sector;
 
+	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
 	if (!cfqq)
 		goto out;
 
+	sector = bio->bi_sector + bio_sectors(bio);
 	n = cfqq->sort_list.rb_node;
 	while (n) {
 		struct cfq_rq *crq = rb_entry_crq(n);
@@ -688,7 +696,7 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 		goto out;
 	}
 
-	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		ret = ELEVATOR_FRONT_MERGE;
 		goto out;
@@ -891,6 +899,7 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
+	struct cfq_io_context *cic;
 	unsigned long sl;
 
 	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
@@ -906,13 +915,23 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	/*
 	 * task has exited, don't wait
 	 */
-	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+	cic = cfqd->active_cic;
+	if (!cic || !cic->ioc->task)
 		return 0;
 
 	cfq_mark_cfqq_must_dispatch(cfqq);
 	cfq_mark_cfqq_wait_request(cfqq);
 
 	sl = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
+
+	/*
+	 * we don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. so allow a little bit of time for him to submit a new rq
+	 */
+	if (sample_valid(cic->seek_samples) && cic->seek_mean > 131072)
+		sl = 2;
+
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	return 1;
 }
@@ -1129,13 +1148,6 @@ cfq_dispatch_requests(request_queue_t *q, int force)
 	if (cfqq) {
 		int max_dispatch;
 
-		/*
-		 * if idle window is disabled, allow queue buildup
-		 */
-		if (!cfq_cfqq_idle_window(cfqq) &&
-		    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
-			return 0;
-
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
@@ -1185,13 +1197,13 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
 		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
-	struct hlist_node *entry, *next;
+	struct hlist_node *entry;
+	struct cfq_queue *__cfqq;
 
-	hlist_for_each_safe(entry, next, hash_list) {
-		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+	hlist_for_each_entry(__cfqq, entry, hash_list, cfq_hash) {
 		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->org_ioprio_class, __cfqq->org_ioprio);
 
-		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
+		if (__cfqq->key == key && (__p == prio || !prio))
 			return __cfqq;
 	}
 
@@ -1572,7 +1584,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 
-#define sample_valid(samples)	((samples) > 80)
+static void
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+		       struct cfq_rq *crq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (cic->last_request_pos < crq->request->sector)
+		sdist = crq->request->sector - cic->last_request_pos;
+	else
+		sdist = cic->last_request_pos - crq->request->sector;
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc
+	 */
+	if (cic->seek_samples <= 60) /* second&third seek */
+		sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (cic->seek_mean * 4)	+ 2*1024*64);
+
+	cic->seek_samples = (7*cic->seek_samples + 256) / 8;
+	cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
+	total = cic->seek_total + (cic->seek_samples/2);
+	do_div(total, cic->seek_samples);
+	cic->seek_mean = (sector_t)total;
+}
 
 /*
  * Disable idle window if the process thinks too long or seeks so much that
@@ -1685,9 +1723,11 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cic = crq->io_context;
 
 	cfq_update_io_thinktime(cfqd, cic);
+	cfq_update_io_seektime(cfqd, cic, crq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_queue = jiffies;
+	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1820,14 +1860,6 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 		cfq_resort_rr_list(cfqq, 0);
 }
 
-static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
-{
-	if (rw == READ || process_sync(task))
-		return task->pid;
-
-	return CFQ_KEY_ASYNC;
-}
-
 static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
@@ -2226,7 +2258,6 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
-	cfqd->cfq_max_depth = cfq_max_depth;
 
 	return 0;
 out_crqpool:
@@ -2309,7 +2340,6 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
-SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -2338,7 +2368,6 @@ STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 #define CFQ_ATTR(name) \
@@ -2355,7 +2384,6 @@ static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
-	CFQ_ATTR(max_depth),
 	__ATTR_NULL
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed0ffa673568..d0cac8b58de7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -63,11 +63,17 @@ struct cfq_io_context {
 	struct io_context *ioc;
 
 	unsigned long last_end_request;
-	unsigned long last_queue;
+	sector_t last_request_pos;
+ 	unsigned long last_queue;
+
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+
 	struct list_head queue_list;
 
 	void (*dtor)(struct io_context *); /* destructor */
-- 
cgit v1.2.3


From 0080b7aae88c75e2a6b38dfcb228b0f239e18e3c Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Tue, 28 Mar 2006 01:56:15 -0800
Subject: [PATCH] synclink_gt add gpio feature

Add driver support for general purpose I/O feature of the Synclink GT
adapters.

Signed-off-by: Paul Fulghum <paulkf@micrgate.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/synclink_gt.c | 246 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/synclink.h   |  11 +-
 2 files changed, 252 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 738ec2f4e563..8818042bad7c 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -1,5 +1,5 @@
 /*
- * $Id: synclink_gt.c,v 4.22 2006/01/09 20:16:06 paulkf Exp $
+ * $Id: synclink_gt.c,v 4.25 2006/02/06 21:20:33 paulkf Exp $
  *
  * Device driver for Microgate SyncLink GT serial adapters.
  *
@@ -92,7 +92,7 @@
  * module identification
  */
 static char *driver_name     = "SyncLink GT";
-static char *driver_version  = "$Revision: 4.22 $";
+static char *driver_version  = "$Revision: 4.25 $";
 static char *tty_driver_name = "synclink_gt";
 static char *tty_dev_prefix  = "ttySLG";
 MODULE_LICENSE("GPL");
@@ -187,6 +187,20 @@ static void hdlcdev_exit(struct slgt_info *info);
 #define SLGT_MAX_PORTS 4
 #define SLGT_REG_SIZE  256
 
+/*
+ * conditional wait facility
+ */
+struct cond_wait {
+	struct cond_wait *next;
+	wait_queue_head_t q;
+	wait_queue_t wait;
+	unsigned int data;
+};
+static void init_cond_wait(struct cond_wait *w, unsigned int data);
+static void add_cond_wait(struct cond_wait **head, struct cond_wait *w);
+static void remove_cond_wait(struct cond_wait **head, struct cond_wait *w);
+static void flush_cond_wait(struct cond_wait **head);
+
 /*
  * DMA buffer descriptor and access macros
  */
@@ -269,6 +283,9 @@ struct slgt_info {
 	struct timer_list	tx_timer;
 	struct timer_list	rx_timer;
 
+	unsigned int            gpio_present;
+	struct cond_wait        *gpio_wait_q;
+
 	spinlock_t lock;	/* spinlock for synchronizing with ISR */
 
 	struct work_struct task;
@@ -379,6 +396,11 @@ static MGSL_PARAMS default_params = {
 #define MASK_OVERRUN BIT4
 
 #define GSR   0x00 /* global status */
+#define JCR   0x04 /* JTAG control */
+#define IODR  0x08 /* GPIO direction */
+#define IOER  0x0c /* GPIO interrupt enable */
+#define IOVR  0x10 /* GPIO value */
+#define IOSR  0x14 /* GPIO interrupt status */
 #define TDR   0x80 /* tx data */
 #define RDR   0x80 /* rx data */
 #define TCR   0x82 /* tx control */
@@ -503,6 +525,9 @@ static int  tiocmset(struct tty_struct *tty, struct file *file,
 static void set_break(struct tty_struct *tty, int break_state);
 static int  get_interface(struct slgt_info *info, int __user *if_mode);
 static int  set_interface(struct slgt_info *info, int if_mode);
+static int  set_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
+static int  get_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
+static int  wait_gpio(struct slgt_info *info, struct gpio_desc __user *gpio);
 
 /*
  * driver functions
@@ -1112,6 +1137,12 @@ static int ioctl(struct tty_struct *tty, struct file *file,
 		return get_interface(info, argp);
 	case MGSL_IOCSIF:
 		return set_interface(info,(int)arg);
+	case MGSL_IOCSGPIO:
+		return set_gpio(info, argp);
+	case MGSL_IOCGGPIO:
+		return get_gpio(info, argp);
+	case MGSL_IOCWAITGPIO:
+		return wait_gpio(info, argp);
 	case TIOCGICOUNT:
 		spin_lock_irqsave(&info->lock,flags);
 		cnow = info->icount;
@@ -2158,6 +2189,24 @@ static void isr_txeom(struct slgt_info *info, unsigned short status)
 	}
 }
 
+static void isr_gpio(struct slgt_info *info, unsigned int changed, unsigned int state)
+{
+	struct cond_wait *w, *prev;
+
+	/* wake processes waiting for specific transitions */
+	for (w = info->gpio_wait_q, prev = NULL ; w != NULL ; w = w->next) {
+		if (w->data & changed) {
+			w->data = state;
+			wake_up_interruptible(&w->q);
+			if (prev != NULL)
+				prev->next = w->next;
+			else
+				info->gpio_wait_q = w->next;
+		} else
+			prev = w;
+	}
+}
+
 /* interrupt service routine
  *
  * 	irq	interrupt number
@@ -2193,6 +2242,22 @@ static irqreturn_t slgt_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 		}
 	}
 
+	if (info->gpio_present) {
+		unsigned int state;
+		unsigned int changed;
+		while ((changed = rd_reg32(info, IOSR)) != 0) {
+			DBGISR(("%s iosr=%08x\n", info->device_name, changed));
+			/* read latched state of GPIO signals */
+			state = rd_reg32(info, IOVR);
+			/* clear pending GPIO interrupt bits */
+			wr_reg32(info, IOSR, changed);
+			for (i=0 ; i < info->port_count ; i++) {
+				if (info->port_array[i] != NULL)
+					isr_gpio(info->port_array[i], changed, state);
+			}
+		}
+	}
+
 	for(i=0; i < info->port_count ; i++) {
 		struct slgt_info *port = info->port_array[i];
 
@@ -2276,6 +2341,8 @@ static void shutdown(struct slgt_info *info)
 		set_signals(info);
 	}
 
+	flush_cond_wait(&info->gpio_wait_q);
+
 	spin_unlock_irqrestore(&info->lock,flags);
 
 	if (info->tty)
@@ -2650,6 +2717,175 @@ static int set_interface(struct slgt_info *info, int if_mode)
 	return 0;
 }
 
+/*
+ * set general purpose IO pin state and direction
+ *
+ * user_gpio fields:
+ * state   each bit indicates a pin state
+ * smask   set bit indicates pin state to set
+ * dir     each bit indicates a pin direction (0=input, 1=output)
+ * dmask   set bit indicates pin direction to set
+ */
+static int set_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+ 	unsigned long flags;
+	struct gpio_desc gpio;
+	__u32 data;
+
+	if (!info->gpio_present)
+		return -EINVAL;
+	if (copy_from_user(&gpio, user_gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s set_gpio state=%08x smask=%08x dir=%08x dmask=%08x\n",
+		 info->device_name, gpio.state, gpio.smask,
+		 gpio.dir, gpio.dmask));
+
+	spin_lock_irqsave(&info->lock,flags);
+	if (gpio.dmask) {
+		data = rd_reg32(info, IODR);
+		data |= gpio.dmask & gpio.dir;
+		data &= ~(gpio.dmask & ~gpio.dir);
+		wr_reg32(info, IODR, data);
+	}
+	if (gpio.smask) {
+		data = rd_reg32(info, IOVR);
+		data |= gpio.smask & gpio.state;
+		data &= ~(gpio.smask & ~gpio.state);
+		wr_reg32(info, IOVR, data);
+	}
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	return 0;
+}
+
+/*
+ * get general purpose IO pin state and direction
+ */
+static int get_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+	struct gpio_desc gpio;
+	if (!info->gpio_present)
+		return -EINVAL;
+	gpio.state = rd_reg32(info, IOVR);
+	gpio.smask = 0xffffffff;
+	gpio.dir   = rd_reg32(info, IODR);
+	gpio.dmask = 0xffffffff;
+	if (copy_to_user(user_gpio, &gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s get_gpio state=%08x dir=%08x\n",
+		 info->device_name, gpio.state, gpio.dir));
+	return 0;
+}
+
+/*
+ * conditional wait facility
+ */
+static void init_cond_wait(struct cond_wait *w, unsigned int data)
+{
+	init_waitqueue_head(&w->q);
+	init_waitqueue_entry(&w->wait, current);
+	w->data = data;
+}
+
+static void add_cond_wait(struct cond_wait **head, struct cond_wait *w)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&w->q, &w->wait);
+	w->next = *head;
+	*head = w;
+}
+
+static void remove_cond_wait(struct cond_wait **head, struct cond_wait *cw)
+{
+	struct cond_wait *w, *prev;
+	remove_wait_queue(&cw->q, &cw->wait);
+	set_current_state(TASK_RUNNING);
+	for (w = *head, prev = NULL ; w != NULL ; prev = w, w = w->next) {
+		if (w == cw) {
+			if (prev != NULL)
+				prev->next = w->next;
+			else
+				*head = w->next;
+			break;
+		}
+	}
+}
+
+static void flush_cond_wait(struct cond_wait **head)
+{
+	while (*head != NULL) {
+		wake_up_interruptible(&(*head)->q);
+		*head = (*head)->next;
+	}
+}
+
+/*
+ * wait for general purpose I/O pin(s) to enter specified state
+ *
+ * user_gpio fields:
+ * state - bit indicates target pin state
+ * smask - set bit indicates watched pin
+ *
+ * The wait ends when at least one watched pin enters the specified
+ * state. When 0 (no error) is returned, user_gpio->state is set to the
+ * state of all GPIO pins when the wait ends.
+ *
+ * Note: Each pin may be a dedicated input, dedicated output, or
+ * configurable input/output. The number and configuration of pins
+ * varies with the specific adapter model. Only input pins (dedicated
+ * or configured) can be monitored with this function.
+ */
+static int wait_gpio(struct slgt_info *info, struct gpio_desc __user *user_gpio)
+{
+ 	unsigned long flags;
+	int rc = 0;
+	struct gpio_desc gpio;
+	struct cond_wait wait;
+	u32 state;
+
+	if (!info->gpio_present)
+		return -EINVAL;
+	if (copy_from_user(&gpio, user_gpio, sizeof(gpio)))
+		return -EFAULT;
+	DBGINFO(("%s wait_gpio() state=%08x smask=%08x\n",
+		 info->device_name, gpio.state, gpio.smask));
+	/* ignore output pins identified by set IODR bit */
+	if ((gpio.smask &= ~rd_reg32(info, IODR)) == 0)
+		return -EINVAL;
+	init_cond_wait(&wait, gpio.smask);
+
+	spin_lock_irqsave(&info->lock, flags);
+	/* enable interrupts for watched pins */
+	wr_reg32(info, IOER, rd_reg32(info, IOER) | gpio.smask);
+	/* get current pin states */
+	state = rd_reg32(info, IOVR);
+
+	if (gpio.smask & ~(state ^ gpio.state)) {
+		/* already in target state */
+		gpio.state = state;
+	} else {
+		/* wait for target state */
+		add_cond_wait(&info->gpio_wait_q, &wait);
+		spin_unlock_irqrestore(&info->lock, flags);
+		schedule();
+		if (signal_pending(current))
+			rc = -ERESTARTSYS;
+		else
+			gpio.state = wait.data;
+		spin_lock_irqsave(&info->lock, flags);
+		remove_cond_wait(&info->gpio_wait_q, &wait);
+	}
+
+	/* disable all GPIO interrupts if no waiting processes */
+	if (info->gpio_wait_q == NULL)
+		wr_reg32(info, IOER, 0);
+	spin_unlock_irqrestore(&info->lock,flags);
+
+	if ((rc == 0) && copy_to_user(user_gpio, &gpio, sizeof(gpio)))
+		rc = -EFAULT;
+	return rc;
+}
+
 static int modem_input_wait(struct slgt_info *info,int arg)
 {
  	unsigned long flags;
@@ -3166,8 +3402,10 @@ static void device_init(int adapter_num, struct pci_dev *pdev)
 		} else {
 			port_array[0]->irq_requested = 1;
 			adapter_test(port_array[0]);
-			for (i=1 ; i < port_count ; i++)
+			for (i=1 ; i < port_count ; i++) {
 				port_array[i]->init_error = port_array[0]->init_error;
+				port_array[i]->gpio_present = port_array[0]->gpio_present;
+			}
 		}
 	}
 }
@@ -4301,7 +4539,7 @@ static int register_test(struct slgt_info *info)
 			break;
 		}
 	}
-
+	info->gpio_present = (rd_reg32(info, JCR) & BIT5) ? 1 : 0;
 	info->init_error = rc ? 0 : DiagStatus_AddressFailure;
 	return rc;
 }
diff --git a/include/linux/synclink.h b/include/linux/synclink.h
index 1b7cd8d1a71b..2993302f7923 100644
--- a/include/linux/synclink.h
+++ b/include/linux/synclink.h
@@ -1,7 +1,7 @@
 /*
  * SyncLink Multiprotocol Serial Adapter Driver
  *
- * $Id: synclink.h,v 3.10 2005/11/08 19:50:54 paulkf Exp $
+ * $Id: synclink.h,v 3.11 2006/02/06 21:20:29 paulkf Exp $
  *
  * Copyright (C) 1998-2000 by Microgate Corporation
  *
@@ -221,6 +221,12 @@ struct mgsl_icount {
 	__u32	rxidle;
 };
 
+struct gpio_desc {
+	__u32 state;
+	__u32 smask;
+	__u32 dir;
+	__u32 dmask;
+};
 
 #define DEBUG_LEVEL_DATA	1
 #define DEBUG_LEVEL_ERROR 	2
@@ -276,5 +282,8 @@ struct mgsl_icount {
 #define MGSL_IOCLOOPTXDONE	_IO(MGSL_MAGIC_IOC,9)
 #define MGSL_IOCSIF		_IO(MGSL_MAGIC_IOC,10)
 #define MGSL_IOCGIF		_IO(MGSL_MAGIC_IOC,11)
+#define MGSL_IOCSGPIO		_IOW(MGSL_MAGIC_IOC,16,struct gpio_desc)
+#define MGSL_IOCGGPIO		_IOR(MGSL_MAGIC_IOC,17,struct gpio_desc)
+#define MGSL_IOCWAITGPIO	_IOWR(MGSL_MAGIC_IOC,18,struct gpio_desc)
 
 #endif /* _SYNCLINK_H_ */
-- 
cgit v1.2.3


From 273577165cd206d2d6689ee4b18aa13de1ec4bde Mon Sep 17 00:00:00 2001
From: Brian Rogan <bcr6@cornell.edu>
Date: Tue, 28 Mar 2006 01:56:20 -0800
Subject: [PATCH] Add oprofile_add_ext_sample

On ppc64 we look at a profiling register to work out the sample address and
if it was in userspace or kernel.

The backtrace interface oprofile_add_sample does not allow this.  Create
oprofile_add_ext_sample and make oprofile_add_sample use it too.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Philippe Elie <phil.el@wanadoo.fr>
Cc: John Levon <levon@movementarian.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/oprofile/cpu_buffer.c | 13 ++++++++++---
 include/linux/oprofile.h      | 10 ++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 330d3869b41e..fc4bc9b94c74 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -217,11 +217,10 @@ static void oprofile_end_trace(struct oprofile_cpu_buffer * cpu_buf)
 	cpu_buf->tracing = 0;
 }
 
-void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
+void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+				unsigned long event, int is_kernel)
 {
 	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
-	unsigned long pc = profile_pc(regs);
-	int is_kernel = !user_mode(regs);
 
 	if (!backtrace_depth) {
 		log_sample(cpu_buf, pc, is_kernel, event);
@@ -238,6 +237,14 @@ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 	oprofile_end_trace(cpu_buf);
 }
 
+void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
+{
+	int is_kernel = !user_mode(regs);
+	unsigned long pc = profile_pc(regs);
+
+	oprofile_add_ext_sample(pc, regs, event, is_kernel);
+}
+
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 {
 	struct oprofile_cpu_buffer * cpu_buf = &cpu_buffer[smp_processor_id()];
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 559c4c38a9c7..b5b3197dfd4f 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -61,6 +61,16 @@ void oprofile_arch_exit(void);
  */
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event);
 
+/**
+ * Add an extended sample.  Use this when the PC is not from the regs, and
+ * we cannot determine if we're in kernel mode from the regs.
+ *
+ * This function does perform a backtrace.
+ *
+ */
+void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+				unsigned long event, int is_kernel);
+
 /* Use this instead when the PC value is not from the regs. Doesn't
  * backtrace. */
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
-- 
cgit v1.2.3


From a28af471b8946de052a0eb0c080d5457be93f168 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 28 Mar 2006 01:56:26 -0800
Subject: [PATCH] fs/fat/: proper prototypes for two functions

Add proper prototypes for fat_cache_init() and fat_cache_destroy() in
msdos_fs.h.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/inode.c           | 3 ---
 include/linux/msdos_fs.h | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 404bfc9f7385..c1ce284f8a94 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1435,9 +1435,6 @@ out_fail:
 
 EXPORT_SYMBOL_GPL(fat_fill_super);
 
-int __init fat_cache_init(void);
-void fat_cache_destroy(void);
-
 static int __init init_fat_fs(void)
 {
 	int err;
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 779e6a5744c7..53cee1581650 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -420,6 +420,9 @@ extern int date_dos2unix(unsigned short time, unsigned short date);
 extern void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date);
 extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
 
+int fat_cache_init(void);
+void fat_cache_destroy(void);
+
 #endif /* __KERNEL__ */
 
 #endif
-- 
cgit v1.2.3


From e51236092d2f7e40e87e88804b5b42e5f8025415 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Mar 2006 01:56:27 -0800
Subject: [PATCH] remove relayfs_fs.h

This is obsolete.

Cc: Tom Zanussi <zanussi@us.ibm.com>
Cc: Jens Axboe <axboe@suse.de>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/relayfs_fs.h | 287 ---------------------------------------------
 1 file changed, 287 deletions(-)
 delete mode 100644 include/linux/relayfs_fs.h

(limited to 'include/linux')

diff --git a/include/linux/relayfs_fs.h b/include/linux/relayfs_fs.h
deleted file mode 100644
index 7342e66247fb..000000000000
--- a/include/linux/relayfs_fs.h
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * linux/include/linux/relayfs_fs.h
- *
- * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
- *
- * RelayFS definitions and declarations
- */
-
-#ifndef _LINUX_RELAYFS_FS_H
-#define _LINUX_RELAYFS_FS_H
-
-#include <linux/config.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/wait.h>
-#include <linux/list.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include <linux/kref.h>
-
-/*
- * Tracks changes to rchan/rchan_buf structs
- */
-#define RELAYFS_CHANNEL_VERSION		6
-
-/*
- * Per-cpu relay channel buffer
- */
-struct rchan_buf
-{
-	void *start;			/* start of channel buffer */
-	void *data;			/* start of current sub-buffer */
-	size_t offset;			/* current offset into sub-buffer */
-	size_t subbufs_produced;	/* count of sub-buffers produced */
-	size_t subbufs_consumed;	/* count of sub-buffers consumed */
-	struct rchan *chan;		/* associated channel */
-	wait_queue_head_t read_wait;	/* reader wait queue */
-	struct work_struct wake_readers; /* reader wake-up work struct */
-	struct dentry *dentry;		/* channel file dentry */
-	struct kref kref;		/* channel buffer refcount */
-	struct page **page_array;	/* array of current buffer pages */
-	unsigned int page_count;	/* number of current buffer pages */
-	unsigned int finalized;		/* buffer has been finalized */
-	size_t *padding;		/* padding counts per sub-buffer */
-	size_t prev_padding;		/* temporary variable */
-	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
-	unsigned int cpu;		/* this buf's cpu */
-} ____cacheline_aligned;
-
-/*
- * Relay channel data structure
- */
-struct rchan
-{
-	u32 version;			/* the version of this struct */
-	size_t subbuf_size;		/* sub-buffer size */
-	size_t n_subbufs;		/* number of sub-buffers per buffer */
-	size_t alloc_size;		/* total buffer size allocated */
-	struct rchan_callbacks *cb;	/* client callbacks */
-	struct kref kref;		/* channel refcount */
-	void *private_data;		/* for user-defined data */
-	size_t last_toobig;		/* tried to log event > subbuf size */
-	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
-};
-
-/*
- * Relay channel client callbacks
- */
-struct rchan_callbacks
-{
-	/*
-	 * subbuf_start - called on buffer-switch to a new sub-buffer
-	 * @buf: the channel buffer containing the new sub-buffer
-	 * @subbuf: the start of the new sub-buffer
-	 * @prev_subbuf: the start of the previous sub-buffer
-	 * @prev_padding: unused space at the end of previous sub-buffer
-	 *
-	 * The client should return 1 to continue logging, 0 to stop
-	 * logging.
-	 *
-	 * NOTE: subbuf_start will also be invoked when the buffer is
-	 *       created, so that the first sub-buffer can be initialized
-	 *       if necessary.  In this case, prev_subbuf will be NULL.
-	 *
-	 * NOTE: the client can reserve bytes at the beginning of the new
-	 *       sub-buffer by calling subbuf_start_reserve() in this callback.
-	 */
-	int (*subbuf_start) (struct rchan_buf *buf,
-			     void *subbuf,
-			     void *prev_subbuf,
-			     size_t prev_padding);
-
-	/*
-	 * buf_mapped - relayfs buffer mmap notification
-	 * @buf: the channel buffer
-	 * @filp: relayfs file pointer
-	 *
-	 * Called when a relayfs file is successfully mmapped
-	 */
-        void (*buf_mapped)(struct rchan_buf *buf,
-			   struct file *filp);
-
-	/*
-	 * buf_unmapped - relayfs buffer unmap notification
-	 * @buf: the channel buffer
-	 * @filp: relayfs file pointer
-	 *
-	 * Called when a relayfs file is successfully unmapped
-	 */
-        void (*buf_unmapped)(struct rchan_buf *buf,
-			     struct file *filp);
-	/*
-	 * create_buf_file - create file to represent a relayfs channel buffer
-	 * @filename: the name of the file to create
-	 * @parent: the parent of the file to create
-	 * @mode: the mode of the file to create
-	 * @buf: the channel buffer
-	 * @is_global: outparam - set non-zero if the buffer should be global
-	 *
-	 * Called during relay_open(), once for each per-cpu buffer,
-	 * to allow the client to create a file to be used to
-	 * represent the corresponding channel buffer.  If the file is
-	 * created outside of relayfs, the parent must also exist in
-	 * that filesystem.
-	 *
-	 * The callback should return the dentry of the file created
-	 * to represent the relay buffer.
-	 *
-	 * Setting the is_global outparam to a non-zero value will
-	 * cause relay_open() to create a single global buffer rather
-	 * than the default set of per-cpu buffers.
-	 *
-	 * See Documentation/filesystems/relayfs.txt for more info.
-	 */
-	struct dentry *(*create_buf_file)(const char *filename,
-					  struct dentry *parent,
-					  int mode,
-					  struct rchan_buf *buf,
-					  int *is_global);
-
-	/*
-	 * remove_buf_file - remove file representing a relayfs channel buffer
-	 * @dentry: the dentry of the file to remove
-	 *
-	 * Called during relay_close(), once for each per-cpu buffer,
-	 * to allow the client to remove a file used to represent a
-	 * channel buffer.
-	 *
-	 * The callback should return 0 if successful, negative if not.
-	 */
-	int (*remove_buf_file)(struct dentry *dentry);
-};
-
-/*
- * relayfs kernel API, fs/relayfs/relay.c
- */
-
-struct rchan *relay_open(const char *base_filename,
-			 struct dentry *parent,
-			 size_t subbuf_size,
-			 size_t n_subbufs,
-			 struct rchan_callbacks *cb);
-extern void relay_close(struct rchan *chan);
-extern void relay_flush(struct rchan *chan);
-extern void relay_subbufs_consumed(struct rchan *chan,
-				   unsigned int cpu,
-				   size_t consumed);
-extern void relay_reset(struct rchan *chan);
-extern int relay_buf_full(struct rchan_buf *buf);
-
-extern size_t relay_switch_subbuf(struct rchan_buf *buf,
-				  size_t length);
-extern struct dentry *relayfs_create_dir(const char *name,
-					 struct dentry *parent);
-extern int relayfs_remove_dir(struct dentry *dentry);
-extern struct dentry *relayfs_create_file(const char *name,
-					  struct dentry *parent,
-					  int mode,
-					  struct file_operations *fops,
-					  void *data);
-extern int relayfs_remove_file(struct dentry *dentry);
-
-/**
- *	relay_write - write data into the channel
- *	@chan: relay channel
- *	@data: data to be written
- *	@length: number of bytes to write
- *
- *	Writes data into the current cpu's channel buffer.
- *
- *	Protects the buffer by disabling interrupts.  Use this
- *	if you might be logging from interrupt context.  Try
- *	__relay_write() if you know you	won't be logging from
- *	interrupt context.
- */
-static inline void relay_write(struct rchan *chan,
-			       const void *data,
-			       size_t length)
-{
-	unsigned long flags;
-	struct rchan_buf *buf;
-
-	local_irq_save(flags);
-	buf = chan->buf[smp_processor_id()];
-	if (unlikely(buf->offset + length > chan->subbuf_size))
-		length = relay_switch_subbuf(buf, length);
-	memcpy(buf->data + buf->offset, data, length);
-	buf->offset += length;
-	local_irq_restore(flags);
-}
-
-/**
- *	__relay_write - write data into the channel
- *	@chan: relay channel
- *	@data: data to be written
- *	@length: number of bytes to write
- *
- *	Writes data into the current cpu's channel buffer.
- *
- *	Protects the buffer by disabling preemption.  Use
- *	relay_write() if you might be logging from interrupt
- *	context.
- */
-static inline void __relay_write(struct rchan *chan,
-				 const void *data,
-				 size_t length)
-{
-	struct rchan_buf *buf;
-
-	buf = chan->buf[get_cpu()];
-	if (unlikely(buf->offset + length > buf->chan->subbuf_size))
-		length = relay_switch_subbuf(buf, length);
-	memcpy(buf->data + buf->offset, data, length);
-	buf->offset += length;
-	put_cpu();
-}
-
-/**
- *	relay_reserve - reserve slot in channel buffer
- *	@chan: relay channel
- *	@length: number of bytes to reserve
- *
- *	Returns pointer to reserved slot, NULL if full.
- *
- *	Reserves a slot in the current cpu's channel buffer.
- *	Does not protect the buffer at all - caller must provide
- *	appropriate synchronization.
- */
-static inline void *relay_reserve(struct rchan *chan, size_t length)
-{
-	void *reserved;
-	struct rchan_buf *buf = chan->buf[smp_processor_id()];
-
-	if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
-		length = relay_switch_subbuf(buf, length);
-		if (!length)
-			return NULL;
-	}
-	reserved = buf->data + buf->offset;
-	buf->offset += length;
-
-	return reserved;
-}
-
-/**
- *	subbuf_start_reserve - reserve bytes at the start of a sub-buffer
- *	@buf: relay channel buffer
- *	@length: number of bytes to reserve
- *
- *	Helper function used to reserve bytes at the beginning of
- *	a sub-buffer in the subbuf_start() callback.
- */
-static inline void subbuf_start_reserve(struct rchan_buf *buf,
-					size_t length)
-{
-	BUG_ON(length >= buf->chan->subbuf_size - 1);
-	buf->offset = length;
-}
-
-/*
- * exported relay file operations, fs/relayfs/inode.c
- */
-extern struct file_operations relay_file_operations;
-
-#endif /* _LINUX_RELAYFS_FS_H */
-
-- 
cgit v1.2.3


From d266ab88938e49aa95f1965ee020df1b1d4c5761 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 28 Mar 2006 01:56:31 -0800
Subject: [PATCH] Small fixes backported to old IDE SiS driver

Some quick backport bits from the libata PATA work to fix things found in
the sis driver.  The piix driver needs some fixes too but those are way to
large and need someone working on old IDE with time to do them.

This patch fixes the case where random bits get loaded into SIS timing
registers according to the description of the correct behaviour from
Vojtech Pavlik.  It also adds the SiS5517 ATA16 chipset which is not
currently supported by the driver.  Thanks to Conrad Harriss for loaning me
the machine with the 5517 chipset.

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/sis5513.c | 2 ++
 include/linux/pci_ids.h   | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 75a2253a3e68..8e9d87701ce2 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -112,6 +112,7 @@ static const struct {
 
 	{ "SiS5596",	PCI_DEVICE_ID_SI_5596,	ATA_16   },
 	{ "SiS5571",	PCI_DEVICE_ID_SI_5571,	ATA_16   },
+	{ "SiS5517",	PCI_DEVICE_ID_SI_5517,	ATA_16   },
 	{ "SiS551x",	PCI_DEVICE_ID_SI_5511,	ATA_16   },
 };
 
@@ -524,6 +525,7 @@ static void config_art_rwp_pio (ide_drive_t *drive, u8 pio)
 			case 3:		test1 = 0x30|0x03; break;
 			case 2:		test1 = 0x40|0x04; break;
 			case 1:		test1 = 0x60|0x07; break;
+			case 0:		test1 = 0x00; break;
 			default:	break;
 		}
 		pci_write_config_byte(dev, drive_pci, test1);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 02f6cf20b141..e2ab2ac18d6b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -642,6 +642,7 @@
 #define PCI_DEVICE_ID_SI_965		0x0965
 #define PCI_DEVICE_ID_SI_5511		0x5511
 #define PCI_DEVICE_ID_SI_5513		0x5513
+#define PCI_DEVICE_ID_SI_5517		0x5517
 #define PCI_DEVICE_ID_SI_5518		0x5518
 #define PCI_DEVICE_ID_SI_5571		0x5571
 #define PCI_DEVICE_ID_SI_5581		0x5581
-- 
cgit v1.2.3


From 70674f95c0a2ea694d5c39f4e514f538a09be36f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 28 Mar 2006 01:56:33 -0800
Subject: [PATCH] Optimize select/poll by putting small data sets on the stack

Optimize select and poll by a using stack space for small fd sets

This brings back an old optimization from Linux 2.0.  Using the stack is
faster than kmalloc.  On a Intel P4 system it speeds up a select of a
single pty fd by about 13% (~4000 cycles -> ~3500)

It also saves memory because a daemon hanging in select or poll will
usually save one or two less pages.  This can add up - e.g.  if you have 10
daemons blocking in poll/select you save 40KB of memory.

I did a patch for this long ago, but it was never applied.  This version is
a reimplementation of the old patch that tries to be less intrusive.  I
only did the minimal changes needed for the stack allocation.

The cut off point before external memory is allocated is currently at
832bytes.  The system calls always allocate this much memory on the stack.

These 832 bytes are divided into 256 bytes frontend data (for the select
bitmaps of the pollfds) and the rest of the space for the wait queues used
by the low level drivers.  There are some extreme cases where this won't
work out for select and it falls back to allocating memory too early -
especially with very sparse large select bitmaps - but the majority of
processes who only have a small number of file descriptors should be ok.
[TBD: 832/256 might not be the best split for select or poll]

I suspect more optimizations might be possible, but they would be more
complicated.  One way would be to cache the select/poll context over
multiple system calls because typically the input values should be similar.
 Problem is when to flush the file descriptors out though.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c          | 106 +++++++++++++++++++++++++++++++--------------------
 include/linux/poll.h |  17 +++++++++
 2 files changed, 81 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/select.c b/fs/select.c
index 1815a57d2255..d8b4f0722b8d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,12 +29,6 @@
 #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
-struct poll_table_entry {
-	struct file * filp;
-	wait_queue_t wait;
-	wait_queue_head_t * wait_address;
-};
-
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
@@ -64,13 +58,23 @@ void poll_initwait(struct poll_wqueues *pwq)
 	init_poll_funcptr(&pwq->pt, __pollwait);
 	pwq->error = 0;
 	pwq->table = NULL;
+	pwq->inline_index = 0;
 }
 
 EXPORT_SYMBOL(poll_initwait);
 
+static void free_poll_entry(struct poll_table_entry *entry)
+{
+	remove_wait_queue(entry->wait_address,&entry->wait);
+	fput(entry->filp);
+}
+
 void poll_freewait(struct poll_wqueues *pwq)
 {
 	struct poll_table_page * p = pwq->table;
+	int i;
+	for (i = 0; i < pwq->inline_index; i++)
+		free_poll_entry(pwq->inline_entries + i);
 	while (p) {
 		struct poll_table_entry * entry;
 		struct poll_table_page *old;
@@ -78,8 +82,7 @@ void poll_freewait(struct poll_wqueues *pwq)
 		entry = p->entry;
 		do {
 			entry--;
-			remove_wait_queue(entry->wait_address,&entry->wait);
-			fput(entry->filp);
+			free_poll_entry(entry);
 		} while (entry > p->entries);
 		old = p;
 		p = p->next;
@@ -89,12 +92,14 @@ void poll_freewait(struct poll_wqueues *pwq)
 
 EXPORT_SYMBOL(poll_freewait);
 
-static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
-		       poll_table *_p)
+static struct poll_table_entry *poll_get_entry(poll_table *_p)
 {
 	struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
 	struct poll_table_page *table = p->table;
 
+	if (p->inline_index < N_INLINE_POLL_ENTRIES)
+		return p->inline_entries + p->inline_index++;
+
 	if (!table || POLL_TABLE_FULL(table)) {
 		struct poll_table_page *new_table;
 
@@ -102,7 +107,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 		if (!new_table) {
 			p->error = -ENOMEM;
 			__set_current_state(TASK_RUNNING);
-			return;
+			return NULL;
 		}
 		new_table->entry = new_table->entries;
 		new_table->next = table;
@@ -110,16 +115,21 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 		table = new_table;
 	}
 
-	/* Add a new entry */
-	{
-		struct poll_table_entry * entry = table->entry;
-		table->entry = entry+1;
-	 	get_file(filp);
-	 	entry->filp = filp;
-		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
-		add_wait_queue(wait_address,&entry->wait);
-	}
+	return table->entry++;
+}
+
+/* Add a new entry */
+static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
+				poll_table *p)
+{
+	struct poll_table_entry *entry = poll_get_entry(p);
+	if (!entry)
+		return;
+	get_file(filp);
+	entry->filp = filp;
+	entry->wait_address = wait_address;
+	init_waitqueue_entry(&entry->wait, current);
+	add_wait_queue(wait_address,&entry->wait);
 }
 
 #define FDS_IN(fds, n)		(fds->in + n)
@@ -284,16 +294,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	return retval;
 }
 
-static void *select_bits_alloc(int size)
-{
-	return kmalloc(6 * size, GFP_KERNEL);
-}
-
-static void select_bits_free(void *bits, int size)
-{
-	kfree(bits);
-}
-
 /*
  * We can actually return ERESTARTSYS instead of EINTR, but I'd
  * like to be certain this leads to no problems. So I return
@@ -312,6 +312,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	char *bits;
 	int ret, size, max_fdset;
 	struct fdtable *fdt;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	char stack_fds[SELECT_STACK_ALLOC];
 
 	ret = -EINVAL;
 	if (n < 0)
@@ -332,7 +334,10 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	 */
 	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	bits = select_bits_alloc(size);
+	if (6*size < SELECT_STACK_ALLOC)
+		bits = stack_fds;
+	else
+		bits = kmalloc(6 * size, GFP_KERNEL);
 	if (!bits)
 		goto out_nofds;
 	fds.in      = (unsigned long *)  bits;
@@ -367,7 +372,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		ret = -EFAULT;
 
 out:
-	select_bits_free(bits, size);
+	if (bits != stack_fds)
+		kfree(bits);
 out_nofds:
 	return ret;
 }
@@ -619,6 +625,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	return count;
 }
 
+#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
+			sizeof(struct pollfd))
+
 int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 {
 	struct poll_wqueues table;
@@ -628,6 +637,9 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
  	struct poll_list *walk;
 	struct fdtable *fdt;
 	int max_fdset;
+	/* Allocate small arguments on the stack to save memory and be faster */
+	char stack_pps[POLL_STACK_ALLOC];
+	struct poll_list *stack_pp = NULL;
 
 	/* Do a sanity check on nfds ... */
 	rcu_read_lock();
@@ -645,14 +657,23 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	err = -ENOMEM;
 	while(i!=0) {
 		struct poll_list *pp;
-		pp = kmalloc(sizeof(struct poll_list)+
-				sizeof(struct pollfd)*
-				(i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
-					GFP_KERNEL);
-		if(pp==NULL)
-			goto out_fds;
+		int num, size;
+		if (stack_pp == NULL)
+			num = N_STACK_PPS;
+		else
+			num = POLLFD_PER_PAGE;
+		if (num > i)
+			num = i;
+		size = sizeof(struct poll_list) + sizeof(struct pollfd)*num;
+		if (!stack_pp)
+			stack_pp = pp = (struct poll_list *)stack_pps;
+		else {
+			pp = kmalloc(size, GFP_KERNEL);
+			if (!pp)
+				goto out_fds;
+		}
 		pp->next=NULL;
-		pp->len = (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i);
+		pp->len = num;
 		if (head == NULL)
 			head = pp;
 		else
@@ -660,7 +681,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 
 		walk = pp;
 		if (copy_from_user(pp->entries, ufds + nfds-i, 
-				sizeof(struct pollfd)*pp->len)) {
+				sizeof(struct pollfd)*num)) {
 			err = -EFAULT;
 			goto out_fds;
 		}
@@ -689,7 +710,8 @@ out_fds:
 	walk = head;
 	while(walk!=NULL) {
 		struct poll_list *pp = walk->next;
-		kfree(walk);
+		if (walk != stack_pp)
+			kfree(walk);
 		walk = pp;
 	}
 	poll_freewait(&table);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 8e8f6098508a..51e1b56741fb 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -11,6 +11,15 @@
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 
+/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
+   additional memory. */
+#define MAX_STACK_ALLOC 832
+#define FRONTEND_STACK_ALLOC	256
+#define SELECT_STACK_ALLOC	FRONTEND_STACK_ALLOC
+#define POLL_STACK_ALLOC	FRONTEND_STACK_ALLOC
+#define WQUEUES_STACK_ALLOC	(MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
+#define N_INLINE_POLL_ENTRIES	(WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
+
 struct poll_table_struct;
 
 /* 
@@ -33,6 +42,12 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->qproc = qproc;
 }
 
+struct poll_table_entry {
+	struct file * filp;
+	wait_queue_t wait;
+	wait_queue_head_t * wait_address;
+};
+
 /*
  * Structures and helpers for sys_poll/sys_poll
  */
@@ -40,6 +55,8 @@ struct poll_wqueues {
 	poll_table pt;
 	struct poll_table_page * table;
 	int error;
+	int inline_index;
+	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
 };
 
 extern void poll_initwait(struct poll_wqueues *pwq);
-- 
cgit v1.2.3


From 631d6747e1d877a4baa924cb373b8b9511a53e5e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:36 -0800
Subject: [PATCH] for_each_possible_cpu: defines for_each_possible_cpu

for_each_cpu() is a for-loop over cpu_possible_map.  for_each_online_cpu is
for-loop cpu over cpu_online_map.  .....for_each_cpu() is not sufficiently
explicit and can lead to mistakes.

This patch adds for_each_possible_cpu() in preparation for the removal of
for_each_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 99e6115d8e52..9cbb781d6f80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -67,7 +67,7 @@
  *
  * int any_online_cpu(mask)		First online cpu in mask
  *
- * for_each_cpu(cpu)			for-loop cpu over cpu_possible_map
+ * for_each_possible_cpu(cpu)		for-loop cpu over cpu_possible_map
  * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
  * for_each_present_cpu(cpu)		for-loop cpu over cpu_present_map
  *
@@ -405,7 +405,8 @@ int __any_online_cpu(const cpumask_t *mask);
 #define any_online_cpu(mask)		0
 #endif
 
-#define for_each_cpu(cpu)	  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_possible_cpu(cpu)  for_each_cpu_mask((cpu), cpu_possible_map)
 #define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
 #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-- 
cgit v1.2.3


From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:37 -0800
Subject: [PATCH] for_each_possible_cpu: fixes for generic part

replaces for_each_cpu with for_each_possible_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c            | 2 +-
 fs/file.c                    | 2 +-
 fs/proc/proc_misc.c          | 2 +-
 include/asm-generic/percpu.h | 2 +-
 include/linux/genhd.h        | 4 ++--
 include/linux/kernel_stat.h  | 2 +-
 init/main.c                  | 2 +-
 kernel/rcutorture.c          | 4 ++--
 kernel/sched.c               | 8 ++++----
 mm/slab.c                    | 4 ++--
 mm/swap.c                    | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..5a19e2eb5711 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3514,7 +3514,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/fs/file.c b/fs/file.c
index bbc743314730..55f4e7022563 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -373,6 +373,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1e9ea37d457e..1edce0c34bfd 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -534,7 +534,7 @@ static int show_stat(struct seq_file *p, void *v)
 	if (wall_to_monotonic.tv_nsec)
 		--jif;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int j;
 
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 78cf45547e31..c0caf433a7d7 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -19,7 +19,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for_each_cpu(__i)					\
+	for_each_possible_cpu(__i)				\
 		memcpy((pcpudst)+__per_cpu_offset[__i],		\
 		       (src), (size));				\
 } while (0)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3c1b0294a742..10a27f29d692 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -152,14 +152,14 @@ struct disk_attribute {
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
-	for_each_cpu(i)							\
+	for_each_possible_cpu(i)					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	res;								\
 })
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
 				sizeof (struct disk_stats));
 }		
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a484572c302e..b46249082cca 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,7 +46,7 @@ static inline int kstat_irqs(int irq)
 {
 	int cpu, sum = 0;
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		sum += kstat_cpu(cpu).irqs[irq];
 
 	return sum;
diff --git a/init/main.c b/init/main.c
index 64466ea1984c..4a2f0898dda1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -341,7 +341,7 @@ static void __init setup_per_cpu_areas(void)
 #endif
 	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5baf5..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
 			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_error, 0);
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			per_cpu(rcu_torture_count, cpu)[i] = 0;
 			per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7854ee516b92..a9ecac398bb9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 
 	/*
@@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 
 	return sum;
@@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
 	return sum;
@@ -6080,7 +6080,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/slab.c b/mm/slab.c
index 681837499d7d..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3311,7 +3311,7 @@ void *__alloc_percpu(size_t size)
 	 * and we have no way of figuring out how to fix the array
 	 * that we have allocated then....
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int node = cpu_to_node(i);
 
 		if (node_online(node))
@@ -3398,7 +3398,7 @@ void free_percpu(const void *objp)
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 	    kfree(p->ptrs[i]);
 	kfree(p);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 91b7e2026f69..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -512,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		long *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-- 
cgit v1.2.3


From 99ac48f54a91d02140c497edc31dc57d4bc5c85d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 28 Mar 2006 01:56:41 -0800
Subject: [PATCH] mark f_ops const in the inode

Mark the f_ops members of inodes as const, as well as fix the
ripple-through this causes by places that copy this f_ops and then "do
stuff" with it.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/ppc/kernel/ppc_htab.c                |  2 +-
 drivers/char/drm/drm_fops.c               |  2 +-
 drivers/char/drm/i810_dma.c               |  2 +-
 drivers/char/drm/i830_dma.c               |  2 +-
 drivers/char/mem.c                        |  2 +-
 drivers/char/misc.c                       |  2 +-
 drivers/input/input.c                     |  2 +-
 drivers/isdn/capi/kcapi_proc.c            |  2 +-
 drivers/media/dvb/dvb-core/dvbdev.c       |  2 +-
 drivers/media/video/videodev.c            |  2 +-
 drivers/message/i2o/i2o_proc.c            |  2 +-
 drivers/oprofile/oprofilefs.c             |  6 +++---
 drivers/telephony/phonedev.c              |  2 +-
 drivers/usb/core/file.c                   |  6 +++---
 drivers/usb/gadget/inode.c                |  6 +++---
 fs/char_dev.c                             |  4 ++--
 fs/debugfs/inode.c                        |  2 +-
 fs/inode.c                                |  2 +-
 fs/nfsd/vfs.c                             |  2 +-
 fs/proc/generic.c                         |  2 +-
 fs/proc/internal.h                        |  2 +-
 fs/proc/proc_misc.c                       |  2 +-
 fs/select.c                               |  2 +-
 include/linux/cdev.h                      |  4 ++--
 include/linux/debugfs.h                   |  2 +-
 include/linux/fs.h                        |  6 +++---
 include/linux/input.h                     |  2 +-
 include/linux/miscdevice.h                |  2 +-
 include/linux/oprofile.h                  |  4 ++--
 include/linux/proc_fs.h                   |  4 ++--
 include/linux/sound.h                     | 12 ++++++------
 include/linux/sunrpc/stats.h              |  4 ++--
 include/linux/usb.h                       |  2 +-
 include/linux/videodev2.h                 |  2 +-
 include/sound/core.h                      |  6 +++---
 net/sunrpc/rpc_pipe.c                     |  2 +-
 net/sunrpc/stats.c                        |  4 ++--
 sound/core/init.c                         |  3 ++-
 sound/core/sound.c                        |  4 ++--
 sound/core/sound_oss.c                    |  2 +-
 sound/sound_core.c                        | 22 +++++++++++-----------
 42 files changed, 75 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index b3962c3a0348..5be40aa483fd 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -103,7 +103,7 @@ spufs_setattr(struct dentry *dentry, struct iattr *attr)
 
 static int
 spufs_new_file(struct super_block *sb, struct dentry *dentry,
-		struct file_operations *fops, int mode,
+		const struct file_operations *fops, int mode,
 		struct spu_context *ctx)
 {
 	static struct inode_operations spufs_file_iops = {
diff --git a/arch/ppc/kernel/ppc_htab.c b/arch/ppc/kernel/ppc_htab.c
index 2f5c7650274f..9b84bffdefce 100644
--- a/arch/ppc/kernel/ppc_htab.c
+++ b/arch/ppc/kernel/ppc_htab.c
@@ -52,7 +52,7 @@ static int ppc_htab_open(struct inode *inode, struct file *file)
 	return single_open(file, ppc_htab_show, NULL);
 }
 
-struct file_operations ppc_htab_operations = {
+const struct file_operations ppc_htab_operations = {
 	.open		= ppc_htab_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/drivers/char/drm/drm_fops.c b/drivers/char/drm/drm_fops.c
index 641f7633878c..b7f7951c4587 100644
--- a/drivers/char/drm/drm_fops.c
+++ b/drivers/char/drm/drm_fops.c
@@ -175,7 +175,7 @@ int drm_stub_open(struct inode *inode, struct file *filp)
 	drm_device_t *dev = NULL;
 	int minor = iminor(inode);
 	int err = -ENODEV;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 
 	DRM_DEBUG("\n");
 
diff --git a/drivers/char/drm/i810_dma.c b/drivers/char/drm/i810_dma.c
index ae0aa6d7e0bb..c658dde3633b 100644
--- a/drivers/char/drm/i810_dma.c
+++ b/drivers/char/drm/i810_dma.c
@@ -126,7 +126,7 @@ static int i810_map_buffer(drm_buf_t * buf, struct file *filp)
 	drm_device_t *dev = priv->head->dev;
 	drm_i810_buf_priv_t *buf_priv = buf->dev_private;
 	drm_i810_private_t *dev_priv = dev->dev_private;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	int retcode = 0;
 
 	if (buf_priv->currently_mapped == I810_BUF_MAPPED)
diff --git a/drivers/char/drm/i830_dma.c b/drivers/char/drm/i830_dma.c
index 163f2cbfe60d..b0f815d8cea8 100644
--- a/drivers/char/drm/i830_dma.c
+++ b/drivers/char/drm/i830_dma.c
@@ -128,7 +128,7 @@ static int i830_map_buffer(drm_buf_t * buf, struct file *filp)
 	drm_device_t *dev = priv->head->dev;
 	drm_i830_buf_priv_t *buf_priv = buf->dev_private;
 	drm_i830_private_t *dev_priv = dev->dev_private;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	unsigned long virtual;
 	int retcode = 0;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 5245ba1649ed..66719f9d294c 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -899,7 +899,7 @@ static const struct {
 	unsigned int		minor;
 	char			*name;
 	umode_t			mode;
-	struct file_operations	*fops;
+	const struct file_operations	*fops;
 } devlist[] = { /* list of minor devices */
 	{1, "mem",     S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops},
 	{2, "kmem",    S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops},
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 3e4c0414a01a..96eb2a709e21 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -129,7 +129,7 @@ static int misc_open(struct inode * inode, struct file * file)
 	int minor = iminor(inode);
 	struct miscdevice *c;
 	int err = -ENODEV;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 	
 	down(&misc_sem);
 	
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 4fe3da3c667a..f8af0945964e 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -923,7 +923,7 @@ void input_unregister_handler(struct input_handler *handler)
 static int input_open_file(struct inode *inode, struct file *file)
 {
 	struct input_handler *handler = input_table[iminor(inode) >> 5];
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 	int err;
 
 	/* No load-on-demand here? */
diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c
index 2cc8b27e4c3b..ca9dc00a45c4 100644
--- a/drivers/isdn/capi/kcapi_proc.c
+++ b/drivers/isdn/capi/kcapi_proc.c
@@ -233,7 +233,7 @@ static struct file_operations proc_applstats_ops = {
 };
 
 static void
-create_seq_entry(char *name, mode_t mode, struct file_operations *f)
+create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
 {
 	struct proc_dir_entry *entry;
 	entry = create_proc_entry(name, mode, NULL);
diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c
index 54f8b95717b0..96fe0ecae250 100644
--- a/drivers/media/dvb/dvb-core/dvbdev.c
+++ b/drivers/media/dvb/dvb-core/dvbdev.c
@@ -86,7 +86,7 @@ static int dvb_device_open(struct inode *inode, struct file *file)
 
 	if (dvbdev && dvbdev->fops) {
 		int err = 0;
-		struct file_operations *old_fops;
+		const struct file_operations *old_fops;
 
 		file->private_data = dvbdev;
 		old_fops = file->f_op;
diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 75e3d41382f2..5f87dd5f1d0b 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -97,7 +97,7 @@ static int video_open(struct inode *inode, struct file *file)
 	unsigned int minor = iminor(inode);
 	int err = 0;
 	struct video_device *vfl;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 
 	if(minor>=VIDEO_NUM_DEVICES)
 		return -ENODEV;
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 2a0c42b8cda5..3d2e76eea93e 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -56,7 +56,7 @@
 typedef struct _i2o_proc_entry_t {
 	char *name;		/* entry name */
 	mode_t mode;		/* mode */
-	struct file_operations *fops;	/* open function */
+	const struct file_operations *fops;	/* open function */
 } i2o_proc_entry;
 
 /* global I2O /proc/i2o entry */
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index d6bae699749a..b62da9b0cbf0 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -130,7 +130,7 @@ static struct file_operations ulong_ro_fops = {
 
 
 static struct dentry * __oprofilefs_create_file(struct super_block * sb,
-	struct dentry * root, char const * name, struct file_operations * fops,
+	struct dentry * root, char const * name, const struct file_operations * fops,
 	int perm)
 {
 	struct dentry * dentry;
@@ -203,7 +203,7 @@ int oprofilefs_create_ro_atomic(struct super_block * sb, struct dentry * root,
 
  
 int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops)
+	char const * name, const struct file_operations * fops)
 {
 	if (!__oprofilefs_create_file(sb, root, name, fops, 0644))
 		return -EFAULT;
@@ -212,7 +212,7 @@ int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
 
 
 int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops, int perm)
+	char const * name, const struct file_operations * fops, int perm)
 {
 	if (!__oprofilefs_create_file(sb, root, name, fops, perm))
 		return -EFAULT;
diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c
index 7a6db1c5c8c5..e166fffea86b 100644
--- a/drivers/telephony/phonedev.c
+++ b/drivers/telephony/phonedev.c
@@ -49,7 +49,7 @@ static int phone_open(struct inode *inode, struct file *file)
 	unsigned int minor = iminor(inode);
 	int err = 0;
 	struct phone_device *p;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 
 	if (minor >= PHONE_NUM_DEVICES)
 		return -ENODEV;
diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c
index 37b13368c814..b263a54a13c0 100644
--- a/drivers/usb/core/file.c
+++ b/drivers/usb/core/file.c
@@ -24,15 +24,15 @@
 #include "usb.h"
 
 #define MAX_USB_MINORS	256
-static struct file_operations *usb_minors[MAX_USB_MINORS];
+static const struct file_operations *usb_minors[MAX_USB_MINORS];
 static DEFINE_SPINLOCK(minor_lock);
 
 static int usb_open(struct inode * inode, struct file * file)
 {
 	int minor = iminor(inode);
-	struct file_operations *c;
+	const struct file_operations *c;
 	int err = -ENODEV;
-	struct file_operations *old_fops, *new_fops = NULL;
+	const struct file_operations *old_fops, *new_fops = NULL;
 
 	spin_lock (&minor_lock);
 	c = usb_minors[minor];
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index b44cfda76b61..3f618ce6998d 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1581,7 +1581,7 @@ restart:
 
 static struct inode *
 gadgetfs_create_file (struct super_block *sb, char const *name,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		struct dentry **dentry_p);
 
 static int activate_ep_files (struct dev_data *dev)
@@ -1955,7 +1955,7 @@ module_param (default_perm, uint, 0644);
 
 static struct inode *
 gadgetfs_make_inode (struct super_block *sb,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		int mode)
 {
 	struct inode *inode = new_inode (sb);
@@ -1979,7 +1979,7 @@ gadgetfs_make_inode (struct super_block *sb,
  */
 static struct inode *
 gadgetfs_create_file (struct super_block *sb, char const *name,
-		void *data, struct file_operations *fops,
+		void *data, const struct file_operations *fops,
 		struct dentry **dentry_p)
 {
 	struct dentry	*dentry;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 8c6eb04d31e2..b53dffa46bac 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -250,7 +250,7 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 }
 
 int register_chrdev(unsigned int major, const char *name,
-		    struct file_operations *fops)
+		    const struct file_operations *fops)
 {
 	struct char_device_struct *cd;
 	struct cdev *cdev;
@@ -473,7 +473,7 @@ struct cdev *cdev_alloc(void)
 	return p;
 }
 
-void cdev_init(struct cdev *cdev, struct file_operations *fops)
+void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
 	memset(cdev, 0, sizeof *cdev);
 	INIT_LIST_HEAD(&cdev->list);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d4f1a2cddd47..85d166cdcae4 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -191,7 +191,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  */
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
 				   struct dentry *parent, void *data,
-				   struct file_operations *fops)
+				   const struct file_operations *fops)
 {
 	struct dentry *dentry = NULL;
 	int error;
diff --git a/fs/inode.c b/fs/inode.c
index 1fddf2803af8..32b7c3375021 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -104,7 +104,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 {
 	static struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
-	static struct file_operations empty_fops;
+	static const struct file_operations empty_fops;
 	struct inode *inode;
 
 	if (sb->s_op->alloc_inode)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5320e5afaddb..31018333dc38 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -706,7 +706,7 @@ nfsd_close(struct file *filp)
  * after it.
  */
 static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-			      struct file_operations *fop)
+			      const struct file_operations *fop)
 {
 	struct inode *inode = dp->d_inode;
 	int (*fsync) (struct file *, struct dentry *, int);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 47b7a20d45eb..4ba03009cf72 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -560,7 +560,7 @@ static void proc_kill_inodes(struct proc_dir_entry *de)
 		struct file * filp = list_entry(p, struct file, f_u.fu_list);
 		struct dentry * dentry = filp->f_dentry;
 		struct inode * inode;
-		struct file_operations *fops;
+		const struct file_operations *fops;
 
 		if (dentry->d_op != &proc_dentry_operations)
 			continue;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95a1cf32b838..0502f17b860d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -30,7 +30,7 @@ do {						\
 
 #endif
 
-extern void create_seq_entry(char *name, mode_t mode, struct file_operations *f);
+extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
 extern int proc_exe_link(struct inode *, struct dentry **, struct vfsmount **);
 extern int proc_tid_stat(struct task_struct *,  char *);
 extern int proc_tgid_stat(struct task_struct *, char *);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1edce0c34bfd..ef5a3323f4b5 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -731,7 +731,7 @@ static struct file_operations proc_sysrq_trigger_operations = {
 
 struct proc_dir_entry *proc_root_kcore;
 
-void create_seq_entry(char *name, mode_t mode, struct file_operations *f)
+void create_seq_entry(char *name, mode_t mode, const struct file_operations *f)
 {
 	struct proc_dir_entry *entry;
 	entry = create_proc_entry(name, mode, NULL);
diff --git a/fs/select.c b/fs/select.c
index 05cd199a1127..b3a3a1326af6 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -220,7 +220,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
 			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
 			unsigned long res_in = 0, res_out = 0, res_ex = 0;
-			struct file_operations *f_op = NULL;
+			const struct file_operations *f_op = NULL;
 			struct file *file = NULL;
 
 			in = *inp++; out = *outp++; ex = *exp++;
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index 8da37e29cb87..2216638962d2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -5,13 +5,13 @@
 struct cdev {
 	struct kobject kobj;
 	struct module *owner;
-	struct file_operations *ops;
+	const struct file_operations *ops;
 	struct list_head list;
 	dev_t dev;
 	unsigned int count;
 };
 
-void cdev_init(struct cdev *, struct file_operations *);
+void cdev_init(struct cdev *, const struct file_operations *);
 
 struct cdev *cdev_alloc(void);
 
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4b0428e335be..176e2d371577 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -29,7 +29,7 @@ struct debugfs_blob_wrapper {
 #if defined(CONFIG_DEBUG_FS)
 struct dentry *debugfs_create_file(const char *name, mode_t mode,
 				   struct dentry *parent, void *data,
-				   struct file_operations *fops);
+				   const struct file_operations *fops);
 
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 680d913350e7..ef355bc73714 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -496,7 +496,7 @@ struct inode {
 	struct mutex		i_mutex;
 	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
-	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
+	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
@@ -636,7 +636,7 @@ struct file {
 	} f_u;
 	struct dentry		*f_dentry;
 	struct vfsmount         *f_vfsmnt;
-	struct file_operations	*f_op;
+	const struct file_operations	*f_op;
 	atomic_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
@@ -1414,7 +1414,7 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int register_chrdev(unsigned int, const char *,
-			   struct file_operations *);
+			   const struct file_operations *);
 extern int unregister_chrdev(unsigned int, const char *);
 extern void unregister_chrdev_region(dev_t, unsigned);
 extern int chrdev_open(struct inode *, struct file *);
diff --git a/include/linux/input.h b/include/linux/input.h
index 6d4cc3c110d6..1d4e341b72e6 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -957,7 +957,7 @@ struct input_handler {
 	struct input_handle* (*connect)(struct input_handler *handler, struct input_dev *dev, struct input_device_id *id);
 	void (*disconnect)(struct input_handle *handle);
 
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	int minor;
 	char *name;
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 14ceebfc1efa..5b584dafb5a6 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -36,7 +36,7 @@ struct class_device;
 struct miscdevice  {
 	int minor;
 	const char *name;
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	struct list_head list;
 	struct device *dev;
 	struct class_device *class;
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index b5b3197dfd4f..0d514b252454 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -84,10 +84,10 @@ void oprofile_add_trace(unsigned long eip);
  * the specified file operations.
  */
 int oprofilefs_create_file(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops);
+	char const * name, const struct file_operations * fops);
 
 int oprofilefs_create_file_perm(struct super_block * sb, struct dentry * root,
-	char const * name, struct file_operations * fops, int perm);
+	char const * name, const struct file_operations * fops, int perm);
  
 /** Create a file for read/write access to an unsigned long. */
 int oprofilefs_create_ulong(struct super_block * sb, struct dentry * root,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index cb224cf653b1..6d03d025fcd5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -58,7 +58,7 @@ struct proc_dir_entry {
 	gid_t gid;
 	loff_t size;
 	struct inode_operations * proc_iops;
-	struct file_operations * proc_fops;
+	const struct file_operations * proc_fops;
 	get_info_t *get_info;
 	struct module *owner;
 	struct proc_dir_entry *next, *parent, *subdir;
@@ -189,7 +189,7 @@ static inline struct proc_dir_entry *proc_net_create(const char *name,
 }
 
 static inline struct proc_dir_entry *proc_net_fops_create(const char *name,
-	mode_t mode, struct file_operations *fops)
+	mode_t mode, const struct file_operations *fops)
 {
 	struct proc_dir_entry *res = create_proc_entry(name, mode, proc_net);
 	if (res)
diff --git a/include/linux/sound.h b/include/linux/sound.h
index 72b9af4c3fd4..f63d8342ffa3 100644
--- a/include/linux/sound.h
+++ b/include/linux/sound.h
@@ -30,12 +30,12 @@
  */
  
 struct device;
-extern int register_sound_special(struct file_operations *fops, int unit);
-extern int register_sound_special_device(struct file_operations *fops, int unit, struct device *dev);
-extern int register_sound_mixer(struct file_operations *fops, int dev);
-extern int register_sound_midi(struct file_operations *fops, int dev);
-extern int register_sound_dsp(struct file_operations *fops, int dev);
-extern int register_sound_synth(struct file_operations *fops, int dev);
+extern int register_sound_special(const struct file_operations *fops, int unit);
+extern int register_sound_special_device(const struct file_operations *fops, int unit, struct device *dev);
+extern int register_sound_mixer(const struct file_operations *fops, int dev);
+extern int register_sound_midi(const struct file_operations *fops, int dev);
+extern int register_sound_dsp(const struct file_operations *fops, int dev);
+extern int register_sound_synth(const struct file_operations *fops, int dev);
 
 extern void unregister_sound_special(int unit);
 extern void unregister_sound_mixer(int unit);
diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h
index 0d6ed3c8bdc4..d93c24b47f3f 100644
--- a/include/linux/sunrpc/stats.h
+++ b/include/linux/sunrpc/stats.h
@@ -50,7 +50,7 @@ struct proc_dir_entry *	rpc_proc_register(struct rpc_stat *);
 void			rpc_proc_unregister(const char *);
 void			rpc_proc_zero(struct rpc_program *);
 struct proc_dir_entry *	svc_proc_register(struct svc_stat *,
-					  struct file_operations *);
+					  const struct file_operations *);
 void			svc_proc_unregister(const char *);
 
 void			svc_seq_show(struct seq_file *,
@@ -65,7 +65,7 @@ static inline void rpc_proc_unregister(const char *p) {}
 static inline void rpc_proc_zero(struct rpc_program *p) {}
 
 static inline struct proc_dir_entry *svc_proc_register(struct svc_stat *s,
-						       struct file_operations *f) { return NULL; }
+						       const struct file_operations *f) { return NULL; }
 static inline void svc_proc_unregister(const char *p) {}
 
 static inline void svc_seq_show(struct seq_file *seq,
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 130d125fda12..e34e5e3dce52 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -615,7 +615,7 @@ extern struct bus_type usb_bus_type;
  */
 struct usb_class_driver {
 	char *name;
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	int minor_base;
 };
 
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 2275bfec5b68..af2d6155d3fe 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -75,7 +75,7 @@ struct video_device
 	int minor;
 
 	/* device ops + callbacks */
-	struct file_operations *fops;
+	const struct file_operations *fops;
 	void (*release)(struct video_device *vfd);
 
 
diff --git a/include/sound/core.h b/include/sound/core.h
index 144bdc2f217f..7f32c12b4a0a 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -186,7 +186,7 @@ struct snd_minor {
 	int type;			/* SNDRV_DEVICE_TYPE_XXX */
 	int card;			/* card number */
 	int device;			/* device number */
-	struct file_operations *f_ops;	/* file operations */
+	const struct file_operations *f_ops;	/* file operations */
 	void *private_data;		/* private data for f_ops->open */
 	char name[0];			/* device name (keep at the end of
 								structure) */
@@ -200,14 +200,14 @@ extern int snd_ecards_limit;
 void snd_request_card(int card);
 
 int snd_register_device(int type, struct snd_card *card, int dev,
-			struct file_operations *f_ops, void *private_data,
+			const struct file_operations *f_ops, void *private_data,
 			const char *name);
 int snd_unregister_device(int type, struct snd_card *card, int dev);
 void *snd_lookup_minor_data(unsigned int minor, int type);
 
 #ifdef CONFIG_SND_OSSEMUL
 int snd_register_oss_device(int type, struct snd_card *card, int dev,
-			    struct file_operations *f_ops, void *private_data,
+			    const struct file_operations *f_ops, void *private_data,
 			    const char *name);
 int snd_unregister_oss_device(int type, struct snd_card *card, int dev);
 void *snd_lookup_oss_minor_data(unsigned int minor, int type);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index aa4158be9900..cc673dd8433f 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -395,7 +395,7 @@ enum {
  */
 struct rpc_filelist {
 	char *name;
-	struct file_operations *i_fop;
+	const struct file_operations *i_fop;
 	int mode;
 };
 
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 790941e8af4d..dea529666d69 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -225,7 +225,7 @@ EXPORT_SYMBOL(rpc_print_iostats);
  * Register/unregister RPC proc files
  */
 static inline struct proc_dir_entry *
-do_register(const char *name, void *data, struct file_operations *fops)
+do_register(const char *name, void *data, const struct file_operations *fops)
 {
 	struct proc_dir_entry *ent;
 
@@ -253,7 +253,7 @@ rpc_proc_unregister(const char *name)
 }
 
 struct proc_dir_entry *
-svc_proc_register(struct svc_stat *statp, struct file_operations *fops)
+svc_proc_register(struct svc_stat *statp, const struct file_operations *fops)
 {
 	return do_register(statp->program->pg_name, statp, fops);
 }
diff --git a/sound/core/init.c b/sound/core/init.c
index ad68761abba1..5bb8a8b23d51 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -223,7 +223,8 @@ int snd_card_disconnect(struct snd_card *card)
 	struct snd_monitor_file *mfile;
 	struct file *file;
 	struct snd_shutdown_f_ops *s_f_ops;
-	struct file_operations *f_ops, *old_f_ops;
+	struct file_operations *f_ops;
+	const struct file_operations *old_f_ops;
 	int err;
 
 	spin_lock(&card->files_lock);
diff --git a/sound/core/sound.c b/sound/core/sound.c
index 4d28e5212611..108e430b5036 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -137,7 +137,7 @@ static int snd_open(struct inode *inode, struct file *file)
 {
 	unsigned int minor = iminor(inode);
 	struct snd_minor *mptr = NULL;
-	struct file_operations *old_fops;
+	const struct file_operations *old_fops;
 	int err = 0;
 
 	if (minor >= ARRAY_SIZE(snd_minors))
@@ -240,7 +240,7 @@ static int snd_kernel_minor(int type, struct snd_card *card, int dev)
  * Retrurns zero if successful, or a negative error code on failure.
  */
 int snd_register_device(int type, struct snd_card *card, int dev,
-			struct file_operations *f_ops, void *private_data,
+			const struct file_operations *f_ops, void *private_data,
 			const char *name)
 {
 	int minor;
diff --git a/sound/core/sound_oss.c b/sound/core/sound_oss.c
index 4023d3b406de..9055c6de9587 100644
--- a/sound/core/sound_oss.c
+++ b/sound/core/sound_oss.c
@@ -95,7 +95,7 @@ static int snd_oss_kernel_minor(int type, struct snd_card *card, int dev)
 }
 
 int snd_register_oss_device(int type, struct snd_card *card, int dev,
-			    struct file_operations *f_ops, void *private_data,
+			    const struct file_operations *f_ops, void *private_data,
 			    const char *name)
 {
 	int minor = snd_oss_kernel_minor(type, card, dev);
diff --git a/sound/sound_core.c b/sound/sound_core.c
index 394b53e20cb8..6f849720aef3 100644
--- a/sound/sound_core.c
+++ b/sound/sound_core.c
@@ -53,7 +53,7 @@
 struct sound_unit
 {
 	int unit_minor;
-	struct file_operations *unit_fops;
+	const struct file_operations *unit_fops;
 	struct sound_unit *next;
 	char name[32];
 };
@@ -73,7 +73,7 @@ EXPORT_SYMBOL(sound_class);
  *	join into it. Called with the lock asserted
  */
 
-static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, struct file_operations *fops, int index, int low, int top)
+static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, const struct file_operations *fops, int index, int low, int top)
 {
 	int n=low;
 
@@ -153,7 +153,7 @@ static DEFINE_SPINLOCK(sound_loader_lock);
  *	list. Acquires locks as needed
  */
 
-static int sound_insert_unit(struct sound_unit **list, struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev)
+static int sound_insert_unit(struct sound_unit **list, const struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev)
 {
 	struct sound_unit *s = kmalloc(sizeof(*s), GFP_KERNEL);
 	int r;
@@ -237,7 +237,7 @@ static struct sound_unit *chains[SOUND_STEP];
  *	a negative error code is returned.
  */
  
-int register_sound_special_device(struct file_operations *fops, int unit,
+int register_sound_special_device(const struct file_operations *fops, int unit,
 				  struct device *dev)
 {
 	const int chain = unit % SOUND_STEP;
@@ -301,7 +301,7 @@ int register_sound_special_device(struct file_operations *fops, int unit,
  
 EXPORT_SYMBOL(register_sound_special_device);
 
-int register_sound_special(struct file_operations *fops, int unit)
+int register_sound_special(const struct file_operations *fops, int unit)
 {
 	return register_sound_special_device(fops, unit, NULL);
 }
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(register_sound_special);
  *	number is returned, on failure a negative error code is returned.
  */
 
-int register_sound_mixer(struct file_operations *fops, int dev)
+int register_sound_mixer(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[0], fops, dev, 0, 128,
 				 "mixer", S_IRUSR | S_IWUSR, NULL);
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(register_sound_mixer);
  *	number is returned, on failure a negative error code is returned.
  */
 
-int register_sound_midi(struct file_operations *fops, int dev)
+int register_sound_midi(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[2], fops, dev, 2, 130,
 				 "midi", S_IRUSR | S_IWUSR, NULL);
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(register_sound_midi);
  *	and will always allocate them as a matching pair - eg dsp3/audio3
  */
 
-int register_sound_dsp(struct file_operations *fops, int dev)
+int register_sound_dsp(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[3], fops, dev, 3, 131,
 				 "dsp", S_IWUSR | S_IRUSR, NULL);
@@ -381,7 +381,7 @@ EXPORT_SYMBOL(register_sound_dsp);
  */
 
 
-int register_sound_synth(struct file_operations *fops, int dev)
+int register_sound_synth(const struct file_operations *fops, int dev)
 {
 	return sound_insert_unit(&chains[9], fops, dev, 9, 137,
 				 "synth", S_IRUSR | S_IWUSR, NULL);
@@ -501,7 +501,7 @@ int soundcore_open(struct inode *inode, struct file *file)
 	int chain;
 	int unit = iminor(inode);
 	struct sound_unit *s;
-	struct file_operations *new_fops = NULL;
+	const struct file_operations *new_fops = NULL;
 
 	chain=unit&0x0F;
 	if(chain==4 || chain==5)	/* dsp/audio/dsp16 */
@@ -540,7 +540,7 @@ int soundcore_open(struct inode *inode, struct file *file)
 		 * switching ->f_op in the first place.
 		 */
 		int err = 0;
-		struct file_operations *old_fops = file->f_op;
+		const struct file_operations *old_fops = file->f_op;
 		file->f_op = new_fops;
 		spin_unlock(&sound_loader_lock);
 		if(file->f_op->open)
-- 
cgit v1.2.3


From 4b6f5d20b04dcbc3d888555522b90ba6d36c4106 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 28 Mar 2006 01:56:42 -0800
Subject: [PATCH] Make most file operations structs in fs/ const

This is a conversion to make the various file_operations structs in fs/
const.  Basically a regexp job, with a few manual fixups

The goal is both to increase correctness (harder to accidentally write to
shared datastructures) and reducing the false sharing of cachelines with
things that get dirty in .data (while .rodata is nicely read only and thus
cache clean)

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/misc/ibmasm/ibmasmfs.c  |  2 +-
 fs/9p/v9fs_vfs.h                |  4 ++--
 fs/9p/vfs_dir.c                 |  2 +-
 fs/9p/vfs_file.c                |  2 +-
 fs/adfs/adfs.h                  |  4 ++--
 fs/adfs/dir.c                   |  2 +-
 fs/adfs/file.c                  |  2 +-
 fs/affs/affs.h                  |  6 +++---
 fs/affs/dir.c                   |  2 +-
 fs/affs/file.c                  |  2 +-
 fs/afs/dir.c                    |  2 +-
 fs/afs/internal.h               |  4 ++--
 fs/afs/mntpt.c                  |  2 +-
 fs/afs/proc.c                   | 10 +++++-----
 fs/autofs/autofs_i.h            |  2 +-
 fs/autofs/root.c                |  2 +-
 fs/autofs4/autofs_i.h           |  4 ++--
 fs/autofs4/root.c               |  4 ++--
 fs/bad_inode.c                  |  2 +-
 fs/befs/linuxvfs.c              |  2 +-
 fs/bfs/bfs.h                    |  4 ++--
 fs/bfs/dir.c                    |  2 +-
 fs/bfs/file.c                   |  2 +-
 fs/binfmt_misc.c                |  6 +++---
 fs/block_dev.c                  |  2 +-
 fs/char_dev.c                   |  2 +-
 fs/cifs/cifsfs.c                | 10 +++++-----
 fs/cifs/cifsfs.h                | 10 +++++-----
 fs/coda/dir.c                   |  2 +-
 fs/coda/file.c                  |  2 +-
 fs/coda/pioctl.c                |  2 +-
 fs/coda/psdev.c                 |  2 +-
 fs/configfs/configfs_internal.h |  6 +++---
 fs/configfs/dir.c               |  2 +-
 fs/configfs/file.c              |  2 +-
 fs/cramfs/inode.c               |  4 ++--
 fs/debugfs/file.c               |  4 ++--
 fs/devfs/base.c                 | 12 ++++++------
 fs/efs/dir.c                    |  2 +-
 fs/eventpoll.c                  |  2 +-
 fs/ext2/dir.c                   |  2 +-
 fs/ext2/ext2.h                  |  6 +++---
 fs/ext2/file.c                  |  4 ++--
 fs/ext3/dir.c                   |  2 +-
 fs/ext3/file.c                  |  2 +-
 fs/fat/dir.c                    |  2 +-
 fs/fat/file.c                   |  2 +-
 fs/fifo.c                       |  2 +-
 fs/freevxfs/vxfs_extern.h       |  2 +-
 fs/freevxfs/vxfs_lookup.c       |  2 +-
 fs/fuse/dev.c                   |  2 +-
 fs/fuse/dir.c                   |  2 +-
 fs/fuse/file.c                  |  6 +++---
 fs/fuse/fuse_i.h                |  2 +-
 fs/hfs/dir.c                    |  2 +-
 fs/hfs/hfs_fs.h                 |  2 +-
 fs/hfs/inode.c                  |  4 ++--
 fs/hfsplus/dir.c                |  2 +-
 fs/hfsplus/inode.c              |  2 +-
 fs/hostfs/hostfs_kern.c         |  4 ++--
 fs/hpfs/dir.c                   |  2 +-
 fs/hpfs/file.c                  |  2 +-
 fs/hpfs/hpfs_fn.h               |  4 ++--
 fs/hppfs/hppfs_kern.c           |  4 ++--
 fs/hugetlbfs/inode.c            |  4 ++--
 fs/inotify.c                    |  2 +-
 fs/isofs/dir.c                  |  2 +-
 fs/isofs/isofs.h                |  2 +-
 fs/jffs/inode-v23.c             |  8 ++++----
 fs/jffs2/dir.c                  |  2 +-
 fs/jffs2/file.c                 |  2 +-
 fs/jffs2/os-linux.h             |  4 ++--
 fs/jfs/file.c                   |  2 +-
 fs/jfs/jfs_inode.h              |  4 ++--
 fs/jfs/namei.c                  |  2 +-
 fs/libfs.c                      |  2 +-
 fs/minix/dir.c                  |  2 +-
 fs/minix/file.c                 |  2 +-
 fs/minix/minix.h                |  4 ++--
 fs/ncpfs/dir.c                  |  2 +-
 fs/ncpfs/file.c                 |  2 +-
 fs/nfs/dir.c                    |  2 +-
 fs/nfs/file.c                   |  2 +-
 fs/nfsd/nfsctl.c                |  4 ++--
 fs/nfsd/stats.c                 |  2 +-
 fs/ntfs/dir.c                   |  2 +-
 fs/ntfs/file.c                  |  4 ++--
 fs/ntfs/ntfs.h                  |  6 +++---
 fs/ocfs2/dlmglue.c              |  2 +-
 fs/ocfs2/file.c                 |  4 ++--
 fs/ocfs2/file.h                 |  4 ++--
 fs/openpromfs/inode.c           |  6 +++---
 fs/pipe.c                       |  6 +++---
 fs/proc/kcore.c                 |  2 +-
 fs/proc/kmsg.c                  |  2 +-
 fs/proc/vmcore.c                |  2 +-
 fs/qnx4/dir.c                   |  2 +-
 fs/qnx4/file.c                  |  2 +-
 fs/ramfs/file-mmu.c             |  2 +-
 fs/ramfs/file-nommu.c           |  2 +-
 fs/ramfs/internal.h             |  2 +-
 fs/read_write.c                 |  2 +-
 fs/reiserfs/dir.c               |  2 +-
 fs/reiserfs/file.c              |  2 +-
 fs/reiserfs/procfs.c            |  2 +-
 fs/romfs/inode.c                |  2 +-
 fs/smbfs/dir.c                  |  2 +-
 fs/smbfs/file.c                 |  2 +-
 fs/smbfs/proto.h                |  4 ++--
 fs/sysfs/bin.c                  |  2 +-
 fs/sysfs/dir.c                  |  2 +-
 fs/sysfs/file.c                 |  2 +-
 fs/sysfs/sysfs.h                |  6 +++---
 fs/sysv/dir.c                   |  2 +-
 fs/sysv/file.c                  |  2 +-
 fs/sysv/sysv.h                  |  4 ++--
 fs/udf/dir.c                    |  2 +-
 fs/udf/file.c                   |  2 +-
 fs/udf/udfdecl.h                |  4 ++--
 fs/ufs/dir.c                    |  2 +-
 fs/ufs/file.c                   |  2 +-
 fs/xfs/linux-2.6/xfs_file.c     |  6 +++---
 fs/xfs/linux-2.6/xfs_iops.h     |  6 +++---
 include/linux/coda_linux.h      |  6 +++---
 include/linux/crash_dump.h      |  2 +-
 include/linux/efs_fs.h          |  2 +-
 include/linux/ext3_fs.h         |  4 ++--
 include/linux/fs.h              | 20 ++++++++++----------
 include/linux/hugetlb.h         |  2 +-
 include/linux/msdos_fs.h        |  4 ++--
 include/linux/ncp_fs.h          |  4 ++--
 include/linux/nfs_fs.h          |  4 ++--
 include/linux/proc_fs.h         |  6 +++---
 include/linux/qnx4_fs.h         |  4 ++--
 include/linux/ramfs.h           |  2 +-
 include/linux/reiserfs_fs.h     |  4 ++--
 include/linux/ufs_fs.h          |  4 ++--
 net/nonet.c                     |  2 +-
 net/socket.c                    |  2 +-
 139 files changed, 225 insertions(+), 225 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 5c550fcac2c4..26a230b6ff80 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -101,7 +101,7 @@ static struct super_operations ibmasmfs_s_ops = {
 	.drop_inode	= generic_delete_inode,
 };
 
-static struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
+static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
 
 static struct file_system_type ibmasmfs_type = {
 	.owner          = THIS_MODULE,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 43c9f7de0314..f867b8d3e973 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -39,8 +39,8 @@
 
 extern struct file_system_type v9fs_fs_type;
 extern struct address_space_operations v9fs_addr_operations;
-extern struct file_operations v9fs_file_operations;
-extern struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 766f11f1215c..e32d5971039b 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -204,7 +204,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-struct file_operations v9fs_dir_operations = {
+const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = v9fs_dir_readdir,
 	.open = v9fs_file_open,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 59e744163407..083dcfcd158e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -266,7 +266,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	return total;
 }
 
-struct file_operations v9fs_file_operations = {
+const struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_file_read,
 	.write = v9fs_file_write,
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index f6cd01352cc8..29217ff36d44 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -85,7 +85,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 
 /* dir_*.c */
 extern struct inode_operations adfs_dir_inode_operations;
-extern struct file_operations adfs_dir_operations;
+extern const struct file_operations adfs_dir_operations;
 extern struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
@@ -94,7 +94,7 @@ extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
 
 /* file.c */
 extern struct inode_operations adfs_file_inode_operations;
-extern struct file_operations adfs_file_operations;
+extern const struct file_operations adfs_file_operations;
 
 static inline __u32 signed_asl(__u32 val, signed int shift)
 {
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 0b4c3a028076..7b075fc397da 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -196,7 +196,7 @@ out:
 	return ret;
 }
 
-struct file_operations adfs_dir_operations = {
+const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= adfs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 6af10885f9d6..1014b9f2117b 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
 
 #include "adfs.h"
 
-struct file_operations adfs_file_operations = {
+const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.mmap		= generic_file_mmap,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 0c6799f2137a..a43a876742b8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -192,9 +192,9 @@ extern void   affs_dir_truncate(struct inode *);
 extern struct inode_operations	 affs_file_inode_operations;
 extern struct inode_operations	 affs_dir_inode_operations;
 extern struct inode_operations   affs_symlink_inode_operations;
-extern struct file_operations	 affs_file_operations;
-extern struct file_operations	 affs_file_operations_ofs;
-extern struct file_operations	 affs_dir_operations;
+extern const struct file_operations	 affs_file_operations;
+extern const struct file_operations	 affs_file_operations_ofs;
+extern const struct file_operations	 affs_dir_operations;
 extern struct address_space_operations	 affs_symlink_aops;
 extern struct address_space_operations	 affs_aops;
 extern struct address_space_operations	 affs_aops_ofs;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 548efd0ee98c..5d9649fa1814 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -17,7 +17,7 @@
 
 static int affs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations affs_dir_operations = {
+const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= affs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index f72fb776ecdf..7076262af39b 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -25,7 +25,7 @@ static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
 static int affs_file_open(struct inode *inode, struct file *filp);
 static int affs_file_release(struct inode *inode, struct file *filp);
 
-struct file_operations affs_file_operations = {
+const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5c61c24dab2a..a6dff6a4f204 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -32,7 +32,7 @@ static int afs_d_delete(struct dentry *dentry);
 static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, ino_t ino, unsigned dtype);
 
-struct file_operations afs_dir_file_operations = {
+const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
 	.readdir	= afs_dir_readdir,
 };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index ab8f87c66319..72febdf9a35a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -64,7 +64,7 @@ extern struct cachefs_index_def afs_cache_cell_index_def;
  * dir.c
  */
 extern struct inode_operations afs_dir_inode_operations;
-extern struct file_operations afs_dir_file_operations;
+extern const struct file_operations afs_dir_file_operations;
 
 /*
  * file.c
@@ -105,7 +105,7 @@ extern struct cachefs_netfs afs_cache_netfs;
  * mntpt.c
  */
 extern struct inode_operations afs_mntpt_inode_operations;
-extern struct file_operations afs_mntpt_file_operations;
+extern const struct file_operations afs_mntpt_file_operations;
 extern struct afs_timer afs_mntpt_expiry_timer;
 extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
 extern unsigned long afs_mntpt_expiry_timeout;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 31ee06590de5..4e6eeb59b83c 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -32,7 +32,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
 
-struct file_operations afs_mntpt_file_operations = {
+const struct file_operations afs_mntpt_file_operations = {
 	.open		= afs_mntpt_open,
 };
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 9c81b8f7eef0..101d21b6c037 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -37,7 +37,7 @@ static struct seq_operations afs_proc_cells_ops = {
 	.show	= afs_proc_cells_show,
 };
 
-static struct file_operations afs_proc_cells_fops = {
+static const struct file_operations afs_proc_cells_fops = {
 	.open		= afs_proc_cells_open,
 	.read		= seq_read,
 	.write		= afs_proc_cells_write,
@@ -53,7 +53,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 				       const char __user *buf,
 				       size_t size, loff_t *_pos);
 
-static struct file_operations afs_proc_rootcell_fops = {
+static const struct file_operations afs_proc_rootcell_fops = {
 	.open		= afs_proc_rootcell_open,
 	.read		= afs_proc_rootcell_read,
 	.write		= afs_proc_rootcell_write,
@@ -77,7 +77,7 @@ static struct seq_operations afs_proc_cell_volumes_ops = {
 	.show	= afs_proc_cell_volumes_show,
 };
 
-static struct file_operations afs_proc_cell_volumes_fops = {
+static const struct file_operations afs_proc_cell_volumes_fops = {
 	.open		= afs_proc_cell_volumes_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -101,7 +101,7 @@ static struct seq_operations afs_proc_cell_vlservers_ops = {
 	.show	= afs_proc_cell_vlservers_show,
 };
 
-static struct file_operations afs_proc_cell_vlservers_fops = {
+static const struct file_operations afs_proc_cell_vlservers_fops = {
 	.open		= afs_proc_cell_vlservers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -124,7 +124,7 @@ static struct seq_operations afs_proc_cell_servers_ops = {
 	.show	= afs_proc_cell_servers_show,
 };
 
-static struct file_operations afs_proc_cell_servers_fops = {
+static const struct file_operations afs_proc_cell_servers_fops = {
 	.open		= afs_proc_cell_servers_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 990c28da5aec..a62327f1bdff 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -146,7 +146,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *,struct autofs_sb_info
 
 extern struct inode_operations autofs_root_inode_operations;
 extern struct inode_operations autofs_symlink_inode_operations;
-extern struct file_operations autofs_root_operations;
+extern const struct file_operations autofs_root_operations;
 
 /* Initializing function */
 
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 870e2cf33016..9cac08d6a873 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -26,7 +26,7 @@ static int autofs_root_rmdir(struct inode *,struct dentry *);
 static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 
-struct file_operations autofs_root_operations = {
+const struct file_operations autofs_root_operations = {
 	.read		= generic_read_dir,
 	.readdir	= autofs_root_readdir,
 	.ioctl		= autofs_root_ioctl,
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c70204a61268..57c4903614e5 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -176,8 +176,8 @@ extern struct inode_operations autofs4_dir_inode_operations;
 extern struct inode_operations autofs4_root_inode_operations;
 extern struct inode_operations autofs4_indirect_root_inode_operations;
 extern struct inode_operations autofs4_direct_root_inode_operations;
-extern struct file_operations autofs4_dir_operations;
-extern struct file_operations autofs4_root_operations;
+extern const struct file_operations autofs4_dir_operations;
+extern const struct file_operations autofs4_root_operations;
 
 /* Initializing function */
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c8fe43a475e2..84e030c8ddd0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t fil
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
-struct file_operations autofs4_root_operations = {
+const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
@@ -40,7 +40,7 @@ struct file_operations autofs4_root_operations = {
 	.ioctl		= autofs4_root_ioctl,
 };
 
-struct file_operations autofs4_dir_operations = {
+const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
 	.release	= autofs4_dir_close,
 	.read		= generic_read_dir,
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index e172180a1d8c..80599ae33966 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -22,7 +22,7 @@ static int return_EIO(void)
 
 #define EIO_ERROR ((void *) (return_EIO))
 
-static struct file_operations bad_file_ops =
+static const struct file_operations bad_file_ops =
 {
 	.llseek		= EIO_ERROR,
 	.aio_read	= EIO_ERROR,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 044a59587829..68ebd10f345d 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -64,7 +64,7 @@ static const struct super_operations befs_sops = {
 /* slab cache for befs_inode_info objects */
 static kmem_cache_t *befs_inode_cachep;
 
-static struct file_operations befs_dir_operations = {
+static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= befs_readdir,
 };
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 1fbc53f14aba..9d791004b21c 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -49,11 +49,11 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 
 /* file.c */
 extern struct inode_operations bfs_file_inops;
-extern struct file_operations bfs_file_operations;
+extern const struct file_operations bfs_file_operations;
 extern struct address_space_operations bfs_aops;
 
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
-extern struct file_operations bfs_dir_operations;
+extern const struct file_operations bfs_dir_operations;
 
 #endif /* _FS_BFS_BFS_H */
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 5af928fa0449..26fad9621738 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -70,7 +70,7 @@ static int bfs_readdir(struct file * f, void * dirent, filldir_t filldir)
 	return 0;	
 }
 
-struct file_operations bfs_dir_operations = {
+const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 807723b65daf..d83cd74a2e4e 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -17,7 +17,7 @@
 #define dprintf(x...)
 #endif
 
-struct file_operations bfs_file_operations = {
+const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 6a7b730c206b..d73d75591a39 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -600,7 +600,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static struct file_operations bm_entry_operations = {
+static const struct file_operations bm_entry_operations = {
 	.read		= bm_entry_read,
 	.write		= bm_entry_write,
 };
@@ -668,7 +668,7 @@ out:
 	return count;
 }
 
-static struct file_operations bm_register_operations = {
+static const struct file_operations bm_register_operations = {
 	.write		= bm_register_write,
 };
 
@@ -715,7 +715,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 	return count;
 }
 
-static struct file_operations bm_status_operations = {
+static const struct file_operations bm_status_operations = {
 	.read		= bm_status_read,
 	.write		= bm_status_write,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 17c76182f389..af88c43043d5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1087,7 +1087,7 @@ struct address_space_operations def_blk_aops = {
 	.direct_IO	= blkdev_direct_IO,
 };
 
-struct file_operations def_blk_fops = {
+const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
diff --git a/fs/char_dev.c b/fs/char_dev.c
index b53dffa46bac..4e1b849f912f 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -406,7 +406,7 @@ static void cdev_purge(struct cdev *cdev)
  * is contain the open that then fills in the correct operations
  * depending on the special file...
  */
-struct file_operations def_chr_fops = {
+const struct file_operations def_chr_fops = {
 	.open = chrdev_open,
 };
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6b99b51d6694..4bbc544857bc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -583,7 +583,7 @@ struct inode_operations cifs_symlink_inode_ops = {
 #endif 
 };
 
-struct file_operations cifs_file_ops = {
+const struct file_operations cifs_file_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.readv = generic_file_readv,
@@ -607,7 +607,7 @@ struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_file_direct_ops = {
+const struct file_operations cifs_file_direct_ops = {
 	/* no mmap, no aio, no readv - 
 	   BB reevaluate whether they can be done with directio, no cache */
 	.read = cifs_user_read,
@@ -626,7 +626,7 @@ struct file_operations cifs_file_direct_ops = {
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
-struct file_operations cifs_file_nobrl_ops = {
+const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.readv = generic_file_readv,
@@ -649,7 +649,7 @@ struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_file_direct_nobrl_ops = {
+const struct file_operations cifs_file_direct_nobrl_ops = {
 	/* no mmap, no aio, no readv - 
 	   BB reevaluate whether they can be done with directio, no cache */
 	.read = cifs_user_read,
@@ -668,7 +668,7 @@ struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
-struct file_operations cifs_dir_ops = {
+const struct file_operations cifs_dir_ops = {
 	.readdir = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 821a8eb22559..74f405ae4da3 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,10 +61,10 @@ extern struct inode_operations cifs_file_inode_ops;
 extern struct inode_operations cifs_symlink_inode_ops;
 
 /* Functions related to files and directories */
-extern struct file_operations cifs_file_ops;
-extern struct file_operations cifs_file_direct_ops; /* if directio mount */
-extern struct file_operations cifs_file_nobrl_ops;
-extern struct file_operations cifs_file_direct_nobrl_ops; /* if directio mount */
+extern const struct file_operations cifs_file_ops;
+extern const struct file_operations cifs_file_direct_ops; /* if directio mount */
+extern const struct file_operations cifs_file_nobrl_ops;
+extern const struct file_operations cifs_file_direct_nobrl_ops; /* if directio mount */
 extern int cifs_open(struct inode *inode, struct file *file);
 extern int cifs_close(struct inode *inode, struct file *file);
 extern int cifs_closedir(struct inode *inode, struct file *file);
@@ -76,7 +76,7 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
 extern int cifs_flush(struct file *);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern struct file_operations cifs_dir_ops;
+extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 extern int cifs_dir_notify(struct file *, unsigned long arg);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 54f76de8a686..71f2ea632e53 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -82,7 +82,7 @@ struct inode_operations coda_dir_inode_operations =
 	.setattr	= coda_setattr,
 };
 
-struct file_operations coda_dir_operations = {
+const struct file_operations coda_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= coda_readdir,
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 146a991d6eb5..7c2642431fa5 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -288,7 +288,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	return err;
 }
 
-struct file_operations coda_file_operations = {
+const struct file_operations coda_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= coda_file_read,
 	.write		= coda_file_write,
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 127714936c66..214822be87bd 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -36,7 +36,7 @@ struct inode_operations coda_ioctl_inode_operations =
 	.setattr	= coda_setattr,
 };
 
-struct file_operations coda_ioctl_operations = {
+const struct file_operations coda_ioctl_operations = {
 	.owner		= THIS_MODULE,
 	.ioctl		= coda_pioctl,
 };
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 98c74fe2e139..6c6771db36da 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -342,7 +342,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
 }
 
 
-static struct file_operations coda_psdev_fops = {
+static const struct file_operations coda_psdev_fops = {
 	.owner		= THIS_MODULE,
 	.read		= coda_psdev_read,
 	.write		= coda_psdev_write,
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index f70e46951b37..3f4ff7a242b9 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -72,9 +72,9 @@ extern void configfs_release_fs(void);
 
 extern struct rw_semaphore configfs_rename_sem;
 extern struct super_block * configfs_sb;
-extern struct file_operations configfs_dir_operations;
-extern struct file_operations configfs_file_operations;
-extern struct file_operations bin_fops;
+extern const struct file_operations configfs_dir_operations;
+extern const struct file_operations configfs_file_operations;
+extern const struct file_operations bin_fops;
 extern struct inode_operations configfs_dir_inode_operations;
 extern struct inode_operations configfs_symlink_inode_operations;
 
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ca60e3abef45..8ed9b06a9828 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1027,7 +1027,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
 	return offset;
 }
 
-struct file_operations configfs_dir_operations = {
+const struct file_operations configfs_dir_operations = {
 	.open		= configfs_dir_open,
 	.release	= configfs_dir_close,
 	.llseek		= configfs_dir_lseek,
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3921920d8716..f499803743e0 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -322,7 +322,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations configfs_file_operations = {
+const struct file_operations configfs_file_operations = {
 	.read		= configfs_read_file,
 	.write		= configfs_write_file,
 	.llseek		= generic_file_llseek,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index acc1b2c10a86..9efcc3a164e8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -29,7 +29,7 @@
 
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
-static struct file_operations cramfs_directory_operations;
+static const struct file_operations cramfs_directory_operations;
 static struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
@@ -512,7 +512,7 @@ static struct address_space_operations cramfs_aops = {
 /*
  * A directory can only readdir
  */
-static struct file_operations cramfs_directory_operations = {
+static const struct file_operations cramfs_directory_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= cramfs_readdir,
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 40c4fc973fad..66a505422e5c 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -39,7 +39,7 @@ static int default_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
-struct file_operations debugfs_file_operations = {
+const struct file_operations debugfs_file_operations = {
 	.read =		default_read_file,
 	.write =	default_write_file,
 	.open =		default_open,
@@ -213,7 +213,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
 	return count;
 }
 
-static struct file_operations fops_bool = {
+static const struct file_operations fops_bool = {
 	.read =		read_file_bool,
 	.write =	write_file_bool,
 	.open =		default_open,
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index b621521e09d4..52f5059c4f31 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -856,14 +856,14 @@ static int devfsd_close(struct inode *inode, struct file *file);
 #ifdef CONFIG_DEVFS_DEBUG
 static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
 			 loff_t * ppos);
-static struct file_operations stat_fops = {
+static const struct file_operations stat_fops = {
 	.open = nonseekable_open,
 	.read = stat_read,
 };
 #endif
 
 /*  Devfs daemon file operations  */
-static struct file_operations devfsd_fops = {
+static const struct file_operations devfsd_fops = {
 	.open = nonseekable_open,
 	.read = devfsd_read,
 	.ioctl = devfsd_ioctl,
@@ -1842,8 +1842,8 @@ static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info,
 
 static struct inode_operations devfs_iops;
 static struct inode_operations devfs_dir_iops;
-static struct file_operations devfs_fops;
-static struct file_operations devfs_dir_fops;
+static const struct file_operations devfs_fops;
+static const struct file_operations devfs_dir_fops;
 static struct inode_operations devfs_symlink_iops;
 
 static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr)
@@ -2061,11 +2061,11 @@ static int devfs_open(struct inode *inode, struct file *file)
 	return err;
 }				/*  End Function devfs_open  */
 
-static struct file_operations devfs_fops = {
+static const struct file_operations devfs_fops = {
 	.open = devfs_open,
 };
 
-static struct file_operations devfs_dir_fops = {
+static const struct file_operations devfs_dir_fops = {
 	.read = generic_read_dir,
 	.readdir = devfs_readdir,
 };
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 777c614ff360..17f5b2d3c16a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -10,7 +10,7 @@
 
 static int efs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations efs_dir_operations = {
+const struct file_operations efs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= efs_readdir,
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e067a06c6464..242fe1a66ce5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -290,7 +290,7 @@ static kmem_cache_t *pwq_cache __read_mostly;
 static struct vfsmount *eventpoll_mnt __read_mostly;
 
 /* File callbacks that implement the eventpoll file behaviour */
-static struct file_operations eventpoll_fops = {
+static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_close,
 	.poll		= ep_eventpoll_poll
 };
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0165388c425c..d672aa9f4061 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -658,7 +658,7 @@ not_empty:
 	return 0;
 }
 
-struct file_operations ext2_dir_operations = {
+const struct file_operations ext2_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext2_readdir,
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 11035ac7986f..9f74a62be555 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -154,12 +154,12 @@ extern void ext2_write_super (struct super_block *);
  */
 
 /* dir.c */
-extern struct file_operations ext2_dir_operations;
+extern const struct file_operations ext2_dir_operations;
 
 /* file.c */
 extern struct inode_operations ext2_file_inode_operations;
-extern struct file_operations ext2_file_operations;
-extern struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_file_operations;
+extern const struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
 extern struct address_space_operations ext2_aops;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a484412fc782..509cceca04db 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -39,7 +39,7 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
  * We have mostly NULL's here: the current defaults are ok for
  * the ext2 filesystem.
  */
-struct file_operations ext2_file_operations = {
+const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
@@ -56,7 +56,7 @@ struct file_operations ext2_file_operations = {
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
-struct file_operations ext2_xip_file_operations = {
+const struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= xip_file_read,
 	.write		= xip_file_write,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 38bd3f6ec147..f37528ed222e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -39,7 +39,7 @@ static int ext3_dx_readdir(struct file * filp,
 static int ext3_release_dir (struct inode * inode,
 				struct file * filp);
 
-struct file_operations ext3_dir_operations = {
+const struct file_operations ext3_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 59098ea56711..783a796220bb 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -105,7 +105,7 @@ force_commit:
 	return ret;
 }
 
-struct file_operations ext3_file_operations = {
+const struct file_operations ext3_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4095bc149eb1..698b85bb1dd4 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -741,7 +741,7 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 	return ret;
 }
 
-struct file_operations fat_dir_operations = {
+const struct file_operations fat_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 88aa1ae13f9f..1ee25232e6af 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -112,7 +112,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	}
 }
 
-struct file_operations fat_file_operations = {
+const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/fifo.c b/fs/fifo.c
index d13fcd3ec803..889f722ee36d 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -145,6 +145,6 @@ err_nocleanup:
  * is contain the open that then fills in the correct operations
  * depending on the access mode of the file...
  */
-struct file_operations def_fifo_fops = {
+const struct file_operations def_fifo_fops = {
 	.open		= fifo_open,	/* will set read or write pipe_fops */
 };
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 927acf70c591..1cf1fe8466a2 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -63,7 +63,7 @@ extern void			vxfs_clear_inode(struct inode *);
 
 /* vxfs_lookup.c */
 extern struct inode_operations	vxfs_dir_inode_ops;
-extern struct file_operations	vxfs_dir_operations;
+extern const struct file_operations	vxfs_dir_operations;
 
 /* vxfs_olt.c */
 extern int			vxfs_read_olt(struct super_block *, u_long);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 554eb455722c..29cce456c7ce 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_dir_inode_ops = {
 	.lookup =		vxfs_lookup,
 };
 
-struct file_operations vxfs_dir_operations = {
+const struct file_operations vxfs_dir_operations = {
 	.readdir =		vxfs_readdir,
 };
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0c9a2ee54c91..23d1f52eb1b8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -922,7 +922,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-struct file_operations fuse_dev_operations = {
+const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
 	.read		= fuse_dev_read,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c72a8a97935c..256355b80256 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1170,7 +1170,7 @@ static struct inode_operations fuse_dir_inode_operations = {
 	.removexattr	= fuse_removexattr,
 };
 
-static struct file_operations fuse_dir_operations = {
+static const struct file_operations fuse_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= fuse_readdir,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6f05379b0a0d..975f2697e866 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 
-static struct file_operations fuse_direct_io_file_operations;
+static const struct file_operations fuse_direct_io_file_operations;
 
 static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 			  struct fuse_open_out *outargp)
@@ -611,7 +611,7 @@ static int fuse_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static struct file_operations fuse_file_operations = {
+static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
@@ -623,7 +623,7 @@ static struct file_operations fuse_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
-static struct file_operations fuse_direct_io_file_operations = {
+static const struct file_operations fuse_direct_io_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= fuse_direct_read,
 	.write		= fuse_direct_write,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4a83adfec968..a16a04fcf41e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -346,7 +346,7 @@ static inline u64 get_node_id(struct inode *inode)
 }
 
 /** Device operations */
-extern struct file_operations fuse_dev_operations;
+extern const struct file_operations fuse_dev_operations;
 
 /**
  * This is the single global spinlock which protects FUSE's structures
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 534e5a7480ef..7cd8cc03aea7 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -313,7 +313,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return res;
 }
 
-struct file_operations hfs_dir_operations = {
+const struct file_operations hfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfs_readdir,
 	.llseek		= generic_file_llseek,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 18ce47ab1b71..3ed8663a8db1 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -169,7 +169,7 @@ extern int hfs_cat_move(u32, struct inode *, struct qstr *,
 extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
 
 /* dir.c */
-extern struct file_operations hfs_dir_operations;
+extern const struct file_operations hfs_dir_operations;
 extern struct inode_operations hfs_dir_inode_operations;
 
 /* extent.c */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2c564701724f..2d4ced22201b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -17,7 +17,7 @@
 #include "hfs_fs.h"
 #include "btree.h"
 
-static struct file_operations hfs_file_operations;
+static const struct file_operations hfs_file_operations;
 static struct inode_operations hfs_file_inode_operations;
 
 /*================ Variable-like macros ================*/
@@ -601,7 +601,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 }
 
 
-static struct file_operations hfs_file_operations = {
+static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 01a6fe3a395c..1f9ece0de326 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -483,7 +483,7 @@ struct inode_operations hfsplus_dir_inode_operations = {
 	.rename		= hfsplus_rename,
 };
 
-struct file_operations hfsplus_dir_operations = {
+const struct file_operations hfsplus_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= hfsplus_readdir,
 	.ioctl          = hfsplus_ioctl,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 9fbe4d2aeece..acf66dba3e01 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -280,7 +280,7 @@ static struct inode_operations hfsplus_file_inode_operations = {
 	.listxattr	= hfsplus_listxattr,
 };
 
-static struct file_operations hfsplus_file_operations = {
+static const struct file_operations hfsplus_file_operations = {
 	.llseek 	= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b3ad0bd0312f..bf0f8e16e433 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -384,7 +384,7 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
 }
 
-static struct file_operations hostfs_file_fops = {
+static const struct file_operations hostfs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.sendfile	= generic_file_sendfile,
@@ -399,7 +399,7 @@ static struct file_operations hostfs_file_fops = {
 	.fsync		= hostfs_fsync,
 };
 
-static struct file_operations hostfs_dir_fops = {
+static const struct file_operations hostfs_dir_fops = {
 	.llseek		= generic_file_llseek,
 	.readdir	= hostfs_readdir,
 	.read		= generic_read_dir,
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 5591f9623aa2..ecc9180645ae 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -310,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	return ERR_PTR(-ENOENT);
 }
 
-struct file_operations hpfs_dir_ops =
+const struct file_operations hpfs_dir_ops =
 {
 	.llseek		= hpfs_dir_lseek,
 	.read		= generic_read_dir,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 7c995ac4081b..d3b9fffe45a1 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -119,7 +119,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 	return retval;
 }
 
-struct file_operations hpfs_file_ops =
+const struct file_operations hpfs_file_ops =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 4c6473ab3b34..29b7a3e55173 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -240,7 +240,7 @@ void hpfs_set_dentry_operations(struct dentry *);
 /* dir.c */
 
 struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *);
-extern struct file_operations hpfs_dir_ops;
+extern const struct file_operations hpfs_dir_ops;
 
 /* dnode.c */
 
@@ -266,7 +266,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 /* file.c */
 
 int hpfs_file_fsync(struct file *, struct dentry *, int);
-extern struct file_operations hpfs_file_ops;
+extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
 extern struct address_space_operations hpfs_aops;
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index a44dc5897399..2ba20cdb5baa 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -558,7 +558,7 @@ static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
 	return(default_llseek(file, off, where));
 }
 
-static struct file_operations hppfs_file_fops = {
+static const struct file_operations hppfs_file_fops = {
 	.owner		= NULL,
 	.llseek		= hppfs_llseek,
 	.read		= hppfs_read,
@@ -609,7 +609,7 @@ static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return(0);
 }
 
-static struct file_operations hppfs_dir_fops = {
+static const struct file_operations hppfs_dir_fops = {
 	.owner		= NULL,
 	.readdir	= hppfs_readdir,
 	.open		= hppfs_dir_open,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 25fa8bba8cb5..3a5b4e923455 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -35,7 +35,7 @@
 
 static struct super_operations hugetlbfs_ops;
 static struct address_space_operations hugetlbfs_aops;
-struct file_operations hugetlbfs_file_operations;
+const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
 
@@ -566,7 +566,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
 		inode_init_once(&ei->vfs_inode);
 }
 
-struct file_operations hugetlbfs_file_operations = {
+const struct file_operations hugetlbfs_file_operations = {
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= simple_sync_file,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
diff --git a/fs/inotify.c b/fs/inotify.c
index f48a3dae0712..367c487c014b 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -920,7 +920,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 	return ret;
 }
 
-static struct file_operations inotify_fops = {
+static const struct file_operations inotify_fops = {
 	.poll           = inotify_poll,
 	.read           = inotify_read,
 	.release        = inotify_release,
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 7901ac9f97ab..5440ea292c69 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -16,7 +16,7 @@
 
 static int isofs_readdir(struct file *, void *, filldir_t);
 
-struct file_operations isofs_dir_operations =
+const struct file_operations isofs_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= isofs_readdir,
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 439a19b1bf3e..b87ba066f5e7 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -175,6 +175,6 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 }
 
 extern struct inode_operations isofs_dir_inode_operations;
-extern struct file_operations isofs_dir_operations;
+extern const struct file_operations isofs_dir_operations;
 extern struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 5a4519e834da..020cc097c539 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -55,9 +55,9 @@
 static int jffs_remove(struct inode *dir, struct dentry *dentry, int type);
 
 static struct super_operations jffs_ops;
-static struct file_operations jffs_file_operations;
+static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
-static struct file_operations jffs_dir_operations;
+static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
 static struct address_space_operations jffs_address_operations;
 
@@ -1629,7 +1629,7 @@ static int jffs_fsync(struct file *f, struct dentry *d, int datasync)
 }
 
 
-static struct file_operations jffs_file_operations =
+static const struct file_operations jffs_file_operations =
 {
 	.open		= generic_file_open,
 	.llseek		= generic_file_llseek,
@@ -1649,7 +1649,7 @@ static struct inode_operations jffs_file_inode_operations =
 };
 
 
-static struct file_operations jffs_dir_operations =
+static const struct file_operations jffs_dir_operations =
 {
 	.readdir	= jffs_readdir,
 };
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index a7bf9cb2567f..8bc7a5018e40 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -37,7 +37,7 @@ static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t);
 static int jffs2_rename (struct inode *, struct dentry *,
                         struct inode *, struct dentry *);
 
-struct file_operations jffs2_dir_operations =
+const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
 	.readdir =	jffs2_readdir,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 935f273dc57b..9f4171213e58 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -38,7 +38,7 @@ int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	return 0;
 }
 
-struct file_operations jffs2_file_operations =
+const struct file_operations jffs2_file_operations =
 {
 	.llseek =	generic_file_llseek,
 	.open =		generic_file_open,
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 59e7a393200c..d307cf548625 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -159,11 +159,11 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
 void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c);
 
 /* dir.c */
-extern struct file_operations jffs2_dir_operations;
+extern const struct file_operations jffs2_dir_operations;
 extern struct inode_operations jffs2_dir_inode_operations;
 
 /* file.c */
-extern struct file_operations jffs2_file_operations;
+extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
 extern struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index e1ac6e497e2b..1c9745be5ada 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -100,7 +100,7 @@ struct inode_operations jfs_file_inode_operations = {
 #endif
 };
 
-struct file_operations jfs_file_operations = {
+const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
 	.write		= generic_file_write,
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 095d471b9f9a..c30072674464 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -35,9 +35,9 @@ extern void jfs_set_inode_flags(struct inode *);
 
 extern struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
-extern struct file_operations jfs_dir_operations;
+extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
-extern struct file_operations jfs_file_operations;
+extern const struct file_operations jfs_file_operations;
 extern struct inode_operations jfs_symlink_inode_operations;
 extern struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 309cee575f7d..09ea03f62277 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1519,7 +1519,7 @@ struct inode_operations jfs_dir_inode_operations = {
 #endif
 };
 
-struct file_operations jfs_dir_operations = {
+const struct file_operations jfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= jfs_readdir,
 	.fsync		= jfs_fsync,
diff --git a/fs/libfs.c b/fs/libfs.c
index 4fdeaceb892c..7145ba7a48d0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -179,7 +179,7 @@ ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t
 	return -EISDIR;
 }
 
-struct file_operations simple_dir_operations = {
+const struct file_operations simple_dir_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.llseek		= dcache_dir_lseek,
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 732502aabc05..69224d1fe043 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -14,7 +14,7 @@ typedef struct minix_dir_entry minix_dirent;
 
 static int minix_readdir(struct file *, void *, filldir_t);
 
-struct file_operations minix_dir_operations = {
+const struct file_operations minix_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
 	.fsync		= minix_sync_file,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index f1d77acb3f01..420b32882a10 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -15,7 +15,7 @@
  */
 int minix_sync_file(struct file *, struct dentry *, int);
 
-struct file_operations minix_file_operations = {
+const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e42a8bb89001..c55b77cdcc8e 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -81,8 +81,8 @@ extern int minix_sync_file(struct file *, struct dentry *, int);
 
 extern struct inode_operations minix_file_inode_operations;
 extern struct inode_operations minix_dir_inode_operations;
-extern struct file_operations minix_file_operations;
-extern struct file_operations minix_dir_operations;
+extern const struct file_operations minix_file_operations;
+extern const struct file_operations minix_dir_operations;
 extern struct dentry_operations minix_dentry_operations;
 
 static inline struct minix_sb_info *minix_sb(struct super_block *sb)
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index cfd76f431dc0..f0860c602d8b 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,7 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
 #define ncp_symlink NULL
 #endif
 		      
-struct file_operations ncp_dir_operations =
+const struct file_operations ncp_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= ncp_readdir,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index ebdad8f6398f..e6b7c67cf057 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -283,7 +283,7 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
-struct file_operations ncp_file_operations =
+const struct file_operations ncp_file_operations =
 {
 	.llseek		= remote_llseek,
 	.read		= ncp_file_read,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06c48b385c94..a23f34894167 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -54,7 +54,7 @@ static int nfs_rename(struct inode *, struct dentry *,
 static int nfs_fsync_dir(struct file *, struct dentry *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
-struct file_operations nfs_dir_operations = {
+const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
 	.readdir	= nfs_readdir,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index dee49a0cb995..f1df2c8d9259 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -49,7 +49,7 @@ static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
 
-struct file_operations nfs_file_operations = {
+const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c8960aff0968..3ef017b3b5bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -134,7 +134,7 @@ static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size
 	return simple_transaction_read(file, buf, size, pos);
 }
 
-static struct file_operations transaction_ops = {
+static const struct file_operations transaction_ops = {
 	.write		= nfsctl_transaction_write,
 	.read		= nfsctl_transaction_read,
 	.release	= simple_transaction_release,
@@ -146,7 +146,7 @@ static int exports_open(struct inode *inode, struct file *file)
 	return seq_open(file, &nfs_exports_op);
 }
 
-static struct file_operations exports_operations = {
+static const struct file_operations exports_operations = {
 	.open		= exports_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 1cf955bcc526..57265d563804 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -80,7 +80,7 @@ static int nfsd_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, nfsd_proc_show, NULL);
 }
 
-static struct file_operations nfsd_proc_fops = {
+static const struct file_operations nfsd_proc_fops = {
 	.owner = THIS_MODULE,
 	.open = nfsd_proc_open,
 	.read  = seq_read,
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9d9ed3fe371d..d1e2c6f9f05e 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1553,7 +1553,7 @@ static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_dir_ops = {
+const struct file_operations ntfs_dir_ops = {
 	.llseek		= generic_file_llseek,	/* Seek inside directory. */
 	.read		= generic_read_dir,	/* Return -EISDIR. */
 	.readdir	= ntfs_readdir,		/* Read directory contents. */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5d057e4acc2..c63a83e8da98 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2294,7 +2294,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 
 #endif /* NTFS_RW */
 
-struct file_operations ntfs_file_ops = {
+const struct file_operations ntfs_file_ops = {
 	.llseek		= generic_file_llseek,	 /* Seek inside file. */
 	.read		= generic_file_read,	 /* Read from file. */
 	.aio_read	= generic_file_aio_read, /* Async read from file. */
@@ -2337,6 +2337,6 @@ struct inode_operations ntfs_file_inode_ops = {
 #endif /* NTFS_RW */
 };
 
-struct file_operations ntfs_empty_file_ops = {};
+const struct file_operations ntfs_empty_file_ops = {};
 
 struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index 166142960b53..bf7b3d7c0930 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -60,13 +60,13 @@ extern struct kmem_cache *ntfs_index_ctx_cache;
 extern struct address_space_operations ntfs_aops;
 extern struct address_space_operations ntfs_mst_aops;
 
-extern struct  file_operations ntfs_file_ops;
+extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
 
-extern struct  file_operations ntfs_dir_ops;
+extern const struct  file_operations ntfs_dir_ops;
 extern struct inode_operations ntfs_dir_inode_ops;
 
-extern struct  file_operations ntfs_empty_file_ops;
+extern const struct  file_operations ntfs_empty_file_ops;
 extern struct inode_operations ntfs_empty_inode_ops;
 
 extern struct export_operations ntfs_export_ops;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 84f153aca692..64cd52860c87 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2017,7 +2017,7 @@ out:
 	return ret;
 }
 
-static struct file_operations ocfs2_dlm_debug_fops = {
+static const struct file_operations ocfs2_dlm_debug_fops = {
 	.open =		ocfs2_dlm_debug_open,
 	.release =	ocfs2_dlm_debug_release,
 	.read =		seq_read,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4b4cbadd5838..34e903a6a46b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1176,7 +1176,7 @@ struct inode_operations ocfs2_special_file_iops = {
 	.getattr	= ocfs2_getattr,
 };
 
-struct file_operations ocfs2_fops = {
+const struct file_operations ocfs2_fops = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.sendfile	= generic_file_sendfile,
@@ -1188,7 +1188,7 @@ struct file_operations ocfs2_fops = {
 	.aio_write	= ocfs2_file_aio_write,
 };
 
-struct file_operations ocfs2_dops = {
+const struct file_operations ocfs2_dops = {
 	.read		= generic_read_dir,
 	.readdir	= ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a5ea33b24060..740c9e7ca599 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -26,8 +26,8 @@
 #ifndef OCFS2_FILE_H
 #define OCFS2_FILE_H
 
-extern struct file_operations ocfs2_fops;
-extern struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops;
+extern const struct file_operations ocfs2_dops;
 extern struct inode_operations ocfs2_file_iops;
 extern struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index aeb0106890e4..0f14276a2e51 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -581,17 +581,17 @@ int property_release (struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct file_operations openpromfs_prop_ops = {
+static const struct file_operations openpromfs_prop_ops = {
 	.read		= property_read,
 	.write		= property_write,
 	.release	= property_release,
 };
 
-static struct file_operations openpromfs_nodenum_ops = {
+static const struct file_operations openpromfs_nodenum_ops = {
 	.read		= nodenum_read,
 };
 
-static struct file_operations openprom_operations = {
+static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index 4384c9290943..e2f4f1d9ffc2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -568,7 +568,7 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
  */
-struct file_operations read_fifo_fops = {
+const struct file_operations read_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
@@ -580,7 +580,7 @@ struct file_operations read_fifo_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-struct file_operations write_fifo_fops = {
+const struct file_operations write_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= pipe_write,
@@ -592,7 +592,7 @@ struct file_operations write_fifo_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-struct file_operations rdwr_fifo_fops = {
+const struct file_operations rdwr_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= pipe_read,
 	.readv		= pipe_readv,
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index adc2cd95169a..17f6e8fa1397 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -31,7 +31,7 @@ static int open_kcore(struct inode * inode, struct file * filp)
 
 static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
 
-struct file_operations proc_kcore_operations = {
+const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
 };
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 10d37bf25206..ff3b90b56e9d 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -47,7 +47,7 @@ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 }
 
 
-struct file_operations proc_kmsg_operations = {
+const struct file_operations proc_kmsg_operations = {
 	.read		= kmsg_read,
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4063fb32f78c..7efa73d44c9a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -172,7 +172,7 @@ static int open_vmcore(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-struct file_operations proc_vmcore_operations = {
+const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
 	.open		= open_vmcore,
 };
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 7a8f5595c26f..9031948fefd0 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -81,7 +81,7 @@ out:
 	return 0;
 }
 
-struct file_operations qnx4_dir_operations =
+const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index c33963fded9e..62af4b1348bd 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -19,7 +19,7 @@
  * We have mostly NULL's here: the current defaults are ok for
  * the qnx4 filesystem.
  */
-struct file_operations qnx4_file_operations =
+const struct file_operations qnx4_file_operations =
 {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 6ada2095b9ac..00a933eb820c 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -32,7 +32,7 @@ struct address_space_operations ramfs_aops = {
 	.commit_write	= simple_commit_write
 };
 
-struct file_operations ramfs_file_operations = {
+const struct file_operations ramfs_file_operations = {
 	.read		= generic_file_read,
 	.write		= generic_file_write,
 	.mmap		= generic_file_mmap,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b1ca234068f6..f443a84b98a5 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -33,7 +33,7 @@ struct address_space_operations ramfs_aops = {
 	.commit_write		= simple_commit_write
 };
 
-struct file_operations ramfs_file_operations = {
+const struct file_operations ramfs_file_operations = {
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= generic_file_read,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 272c8a7120b0..313237631b49 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -11,5 +11,5 @@
 
 
 extern struct address_space_operations ramfs_aops;
-extern struct file_operations ramfs_file_operations;
+extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index 34b1bf259efd..6256ca81a718 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -19,7 +19,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-struct file_operations generic_ro_fops = {
+const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.mmap		= generic_file_readonly_mmap,
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index d71ac6579289..973c819f8033 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -18,7 +18,7 @@ static int reiserfs_readdir(struct file *, void *, filldir_t);
 static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 			      int datasync);
 
-struct file_operations reiserfs_dir_operations = {
+const struct file_operations reiserfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index d0c1e865963e..010094d14da6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1566,7 +1566,7 @@ static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
 	return generic_file_aio_write(iocb, buf, count, pos);
 }
 
-struct file_operations reiserfs_file_operations = {
+const struct file_operations reiserfs_file_operations = {
 	.read = generic_file_read,
 	.write = reiserfs_file_write,
 	.ioctl = reiserfs_ioctl,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index ef6caed9336b..731688e1cfe3 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -470,7 +470,7 @@ static int r_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations r_file_operations = {
+static const struct file_operations r_file_operations = {
 	.open = r_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c2fc424d7d5c..9b9eda7b335c 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -463,7 +463,7 @@ static struct address_space_operations romfs_aops = {
 	.readpage = romfs_readpage
 };
 
-static struct file_operations romfs_dir_operations = {
+static const struct file_operations romfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= romfs_readdir,
 };
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 0424d06b147e..34c7a11d91f0 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -34,7 +34,7 @@ static int smb_rename(struct inode *, struct dentry *,
 static int smb_make_node(struct inode *,struct dentry *,int,dev_t);
 static int smb_link(struct dentry *, struct inode *, struct dentry *);
 
-struct file_operations smb_dir_operations =
+const struct file_operations smb_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= smb_readdir,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 7042e62726a4..c56bd99a9701 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -401,7 +401,7 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return error;
 }
 
-struct file_operations smb_file_operations =
+const struct file_operations smb_file_operations =
 {
 	.llseek		= remote_llseek,
 	.read		= smb_file_read,
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index e866ec8660d0..47664597e6b1 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -35,7 +35,7 @@ extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
 extern void smb_install_null_ops(struct smb_ops *ops);
 /* dir.c */
-extern struct file_operations smb_dir_operations;
+extern const struct file_operations smb_dir_operations;
 extern struct inode_operations smb_dir_inode_operations;
 extern struct inode_operations smb_dir_inode_operations_unix;
 extern void smb_new_dentry(struct dentry *dentry);
@@ -64,7 +64,7 @@ extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
 extern struct address_space_operations smb_file_aops;
-extern struct file_operations smb_file_operations;
+extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
 extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 78899eeab974..c16a93c353c0 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -163,7 +163,7 @@ static int release(struct inode * inode, struct file * file)
 	return 0;
 }
 
-struct file_operations bin_fops = {
+const struct file_operations bin_fops = {
 	.read		= read,
 	.write		= write,
 	.mmap		= mmap,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 9ee956864445..f26880a4785e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -503,7 +503,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 	return offset;
 }
 
-struct file_operations sysfs_dir_operations = {
+const struct file_operations sysfs_dir_operations = {
 	.open		= sysfs_dir_open,
 	.release	= sysfs_dir_close,
 	.llseek		= sysfs_dir_lseek,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 5e83e7246788..830f76fa098c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -348,7 +348,7 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations sysfs_file_operations = {
+const struct file_operations sysfs_file_operations = {
 	.read		= sysfs_read_file,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index cf11d5b789d9..32958a7c50e9 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -21,9 +21,9 @@ extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
 extern struct rw_semaphore sysfs_rename_sem;
 extern struct super_block * sysfs_sb;
-extern struct file_operations sysfs_dir_operations;
-extern struct file_operations sysfs_file_operations;
-extern struct file_operations bin_fops;
+extern const struct file_operations sysfs_dir_operations;
+extern const struct file_operations sysfs_file_operations;
+extern const struct file_operations bin_fops;
 extern struct inode_operations sysfs_dir_inode_operations;
 extern struct inode_operations sysfs_symlink_inode_operations;
 
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index cce8b05cba5a..8c66e9270dd6 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -20,7 +20,7 @@
 
 static int sysv_readdir(struct file *, void *, filldir_t);
 
-struct file_operations sysv_dir_operations = {
+const struct file_operations sysv_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
 	.fsync		= sysv_sync_file,
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index da69abc06240..a59e303135fa 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -19,7 +19,7 @@
  * We have mostly NULLs here: the current defaults are OK for
  * the coh filesystem.
  */
-struct file_operations sysv_file_operations = {
+const struct file_operations sysv_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index b7f9b4a42aab..393a480e4deb 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -159,8 +159,8 @@ extern ino_t sysv_inode_by_name(struct dentry *);
 extern struct inode_operations sysv_file_inode_operations;
 extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
-extern struct file_operations sysv_file_operations;
-extern struct file_operations sysv_dir_operations;
+extern const struct file_operations sysv_file_operations;
+extern const struct file_operations sysv_dir_operations;
 extern struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f5222527fe39..8c28efa3b8ff 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -42,7 +42,7 @@ static int do_udf_readdir(struct inode *, struct file *, filldir_t, void *);
 
 /* readdir and lookup functions */
 
-struct file_operations udf_dir_operations = {
+const struct file_operations udf_dir_operations = {
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.ioctl			= udf_ioctl,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index a6f2acc1f15c..e34b00e303f1 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -248,7 +248,7 @@ static int udf_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-struct file_operations udf_file_operations = {
+const struct file_operations udf_file_operations = {
 	.read			= generic_file_read,
 	.ioctl			= udf_ioctl,
 	.open			= generic_file_open,
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 1d5800e0cbe7..023e19ba5a2e 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -44,9 +44,9 @@ struct buffer_head;
 struct super_block;
 
 extern struct inode_operations udf_dir_inode_operations;
-extern struct file_operations udf_dir_operations;
+extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
-extern struct file_operations udf_file_operations;
+extern const struct file_operations udf_file_operations;
 extern struct address_space_operations udf_aops;
 extern struct address_space_operations udf_adinicb_aops;
 extern struct address_space_operations udf_symlink_aops;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 7c10c68902ae..1a561202d3f4 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -620,7 +620,7 @@ int ufs_empty_dir (struct inode * inode)
 	return 1;
 }
 
-struct file_operations ufs_dir_operations = {
+const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
 	.fsync		= file_fsync,
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 62ad481810ef..312fd3f86313 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -31,7 +31,7 @@
  * the ufs filesystem.
  */
  
-struct file_operations ufs_file_operations = {
+const struct file_operations ufs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
 	.write		= generic_file_write,
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 185567a6a561..85997b1205f5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -528,7 +528,7 @@ open_exec_out:
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
 
-struct file_operations xfs_file_operations = {
+const struct file_operations xfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
@@ -550,7 +550,7 @@ struct file_operations xfs_file_operations = {
 #endif
 };
 
-struct file_operations xfs_invis_file_operations = {
+const struct file_operations xfs_invis_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
@@ -570,7 +570,7 @@ struct file_operations xfs_invis_file_operations = {
 };
 
 
-struct file_operations xfs_dir_file_operations = {
+const struct file_operations xfs_dir_file_operations = {
 	.read		= generic_read_dir,
 	.readdir	= xfs_file_readdir,
 	.unlocked_ioctl	= xfs_file_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index a8417d7af5f9..ad6173da5678 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,9 +22,9 @@ extern struct inode_operations xfs_inode_operations;
 extern struct inode_operations xfs_dir_inode_operations;
 extern struct inode_operations xfs_symlink_inode_operations;
 
-extern struct file_operations xfs_file_operations;
-extern struct file_operations xfs_dir_file_operations;
-extern struct file_operations xfs_invis_file_operations;
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+extern const struct file_operations xfs_invis_file_operations;
 
 extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
                         int, unsigned int, void __user *);
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index cc621ec409d8..b3ecf8f71d97 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -30,9 +30,9 @@ extern struct inode_operations coda_ioctl_inode_operations;
 extern struct address_space_operations coda_file_aops;
 extern struct address_space_operations coda_symlink_aops;
 
-extern struct file_operations coda_dir_operations;
-extern struct file_operations coda_file_operations;
-extern struct file_operations coda_ioctl_operations;
+extern const struct file_operations coda_dir_operations;
+extern const struct file_operations coda_file_operations;
+extern const struct file_operations coda_ioctl_operations;
 
 /* operations shared over more than one file */
 int coda_open(struct inode *i, struct file *f);
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 534d750d922d..32503657f14f 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -11,7 +11,7 @@
 extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
-extern struct file_operations proc_vmcore_operations;
+extern const struct file_operations proc_vmcore_operations;
 extern struct proc_dir_entry *proc_vmcore;
 
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index 28f368c526fb..fbfa6b52e2fb 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -37,7 +37,7 @@ static inline struct efs_sb_info *SUPER_INFO(struct super_block *sb)
 struct statfs;
 
 extern struct inode_operations efs_dir_inode_operations;
-extern struct file_operations efs_dir_operations;
+extern const struct file_operations efs_dir_operations;
 extern struct address_space_operations efs_symlink_aops;
 
 extern void efs_read_inode(struct inode *);
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 8bb4f842cded..3ade6a4e3bdd 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -833,11 +833,11 @@ do {								\
  */
 
 /* dir.c */
-extern struct file_operations ext3_dir_operations;
+extern const struct file_operations ext3_dir_operations;
 
 /* file.c */
 extern struct inode_operations ext3_file_inode_operations;
-extern struct file_operations ext3_file_operations;
+extern const struct file_operations ext3_file_operations;
 
 /* namei.c */
 extern struct inode_operations ext3_dir_inode_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef355bc73714..408fe89498f4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1390,11 +1390,11 @@ extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
-extern struct file_operations def_blk_fops;
+extern const struct file_operations def_blk_fops;
 extern struct address_space_operations def_blk_aops;
-extern struct file_operations def_chr_fops;
-extern struct file_operations bad_sock_fops;
-extern struct file_operations def_fifo_fops;
+extern const struct file_operations def_chr_fops;
+extern const struct file_operations bad_sock_fops;
+extern const struct file_operations def_fifo_fops;
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
@@ -1444,9 +1444,9 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 
-extern struct file_operations read_fifo_fops;
-extern struct file_operations write_fifo_fops;
-extern struct file_operations rdwr_fifo_fops;
+extern const struct file_operations read_fifo_fops;
+extern const struct file_operations write_fifo_fops;
+extern const struct file_operations rdwr_fifo_fops;
 
 extern int fs_may_remount_ro(struct super_block *);
 
@@ -1688,7 +1688,7 @@ static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
 
-extern struct file_operations generic_ro_fops;
+extern const struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 
@@ -1744,9 +1744,9 @@ extern int simple_commit_write(struct file *file, struct page *page,
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
-extern struct file_operations simple_dir_operations;
+extern const struct file_operations simple_dir_operations;
 extern struct inode_operations simple_dir_inode_operations;
-struct tree_descr { char *name; struct file_operations *ops; int mode; };
+struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, int, struct tree_descr *);
 extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d6f1019625af..4c5e610fe442 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -154,7 +154,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-extern struct file_operations hugetlbfs_file_operations;
+extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
 int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 53cee1581650..d9035c73e5d1 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -334,7 +334,7 @@ extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		    unsigned long *mapped_blocks);
 
 /* fat/dir.c */
-extern struct file_operations fat_dir_operations;
+extern const struct file_operations fat_dir_operations;
 extern int fat_search_long(struct inode *inode, const unsigned char *name,
 			   int name_len, struct fat_slot_info *sinfo);
 extern int fat_dir_empty(struct inode *dir);
@@ -397,7 +397,7 @@ extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
 extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
 			     unsigned int cmd, unsigned long arg);
-extern struct file_operations fat_file_operations;
+extern const struct file_operations fat_file_operations;
 extern struct inode_operations fat_file_inode_operations;
 extern int fat_notify_change(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h
index e01342568530..96dc237b8f03 100644
--- a/include/linux/ncp_fs.h
+++ b/include/linux/ncp_fs.h
@@ -209,7 +209,7 @@ void ncp_update_inode2(struct inode *, struct ncp_entry_info *);
 
 /* linux/fs/ncpfs/dir.c */
 extern struct inode_operations ncp_dir_inode_operations;
-extern struct file_operations ncp_dir_operations;
+extern const struct file_operations ncp_dir_operations;
 int ncp_conn_logged_in(struct super_block *);
 int ncp_date_dos2unix(__le16 time, __le16 date);
 void ncp_date_unix2dos(int unix_date, __le16 * time, __le16 * date);
@@ -230,7 +230,7 @@ void ncp_unlock_server(struct ncp_server *server);
 
 /* linux/fs/ncpfs/file.c */
 extern struct inode_operations ncp_file_inode_operations;
-extern struct file_operations ncp_file_operations;
+extern const struct file_operations ncp_file_operations;
 int ncp_make_open(struct inode *, int);
 
 /* linux/fs/ncpfs/mmap.c */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index cbebd7d1b9e8..c71227dd4389 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -324,7 +324,7 @@ extern struct inode_operations nfs_file_inode_operations;
 #ifdef CONFIG_NFS_V3
 extern struct inode_operations nfs3_file_inode_operations;
 #endif /* CONFIG_NFS_V3 */
-extern struct file_operations nfs_file_operations;
+extern const struct file_operations nfs_file_operations;
 extern struct address_space_operations nfs_file_aops;
 
 static inline struct rpc_cred *nfs_file_cred(struct file *file)
@@ -371,7 +371,7 @@ extern struct inode_operations nfs_dir_inode_operations;
 #ifdef CONFIG_NFS_V3
 extern struct inode_operations nfs3_dir_inode_operations;
 #endif /* CONFIG_NFS_V3 */
-extern struct file_operations nfs_dir_operations;
+extern const struct file_operations nfs_dir_operations;
 extern struct dentry_operations nfs_dentry_operations;
 
 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 6d03d025fcd5..135871df9911 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -128,9 +128,9 @@ extern int proc_match(int, const char *,struct proc_dir_entry *);
 extern int proc_readdir(struct file *, void *, filldir_t);
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
-extern struct file_operations proc_kcore_operations;
-extern struct file_operations proc_kmsg_operations;
-extern struct file_operations ppc_htab_operations;
+extern const struct file_operations proc_kcore_operations;
+extern const struct file_operations proc_kmsg_operations;
+extern const struct file_operations ppc_htab_operations;
 
 /*
  * proc_tty.c
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index fc610bb0f733..27f49c85d5d6 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -118,8 +118,8 @@ extern struct buffer_head *qnx4_bread(struct inode *, int, int);
 
 extern struct inode_operations qnx4_file_inode_operations;
 extern struct inode_operations qnx4_dir_inode_operations;
-extern struct file_operations qnx4_file_operations;
-extern struct file_operations qnx4_dir_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
 extern int qnx4_is_free(struct super_block *sb, long block);
 extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
 extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 953b6df5d037..78ecfa28b1c2 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -15,7 +15,7 @@ extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 #endif
 
-extern struct file_operations ramfs_file_operations;
+extern const struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 
 #endif
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 912f1b7cb18f..5676c4210e2c 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1960,7 +1960,7 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
 extern struct inode_operations reiserfs_dir_inode_operations;
 extern struct inode_operations reiserfs_symlink_inode_operations;
 extern struct inode_operations reiserfs_special_inode_operations;
-extern struct file_operations reiserfs_dir_operations;
+extern const struct file_operations reiserfs_dir_operations;
 
 /* tail_conversion.c */
 int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
@@ -1972,7 +1972,7 @@ void reiserfs_unmap_buffer(struct buffer_head *);
 
 /* file.c */
 extern struct inode_operations reiserfs_file_inode_operations;
-extern struct file_operations reiserfs_file_operations;
+extern const struct file_operations reiserfs_file_operations;
 extern struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
diff --git a/include/linux/ufs_fs.h b/include/linux/ufs_fs.h
index b0ffe4356e5a..843aeaaa79d4 100644
--- a/include/linux/ufs_fs.h
+++ b/include/linux/ufs_fs.h
@@ -895,7 +895,7 @@ extern void ufs_set_link(struct inode *, struct ufs_dir_entry *, struct buffer_h
 
 /* file.c */
 extern struct inode_operations ufs_file_inode_operations;
-extern struct file_operations ufs_file_operations;
+extern const struct file_operations ufs_file_operations;
 
 extern struct address_space_operations ufs_aops;
 
@@ -915,7 +915,7 @@ extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
 /* namei.c */
-extern struct file_operations ufs_dir_operations;
+extern const struct file_operations ufs_dir_operations;
         
 /* super.c */
 extern void ufs_warning (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4)));
diff --git a/net/nonet.c b/net/nonet.c
index 1230f0ae832e..92e76640c7cd 100644
--- a/net/nonet.c
+++ b/net/nonet.c
@@ -19,7 +19,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 	return -ENXIO;
 }
 
-struct file_operations bad_sock_fops = {
+const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
 };
diff --git a/net/socket.c b/net/socket.c
index 5211ba270375..fcd77eac0ccf 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -539,7 +539,7 @@ static int sock_no_open(struct inode *irrelevant, struct file *dontcare)
 	return -ENXIO;
 }
 
-struct file_operations bad_sock_fops = {
+const struct file_operations bad_sock_fops = {
 	.owner = THIS_MODULE,
 	.open = sock_no_open,
 };
-- 
cgit v1.2.3


From 910638ae7ed4be27d6af55f6c9b5bf54b838e78b Mon Sep 17 00:00:00 2001
From: Matthias Gehre <M.Gehre@gmx.de>
Date: Tue, 28 Mar 2006 01:56:48 -0800
Subject: [PATCH] Replace 0xff.. with correct DMA_xBIT_MASK

Replace all occurences of 0xff..  in calls to function pci_set_dma_mask()
and pci_set_consistant_dma_mask() with the corresponding DMA_xBIT_MASK from
linux/dma-mapping.h.

Signed-off-by: Matthias Gehre <M.Gehre@gmx.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DMA-mapping.txt                 | 2 ++
 drivers/atm/lanai.c                           | 2 +-
 drivers/block/umem.c                          | 5 +++--
 drivers/net/forcedeth.c                       | 3 ++-
 drivers/net/ioc3-eth.c                        | 7 ++++---
 drivers/net/ns83820.c                         | 6 +++---
 drivers/net/wan/wanxl.c                       | 4 ++--
 drivers/net/wireless/prism54/islpci_hotplug.c | 3 ++-
 drivers/scsi/BusLogic.c                       | 7 ++++---
 drivers/scsi/a100u2w.c                        | 3 ++-
 drivers/scsi/aacraid/aachba.c                 | 1 +
 drivers/scsi/aacraid/linit.c                  | 5 +++--
 drivers/scsi/atp870u.c                        | 3 ++-
 drivers/scsi/dpt_i2o.c                        | 5 +++--
 drivers/scsi/eata.c                           | 3 ++-
 drivers/scsi/gdth.c                           | 7 ++++---
 drivers/scsi/initio.c                         | 3 ++-
 drivers/scsi/ips.c                            | 5 +++--
 drivers/scsi/megaraid.c                       | 7 ++++---
 drivers/scsi/nsp32.c                          | 3 ++-
 drivers/scsi/qla1280.c                        | 5 +++--
 drivers/scsi/qlogicfc.c                       | 5 +++--
 include/linux/dma-mapping.h                   | 1 +
 sound/oss/esssolo1.c                          | 2 +-
 sound/oss/sonicvibes.c                        | 3 ++-
 sound/pci/ad1889.c                            | 1 +
 sound/pci/ali5451/ali5451.c                   | 5 +++--
 sound/pci/als4000.c                           | 5 +++--
 sound/pci/azt3328.c                           | 5 +++--
 sound/pci/emu10k1/emu10k1x.c                  | 1 +
 sound/pci/es1938.c                            | 5 +++--
 sound/pci/es1968.c                            | 1 +
 sound/pci/ice1712/ice1712.c                   | 2 ++
 sound/pci/maestro3.c                          | 1 +
 sound/pci/mixart/mixart.c                     | 2 ++
 sound/pci/pcxhr/pcxhr.c                       | 1 +
 sound/pci/sonicvibes.c                        | 5 +++--
 sound/pci/trident/trident_main.c              | 5 +++--
 38 files changed, 88 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index 684557474c15..ee4bb73683cd 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -199,6 +199,8 @@ address during PCI bus mastering you might do something like:
 		       "mydev: 24-bit DMA addressing not available.\n");
 		goto ignore_this_device;
 	}
+[Better use DMA_24BIT_MASK instead of 0x00ffffff.
+See linux/include/dma-mapping.h for reference.]
 
 When pci_set_dma_mask() is successful, and returns zero, the PCI layer
 saves away this mask you have provided.  The PCI layer will use this
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 69f4c7ce9a63..cac09e353be8 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1972,7 +1972,7 @@ static int __devinit lanai_pci_start(struct lanai_dev *lanai)
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
 	}
-	if (pci_set_consistent_dma_mask(pci, 0xFFFFFFFF) != 0) {
+	if (pci_set_consistent_dma_mask(pci, DMA_32BIT_MASK) != 0) {
 		printk(KERN_WARNING DEV_LABEL
 		    "(itf %d): No suitable DMA available.\n", lanai->number);
 		return -EBUSY;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index c16e66b9c7a7..f7d4c65a7b8c 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -50,6 +50,7 @@
 #include <linux/timer.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <linux/fcntl.h>        /* O_ACCMODE */
 #include <linux/hdreg.h>  /* HDIO_GETGEO */
@@ -881,8 +882,8 @@ static int __devinit mm_pci_probe(struct pci_dev *dev, const struct pci_device_i
 	printk(KERN_INFO "Micro Memory(tm) controller #%d found at %02x:%02x (PCI Mem Module (Battery Backup))\n",
 	       card->card_number, dev->bus->number, dev->devfn);
 
-	if (pci_set_dma_mask(dev, 0xffffffffffffffffLL) &&
-	    pci_set_dma_mask(dev, 0xffffffffLL)) {
+	if (pci_set_dma_mask(dev, DMA_64BIT_MASK) &&
+	    pci_set_dma_mask(dev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "MM%d: NO suitable DMA found\n",num_cards);
 		return  -ENOMEM;
 	}
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index e7fc28b07e5a..7627a75f4f7c 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -134,6 +134,7 @@
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/if_vlan.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/irq.h>
 #include <asm/io.h>
@@ -2932,7 +2933,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 	if (id->driver_data & DEV_HAS_HIGH_DMA) {
 		/* packet format 3: supports 40-bit addressing */
 		np->desc_ver = DESC_VER_3;
-		if (pci_set_dma_mask(pci_dev, 0x0000007fffffffffULL)) {
+		if (pci_set_dma_mask(pci_dev, DMA_39BIT_MASK)) {
 			printk(KERN_INFO "forcedeth: 64-bit DMA failed, using 32-bit addressing for device %s.\n",
 					pci_name(pci_dev));
 		} else {
diff --git a/drivers/net/ioc3-eth.c b/drivers/net/ioc3-eth.c
index 9b8295ee06ef..ae71ed57c12d 100644
--- a/drivers/net/ioc3-eth.c
+++ b/drivers/net/ioc3-eth.c
@@ -44,6 +44,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
+#include <linux/dma-mapping.h>
 
 #ifdef CONFIG_SERIAL_8250
 #include <linux/serial_core.h>
@@ -1195,17 +1196,17 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	int err, pci_using_dac;
 
 	/* Configure DMA attributes. */
-	err = pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+	err = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 	if (!err) {
 		pci_using_dac = 1;
-		err = pci_set_consistent_dma_mask(pdev, 0xffffffffffffffffULL);
+		err = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
 		if (err < 0) {
 			printk(KERN_ERR "%s: Unable to obtain 64 bit DMA "
 			       "for consistent allocations\n", pci_name(pdev));
 			goto out;
 		}
 	} else {
-		err = pci_set_dma_mask(pdev, 0xffffffffULL);
+		err = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 		if (err) {
 			printk(KERN_ERR "%s: No usable DMA configuration, "
 			       "aborting.\n", pci_name(pdev));
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index 0fede50abd3e..8e9b1a537dee 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -1828,10 +1828,10 @@ static int __devinit ns83820_init_one(struct pci_dev *pci_dev, const struct pci_
 	int using_dac = 0;
 
 	/* See if we can set the dma mask early on; failure is fatal. */
-	if (sizeof(dma_addr_t) == 8 && 
-	 	!pci_set_dma_mask(pci_dev, 0xffffffffffffffffULL)) {
+	if (sizeof(dma_addr_t) == 8 &&
+	 	!pci_set_dma_mask(pci_dev, DMA_64BIT_MASK)) {
 		using_dac = 1;
-	} else if (!pci_set_dma_mask(pci_dev, 0xffffffff)) {
+	} else if (!pci_set_dma_mask(pci_dev, DMA_32BIT_MASK)) {
 		using_dac = 0;
 	} else {
 		printk(KERN_WARNING "ns83820.c: pci_set_dma_mask failed!\n");
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index 9d3b51c3ef54..29a756dd979b 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -577,8 +577,8 @@ static int __devinit wanxl_pci_init_one(struct pci_dev *pdev,
 	   We set both dma_mask and consistent_dma_mask to 28 bits
 	   and pray pci_alloc_consistent() will use this info. It should
 	   work on most platforms */
-	if (pci_set_consistent_dma_mask(pdev, 0x0FFFFFFF) ||
-	    pci_set_dma_mask(pdev, 0x0FFFFFFF)) {
+	if (pci_set_consistent_dma_mask(pdev, DMA_28BIT_MASK) ||
+	    pci_set_dma_mask(pdev, DMA_28BIT_MASK)) {
 		printk(KERN_ERR "wanXL: No usable DMA configuration\n");
 		return -EIO;
 	}
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index b41d666fea3c..bfa0cc319a09 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -22,6 +22,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/init.h> /* For __init, __exit */
+#include <linux/dma-mapping.h>
 
 #include "prismcompat.h"
 #include "islpci_dev.h"
@@ -124,7 +125,7 @@ prism54_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	}
 
 	/* enable PCI DMA */
-	if (pci_set_dma_mask(pdev, 0xffffffff)) {
+	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
 		printk(KERN_ERR "%s: 32-bit PCI DMA not supported", DRV_NAME);
 		goto do_pci_disable_device;
         }
diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c
index 5bf83cbca868..bde3d5834ade 100644
--- a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -42,6 +42,7 @@
 #include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/jiffies.h>
+#include <linux/dma-mapping.h>
 #include <scsi/scsicam.h>
 
 #include <asm/dma.h>
@@ -677,7 +678,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK ))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -832,7 +833,7 @@ static int __init BusLogic_InitializeMultiMasterProbeInfo(struct BusLogic_HostAd
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
 			continue;
 
 		Bus = PCI_Device->bus->number;
@@ -886,7 +887,7 @@ static int __init BusLogic_InitializeFlashPointProbeInfo(struct BusLogic_HostAda
 		if (pci_enable_device(PCI_Device))
 			continue;
 
-		if (pci_set_dma_mask(PCI_Device, (u64) 0xffffffff))
+		if (pci_set_dma_mask(PCI_Device, DMA_32BIT_MASK))
 			continue;
 
 		Bus = PCI_Device->bus->number;
diff --git a/drivers/scsi/a100u2w.c b/drivers/scsi/a100u2w.c
index 9f45ae1745da..3dce21c78737 100644
--- a/drivers/scsi/a100u2w.c
+++ b/drivers/scsi/a100u2w.c
@@ -89,6 +89,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -1052,7 +1053,7 @@ static int __devinit inia100_probe_one(struct pci_dev *pdev,
 
 	if (pci_enable_device(pdev))
 		goto out;
-	if (pci_set_dma_mask(pdev, 0xffffffffULL)) {
+	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "Unable to set 32bit DMA "
 				    "on inia100 adapter, ignoring.\n");
 		goto out_disable_device;
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index a16f8ded8f1d..8df4a0ea3761 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/completion.h>
 #include <linux/blkdev.h>
+#include <linux/dma-mapping.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c
index c2596335549d..720330778648 100644
--- a/drivers/scsi/aacraid/linit.c
+++ b/drivers/scsi/aacraid/linit.c
@@ -45,6 +45,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
 #include <linux/syscalls.h>
 #include <linux/delay.h>
 #include <linux/smp_lock.h>
@@ -806,8 +807,8 @@ static int __devinit aac_probe_one(struct pci_dev *pdev,
 	 * to driver communication memory to be allocated below 2gig
 	 */
 	if (aac_drivers[index].quirks & AAC_QUIRK_31BIT) 
-		if (pci_set_dma_mask(pdev, 0x7FFFFFFFULL) ||
-				pci_set_consistent_dma_mask(pdev, 0x7FFFFFFFULL))
+		if (pci_set_dma_mask(pdev, DMA_31BIT_MASK) ||
+				pci_set_consistent_dma_mask(pdev, DMA_31BIT_MASK))
 			goto out;
 	
 	pci_set_master(pdev);
diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c
index 5227a779c05c..a198d86667e9 100644
--- a/drivers/scsi/atp870u.c
+++ b/drivers/scsi/atp870u.c
@@ -28,6 +28,7 @@
 #include <linux/spinlock.h>
 #include <linux/pci.h>
 #include <linux/blkdev.h>
+#include <linux/dma-mapping.h>
 #include <asm/system.h>
 #include <asm/io.h>
 
@@ -2631,7 +2632,7 @@ static int atp870u_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (pci_enable_device(pdev))
 		return -EIO;
 
-        if (!pci_set_dma_mask(pdev, 0xFFFFFFFFUL)) {
+        if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
                 printk(KERN_INFO "atp870u: use 32bit DMA mask.\n");
         } else {
                 printk(KERN_ERR "atp870u: DMA mask required but not available.\n");
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 6e6b293dcb28..b1b704a42efd 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -57,6 +57,7 @@ MODULE_DESCRIPTION("Adaptec I2O RAID Driver");
 #include <linux/reboot.h>
 #include <linux/spinlock.h>
 #include <linux/smp_lock.h>
+#include <linux/dma-mapping.h>
 
 #include <linux/timer.h>
 #include <linux/string.h>
@@ -906,8 +907,8 @@ static int adpt_install_hba(struct scsi_host_template* sht, struct pci_dev* pDev
 	}
 
 	pci_set_master(pDev);
-	if (pci_set_dma_mask(pDev, 0xffffffffffffffffULL) &&
-	    pci_set_dma_mask(pDev, 0xffffffffULL))
+	if (pci_set_dma_mask(pDev, DMA_64BIT_MASK) &&
+	    pci_set_dma_mask(pDev, DMA_32BIT_MASK))
 		return -EINVAL;
 
 	base_addr0_phys = pci_resource_start(pDev,0);
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index b3f9de8f7595..059eeee4b554 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -490,6 +490,7 @@
 #include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/spinlock.h>
+#include <linux/dma-mapping.h>
 #include <asm/byteorder.h>
 #include <asm/dma.h>
 #include <asm/io.h>
@@ -1426,7 +1427,7 @@ static int port_detect(unsigned long port_base, unsigned int j,
 
 	if (ha->pdev) {
 		pci_set_master(ha->pdev);
-		if (pci_set_dma_mask(ha->pdev, 0xffffffff))
+		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK))
 			printk("%s: warning, pci_set_dma_mask failed.\n",
 			       ha->board_name);
 	}
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 7f7013e80a88..d5740bbdef3e 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -388,6 +388,7 @@
 #include <linux/proc_fs.h>
 #include <linux/time.h>
 #include <linux/timer.h>
+#include <linux/dma-mapping.h>
 #ifdef GDTH_RTC
 #include <linux/mc146818rtc.h>
 #endif
@@ -4527,15 +4528,15 @@ static int __init gdth_detect(struct scsi_host_template *shtp)
             if (!(ha->cache_feat & ha->raw_feat & ha->screen_feat &GDT_64BIT)||
                 /* 64-bit DMA only supported from FW >= x.43 */
                 (!ha->dma64_support)) {
-                if (pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffff)) {
+                if (pci_set_dma_mask(pcistr[ctr].pdev, DMA_32BIT_MASK)) {
                     printk(KERN_WARNING "GDT-PCI %d: Unable to set 32-bit DMA\n", hanum);
                     err = TRUE;
                 }
             } else {
                 shp->max_cmd_len = 16;
-                if (!pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffffffffffffULL)) {
+                if (!pci_set_dma_mask(pcistr[ctr].pdev, DMA_64BIT_MASK)) {
                     printk("GDT-PCI %d: 64-bit DMA enabled\n", hanum);
-                } else if (pci_set_dma_mask(pcistr[ctr].pdev, 0xffffffff)) {
+                } else if (pci_set_dma_mask(pcistr[ctr].pdev, DMA_32BIT_MASK)) {
                     printk(KERN_WARNING "GDT-PCI %d: Unable to set 64/32-bit DMA\n", hanum);
                     err = TRUE;
                 }
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index ea6f3c0e05d9..0cc7f65b584f 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -127,6 +127,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 
 #include <scsi/scsi.h>
@@ -2780,7 +2781,7 @@ static int tul_NewReturnNumberOfAdapters(void)
 			if (((dRegValue & 0xFF00) >> 8) == 0xFF)
 				dRegValue = 0;
 			wBIOS = (wBIOS << 8) + ((UWORD) ((dRegValue & 0xFF00) >> 8));
-			if (pci_set_dma_mask(pDev, 0xffffffff)) {
+			if (pci_set_dma_mask(pDev, DMA_32BIT_MASK)) {
 				printk(KERN_WARNING 
 				       "i91u: Could not set 32 bit DMA mask\n");
 				continue;
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 481708d527ae..a4c0b04cfdbd 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -179,6 +179,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/types.h>
+#include <linux/dma-mapping.h>
 
 #include <scsi/sg.h>
 
@@ -7284,10 +7285,10 @@ ips_init_phase1(struct pci_dev *pci_dev, int *indexPtr)
 	 * are guaranteed to be < 4G.
 	 */
 	if (IPS_ENABLE_DMA64 && IPS_HAS_ENH_SGLIST(ha) &&
-	    !pci_set_dma_mask(ha->pcidev, 0xffffffffffffffffULL)) {
+	    !pci_set_dma_mask(ha->pcidev, DMA_64BIT_MASK)) {
 		(ha)->flags |= IPS_HA_ENH_SG;
 	} else {
-		if (pci_set_dma_mask(ha->pcidev, 0xffffffffULL) != 0) {
+		if (pci_set_dma_mask(ha->pcidev, DMA_32BIT_MASK) != 0) {
 			printk(KERN_WARNING "Unable to set DMA Mask\n");
 			return ips_abort_init(ha, index);
 		}
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 7144674bc8e6..80b68a2481b3 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -45,6 +45,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/dma-mapping.h>
 #include <scsi/scsicam.h>
 
 #include "scsi.h"
@@ -2094,7 +2095,7 @@ make_local_pdev(adapter_t *adapter, struct pci_dev **pdev)
 
 	memcpy(*pdev, adapter->dev, sizeof(struct pci_dev));
 
-	if( pci_set_dma_mask(*pdev, 0xffffffff) != 0 ) {
+	if( pci_set_dma_mask(*pdev, DMA_32BIT_MASK) != 0 ) {
 		kfree(*pdev);
 		return -1;
 	}
@@ -4859,10 +4860,10 @@ megaraid_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	/* Set the Mode of addressing to 64 bit if we can */
 	if ((adapter->flag & BOARD_64BIT) && (sizeof(dma_addr_t) == 8)) {
-		pci_set_dma_mask(pdev, 0xffffffffffffffffULL);
+		pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 		adapter->has_64bit_addr = 1;
 	} else  {
-		pci_set_dma_mask(pdev, 0xffffffff);
+		pci_set_dma_mask(pdev, DMA_32BIT_MASK);
 		adapter->has_64bit_addr = 0;
 	}
 		
diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c
index a279ebb61447..30ee0ef4b459 100644
--- a/drivers/scsi/nsp32.c
+++ b/drivers/scsi/nsp32.c
@@ -38,6 +38,7 @@
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/dma.h>
 #include <asm/system.h>
@@ -2776,7 +2777,7 @@ static int nsp32_detect(struct scsi_host_template *sht)
 	/*
 	 * setup DMA 
 	 */
-	if (pci_set_dma_mask(PCIDEV, 0xffffffffUL) != 0) {
+	if (pci_set_dma_mask(PCIDEV, DMA_32BIT_MASK) != 0) {
 		nsp32_msg (KERN_ERR, "failed to set PCI DMA mask");
 		goto scsi_unregister;
 	}
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index e0230249fa0f..5a48e55f9418 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -350,6 +350,7 @@
 #include <linux/pci_ids.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -4321,7 +4322,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 #ifdef QLA_64BIT_PTR
 	if (pci_set_dma_mask(ha->pdev, (dma_addr_t) ~ 0ULL)) {
-		if (pci_set_dma_mask(ha->pdev, 0xffffffff)) {
+		if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
 			printk(KERN_WARNING "scsi(%li): Unable to set a "
 			       "suitable DMA mask - aborting\n", ha->host_no);
 			error = -ENODEV;
@@ -4331,7 +4332,7 @@ qla1280_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		dprintk(2, "scsi(%li): 64 Bit PCI Addressing Enabled\n",
 			ha->host_no);
 #else
-	if (pci_set_dma_mask(ha->pdev, 0xffffffff)) {
+	if (pci_set_dma_mask(ha->pdev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "scsi(%li): Unable to set a "
 		       "suitable DMA mask - aborting\n", ha->host_no);
 		error = -ENODEV;
diff --git a/drivers/scsi/qlogicfc.c b/drivers/scsi/qlogicfc.c
index 5b15998c71a6..52b224a5d6fd 100644
--- a/drivers/scsi/qlogicfc.c
+++ b/drivers/scsi/qlogicfc.c
@@ -61,6 +61,7 @@
 #include <linux/unistd.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/dma-mapping.h>
 #include <linux/jiffies.h>
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -738,8 +739,8 @@ static int isp2x00_detect(struct scsi_host_template * tmpt)
 				continue;
 
 			/* Try to configure DMA attributes. */
-			if (pci_set_dma_mask(pdev, 0xffffffffffffffffULL) &&
-			    pci_set_dma_mask(pdev, 0xffffffffULL))
+			if (pci_set_dma_mask(pdev, DMA_64BIT_MASK) &&
+			    pci_set_dma_mask(pdev, DMA_32BIT_MASK))
 					continue;
 
 		        host = scsi_register(tmpt, sizeof(struct isp2x00_hostdata));
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index a8731062a74c..9b4751aecc23 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -21,6 +21,7 @@ enum dma_data_direction {
 #define DMA_30BIT_MASK	0x000000003fffffffULL
 #define DMA_29BIT_MASK	0x000000001fffffffULL
 #define DMA_28BIT_MASK	0x000000000fffffffULL
+#define DMA_24BIT_MASK 0x0000000000ffffffULL
 
 #include <asm/dma-mapping.h>
 
diff --git a/sound/oss/esssolo1.c b/sound/oss/esssolo1.c
index 78d3e29ce968..6861563d7525 100644
--- a/sound/oss/esssolo1.c
+++ b/sound/oss/esssolo1.c
@@ -2348,7 +2348,7 @@ static int __devinit solo1_probe(struct pci_dev *pcidev, const struct pci_device
 	/* Recording requires 24-bit DMA, so attempt to set dma mask
 	 * to 24 bits first, then 32 bits (playback only) if that fails.
 	 */
-	if (pci_set_dma_mask(pcidev, 0x00ffffff) &&
+	if (pci_set_dma_mask(pcidev, DMA_24BIT_MASK) &&
 	    pci_set_dma_mask(pcidev, DMA_32BIT_MASK)) {
 		printk(KERN_WARNING "solo1: architecture does not support 24bit or 32bit PCI busmaster DMA\n");
 		return -ENODEV;
diff --git a/sound/oss/sonicvibes.c b/sound/oss/sonicvibes.c
index 4471757b7985..42bd276cfc39 100644
--- a/sound/oss/sonicvibes.c
+++ b/sound/oss/sonicvibes.c
@@ -116,6 +116,7 @@
 #include <linux/spinlock.h>
 #include <linux/smp_lock.h>
 #include <linux/gameport.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
 
 
@@ -2535,7 +2536,7 @@ static int __devinit sv_probe(struct pci_dev *pcidev, const struct pci_device_id
 		return -ENODEV;
 	if (pcidev->irq == 0)
 		return -ENODEV;
-	if (pci_set_dma_mask(pcidev, 0x00ffffff)) {
+	if (pci_set_dma_mask(pcidev, DMA_24BIT_MASK)) {
 		printk(KERN_WARNING "sonicvibes: architecture does not support 24bit PCI busmaster DMA\n");
 		return -ENODEV;
 	}
diff --git a/sound/pci/ad1889.c b/sound/pci/ad1889.c
index 2aa5a7fdb6e0..c6c8333acc62 100644
--- a/sound/pci/ad1889.c
+++ b/sound/pci/ad1889.c
@@ -39,6 +39,7 @@
 #include <linux/interrupt.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/driver.h>
 #include <sound/core.h>
diff --git a/sound/pci/ali5451/ali5451.c b/sound/pci/ali5451/ali5451.c
index e264136e8fb4..fc92b6896c24 100644
--- a/sound/pci/ali5451/ali5451.c
+++ b/sound/pci/ali5451/ali5451.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/info.h>
@@ -2220,8 +2221,8 @@ static int __devinit snd_ali_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 31 bits */
-	if (pci_set_dma_mask(pci, 0x7fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x7fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_31BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_31BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 31bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/als4000.c b/sound/pci/als4000.c
index 7b2ff5f4672e..100d8127a411 100644
--- a/sound/pci/als4000.c
+++ b/sound/pci/als4000.c
@@ -70,6 +70,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 #include <sound/rawmidi.h>
@@ -688,8 +689,8 @@ static int __devinit snd_card_als4000_probe(struct pci_dev *pci,
 		return err;
 	}
 	/* check, if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
diff --git a/sound/pci/azt3328.c b/sound/pci/azt3328.c
index e077eb3fbe2f..680077e1e057 100644
--- a/sound/pci/azt3328.c
+++ b/sound/pci/azt3328.c
@@ -104,6 +104,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
@@ -1669,8 +1670,8 @@ snd_azf3328_create(struct snd_card *card,
 	chip->irq = -1;
 
 	/* check if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		err = -ENXIO;
 		goto out_err;
diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c
index 2208dbd48be9..3e332f398162 100644
--- a/sound/pci/emu10k1/emu10k1x.c
+++ b/sound/pci/emu10k1/emu10k1x.c
@@ -36,6 +36,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/pcm.h>
diff --git a/sound/pci/es1938.c b/sound/pci/es1938.c
index 0d556b09ad04..4d62fe439177 100644
--- a/sound/pci/es1938.c
+++ b/sound/pci/es1938.c
@@ -55,6 +55,7 @@
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
@@ -1517,8 +1518,8 @@ static int __devinit snd_es1938_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
         /* check, if we can restrict PCI DMA transfers to 24 bits */
-	if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
                 return -ENXIO;
diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c
index dd465a186e11..e3ad17f53c29 100644
--- a/sound/pci/es1968.c
+++ b/sound/pci/es1968.c
@@ -104,6 +104,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
 
 #include <sound/core.h>
diff --git a/sound/pci/ice1712/ice1712.c b/sound/pci/ice1712/ice1712.c
index 672e198317e1..b88eeba2f5d1 100644
--- a/sound/pci/ice1712/ice1712.c
+++ b/sound/pci/ice1712/ice1712.c
@@ -56,7 +56,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <linux/mutex.h>
+
 #include <sound/core.h>
 #include <sound/cs8427.h>
 #include <sound/info.h>
diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c
index 8bc084956c28..44393e190929 100644
--- a/sound/pci/maestro3.c
+++ b/sound/pci/maestro3.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/info.h>
 #include <sound/control.h>
diff --git a/sound/pci/mixart/mixart.c b/sound/pci/mixart/mixart.c
index 43ee3b2b948f..b5a095052d4c 100644
--- a/sound/pci/mixart/mixart.c
+++ b/sound/pci/mixart/mixart.c
@@ -28,6 +28,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
+#include <linux/dma-mapping.h>
+
 #include <sound/core.h>
 #include <sound/initval.h>
 #include <sound/info.h>
diff --git a/sound/pci/pcxhr/pcxhr.c b/sound/pci/pcxhr/pcxhr.c
index f679779d96e3..35875c8aa299 100644
--- a/sound/pci/pcxhr/pcxhr.c
+++ b/sound/pci/pcxhr/pcxhr.c
@@ -30,6 +30,7 @@
 #include <linux/delay.h>
 #include <linux/moduleparam.h>
 #include <linux/mutex.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/initval.h>
diff --git a/sound/pci/sonicvibes.c b/sound/pci/sonicvibes.c
index 7bbea3738b8a..2d66a09fe5ee 100644
--- a/sound/pci/sonicvibes.c
+++ b/sound/pci/sonicvibes.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/pcm.h>
@@ -1227,8 +1228,8 @@ static int __devinit snd_sonicvibes_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 24 bits */
-        if (pci_set_dma_mask(pci, 0x00ffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x00ffffff) < 0) {
+        if (pci_set_dma_mask(pci, DMA_24BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_24BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 24bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
                 return -ENXIO;
diff --git a/sound/pci/trident/trident_main.c b/sound/pci/trident/trident_main.c
index 83b7d8aba9e6..52178b8ad49d 100644
--- a/sound/pci/trident/trident_main.c
+++ b/sound/pci/trident/trident_main.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/gameport.h>
+#include <linux/dma-mapping.h>
 
 #include <sound/core.h>
 #include <sound/info.h>
@@ -3554,8 +3555,8 @@ int __devinit snd_trident_create(struct snd_card *card,
 	if ((err = pci_enable_device(pci)) < 0)
 		return err;
 	/* check, if we can restrict PCI DMA transfers to 30 bits */
-	if (pci_set_dma_mask(pci, 0x3fffffff) < 0 ||
-	    pci_set_consistent_dma_mask(pci, 0x3fffffff) < 0) {
+	if (pci_set_dma_mask(pci, DMA_30BIT_MASK) < 0 ||
+	    pci_set_consistent_dma_mask(pci, DMA_30BIT_MASK) < 0) {
 		snd_printk(KERN_ERR "architecture does not support 30bit PCI busmaster DMA\n");
 		pci_disable_device(pci);
 		return -ENXIO;
-- 
cgit v1.2.3


From 7f927fcc2fd1575d01efb4b76665975007945690 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 28 Mar 2006 01:56:53 -0800
Subject: [PATCH] Typo fixes

Fix a lot of typos.  Eyeballed by jmc@ in OpenBSD.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/m68k/README.buddha         | 2 +-
 Documentation/networking/ifenslave.c     | 2 +-
 arch/arm/lib/copy_template.S             | 2 +-
 drivers/char/mxser.h                     | 2 +-
 drivers/char/synclink.c                  | 2 +-
 drivers/edac/edac_mc.c                   | 2 +-
 drivers/net/irda/nsc-ircc.c              | 2 +-
 drivers/net/sis900.c                     | 2 +-
 drivers/net/tulip/de4x5.c                | 2 +-
 drivers/net/tulip/pnic2.c                | 2 +-
 drivers/net/typhoon.c                    | 4 ++--
 drivers/net/wireless/orinoco.c           | 2 +-
 drivers/net/wireless/prism54/isl_ioctl.c | 2 +-
 drivers/scsi/3w-9xxx.c                   | 2 +-
 drivers/serial/8250.c                    | 2 +-
 drivers/serial/serial_txx9.c             | 2 +-
 drivers/serial/sunsu.c                   | 2 +-
 drivers/usb/host/ohci-s3c2410.c          | 2 +-
 drivers/usb/net/zaurus.c                 | 2 +-
 fs/mbcache.c                             | 2 +-
 include/asm-parisc/pdc.h                 | 2 +-
 include/asm-sh/addrspace.h               | 2 +-
 include/asm-sparc64/floppy.h             | 2 +-
 include/linux/fb.h                       | 2 +-
 mm/mempolicy.c                           | 2 +-
 net/irda/af_irda.c                       | 6 +++---
 sound/pci/rme32.c                        | 8 ++++----
 sound/pci/rme96.c                        | 8 ++++----
 sound/pci/rme9652/hdspm.c                | 2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c          | 2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/m68k/README.buddha b/Documentation/m68k/README.buddha
index bf802ffc98ad..ef484a719bb9 100644
--- a/Documentation/m68k/README.buddha
+++ b/Documentation/m68k/README.buddha
@@ -29,7 +29,7 @@ address is written to $4a, then the whole Byte is written to
 $48, while it doesn't matter how often you're writing to $4a
 as  long as $48 is not touched.  After $48 has been written,
 the  whole card disappears from $e8 and is mapped to the new
-address just written.  Make shure $4a is written before $48,
+address just written.  Make sure $4a is written before $48,
 otherwise your chance is only 1:16 to find the board :-).
 
 The local memory-map is even active when mapped to $e8:
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 545447ac503a..a12059886755 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -87,7 +87,7 @@
  *	   would fail and generate an error message in the system log.
  * 	 - For opt_c: slave should not be set to the master's setting
  *	   while it is running. It was already set during enslave. To
- *	   simplify things, it is now handeled separately.
+ *	   simplify things, it is now handled separately.
  *
  *    - 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
  *	 - Code cleanup and style changes
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 838e435e4922..cab355c0c1f7 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -236,7 +236,7 @@
 
 
 /*
- * Abort preanble and completion macros.
+ * Abort preamble and completion macros.
  * If a fixup handler is required then those macros must surround it.
  * It is assumed that the fixup code will handle the private part of
  * the exit macro.
diff --git a/drivers/char/mxser.h b/drivers/char/mxser.h
index e7fd0b08e0b7..7e188a4d602a 100644
--- a/drivers/char/mxser.h
+++ b/drivers/char/mxser.h
@@ -118,7 +118,7 @@
 
 // enable CTS interrupt
 #define MOXA_MUST_IER_ECTSI		0x80
-// eanble RTS interrupt
+// enable RTS interrupt
 #define MOXA_MUST_IER_ERTSI		0x40
 // enable Xon/Xoff interrupt
 #define MOXA_MUST_IER_XINT		0x20
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index abb03e52fe12..fee2aca3f6a5 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -5996,7 +5996,7 @@ static void usc_set_async_mode( struct mgsl_struct *info )
 	 * <15..8>	?		RxFIFO IRQ Request Level
 	 *
 	 * Note: For async mode the receive FIFO level must be set
-	 * to 0 to aviod the situation where the FIFO contains fewer bytes
+	 * to 0 to avoid the situation where the FIFO contains fewer bytes
 	 * than the trigger level and no more data is expected.
 	 *
 	 * <7>		0		Exited Hunt IA (Interrupt Arm)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 905f58ba8e16..ea06e3a4dc35 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -2044,7 +2044,7 @@ static int __init edac_mc_init(void)
 	 */
 	clear_pci_parity_errors();
 
-	/* Create the MC sysfs entires */
+	/* Create the MC sysfs entries */
 	if (edac_sysfs_memctrl_setup()) {
 		edac_printk(KERN_ERR, EDAC_MC,
 			"Error initializing sysfs code\n");
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index 9aa074b44dd3..cc7ff8f00e42 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -812,7 +812,7 @@ static int nsc_ircc_init_39x(nsc_chip_t *chip, chipio_t *info)
 	int cfg_base = info->cfg_base;
 	int enabled;
 
-	/* User is shure about his config... accept it. */
+	/* User is sure about his config... accept it. */
 	IRDA_DEBUG(2, "%s(): nsc_ircc_init_39x (user settings): "
 		   "io=0x%04x, irq=%d, dma=%d\n", 
 		   __FUNCTION__, info->fir_base, info->irq, info->dma);
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 8429ceb01389..b82191d2bee1 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -2283,7 +2283,7 @@ static void set_rx_mode(struct net_device *net_dev)
 	int i, table_entries;
 	u32 rx_mode;
 
-	/* 635 Hash Table entires = 256(2^16) */
+	/* 635 Hash Table entries = 256(2^16) */
 	if((sis_priv->chipset_rev >= SIS635A_900_REV) ||
 			(sis_priv->chipset_rev == SIS900B_900_REV))
 		table_entries = 16;
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index ee48bfd67349..d1a86a080a65 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -513,7 +513,7 @@ struct mii_phy {
     u_char  *rst;           /* Start of reset sequence in SROM           */
     u_int mc;               /* Media Capabilities                        */
     u_int ana;              /* NWay Advertisement                        */
-    u_int fdx;              /* Full DupleX capabilites for each media    */
+    u_int fdx;              /* Full DupleX capabilities for each media   */
     u_int ttm;              /* Transmit Threshold Mode for each media    */
     u_int mci;              /* 21142 MII Connector Interrupt info        */
 };
diff --git a/drivers/net/tulip/pnic2.c b/drivers/net/tulip/pnic2.c
index 55f4a9a631bc..ab985023fcca 100644
--- a/drivers/net/tulip/pnic2.c
+++ b/drivers/net/tulip/pnic2.c
@@ -199,7 +199,7 @@ void pnic2_lnk_change(struct net_device *dev, int csr5)
 	               /* negotiation ended successfully */
 
 	               /* get the link partners reply and mask out all but
-                        * bits 24-21 which show the partners capabilites
+                        * bits 24-21 which show the partners capabilities
                         * and match those to what we advertised
                         *
                         * then begin to interpret the results of the negotiation.
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index cde35dd87906..c1ce87a5f8d3 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -208,7 +208,7 @@ static const struct typhoon_card_info typhoon_card_info[] __devinitdata = {
 };
 
 /* Notes on the new subsystem numbering scheme:
- * bits 0-1 indicate crypto capabilites: (0) variable, (1) DES, or (2) 3DES
+ * bits 0-1 indicate crypto capabilities: (0) variable, (1) DES, or (2) 3DES
  * bit 4 indicates if this card has secured firmware (we don't support it)
  * bit 8 indicates if this is a (0) copper or (1) fiber card
  * bits 12-16 indicate card type: (0) client and (1) server
@@ -788,7 +788,7 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 	/* we have two rings to choose from, but we only use txLo for now
 	 * If we start using the Hi ring as well, we'll need to update
 	 * typhoon_stop_runtime(), typhoon_interrupt(), typhoon_num_free_tx(),
-	 * and TXHI_ENTIRES to match, as well as update the TSO code below
+	 * and TXHI_ENTRIES to match, as well as update the TSO code below
 	 * to get the right DMA address
 	 */
 	txRing = &tp->txLoRing;
diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c
index 6fd0bf736830..8dfdfbd5966c 100644
--- a/drivers/net/wireless/orinoco.c
+++ b/drivers/net/wireless/orinoco.c
@@ -3858,7 +3858,7 @@ static int orinoco_ioctl_setscan(struct net_device *dev,
 	unsigned long flags;
 
 	/* Note : you may have realised that, as this is a SET operation,
-	 * this is priviledged and therefore a normal user can't
+	 * this is privileged and therefore a normal user can't
 	 * perform scanning.
 	 * This is not an error, while the device perform scanning,
 	 * traffic doesn't flow, so it's a perfect DoS...
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index e5bb9f5ae429..989599ad33ef 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -747,7 +747,7 @@ prism54_get_essid(struct net_device *ndev, struct iw_request_info *info,
 
 	if (essid->length) {
 		dwrq->flags = 1;	/* set ESSID to ON for Wireless Extensions */
-		/* if it is to big, trunk it */
+		/* if it is too big, trunk it */
 		dwrq->length = min((u8)IW_ESSID_MAX_SIZE, essid->length);
 	} else {
 		dwrq->flags = 0;
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 0ab26d01877b..0d2b447c50ed 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1026,7 +1026,7 @@ static void twa_free_request_id(TW_Device_Extension *tw_dev, int request_id)
 	tw_dev->free_tail = (tw_dev->free_tail + 1) % TW_Q_LENGTH;
 } /* End twa_free_request_id() */
 
-/* This function will get parameter table entires from the firmware */
+/* This function will get parameter table entries from the firmware */
 static void *twa_get_param(TW_Device_Extension *tw_dev, int request_id, int table_id, int parameter_id, int parameter_size_bytes)
 {
 	TW_Command_Full *full_command_packet;
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 5996d3cd0ed8..674b15c78f68 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1528,7 +1528,7 @@ static int serial8250_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	serial8250_clear_fifos(up);
 
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index b848b7d94412..3bdee64d1a99 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -483,7 +483,7 @@ static int serial_txx9_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	sio_set(up, TXX9_SIFCR,
 		TXX9_SIFCR_TFRST | TXX9_SIFCR_RFRST | TXX9_SIFCR_FRSTE);
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 9fe2283d91e5..1c4396c2962d 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -641,7 +641,7 @@ static int sunsu_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	if (uart_config[up->port.type].flags & UART_CLEAR_FIFO) {
 		serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO);
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index 372527a83593..682bf2215660 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -158,7 +158,7 @@ static int ohci_s3c2410_hub_control (
 		"s3c2410_hub_control(%p,0x%04x,0x%04x,0x%04x,%p,%04x)\n",
 		hcd, typeReq, wValue, wIndex, buf, wLength);
 
-	/* if we are only an humble host without any special capabilites
+	/* if we are only an humble host without any special capabilities
 	 * process the request straight away and exit */
 
 	if (info == NULL) {
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index 9c5ab251370c..f7ac9d6b9856 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -217,7 +217,7 @@ static int blan_mdlm_bind(struct usbnet *dev, struct usb_interface *intf)
 			 * with devices that use it and those that don't.
 			 */
 			if ((detail->bDetailData[1] & ~0x02) != 0x01) {
-				/* bmDataCapabilites == 0 would be fine too,
+				/* bmDataCapabilities == 0 would be fine too,
 				 * but framing is minidriver-coupled for now.
 				 */
 bad_detail:
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 73e754fea2d8..e4fde1ab22cd 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -311,7 +311,7 @@ fail:
 /*
  * mb_cache_shrink()
  *
- * Removes all cache entires of a device from the cache. All cache entries
+ * Removes all cache entries of a device from the cache. All cache entries
  * currently in use cannot be freed, and thus remain in the cache. All others
  * are freed.
  *
diff --git a/include/asm-parisc/pdc.h b/include/asm-parisc/pdc.h
index 8e23e4c674f6..0a3face6c480 100644
--- a/include/asm-parisc/pdc.h
+++ b/include/asm-parisc/pdc.h
@@ -333,7 +333,7 @@ struct pdc_model {		/* for PDC_MODEL */
 	unsigned long curr_key;
 };
 
-/* Values for PDC_MODEL_CAPABILITES non-equivalent virtual aliasing support */
+/* Values for PDC_MODEL_CAPABILITIES non-equivalent virtual aliasing support */
 
 #define PDC_MODEL_IOPDIR_FDC            (1 << 2)        /* see sba_iommu.c */
 #define PDC_MODEL_NVA_MASK		(3 << 4)
diff --git a/include/asm-sh/addrspace.h b/include/asm-sh/addrspace.h
index dbb05d1a26d1..720afc11c2ca 100644
--- a/include/asm-sh/addrspace.h
+++ b/include/asm-sh/addrspace.h
@@ -13,7 +13,7 @@
 
 #include <asm/cpu/addrspace.h>
 
-/* Memory segments (32bit Priviledged mode addresses)  */
+/* Memory segments (32bit Privileged mode addresses)  */
 #define P0SEG		0x00000000
 #define P1SEG		0x80000000
 #define P2SEG		0xa0000000
diff --git a/include/asm-sparc64/floppy.h b/include/asm-sparc64/floppy.h
index 49d49a285943..6a95d5d0c576 100644
--- a/include/asm-sparc64/floppy.h
+++ b/include/asm-sparc64/floppy.h
@@ -738,7 +738,7 @@ static unsigned long __init sun_floppy_init(void)
 		if (!sun_floppy_types[0] && sun_floppy_types[1]) {
 			/*
 			 * Set the drive exchange bit in FCR on NS87303,
-			 * make shure other bits are sane before doing so.
+			 * make sure other bits are sane before doing so.
 			 */
 			ns87303_modify(config, FER, FER_EDM, 0);
 			ns87303_modify(config, ASC, ASC_DRV2_SEL, 0);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 2cb19e6503aa..d03fadfcafe3 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -734,7 +734,7 @@ struct fb_tile_ops {
 
 /* A driver may set this flag to indicate that it does want a set_par to be
  * called every time when fbcon_switch is executed. The advantage is that with
- * this flag set you can really be shure that set_par is always called before
+ * this flag set you can really be sure that set_par is always called before
  * any of the functions dependant on the correct hardware state or altering
  * that state, even if you are using some broken X releases. The disadvantage
  * is that it introduces unwanted delays to every console switch if set_par
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4f71cfd29c6f..dec8249e972d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -912,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	/*
 	 * Check if this process has the right to modify the specified
 	 * process. The right exists if the process has administrative
-	 * capabilities, superuser priviledges or the same
+	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 759445648667..627b11342233 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1302,7 +1302,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (sk->sk_state != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
-	/* Check that we don't send out to big frames */
+	/* Check that we don't send out too big frames */
 	if (len > self->max_data_size) {
 		IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n",
 			   __FUNCTION__, len, self->max_data_size);
@@ -1546,7 +1546,7 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 	IRDA_ASSERT(self != NULL, return -1;);
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
@@ -1642,7 +1642,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 	}
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
diff --git a/sound/pci/rme32.c b/sound/pci/rme32.c
index 0cbef5fe6c63..ab78544bf042 100644
--- a/sound/pci/rme32.c
+++ b/sound/pci/rme32.c
@@ -313,7 +313,7 @@ static int snd_rme32_capture_copy(struct snd_pcm_substream *substream, int chann
 }
 
 /*
- * SPDIF I/O capabilites (half-duplex mode)
+ * SPDIF I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP_IOMEM |
@@ -339,7 +339,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_info = {
 };
 
 /*
- * ADAT I/O capabilites (half-duplex mode)
+ * ADAT I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_info =
 {
@@ -364,7 +364,7 @@ static struct snd_pcm_hardware snd_rme32_adat_info =
 };
 
 /*
- * SPDIF I/O capabilites (full-duplex mode)
+ * SPDIF I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP |
@@ -390,7 +390,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 };
 
 /*
- * ADAT I/O capabilites (full-duplex mode)
+ * ADAT I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_fd_info =
 {
diff --git a/sound/pci/rme96.c b/sound/pci/rme96.c
index 0e694b011dcc..6c2a9f4a7659 100644
--- a/sound/pci/rme96.c
+++ b/sound/pci/rme96.c
@@ -359,7 +359,7 @@ snd_rme96_capture_copy(struct snd_pcm_substream *substream,
 }
 
 /*
- * Digital output capabilites (S/PDIF)
+ * Digital output capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 {
@@ -388,7 +388,7 @@ static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 };
 
 /*
- * Digital input capabilites (S/PDIF)
+ * Digital input capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 {
@@ -417,7 +417,7 @@ static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 };
 
 /*
- * Digital output capabilites (ADAT)
+ * Digital output capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 {
@@ -442,7 +442,7 @@ static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 };
 
 /*
- * Digital input capabilites (ADAT)
+ * Digital input capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_capture_adat_info =
 {
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 980b9cd689dd..b5538efd146b 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -2256,7 +2256,7 @@ static int snd_hdspm_create_controls(struct snd_card *card, struct hdspm * hdspm
 	}
 
 	/* Channel playback mixer as default control 
-	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats to big for any alsamixer 
+	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats too big for any alsamixer
 	   they are accesible via special IOCTL on hwdep
 	   and the mixer 2dimensional mixer control */
 
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 315855082fe1..fe67a92e2a1a 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -404,7 +404,7 @@ static void usX2Y_usbpcm_subs_startup(struct snd_usX2Y_substream *subs)
 	struct usX2Ydev * usX2Y = subs->usX2Y;
 	usX2Y->prepare_subs = subs;
 	subs->urb[0]->start_frame = -1;
-	smp_wmb();	// Make shure above modifications are seen by i_usX2Y_subs_startup()
+	smp_wmb();	// Make sure above modifications are seen by i_usX2Y_subs_startup()
 	usX2Y_urbs_set_complete(usX2Y, i_usX2Y_usbpcm_subs_startup);
 }
 
-- 
cgit v1.2.3