From 2ef9481e666b4654159ac9f847e6963809e3c470 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Mon, 23 Jan 2006 10:58:20 -0600
Subject: [PATCH] powerpc: trivial: modify comments to refer to new location of
 files

This patch removes all self references and fixes references to files
in the now defunct arch/ppc64 tree.  I think this accomplises
everything wanted, though there might be a few references I missed.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d96..3e376202dd48 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -958,7 +958,7 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 	 *
 	 * i386     no
 	 * x86_64   no
-	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
 	 *
 	 * This also happens with vm86 emulation in a non-nested manner
 	 * (entries without exits), so this case must be caught.
-- 
cgit v1.2.3


From 1fa44ecad2b86475e038aed81b0bf333fa484f8b Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@steeleye.com>
Date: Thu, 23 Feb 2006 12:43:43 -0600
Subject: [SCSI] add execute_in_process_context() API

We have several points in the SCSI stack (primarily for our device
functions) where we need to guarantee process context, but (given the
place where the last reference was released) we cannot guarantee this.

This API gets around the issue by executing the function directly if
the caller has process context, but scheduling a workqueue to execute
in process context if the caller doesn't have it.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 include/linux/workqueue.h |  6 ++++++
 kernel/workqueue.c        | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 86b111300231..957c21c16d62 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -20,6 +20,10 @@ struct work_struct {
 	struct timer_list timer;
 };
 
+struct execute_work {
+	struct work_struct work;
+};
+
 #define __WORK_INITIALIZER(n, f, d) {				\
         .entry	= { &(n).entry, &(n).entry },			\
 	.func = (f),						\
@@ -74,6 +78,8 @@ extern void init_workqueues(void);
 void cancel_rearming_delayed_work(struct work_struct *work);
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *,
 				       struct work_struct *);
+int execute_in_process_context(void (*fn)(void *), void *,
+			       struct execute_work *);
 
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b052e2c4c710..e9e464a90376 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -27,6 +27,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/hardirq.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -476,6 +477,34 @@ void cancel_rearming_delayed_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 
+/**
+ * execute_in_process_context - reliably execute the routine with user context
+ * @fn:		the function to execute
+ * @data:	data to pass to the function
+ * @ew:		guaranteed storage for the execute work structure (must
+ *		be available when the work executes)
+ *
+ * Executes the function immediately if process context is available,
+ * otherwise schedules the function for delayed execution.
+ *
+ * Returns:	0 - function was executed
+ *		1 - function was scheduled for execution
+ */
+int execute_in_process_context(void (*fn)(void *data), void *data,
+			       struct execute_work *ew)
+{
+	if (!in_interrupt()) {
+		fn(data);
+		return 0;
+	}
+
+	INIT_WORK(&ew->work, fn, data);
+	schedule_work(&ew->work);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(execute_in_process_context);
+
 int keventd_up(void)
 {
 	return keventd_wq != NULL;
-- 
cgit v1.2.3


From afc847b7ddcf636e524cf5b0de644bd3a9419a8c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Feb 2006 12:51:55 -0500
Subject: [PATCH] don't do exit_io_context() until we know we won't be doing
 any IO

testcase:

mount /dev/sdb10 /mnt
touch /mnt/tmp/b
umount /mnt
mount /dev/sdb10 /mnt
rm /mnt/tmp/b </mnt/tmp/b
umount /mnt

and watch blkdev_ioc line in /proc/slabinfo.  Vanilla kernel leaks.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 531aadca5530..d1e8d500a7e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -807,8 +807,6 @@ fastcall NORET_TYPE void do_exit(long code)
 		panic("Attempted to kill the idle task!");
 	if (unlikely(tsk->pid == 1))
 		panic("Attempted to kill init!");
-	if (tsk->io_context)
-		exit_io_context();
 
 	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
 		current->ptrace_message = code;
@@ -822,6 +820,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		printk(KERN_ALERT
 			"Fixing recursive fault but reboot is needed!\n");
+		if (tsk->io_context)
+			exit_io_context();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
@@ -881,6 +881,9 @@ fastcall NORET_TYPE void do_exit(long code)
 	 */
 	mutex_debug_check_no_locks_held(tsk);
 
+	if (tsk->io_context)
+		exit_io_context();
+
 	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
-- 
cgit v1.2.3


From 7e7f8a036b8e2b2a300df016da5e7128c8a9192e Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 31 Jan 2006 16:56:28 -0500
Subject: [PATCH] make vm86 call audit_syscall_exit

hi,

The motivation behind the patch below was to address messages in
/var/log/messages such as:

Jan 31 10:54:15 mets kernel: audit(:0): major=252 name_count=0: freeing
multiple contexts (1)
Jan 31 10:54:15 mets kernel: audit(:0): major=113 name_count=0: freeing
multiple contexts (2)

I can reproduce by running 'get-edid' from:
http://john.fremlin.de/programs/linux/read-edid/.

These messages come about in the log b/c the vm86 calls do not exit via
the normal system call exit paths and thus do not call
'audit_syscall_exit'. The next system call will then free the context for
itself and for the vm86 context, thus generating the above messages. This
patch addresses the issue by simply adding a call to 'audit_syscall_exit'
from the vm86 code.

Besides fixing the above error messages the patch also now allows vm86
system calls to become auditable. This is useful since strace does not
appear to properly record the return values from sys_vm86.

I think this patch is also a step in the right direction in terms of
cleaning up some core auditing code. If we can correct any other paths
that do not properly call the audit exit and entries points, then we can
also eliminate the notion of context chaining.

I've tested this patch by verifying that the log messages no longer
appear, and that the audit records for sys_vm86 appear to be correct.
Also, 'read_edid' produces itentical output.

thanks,

-Jason

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/i386/kernel/vm86.c | 12 ++++++++++--
 kernel/auditsc.c        |  5 -----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index f51c894a7da5..aee14fafd13d 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
 #include <linux/smp_lock.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -252,6 +253,7 @@ out:
 static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
 {
 	struct tss_struct *tss;
+	long eax;
 /*
  * make sure the vm86() system call doesn't try to do anything silly
  */
@@ -305,13 +307,19 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
+	__asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
+	__asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
+
+	/*call audit_syscall_exit since we do not exit via the normal paths */
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, AUDITSC_RESULT(eax), eax);
+
 	__asm__ __volatile__(
-		"xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
 		"jmp resume_userspace"
 		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)) : "ax");
+		:"r" (&info->regs), "r" (task_thread_info(tsk)));
 	/* we never return here */
 }
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d7e7e637b92a..cfaa4a277f08 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -966,11 +966,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 	if (context->in_syscall) {
 		struct audit_context *newctx;
 
-#if defined(__NR_vm86) && defined(__NR_vm86old)
-		/* vm86 mode should only be entered once */
-		if (major == __NR_vm86 || major == __NR_vm86old)
-			return;
-#endif
 #if AUDIT_DEBUG
 		printk(KERN_ERR
 		       "audit(:%d) pid=%d in syscall=%d;"
-- 
cgit v1.2.3


From b0dd25a8263dde3c30b0d7d72a8bd92d7ba0e3f5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 13 Sep 2005 12:47:11 -0700
Subject: [PATCH] AUDIT: kerneldoc for kernel/audit*.c

- add kerneldoc for non-static functions;
- don't init static data to 0;
- limit lines to < 80 columns;
- fix long-format style;
- delete whitespace at end of some lines;

(chrisw: resend and update to current audit-2.6 tree)

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/audit.c   | 134 ++++++++++++++++++++++++++++++++++++++-----------
 kernel/auditsc.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 238 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 0a813d2883e5..973ca5a9e0d6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -72,7 +72,7 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;
  * contains the (non-zero) pid. */
 int		audit_pid;
 
-/* If audit_limit is non-zero, limit the rate of sending audit records
+/* If audit_rate_limit is non-zero, limit the rate of sending audit records
  * to that number per second.  This prevents DoS attacks, but results in
  * audit records being dropped. */
 static int	audit_rate_limit;
@@ -102,7 +102,7 @@ static struct sock *audit_sock;
  * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
  * being placed on the freelist). */
 static DEFINE_SPINLOCK(audit_freelist_lock);
-static int	   audit_freelist_count = 0;
+static int	   audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 
 static struct sk_buff_head audit_skb_queue;
@@ -186,8 +186,14 @@ static inline int audit_rate_check(void)
 	return retval;
 }
 
-/* Emit at least 1 message per second, even if audit_rate_check is
- * throttling. */
+/**
+ * audit_log_lost - conditionally log lost audit message event
+ * @message: the message stating reason for lost audit message
+ *
+ * Emit at least 1 message per second, even if audit_rate_check is
+ * throttling.
+ * Always increment the lost messages counter.
+*/
 void audit_log_lost(const char *message)
 {
 	static unsigned long	last_msg = 0;
@@ -218,7 +224,6 @@ void audit_log_lost(const char *message)
 		       audit_backlog_limit);
 		audit_panic(message);
 	}
-
 }
 
 static int audit_set_rate_limit(int limit, uid_t loginuid)
@@ -302,6 +307,19 @@ static int kauditd_thread(void *dummy)
 	}
 }
 
+/**
+ * audit_send_reply - send an audit reply message via netlink
+ * @pid: process id to send reply to
+ * @seq: sequence number
+ * @type: audit message type
+ * @done: done (last) flag
+ * @multi: multi-part message flag
+ * @payload: payload data
+ * @size: payload size
+ *
+ * Allocates an skb, builds the netlink message, and sends it to the pid.
+ * No failure notifications.
+ */
 void audit_send_reply(int pid, int seq, int type, int done, int multi,
 		      void *payload, int size)
 {
@@ -376,7 +394,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (err)
 		return err;
 
-	/* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+	/* As soon as there's any sign of userspace auditd,
+	 * start kauditd to talk to it */
 	if (!kauditd_task)
 		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 	if (IS_ERR(kauditd_task)) {
@@ -469,9 +488,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return err < 0 ? err : 0;
 }
 
-/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+/*
+ * Get message from skb (based on rtnetlink_rcv_skb).  Each message is
  * processed by audit_receive_msg.  Malformed skbs with wrong length are
- * discarded silently.  */
+ * discarded silently.
+ */
 static void audit_receive_skb(struct sk_buff *skb)
 {
 	int		err;
@@ -600,7 +621,10 @@ err:
 	return NULL;
 }
 
-/* Compute a serial number for the audit record.  Audit records are
+/**
+ * audit_serial - compute a serial number for the audit record
+ *
+ * Compute a serial number for the audit record.  Audit records are
  * written to user-space as soon as they are generated, so a complete
  * audit record may be written in several pieces.  The timestamp of the
  * record and this serial number are used by the user-space tools to
@@ -612,8 +636,8 @@ err:
  * audit context (for those records that have a context), and emit them
  * all at syscall exit.  However, this could delay the reporting of
  * significant errors until syscall exit (or never, if the system
- * halts). */
-
+ * halts).
+ */
 unsigned int audit_serial(void)
 {
 	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
@@ -649,6 +673,21 @@ static inline void audit_get_stamp(struct audit_context *ctx,
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
 
+/**
+ * audit_log_start - obtain an audit buffer
+ * @ctx: audit_context (may be NULL)
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ *
+ * Returns audit_buffer pointer on success or NULL on error.
+ *
+ * Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the task (ctx) is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, then
+ * task context (ctx) should be NULL.
+ */
 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
@@ -713,6 +752,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 /**
  * audit_expand - expand skb in the audit buffer
  * @ab: audit_buffer
+ * @extra: space to add at tail of the skb
  *
  * Returns 0 (no space) on failed expansion, or available space if
  * successful.
@@ -729,10 +769,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
 	return skb_tailroom(skb);
 }
 
-/* Format an audit message into the audit buffer.  If there isn't enough
+/*
+ * Format an audit message into the audit buffer.  If there isn't enough
  * room in the audit buffer, more room will be allocated and vsnprint
  * will be called a second time.  Currently, we assume that a printk
- * can't format message larger than 1024 bytes, so we don't either. */
+ * can't format message larger than 1024 bytes, so we don't either.
+ */
 static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 			      va_list args)
 {
@@ -757,7 +799,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
 		/* The printk buffer is 1024 bytes long, so if we get
 		 * here and AUDIT_BUFSIZ is at least 1024, then we can
 		 * log everything that printk could have logged. */
-		avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+		avail = audit_expand(ab,
+			max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
 		if (!avail)
 			goto out;
 		len = vsnprintf(skb->tail, avail, fmt, args2);
@@ -768,8 +811,14 @@ out:
 	return;
 }
 
-/* Format a message into the audit buffer.  All the work is done in
- * audit_log_vformat. */
+/**
+ * audit_log_format - format a message into the audit buffer.
+ * @ab: audit_buffer
+ * @fmt: format string
+ * @...: optional parameters matching @fmt string
+ *
+ * All the work is done in audit_log_vformat.
+ */
 void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 {
 	va_list args;
@@ -781,9 +830,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
 	va_end(args);
 }
 
-/* This function will take the passed buf and convert it into a string of
- * ascii hex digits. The new string is placed onto the skb. */
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, 
+/**
+ * audit_log_hex - convert a buffer to hex and append it to the audit skb
+ * @ab: the audit_buffer
+ * @buf: buffer to convert to hex
+ * @len: length of @buf to be converted
+ *
+ * No return value; failure to expand is silently ignored.
+ *
+ * This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb.
+ */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
 		size_t len)
 {
 	int i, avail, new_len;
@@ -812,10 +870,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
 	skb_put(skb, len << 1); /* new string is twice the old string */
 }
 
-/* This code will escape a string that is passed to it if the string
- * contains a control character, unprintable character, double quote mark, 
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
  * or a space. Unescaped strings will start and end with a double quote mark.
- * Strings that are escaped are printed in hex (2 digits per char). */
+ * Strings that are escaped are printed in hex (2 digits per char).
+ */
 void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 {
 	const unsigned char *p = string;
@@ -854,10 +918,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(path);
 }
 
-/* The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is places on a queue and a tasklet is scheduled to
+/**
+ * audit_log_end - end one audit record
+ * @ab: the audit_buffer
+ *
+ * The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is placed on a queue and a tasklet is scheduled to
  * remove them from the queue outside the irq context.  May be called in
- * any context. */
+ * any context.
+ */
 void audit_log_end(struct audit_buffer *ab)
 {
 	if (!ab)
@@ -878,9 +947,18 @@ void audit_log_end(struct audit_buffer *ab)
 	audit_buffer_free(ab);
 }
 
-/* Log an audit record.  This is a convenience function that calls
- * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
- * called in any context. */
+/**
+ * audit_log - Log an audit record
+ * @ctx: audit context
+ * @gfp_mask: type of allocation
+ * @type: audit message type
+ * @fmt: format string to use
+ * @...: variable parameters matching the format string
+ *
+ * This is a convenience function that calls audit_log_start,
+ * audit_log_vformat, and audit_log_end.  It may be called
+ * in any context.
+ */
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
 	       const char *fmt, ...)
 {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cfaa4a277f08..51a4f58a4d81 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -330,6 +330,15 @@ static int audit_list_rules(void *_dest)
 	return 0;
 }
 
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @loginuid: loginuid of sender
+ */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 							uid_t loginuid)
 {
@@ -527,7 +536,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
 /* At syscall entry and exit time, this filter is called if the
  * audit_state is not low enough that auditing cannot take place, but is
  * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
  */
 static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 					     struct audit_context *ctx,
@@ -721,10 +730,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
 	return context;
 }
 
-/* Filter on the task information and allocate a per-task audit context
+/**
+ * audit_alloc - allocate an audit context block for a task
+ * @tsk: task
+ *
+ * Filter on the task information and allocate a per-task audit context
  * if necessary.  Doing so turns on system call auditing for the
  * specified task.  This is called from copy_process, so no lock is
- * needed. */
+ * needed.
+ */
 int audit_alloc(struct task_struct *tsk)
 {
 	struct audit_context *context;
@@ -911,8 +925,12 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 	}
 }
 
-/* Free a per-task audit context.  Called from copy_process and
- * __put_task_struct. */
+/**
+ * audit_free - free a per-task audit context
+ * @tsk: task whose audit context block to free
+ *
+ * Called from copy_process and __put_task_struct.
+ */
 void audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
@@ -934,13 +952,24 @@ void audit_free(struct task_struct *tsk)
 	audit_free_context(context);
 }
 
-/* Fill in audit context at syscall entry.  This only happens if the
+/**
+ * audit_syscall_entry - fill in an audit record at syscall entry
+ * @tsk: task being audited
+ * @arch: architecture type
+ * @major: major syscall type (function)
+ * @a1: additional syscall register 1
+ * @a2: additional syscall register 2
+ * @a3: additional syscall register 3
+ * @a4: additional syscall register 4
+ *
+ * Fill in audit context at syscall entry.  This only happens if the
  * audit context was created when the task was created and the state or
  * filters demand the audit context be built.  If the state from the
  * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
  * then the record will be written at syscall exit time (otherwise, it
  * will only be written if another part of the kernel requests that it
- * be written). */
+ * be written).
+ */
 void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
@@ -950,7 +979,8 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 
 	BUG_ON(!context);
 
-	/* This happens only on certain architectures that make system
+	/*
+	 * This happens only on certain architectures that make system
 	 * calls in kernel_thread via the entry.S interface, instead of
 	 * with direct calls.  (If you are porting to a new
 	 * architecture, hitting this condition can indicate that you
@@ -1009,11 +1039,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
 	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
 }
 
-/* Tear down after system call.  If the audit context has been marked as
+/**
+ * audit_syscall_exit - deallocate audit context after a system call
+ * @tsk: task being audited
+ * @valid: success/failure flag
+ * @return_code: syscall return value
+ *
+ * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
  * filtering, or because some other part of the kernel write an audit
  * message), then write out the syscall information.  In call cases,
- * free the names stored from getname(). */
+ * free the names stored from getname().
+ */
 void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 {
 	struct audit_context *context;
@@ -1048,7 +1085,13 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 	put_task_struct(tsk);
 }
 
-/* Add a name to the list.  Called from fs/namei.c:getname(). */
+/**
+ * audit_getname - add a name to the list
+ * @name: name to add
+ *
+ * Add a name to the list of audit names for this context.
+ * Called from fs/namei.c:getname().
+ */
 void audit_getname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
@@ -1077,10 +1120,13 @@ void audit_getname(const char *name)
 		
 }
 
-/* Intercept a putname request.  Called from
- * include/linux/fs.h:putname().  If we have stored the name from
- * getname in the audit context, then we delay the putname until syscall
- * exit. */
+/* audit_putname - intercept a putname request
+ * @name: name to intercept and delay for putname
+ *
+ * If we have stored the name from getname in the audit context,
+ * then we delay the putname until syscall exit.
+ * Called from include/linux/fs.h:putname().
+ */
 void audit_putname(const char *name)
 {
 	struct audit_context *context = current->audit_context;
@@ -1117,8 +1163,14 @@ void audit_putname(const char *name)
 #endif
 }
 
-/* Store the inode and device from a lookup.  Called from
- * fs/namei.c:path_lookup(). */
+/**
+ * audit_inode - store the inode and device from a lookup
+ * @name: name being audited
+ * @inode: inode being audited
+ * @flags: lookup flags (as used in path_lookup())
+ *
+ * Called from fs/namei.c:path_lookup().
+ */
 void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
 	int idx;
@@ -1154,6 +1206,14 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 	context->names[idx].rdev  = inode->i_rdev;
 }
 
+/**
+ * auditsc_get_stamp - get local copies of audit_context values
+ * @ctx: audit_context for the task
+ * @t: timespec to store time recorded in the audit_context
+ * @serial: serial value that is recorded in the audit_context
+ *
+ * Also sets the context as auditable.
+ */
 void auditsc_get_stamp(struct audit_context *ctx,
 		       struct timespec *t, unsigned int *serial)
 {
@@ -1165,6 +1225,15 @@ void auditsc_get_stamp(struct audit_context *ctx,
 	ctx->auditable = 1;
 }
 
+/**
+ * audit_set_loginuid - set a task's audit_context loginuid
+ * @task: task whose audit context is being modified
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
 int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 {
 	if (task->audit_context) {
@@ -1183,11 +1252,26 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
 	return 0;
 }
 
+/**
+ * audit_get_loginuid - get the loginuid for an audit_context
+ * @ctx: the audit_context
+ *
+ * Returns the context's loginuid or -1 if @ctx is NULL.
+ */
 uid_t audit_get_loginuid(struct audit_context *ctx)
 {
 	return ctx ? ctx->loginuid : -1;
 }
 
+/**
+ * audit_ipc_perms - record audit data for ipc
+ * @qbytes: msgq bytes
+ * @uid: msgq user id
+ * @gid: msgq group id
+ * @mode: msgq mode (permissions)
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
 int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct audit_aux_data_ipcctl *ax;
@@ -1211,6 +1295,13 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	return 0;
 }
 
+/**
+ * audit_socketcall - record audit data for sys_socketcall
+ * @nargs: number of args
+ * @args: args array
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
 int audit_socketcall(int nargs, unsigned long *args)
 {
 	struct audit_aux_data_socketcall *ax;
@@ -1232,6 +1323,13 @@ int audit_socketcall(int nargs, unsigned long *args)
 	return 0;
 }
 
+/**
+ * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto
+ * @len: data length in user space
+ * @a: data address in kernel space
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
 int audit_sockaddr(int len, void *a)
 {
 	struct audit_aux_data_sockaddr *ax;
@@ -1253,6 +1351,15 @@ int audit_sockaddr(int len, void *a)
 	return 0;
 }
 
+/**
+ * audit_avc_path - record the granting or denial of permissions
+ * @dentry: dentry to record
+ * @mnt: mnt to record
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ *
+ * Called from security/selinux/avc.c::avc_audit()
+ */
 int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
 {
 	struct audit_aux_data_path *ax;
@@ -1274,6 +1381,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
 	return 0;
 }
 
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
 void audit_signal_info(int sig, struct task_struct *t)
 {
 	extern pid_t audit_sig_pid;
@@ -1290,4 +1405,3 @@ void audit_signal_info(int sig, struct task_struct *t)
 		}
 	}
 }
-
-- 
cgit v1.2.3


From b63862f46547487388e582e8ac9083830d34f058 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 15:41:46 +0000
Subject: [PATCH] Filter rule comparators

Currently, audit only supports the "=" and "!=" operators in the -F
filter rules.

This patch reworks the support for "=" and "!=", and adds support
for ">", ">=", "<", and "<=".

This turned out to be a pretty clean, and simply process.  I ended up
using the high order bits of the "field", as suggested by Steve and Amy.
This allowed for no changes whatsoever to the netlink communications.
See the documentation within the patch in the include/linux/audit.h
area, where there is a table that explains the reasoning of the bitmask
assignments clearly.

The patch adds a new function, audit_comparator(left, op, right).
This function will perform the specified comparison (op, which defaults
to "==" for backward compatibility) between two values (left and right).
If the negate bit is on, it will negate whatever that result was.  This
value is returned.

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h |  29 ++++++++++++-
 kernel/auditsc.c      | 117 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 103 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index da3c01955f3d..2408cb77899c 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -98,6 +98,13 @@
 #define AUDIT_WORD(nr) ((__u32)((nr)/32))
 #define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
 
+/* This bitmask is used to validate user input.  It represents all bits that
+ * are currently used in an audit field constant understood by the kernel.
+ * If you are adding a new #define AUDIT_<whatever>, please ensure that
+ * AUDIT_UNUSED_BITS is updated if need be. */
+#define AUDIT_UNUSED_BITS	0x0FFFFC00
+
+
 /* Rule fields */
 				/* These are useful when checking the
 				 * task structure at task creation time
@@ -128,8 +135,28 @@
 #define AUDIT_ARG2      (AUDIT_ARG0+2)
 #define AUDIT_ARG3      (AUDIT_ARG0+3)
 
-#define AUDIT_NEGATE    0x80000000
+#define AUDIT_NEGATE			0x80000000
 
+/* These are the supported operators.
+ *	4  2  1
+ *	=  >  <
+ *	-------
+ *	0  0  0		0	nonsense
+ *	0  0  1		1	<
+ *	0  1  0		2	>
+ *	0  1  1		3	!=
+ *	1  0  0		4	=
+ *	1  0  1		5	<=
+ *	1  1  0		6	>=
+ *	1  1  1		7	all operators
+ */
+#define AUDIT_LESS_THAN			0x10000000
+#define AUDIT_GREATER_THAN		0x20000000
+#define AUDIT_NOT_EQUAL			0x30000000
+#define AUDIT_EQUAL			0x40000000
+#define AUDIT_LESS_THAN_OR_EQUAL	(AUDIT_LESS_THAN|AUDIT_EQUAL)
+#define AUDIT_GREATER_THAN_OR_EQUAL	(AUDIT_GREATER_THAN|AUDIT_EQUAL)
+#define AUDIT_OPERATORS			(AUDIT_EQUAL|AUDIT_NOT_EQUAL)
 
 /* Status symbols */
 				/* Mask values */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 51a4f58a4d81..95076fa12202 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,7 @@
  * Handles all system-call specific auditing features.
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright (C) 2005 IBM Corporation
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify
@@ -27,6 +28,9 @@
  * this file -- see entry.S) is based on a GPL'd patch written by
  * okir@suse.de and Copyright 2003 SuSE Linux AG.
  *
+ * The support of additional filter rules compares (>, <, >=, <=) was
+ * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
+ *
  */
 
 #include <linux/init.h>
@@ -252,6 +256,7 @@ static inline int audit_add_rule(struct audit_rule *rule,
 				  struct list_head *list)
 {
 	struct audit_entry  *entry;
+	int i;
 
 	/* Do not use the _rcu iterator here, since this is the only
 	 * addition routine. */
@@ -261,6 +266,16 @@ static inline int audit_add_rule(struct audit_rule *rule,
 		}
 	}
 
+	for (i = 0; i < rule->field_count; i++) {
+		if (rule->fields[i] & AUDIT_UNUSED_BITS)
+			return -EINVAL;
+		if ( rule->fields[i] & AUDIT_NEGATE )
+			rule->fields[i] |= AUDIT_NOT_EQUAL;
+		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
+			rule->fields[i] |= AUDIT_EQUAL;
+		rule->fields[i] &= (~AUDIT_NEGATE);
+	}
+
 	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
 		return -ENOMEM;
 	if (audit_copy_rule(&entry->rule, rule)) {
@@ -394,6 +409,26 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 	return err;
 }
 
+static int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+	switch (op) {
+	case AUDIT_EQUAL:
+		return (left == right);
+	case AUDIT_NOT_EQUAL:
+		return (left != right);
+	case AUDIT_LESS_THAN:
+		return (left < right);
+	case AUDIT_LESS_THAN_OR_EQUAL:
+		return (left <= right);
+	case AUDIT_GREATER_THAN:
+		return (left > right);
+	case AUDIT_GREATER_THAN_OR_EQUAL:
+		return (left >= right);
+	default:
+		return -EINVAL;
+	}
+}
+
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
@@ -404,62 +439,63 @@ static int audit_filter_rules(struct task_struct *tsk,
 	int i, j;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
 		u32 value  = rule->values[i];
 		int result = 0;
 
 		switch (field) {
 		case AUDIT_PID:
-			result = (tsk->pid == value);
+			result = audit_comparator(tsk->pid, op, value);
 			break;
 		case AUDIT_UID:
-			result = (tsk->uid == value);
+			result = audit_comparator(tsk->uid, op, value);
 			break;
 		case AUDIT_EUID:
-			result = (tsk->euid == value);
+			result = audit_comparator(tsk->euid, op, value);
 			break;
 		case AUDIT_SUID:
-			result = (tsk->suid == value);
+			result = audit_comparator(tsk->suid, op, value);
 			break;
 		case AUDIT_FSUID:
-			result = (tsk->fsuid == value);
+			result = audit_comparator(tsk->fsuid, op, value);
 			break;
 		case AUDIT_GID:
-			result = (tsk->gid == value);
+			result = audit_comparator(tsk->gid, op, value);
 			break;
 		case AUDIT_EGID:
-			result = (tsk->egid == value);
+			result = audit_comparator(tsk->egid, op, value);
 			break;
 		case AUDIT_SGID:
-			result = (tsk->sgid == value);
+			result = audit_comparator(tsk->sgid, op, value);
 			break;
 		case AUDIT_FSGID:
-			result = (tsk->fsgid == value);
+			result = audit_comparator(tsk->fsgid, op, value);
 			break;
 		case AUDIT_PERS:
-			result = (tsk->personality == value);
+			result = audit_comparator(tsk->personality, op, value);
 			break;
 		case AUDIT_ARCH:
-			if (ctx) 
-				result = (ctx->arch == value);
+ 			if (ctx)
+				result = audit_comparator(ctx->arch, op, value);
 			break;
 
 		case AUDIT_EXIT:
 			if (ctx && ctx->return_valid)
-				result = (ctx->return_code == value);
+				result = audit_comparator(ctx->return_code, op, value);
 			break;
 		case AUDIT_SUCCESS:
 			if (ctx && ctx->return_valid) {
 				if (value)
-					result = (ctx->return_valid == AUDITSC_SUCCESS);
+					result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS);
 				else
-					result = (ctx->return_valid == AUDITSC_FAILURE);
+					result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE);
 			}
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (MAJOR(ctx->names[j].dev)==value) {
+					if (audit_comparator(MAJOR(ctx->names[j].dev),	op, value)) {
 						++result;
 						break;
 					}
@@ -469,7 +505,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_DEVMINOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (MINOR(ctx->names[j].dev)==value) {
+					if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) {
 						++result;
 						break;
 					}
@@ -479,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (ctx->names[j].ino == value) {
+					if (audit_comparator(ctx->names[j].ino, op, value)) {
 						++result;
 						break;
 					}
@@ -489,19 +525,17 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_LOGINUID:
 			result = 0;
 			if (ctx)
-				result = (ctx->loginuid == value);
+				result = audit_comparator(ctx->loginuid, op, value);
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			if (ctx)
-				result = (ctx->argv[field-AUDIT_ARG0]==value);
+				result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value);
 			break;
 		}
 
-		if (rule->fields[i] & AUDIT_NEGATE)
-			result = !result;
 		if (!result)
 			return 0;
 	}
@@ -550,49 +584,48 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 
 	rcu_read_lock();
 	if (!list_empty(list)) {
-		    int word = AUDIT_WORD(ctx->major);
-		    int bit  = AUDIT_BIT(ctx->major);
-
-		    list_for_each_entry_rcu(e, list, list) {
-			    if ((e->rule.mask[word] & bit) == bit
-				&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
-				    rcu_read_unlock();
-				    return state;
-			    }
-		    }
+		int word = AUDIT_WORD(ctx->major);
+		int bit  = AUDIT_BIT(ctx->major);
+
+		list_for_each_entry_rcu(e, list, list) {
+			if ((e->rule.mask[word] & bit) == bit
+					&& audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+				rcu_read_unlock();
+				return state;
+			}
+		}
 	}
 	rcu_read_unlock();
 	return AUDIT_BUILD_CONTEXT;
 }
 
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-			      struct audit_rule *rule,
-			      enum audit_state *state)
+				   struct audit_rule *rule,
+				   enum audit_state *state)
 {
 	int i;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
 		u32 value  = rule->values[i];
 		int result = 0;
 
 		switch (field) {
 		case AUDIT_PID:
-			result = (cb->creds.pid == value);
+			result = audit_comparator(cb->creds.pid, op, value);
 			break;
 		case AUDIT_UID:
-			result = (cb->creds.uid == value);
+			result = audit_comparator(cb->creds.uid, op, value);
 			break;
 		case AUDIT_GID:
-			result = (cb->creds.gid == value);
+			result = audit_comparator(cb->creds.gid, op, value);
 			break;
 		case AUDIT_LOGINUID:
-			result = (cb->loginuid == value);
+			result = audit_comparator(cb->loginuid, op, value);
 			break;
 		}
 
-		if (rule->fields[i] & AUDIT_NEGATE)
-			result = !result;
 		if (!result)
 			return 0;
 	}
-- 
cgit v1.2.3


From 90d526c074ae5db484388da56c399acf892b6c17 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 3 Nov 2005 15:48:08 +0000
Subject: [PATCH] Define new range of userspace messages.

The attached patch updates various items for the new user space
messages. Please apply.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/audit.h       | 19 +++++++++++++++----
 kernel/audit.c              |  2 ++
 security/selinux/nlmsgtab.c |  6 ++++--
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2408cb77899c..fd65078e794a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -33,11 +33,20 @@
  * 1200 - 1299 messages internal to the audit daemon
  * 1300 - 1399 audit event messages
  * 1400 - 1499 SE Linux use
- * 1500 - 1999 future use
- * 2000 is for otherwise unclassified kernel audit messages
+ * 1500 - 1599 kernel LSPP events
+ * 1600 - 1699 kernel crypto events
+ * 1700 - 1999 future kernel use (maybe integrity labels and related events)
+ * 2000 is for otherwise unclassified kernel audit messages (legacy)
+ * 2001 - 2099 unused (kernel)
+ * 2100 - 2199 user space anomaly records
+ * 2200 - 2299 user space actions taken in response to anomalies
+ * 2300 - 2399 user space generated LSPP events
+ * 2400 - 2499 user space crypto events
+ * 2500 - 2999 future user space (maybe integrity labels and related events)
  *
- * Messages from 1000-1199 are bi-directional. 1200-1299 are exclusively user
- * space. Anything over that is kernel --> user space communication.
+ * Messages from 1000-1199 are bi-directional. 1200-1299 & 2100 - 2999 are
+ * exclusively user space. 1300-2099 is kernel --> user space 
+ * communication.
  */
 #define AUDIT_GET		1000	/* Get status */
 #define AUDIT_SET		1001	/* Set status (enable/disable/auditd) */
@@ -54,6 +63,8 @@
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
 #define AUDIT_LAST_USER_MSG	1199
+#define AUDIT_FIRST_USER_MSG2	2100	/* More user space messages */
+#define AUDIT_LAST_USER_MSG2	2999
  
 #define AUDIT_DAEMON_START      1200    /* Daemon startup record */
 #define AUDIT_DAEMON_END        1201    /* Daemon normal stop record */
diff --git a/kernel/audit.c b/kernel/audit.c
index 973ca5a9e0d6..6d61dd79a605 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -369,6 +369,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
 		if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
 			err = -EPERM;
 		break;
@@ -449,6 +450,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+	case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 69b9329b2054..d7c0e912c5f3 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -145,8 +145,10 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
 		break;
 
 	case SECCLASS_NETLINK_AUDIT_SOCKET:
-		if (nlmsg_type >= AUDIT_FIRST_USER_MSG &&
-		    nlmsg_type <= AUDIT_LAST_USER_MSG) {
+		if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
+		     nlmsg_type <= AUDIT_LAST_USER_MSG) ||
+		    (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
+                     nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
 			*perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
 		} else {
 			err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
-- 
cgit v1.2.3


From f38aa94224c5517a40ba56d453779f70d3229803 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 3 Nov 2005 15:57:06 +0000
Subject: [PATCH] Pass dentry, not just name, in fsnotify creation hooks.

The audit hooks (to be added shortly) will want to see dentry->d_inode
too, not just the name.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/namei.c               | 10 +++++-----
 include/linux/fsnotify.h |  9 +++++----
 kernel/auditsc.c         |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index 8dc2b038d5d9..f6619af9e957 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1472,7 +1472,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	DQUOT_INIT(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -1793,7 +1793,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -1870,7 +1870,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
-		fsnotify_mkdir(dir, dentry->d_name.name);
+		fsnotify_mkdir(dir, dentry);
 	return error;
 }
 
@@ -2133,7 +2133,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 	DQUOT_INIT(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error)
-		fsnotify_create(dir, dentry->d_name.name);
+		fsnotify_create(dir, dentry);
 	return error;
 }
 
@@ -2210,7 +2210,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&old_dentry->d_inode->i_mutex);
 	if (!error)
-		fsnotify_create(dir, new_dentry->d_name.name);
+		fsnotify_create(dir, new_dentry);
 	return error;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 03b8e7932b83..b5ff64d2f092 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -70,19 +70,20 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 /*
  * fsnotify_create - 'name' was linked in
  */
-static inline void fsnotify_create(struct inode *inode, const char *name)
+static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name);
 }
 
 /*
  * fsnotify_mkdir - directory 'name' was created
  */
-static inline void fsnotify_mkdir(struct inode *inode, const char *name)
+static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
+				  dentry->d_name.name);
 }
 
 /*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95076fa12202..55ba331757c5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -515,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, op, value)) {
+					if ( audit_comparator(ctx->names[j].ino, op, value)) {
 						++result;
 						break;
 					}
-- 
cgit v1.2.3


From 73241ccca0f7786933f1d31b3d86f2456549953a Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 3 Nov 2005 16:00:25 +0000
Subject: [PATCH] Collect more inode information during syscall processing.

This patch augments the collection of inode info during syscall
processing. It represents part of the functionality that was provided
by the auditfs patch included in RHEL4.

Specifically, it:

- Collects information for target inodes created or removed during
  syscalls.  Previous code only collects information for the target
  inode's parent.

- Adds the audit_inode() hook to syscalls that operate on a file
  descriptor (e.g. fchown), enabling audit to do inode filtering for
  these calls.

- Modifies filtering code to check audit context for either an inode #
  or a parent inode # matching a given rule.

- Modifies logging to provide inode # for both parent and child.

- Protect debug info from NULL audit_names.name.

[AV: folded a later typo fix from the same author]

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c               |   1 +
 fs/open.c                |   8 ++-
 fs/xattr.c               |  11 +++-
 include/linux/audit.h    |  18 +++++-
 include/linux/fsnotify.h |   5 ++
 kernel/auditsc.c         | 142 +++++++++++++++++++++++++++++++++++++++--------
 6 files changed, 157 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/fs/namei.c b/fs/namei.c
index f6619af9e957..51cfc9c3ed00 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1353,6 +1353,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 		return -ENOENT;
 
 	BUG_ON(victim->d_parent->d_inode != dir);
+	audit_inode_child(victim->d_name.name, victim->d_inode, dir->i_ino);
 
 	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
 	if (error)
diff --git a/fs/open.c b/fs/open.c
index 70e0230d8e77..70510004d06e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/rcupdate.h>
+#include <linux/audit.h>
 
 #include <asm/unistd.h>
 
@@ -626,6 +627,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
+	audit_inode(NULL, inode, 0);
+
 	err = -EROFS;
 	if (IS_RDONLY(inode))
 		goto out_putf;
@@ -775,7 +778,10 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 
 	file = fget(fd);
 	if (file) {
-		error = chown_common(file->f_dentry, user, group);
+		struct dentry * dentry;
+		dentry = file->f_dentry;
+		audit_inode(NULL, dentry->d_inode, 0);
+		error = chown_common(dentry, user, group);
 		fput(file);
 	}
 	return error;
diff --git a/fs/xattr.c b/fs/xattr.c
index 80eca7d3d69f..e416190f5e9c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/fsnotify.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 
 
@@ -234,12 +235,15 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 	      size_t size, int flags)
 {
 	struct file *f;
+	struct dentry *dentry;
 	int error = -EBADF;
 
 	f = fget(fd);
 	if (!f)
 		return error;
-	error = setxattr(f->f_dentry, name, value, size, flags);
+	dentry = f->f_dentry;
+	audit_inode(NULL, dentry->d_inode, 0);
+	error = setxattr(dentry, name, value, size, flags);
 	fput(f);
 	return error;
 }
@@ -458,12 +462,15 @@ asmlinkage long
 sys_fremovexattr(int fd, char __user *name)
 {
 	struct file *f;
+	struct dentry *dentry;
 	int error = -EBADF;
 
 	f = fget(fd);
 	if (!f)
 		return error;
-	error = removexattr(f->f_dentry, name);
+	dentry = f->f_dentry;
+	audit_inode(NULL, dentry->d_inode, 0);
+	error = removexattr(dentry, name);
 	fput(f);
 	return error;
 }
diff --git a/include/linux/audit.h b/include/linux/audit.h
index fd65078e794a..739b954cb242 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -260,7 +260,20 @@ extern void audit_syscall_entry(struct task_struct *task, int arch,
 extern void audit_syscall_exit(struct task_struct *task, int failed, long return_code);
 extern void audit_getname(const char *name);
 extern void audit_putname(const char *name);
-extern void audit_inode(const char *name, const struct inode *inode, unsigned flags);
+extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags);
+extern void __audit_inode_child(const char *dname, const struct inode *inode,
+				unsigned long pino);
+static inline void audit_inode(const char *name, const struct inode *inode,
+			       unsigned flags) {
+	if (unlikely(current->audit_context))
+		__audit_inode(name, inode, flags);
+}
+static inline void audit_inode_child(const char *dname, 
+				     const struct inode *inode, 
+				     unsigned long pino) {
+	if (unlikely(current->audit_context))
+		__audit_inode_child(dname, inode, pino);
+}
 
 				/* Private API (for audit.c only) */
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
@@ -283,7 +296,10 @@ extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 #define audit_syscall_exit(t,f,r) do { ; } while (0)
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
+#define __audit_inode(n,i,f) do { ; } while (0)
+#define __audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_inode(n,i,f) do { ; } while (0)
+#define audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index b5ff64d2f092..94919c376a72 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -15,6 +15,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/audit.h>
 
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
@@ -45,6 +46,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
 	}
+	audit_inode_child(old_name, source, old_dir->i_ino);
+	audit_inode_child(new_name, target, new_dir->i_ino);
 }
 
 /*
@@ -74,6 +77,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
 /*
@@ -84,6 +88,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
 				  dentry->d_name.name);
+	audit_inode_child(dentry->d_name.name, dentry->d_inode, inode->i_ino);
 }
 
 /*
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 55ba331757c5..73f932b7deb6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2,6 +2,7 @@
  * Handles all system-call specific auditing features.
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
  * Copyright (C) 2005 IBM Corporation
  * All Rights Reserved.
  *
@@ -31,11 +32,16 @@
  * The support of additional filter rules compares (>, <, >=, <=) was
  * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
  *
+ * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
+ * filesystem information.
  */
 
 #include <linux/init.h>
 #include <asm/types.h>
 #include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -97,12 +103,12 @@ enum audit_state {
 struct audit_names {
 	const char	*name;
 	unsigned long	ino;
+	unsigned long	pino;
 	dev_t		dev;
 	umode_t		mode;
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
-	unsigned	flags;
 };
 
 struct audit_aux_data {
@@ -515,7 +521,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if ( audit_comparator(ctx->names[j].ino, op, value)) {
+					if (audit_comparator(ctx->names[j].ino, op, value) ||
+					    audit_comparator(ctx->names[j].pino, op, value)) {
 						++result;
 						break;
 					}
@@ -696,17 +703,17 @@ static inline void audit_free_names(struct audit_context *context)
 #if AUDIT_DEBUG == 2
 	if (context->auditable
 	    ||context->put_count + context->ino_count != context->name_count) {
-		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
 		       " name_count=%d put_count=%d"
 		       " ino_count=%d [NOT freeing]\n",
-		       __LINE__,
+		       __FILE__, __LINE__,
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
 		for (i = 0; i < context->name_count; i++)
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
 			       context->names[i].name,
-			       context->names[i].name);
+			       context->names[i].name ?: "(null)");
 		dump_stack();
 		return;
 	}
@@ -932,27 +939,34 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
+		unsigned long ino  = context->names[i].ino;
+		unsigned long pino = context->names[i].pino;
+
 		ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
 		audit_log_format(ab, "item=%d", i);
-		if (context->names[i].name) {
-			audit_log_format(ab, " name=");
+
+		audit_log_format(ab, " name=");
+		if (context->names[i].name)
 			audit_log_untrustedstring(ab, context->names[i].name);
-		}
-		audit_log_format(ab, " flags=%x\n", context->names[i].flags);
-			 
-		if (context->names[i].ino != (unsigned long)-1)
-			audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
-					     " ouid=%u ogid=%u rdev=%02x:%02x",
-					 context->names[i].ino,
-					 MAJOR(context->names[i].dev),
-					 MINOR(context->names[i].dev),
-					 context->names[i].mode,
-					 context->names[i].uid,
-					 context->names[i].gid,
-					 MAJOR(context->names[i].rdev),
+		else
+			audit_log_format(ab, "(null)");
+
+		if (pino != (unsigned long)-1)
+			audit_log_format(ab, " parent=%lu",  pino);
+		if (ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",  ino);
+		if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
+			audit_log_format(ab, " dev=%02x:%02x mode=%#o" 
+					 " ouid=%u ogid=%u rdev=%02x:%02x", 
+					 MAJOR(context->names[i].dev), 
+					 MINOR(context->names[i].dev), 
+					 context->names[i].mode, 
+					 context->names[i].uid, 
+					 context->names[i].gid, 
+					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
 		audit_log_end(ab);
 	}
@@ -1174,7 +1188,7 @@ void audit_putname(const char *name)
 			for (i = 0; i < context->name_count; i++)
 				printk(KERN_ERR "name[%d] = %p = %s\n", i,
 				       context->names[i].name,
-				       context->names[i].name);
+				       context->names[i].name ?: "(null)");
 		}
 #endif
 		__putname(name);
@@ -1204,7 +1218,7 @@ void audit_putname(const char *name)
  *
  * Called from fs/namei.c:path_lookup().
  */
-void audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 {
 	int idx;
 	struct audit_context *context = current->audit_context;
@@ -1230,13 +1244,93 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags)
 		++context->ino_count;
 #endif
 	}
-	context->names[idx].flags = flags;
-	context->names[idx].ino   = inode->i_ino;
 	context->names[idx].dev	  = inode->i_sb->s_dev;
 	context->names[idx].mode  = inode->i_mode;
 	context->names[idx].uid   = inode->i_uid;
 	context->names[idx].gid   = inode->i_gid;
 	context->names[idx].rdev  = inode->i_rdev;
+	if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
+	    (strcmp(name, ".") != 0)) {
+		context->names[idx].ino   = (unsigned long)-1;
+		context->names[idx].pino  = inode->i_ino;
+	} else {
+		context->names[idx].ino   = inode->i_ino;
+		context->names[idx].pino  = (unsigned long)-1;
+	}
+}
+
+/**
+ * audit_inode_child - collect inode info for created/removed objects
+ * @dname: inode's dentry name
+ * @inode: inode being audited
+ * @pino: inode number of dentry parent
+ *
+ * For syscalls that create or remove filesystem objects, audit_inode
+ * can only collect information for the filesystem object's parent.
+ * This call updates the audit context with the child's information.
+ * Syscalls that create a new filesystem object must be hooked after
+ * the object is created.  Syscalls that remove a filesystem object
+ * must be hooked prior, in order to capture the target inode during
+ * unsuccessful attempts.
+ */
+void __audit_inode_child(const char *dname, const struct inode *inode,
+			 unsigned long pino)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+
+	/* determine matching parent */
+	if (dname)
+		for (idx = 0; idx < context->name_count; idx++)
+			if (context->names[idx].pino == pino) {
+				const char *n;
+				const char *name = context->names[idx].name;
+				int dlen = strlen(dname);
+				int nlen = name ? strlen(name) : 0;
+
+				if (nlen < dlen)
+					continue;
+				
+				/* disregard trailing slashes */
+				n = name + nlen - 1;
+				while ((*n == '/') && (n > name))
+					n--;
+
+				/* find last path component */
+				n = n - dlen + 1;
+				if (n < name)
+					continue;
+				else if (n > name) {
+					if (*--n != '/')
+						continue;
+					else
+						n++;
+				}
+
+				if (strncmp(n, dname, dlen) == 0)
+					goto update_context;
+			}
+
+	/* catch-all in case match not found */
+	idx = context->name_count++;
+	context->names[idx].name  = NULL;
+	context->names[idx].pino  = pino;
+#if AUDIT_DEBUG
+	context->ino_count++;
+#endif
+
+update_context:
+	if (inode) {
+		context->names[idx].ino   = inode->i_ino;
+		context->names[idx].dev	  = inode->i_sb->s_dev;
+		context->names[idx].mode  = inode->i_mode;
+		context->names[idx].uid   = inode->i_uid;
+		context->names[idx].gid   = inode->i_gid;
+		context->names[idx].rdev  = inode->i_rdev;
+	}
 }
 
 /**
-- 
cgit v1.2.3


From c8edc80c8b8c397c53f4f659a05b9ea6208029bf Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 16:12:36 +0000
Subject: [PATCH] Exclude messages by message type

    - Add a new, 5th filter called "exclude".
    - And add a new field AUDIT_MSGTYPE.
    - Define a new function audit_filter_exclude() that takes a message type
      as input and examines all rules in the filter.  It returns '1' if the
      message is to be excluded, and '0' otherwise.
    - Call the audit_filter_exclude() function near the top of
      audit_log_start() just after asserting audit_initialized.  If the
      message type is not to be audited, return NULL very early, before
      doing a lot of work.
[combined with followup fix for bug in original patch, Nov 4, same author]
[combined with later renaming AUDIT_FILTER_EXCLUDE->AUDIT_FILTER_TYPE
and audit_filter_exclude() -> audit_filter_type()]

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  5 ++++-
 kernel/audit.c        |  3 +++
 kernel/auditsc.c      | 35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 41 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 739b954cb242..8fa1a8fbc04d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -92,8 +92,9 @@
 #define AUDIT_FILTER_ENTRY	0x02	/* Apply rule at syscall entry */
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
+#define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
 
-#define AUDIT_NR_FILTERS	5
+#define AUDIT_NR_FILTERS	6
 
 #define AUDIT_FILTER_PREPEND	0x10	/* Prepend to front of list */
 
@@ -132,6 +133,7 @@
 #define AUDIT_LOGINUID	9
 #define AUDIT_PERS	10
 #define AUDIT_ARCH	11
+#define AUDIT_MSGTYPE	12
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -289,6 +291,7 @@ extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
+extern int audit_filter_type(int type);
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
diff --git a/kernel/audit.c b/kernel/audit.c
index 6d61dd79a605..1c3eb1b12bfc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -702,6 +702,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (!audit_initialized)
 		return NULL;
 
+	if (unlikely(audit_filter_type(type)))
+		return NULL;
+
 	if (gfp_mask & __GFP_WAIT)
 		reserve = 0;
 	else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 73f932b7deb6..31917ac730af 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -187,7 +187,8 @@ static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[2]),
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
-#if AUDIT_NR_FILTERS != 5
+	LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -663,6 +664,38 @@ int audit_filter_user(struct netlink_skb_parms *cb, int type)
 	return ret; /* Audit by default */
 }
 
+int audit_filter_type(int type)
+{
+	struct audit_entry *e;
+	int result = 0;
+	
+	rcu_read_lock();
+	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+		goto unlock_and_return;
+
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+				list) {
+		struct audit_rule *rule = &e->rule;
+		int i;
+		for (i = 0; i < rule->field_count; i++) {
+			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+			u32 value  = rule->values[i];
+			if ( field == AUDIT_MSGTYPE ) {
+				result = audit_comparator(type, op, value); 
+				if (!result)
+					break;
+			}
+		}
+		if (result)
+			goto unlock_and_return;
+	}
+unlock_and_return:
+	rcu_read_unlock();
+	return result;
+}
+
+
 /* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-- 
cgit v1.2.3


From 8c8570fb8feef2bc166bee75a85748b25cda22d9 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 3 Nov 2005 17:15:16 +0000
Subject: [PATCH] Capture selinux subject/object context information.

This patch extends existing audit records with subject/object context
information. Audit records associated with filesystem inodes, ipc, and
tasks now contain SELinux label information in the field "subj" if the
item is performing the action, or in "obj" if the item is the receiver
of an action.

These labels are collected via hooks in SELinux and appended to the
appropriate record in the audit code.

This additional information is required for Common Criteria Labeled
Security Protection Profile (LSPP).

[AV: fixed kmalloc flags use]
[folded leak fixes]
[folded cleanup from akpm (kfree(NULL)]
[folded audit_inode_context() leak fix]
[folded akpm's fix for audit_ipc_perm() definition in case of !CONFIG_AUDIT]

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h    |   8 ++-
 include/linux/security.h |  27 +++++++++
 ipc/msg.c                |   5 +-
 ipc/sem.c                |   5 +-
 ipc/shm.c                |   4 +-
 kernel/audit.c           |   2 +-
 kernel/auditsc.c         | 142 ++++++++++++++++++++++++++++++++++++++++++++---
 security/dummy.c         |   6 ++
 security/selinux/hooks.c |  96 +++++++++++++++-----------------
 9 files changed, 226 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8fa1a8fbc04d..1912d8e8ae90 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -285,13 +285,14 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
-extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode);
+extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
+extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
@@ -306,12 +307,13 @@ extern int audit_filter_type(int type);
 #define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
-#define audit_ipc_perms(q,u,g,m) ({ 0; })
+#define audit_ipc_perms(q,u,g,m,i) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
 #define audit_filter_user(cb,t) ({ 1; })
+#define audit_set_macxattr(n) do { ; } while (0)
 #endif
 
 #ifdef CONFIG_AUDIT
@@ -340,6 +342,7 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     int done, int multi,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
+extern void		    audit_panic(const char *message);
 extern struct semaphore audit_netlink_sem;
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
@@ -350,6 +353,7 @@ extern struct semaphore audit_netlink_sem;
 #define audit_log_hex(a,b,l) do { ; } while (0)
 #define audit_log_untrustedstring(a,s) do { ; } while (0)
 #define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_panic(m) do { ; } while (0)
 #endif
 #endif
 #endif
diff --git a/include/linux/security.h b/include/linux/security.h
index 7cbef482e13a..ec0bbbc3ffc2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -869,6 +869,11 @@ struct swap_info_struct;
  *	@ipcp contains the kernel IPC permission structure
  *	@flag contains the desired (requested) permission set
  *	Return 0 if permission is granted.
+ * @ipc_getsecurity:
+ *      Copy the security label associated with the ipc object into
+ *      @buffer.  @buffer may be NULL to request the size of the buffer 
+ *      required.  @size indicates the size of @buffer in bytes. Return 
+ *      number of bytes used/required on success.
  *
  * Security hooks for individual messages held in System V IPC message queues
  * @msg_msg_alloc_security:
@@ -1168,6 +1173,7 @@ struct security_operations {
 	int (*inode_getxattr) (struct dentry *dentry, char *name);
 	int (*inode_listxattr) (struct dentry *dentry);
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
+	char *(*inode_xattr_getsuffix) (void);
   	int (*inode_getsecurity)(struct inode *inode, const char *name, void *buffer, size_t size, int err);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   	int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size);
@@ -1217,6 +1223,7 @@ struct security_operations {
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm * ipcp, short flag);
+	int (*ipc_getsecurity)(struct kern_ipc_perm *ipcp, void *buffer, size_t size);
 
 	int (*msg_msg_alloc_security) (struct msg_msg * msg);
 	void (*msg_msg_free_security) (struct msg_msg * msg);
@@ -1674,6 +1681,11 @@ static inline int security_inode_removexattr (struct dentry *dentry, char *name)
 	return security_ops->inode_removexattr (dentry, name);
 }
 
+static inline const char *security_inode_xattr_getsuffix(void)
+{
+	return security_ops->inode_xattr_getsuffix();
+}
+
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	if (unlikely (IS_PRIVATE (inode)))
@@ -1869,6 +1881,11 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return security_ops->ipc_permission (ipcp, flag);
 }
 
+static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return security_ops->ipc_getsecurity(ipcp, buffer, size);
+}
+
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return security_ops->msg_msg_alloc_security (msg);
@@ -2316,6 +2333,11 @@ static inline int security_inode_removexattr (struct dentry *dentry, char *name)
 	return cap_inode_removexattr(dentry, name);
 }
 
+static inline const char *security_inode_xattr_getsuffix (void)
+{
+	return NULL ;
+}
+
 static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
@@ -2499,6 +2521,11 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
+static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return 0;
diff --git a/ipc/msg.c b/ipc/msg.c
index fbf757064a32..8c30ec2f6e34 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -429,8 +429,6 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 			return -EFAULT;
 		if (copy_msqid_from_user (&setbuf, buf, version))
 			return -EFAULT;
-		if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 		break;
 	case IPC_RMID:
 		break;
@@ -461,6 +459,9 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	switch (cmd) {
 	case IPC_SET:
 	{
+		if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+			goto out_unlock_up;
+
 		err = -EPERM;
 		if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE))
 			goto out_unlock_up;
diff --git a/ipc/sem.c b/ipc/sem.c
index 31fd4027d2b5..59696a840be1 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -809,8 +809,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 	if(cmd == IPC_SET) {
 		if(copy_semid_from_user (&setbuf, arg.buf, version))
 			return -EFAULT;
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 	}
 	sma = sem_lock(semid);
 	if(sma==NULL)
@@ -821,7 +819,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		goto out_unlock;
 	}	
 	ipcp = &sma->sem_perm;
-	
 	if (current->euid != ipcp->cuid && 
 	    current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) {
 	    	err=-EPERM;
@@ -838,6 +835,8 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		err = 0;
 		break;
 	case IPC_SET:
+		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+			goto out_unlock;
 		ipcp->uid = setbuf.uid;
 		ipcp->gid = setbuf.gid;
 		ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
diff --git a/ipc/shm.c b/ipc/shm.c
index 9162123a7b23..a88c8a02e7f3 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -620,13 +620,13 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 			err = -EFAULT;
 			goto out;
 		}
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode)))
-			return err;
 		down(&shm_ids.sem);
 		shp = shm_lock(shmid);
 		err=-EINVAL;
 		if(shp==NULL)
 			goto out_up;
+		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm))))
+			goto out_unlock_up;
 		err = shm_checkid(shp,shmid);
 		if(err)
 			goto out_unlock_up;
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c3eb1b12bfc..45c123ef77a7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -142,7 +142,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
 	nlh->nlmsg_pid = pid;
 }
 
-static void audit_panic(const char *message)
+void audit_panic(const char *message)
 {
 	switch (audit_failure)
 	{
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 31917ac730af..4e2256ec7cf3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,6 +34,9 @@
  *
  * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional
  * filesystem information.
+ *
+ * Subject and object context labeling support added by <danjones@us.ibm.com>
+ * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
  */
 
 #include <linux/init.h>
@@ -53,6 +56,7 @@
 #include <linux/netlink.h>
 #include <linux/compiler.h>
 #include <asm/unistd.h>
+#include <linux/security.h>
 
 /* 0 = no checking
    1 = put_count checking
@@ -109,6 +113,7 @@ struct audit_names {
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
+	char		*ctx;
 };
 
 struct audit_aux_data {
@@ -125,6 +130,7 @@ struct audit_aux_data_ipcctl {
 	uid_t			uid;
 	gid_t			gid;
 	mode_t			mode;
+	char 			*ctx;
 };
 
 struct audit_aux_data_socketcall {
@@ -743,10 +749,11 @@ static inline void audit_free_names(struct audit_context *context)
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
-		for (i = 0; i < context->name_count; i++)
+		for (i = 0; i < context->name_count; i++) {
 			printk(KERN_ERR "names[%d] = %p = %s\n", i,
 			       context->names[i].name,
 			       context->names[i].name ?: "(null)");
+		}
 		dump_stack();
 		return;
 	}
@@ -756,9 +763,13 @@ static inline void audit_free_names(struct audit_context *context)
 	context->ino_count  = 0;
 #endif
 
-	for (i = 0; i < context->name_count; i++)
+	for (i = 0; i < context->name_count; i++) {
+		char *p = context->names[i].ctx;
+		context->names[i].ctx = NULL;
+		kfree(p);
 		if (context->names[i].name)
 			__putname(context->names[i].name);
+	}
 	context->name_count = 0;
 	if (context->pwd)
 		dput(context->pwd);
@@ -778,6 +789,12 @@ static inline void audit_free_aux(struct audit_context *context)
 			dput(axi->dentry);
 			mntput(axi->mnt);
 		}
+		if ( aux->type == AUDIT_IPC ) {
+			struct audit_aux_data_ipcctl *axi = (void *)aux;
+			if (axi->ctx)
+				kfree(axi->ctx);
+		}
+
 		context->aux = aux->next;
 		kfree(aux);
 	}
@@ -862,7 +879,38 @@ static inline void audit_free_context(struct audit_context *context)
 		printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 
-static void audit_log_task_info(struct audit_buffer *ab)
+static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
+{
+	char *ctx = NULL;
+	ssize_t len = 0;
+
+	len = security_getprocattr(current, "current", NULL, 0);
+	if (len < 0) {
+		if (len != -EINVAL)
+			goto error_path;
+		return;
+	}
+
+	ctx = kmalloc(len, gfp_mask);
+	if (!ctx) {
+		goto error_path;
+		return;
+	}
+
+	len = security_getprocattr(current, "current", ctx, len);
+	if (len < 0 )
+		goto error_path;
+
+	audit_log_format(ab, " subj=%s", ctx);
+
+error_path:
+	if (ctx)
+		kfree(ctx);
+	audit_panic("security_getprocattr error in audit_log_task_context");
+	return;
+}
+
+static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
 {
 	char name[sizeof(current->comm)];
 	struct mm_struct *mm = current->mm;
@@ -875,6 +923,10 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	if (!mm)
 		return;
 
+	/*
+	 * this is brittle; all callers that pass GFP_ATOMIC will have
+	 * NULL current->mm and we won't get here.
+	 */
 	down_read(&mm->mmap_sem);
 	vma = mm->mmap;
 	while (vma) {
@@ -888,6 +940,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
 		vma = vma->vm_next;
 	}
 	up_read(&mm->mmap_sem);
+	audit_log_task_context(ab, gfp_mask);
 }
 
 static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
@@ -923,7 +976,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
 		  context->egid, context->sgid, context->fsgid);
-	audit_log_task_info(ab);
+	audit_log_task_info(ab, gfp_mask);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
@@ -936,8 +989,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx iuid=%u igid=%u mode=%x",
-					 axi->qbytes, axi->uid, axi->gid, axi->mode);
+					 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
+					 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -1001,6 +1054,11 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 					 context->names[i].gid, 
 					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
+		if (context->names[i].ctx) {
+			audit_log_format(ab, " obj=%s",
+					context->names[i].ctx);
+		}
+
 		audit_log_end(ab);
 	}
 }
@@ -1243,6 +1301,39 @@ void audit_putname(const char *name)
 #endif
 }
 
+void audit_inode_context(int idx, const struct inode *inode)
+{
+	struct audit_context *context = current->audit_context;
+	char *ctx = NULL;
+	int len = 0;
+
+	if (!security_inode_xattr_getsuffix())
+		return;
+
+	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0);
+	if (len < 0) 
+		goto error_path;
+
+	ctx = kmalloc(len, GFP_KERNEL);
+	if (!ctx) 
+		goto error_path;
+
+	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0);
+	if (len < 0)
+		goto error_path;
+
+	kfree(context->names[idx].ctx);
+	context->names[idx].ctx = ctx;
+	return;
+
+error_path:
+	if (ctx)
+		kfree(ctx);
+	audit_panic("error in audit_inode_context");
+	return;
+}
+
+
 /**
  * audit_inode - store the inode and device from a lookup
  * @name: name being audited
@@ -1282,6 +1373,7 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
 	context->names[idx].uid   = inode->i_uid;
 	context->names[idx].gid   = inode->i_gid;
 	context->names[idx].rdev  = inode->i_rdev;
+	audit_inode_context(idx, inode);
 	if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && 
 	    (strcmp(name, ".") != 0)) {
 		context->names[idx].ino   = (unsigned long)-1;
@@ -1363,6 +1455,7 @@ update_context:
 		context->names[idx].uid   = inode->i_uid;
 		context->names[idx].gid   = inode->i_gid;
 		context->names[idx].rdev  = inode->i_rdev;
+		audit_inode_context(idx, inode);
 	}
 }
 
@@ -1423,6 +1516,38 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
+static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
+{
+	struct audit_context *context = current->audit_context;
+	char *ctx = NULL;
+	int len = 0;
+
+	if (likely(!context))
+		return NULL;
+
+	len = security_ipc_getsecurity(ipcp, NULL, 0);
+	if (len == -EOPNOTSUPP)
+		goto ret;
+	if (len < 0)
+		goto error_path;
+
+	ctx = kmalloc(len, GFP_ATOMIC);
+	if (!ctx)
+		goto error_path;
+
+	len = security_ipc_getsecurity(ipcp, ctx, len);
+	if (len < 0)
+		goto error_path;
+
+	return ctx;
+
+error_path:
+	kfree(ctx);
+	audit_panic("error in audit_ipc_context");
+ret:
+	return NULL;
+}
+
 /**
  * audit_ipc_perms - record audit data for ipc
  * @qbytes: msgq bytes
@@ -1432,7 +1557,7 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
@@ -1440,7 +1565,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	if (likely(!context))
 		return 0;
 
-	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
 	if (!ax)
 		return -ENOMEM;
 
@@ -1448,6 +1573,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
 	ax->uid = uid;
 	ax->gid = gid;
 	ax->mode = mode;
+	ax->ctx = audit_ipc_context(ipcp);
 
 	ax->d.type = AUDIT_IPC;
 	ax->d.next = context->aux;
diff --git a/security/dummy.c b/security/dummy.c
index f1a5bd98bf10..6febe7d39fa0 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -558,6 +558,11 @@ static int dummy_ipc_permission (struct kern_ipc_perm *ipcp, short flag)
 	return 0;
 }
 
+static int dummy_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
 static int dummy_msg_msg_alloc_security (struct msg_msg *msg)
 {
 	return 0;
@@ -959,6 +964,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_reparent_to_init);
  	set_to_dummy_if_null(ops, task_to_inode);
 	set_to_dummy_if_null(ops, ipc_permission);
+	set_to_dummy_if_null(ops, ipc_getsecurity);
 	set_to_dummy_if_null(ops, msg_msg_alloc_security);
 	set_to_dummy_if_null(ops, msg_msg_free_security);
 	set_to_dummy_if_null(ops, msg_queue_alloc_security);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b65c201e9ff5..9c08a19cc81b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -117,6 +117,32 @@ static struct security_operations *secondary_ops = NULL;
 static LIST_HEAD(superblock_security_head);
 static DEFINE_SPINLOCK(sb_security_lock);
 
+/* Return security context for a given sid or just the context 
+   length if the buffer is null or length is 0 */
+static int selinux_getsecurity(u32 sid, void *buffer, size_t size)
+{
+	char *context;
+	unsigned len;
+	int rc;
+
+	rc = security_sid_to_context(sid, &context, &len);
+	if (rc)
+		return rc;
+
+	if (!buffer || !size)
+		goto getsecurity_exit;
+
+	if (size < len) {
+		len = -ERANGE;
+		goto getsecurity_exit;
+	}
+	memcpy(buffer, context, len);
+
+getsecurity_exit:
+	kfree(context);
+	return len;
+}
+
 /* Allocate and free functions for each kind of security blob. */
 
 static int task_alloc_security(struct task_struct *task)
@@ -2209,6 +2235,11 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name)
 	return -EACCES;
 }
 
+static const char *selinux_inode_xattr_getsuffix(void)
+{
+      return XATTR_SELINUX_SUFFIX;
+}
+
 /*
  * Copy the in-core inode security context value to the user.  If the
  * getxattr() prior to this succeeded, check to see if we need to
@@ -2219,44 +2250,11 @@ static int selinux_inode_removexattr (struct dentry *dentry, char *name)
 static int selinux_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	struct inode_security_struct *isec = inode->i_security;
-	char *context;
-	unsigned len;
-	int rc;
-
-	if (strcmp(name, XATTR_SELINUX_SUFFIX)) {
-		rc = -EOPNOTSUPP;
-		goto out;
-	}
-
-	rc = security_sid_to_context(isec->sid, &context, &len);
-	if (rc)
-		goto out;
-
-	/* Probe for required buffer size */
-	if (!buffer || !size) {
-		rc = len;
-		goto out_free;
-	}
 
-	if (size < len) {
-		rc = -ERANGE;
-		goto out_free;
-	}
+	if (strcmp(name, XATTR_SELINUX_SUFFIX))
+		return -EOPNOTSUPP;
 
-	if (err > 0) {
-		if ((len == err) && !(memcmp(context, buffer, len))) {
-			/* Don't need to canonicalize value */
-			rc = err;
-			goto out_free;
-		}
-		memset(buffer, 0, size);
-	}
-	memcpy(buffer, context, len);
-	rc = len;
-out_free:
-	kfree(context);
-out:
-	return rc;
+	return selinux_getsecurity(isec->sid, buffer, size);
 }
 
 static int selinux_inode_setsecurity(struct inode *inode, const char *name,
@@ -4022,6 +4020,13 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 	return ipc_has_perm(ipcp, av);
 }
 
+static int selinux_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
+{
+	struct ipc_security_struct *isec = ipcp->security;
+
+	return selinux_getsecurity(isec->sid, buffer, size);
+}
+
 /* module stacking operations */
 static int selinux_register_security (const char *name, struct security_operations *ops)
 {
@@ -4063,8 +4068,7 @@ static int selinux_getprocattr(struct task_struct *p,
 			       char *name, void *value, size_t size)
 {
 	struct task_security_struct *tsec;
-	u32 sid, len;
-	char *context;
+	u32 sid;
 	int error;
 
 	if (current != p) {
@@ -4073,9 +4077,6 @@ static int selinux_getprocattr(struct task_struct *p,
 			return error;
 	}
 
-	if (!size)
-		return -ERANGE;
-
 	tsec = p->security;
 
 	if (!strcmp(name, "current"))
@@ -4092,16 +4093,7 @@ static int selinux_getprocattr(struct task_struct *p,
 	if (!sid)
 		return 0;
 
-	error = security_sid_to_context(sid, &context, &len);
-	if (error)
-		return error;
-	if (len > size) {
-		kfree(context);
-		return -ERANGE;
-	}
-	memcpy(value, context, len);
-	kfree(context);
-	return len;
+	return selinux_getsecurity(sid, value, size);
 }
 
 static int selinux_setprocattr(struct task_struct *p,
@@ -4259,6 +4251,7 @@ static struct security_operations selinux_ops = {
 	.inode_getxattr =		selinux_inode_getxattr,
 	.inode_listxattr =		selinux_inode_listxattr,
 	.inode_removexattr =		selinux_inode_removexattr,
+	.inode_xattr_getsuffix =        selinux_inode_xattr_getsuffix,
 	.inode_getsecurity =            selinux_inode_getsecurity,
 	.inode_setsecurity =            selinux_inode_setsecurity,
 	.inode_listsecurity =           selinux_inode_listsecurity,
@@ -4296,6 +4289,7 @@ static struct security_operations selinux_ops = {
 	.task_to_inode =                selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
+	.ipc_getsecurity =		selinux_ipc_getsecurity,
 
 	.msg_msg_alloc_security =	selinux_msg_msg_alloc_security,
 	.msg_msg_free_security =	selinux_msg_msg_free_security,
-- 
cgit v1.2.3


From 7306a0b9b3e2056a616c84841288ca2431a05627 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Wed, 16 Nov 2005 15:53:13 +0000
Subject: [PATCH] Miscellaneous bug and warning fixes

This patch fixes a couple of bugs revealed in new features recently
added to -mm1:
* fixes warnings due to inconsistent use of const struct inode *inode
* fixes bug that prevent a kernel from booting with audit on, and SELinux off
  due to a missing function in security/dummy.c
* fixes a bug that throws spurious audit_panic() messages due to a missing
  return just before an error_path label
* some reasonable house cleaning in audit_ipc_context(),
  audit_inode_context(), and audit_log_task_context()

Signed-off-by: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/security.h |  8 ++++----
 kernel/auditsc.c         | 21 ++++++++++++---------
 security/dummy.c         |  8 +++++++-
 security/selinux/hooks.c |  2 +-
 4 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index ec0bbbc3ffc2..2a502250eb5c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1173,8 +1173,8 @@ struct security_operations {
 	int (*inode_getxattr) (struct dentry *dentry, char *name);
 	int (*inode_listxattr) (struct dentry *dentry);
 	int (*inode_removexattr) (struct dentry *dentry, char *name);
-	char *(*inode_xattr_getsuffix) (void);
-  	int (*inode_getsecurity)(struct inode *inode, const char *name, void *buffer, size_t size, int err);
+	const char *(*inode_xattr_getsuffix) (void);
+  	int (*inode_getsecurity)(const struct inode *inode, const char *name, void *buffer, size_t size, int err);
   	int (*inode_setsecurity)(struct inode *inode, const char *name, const void *value, size_t size, int flags);
   	int (*inode_listsecurity)(struct inode *inode, char *buffer, size_t buffer_size);
 
@@ -1686,7 +1686,7 @@ static inline const char *security_inode_xattr_getsuffix(void)
 	return security_ops->inode_xattr_getsuffix();
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	if (unlikely (IS_PRIVATE (inode)))
 		return 0;
@@ -2338,7 +2338,7 @@ static inline const char *security_inode_xattr_getsuffix (void)
 	return NULL ;
 }
 
-static inline int security_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static inline int security_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e2256ec7cf3..4ef14515da35 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -892,21 +892,20 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
 	}
 
 	ctx = kmalloc(len, gfp_mask);
-	if (!ctx) {
+	if (!ctx)
 		goto error_path;
-		return;
-	}
 
 	len = security_getprocattr(current, "current", ctx, len);
 	if (len < 0 )
 		goto error_path;
 
 	audit_log_format(ab, " subj=%s", ctx);
+	return;
 
 error_path:
 	if (ctx)
 		kfree(ctx);
-	audit_panic("security_getprocattr error in audit_log_task_context");
+	audit_panic("error in audit_log_task_context");
 	return;
 }
 
@@ -1304,13 +1303,16 @@ void audit_putname(const char *name)
 void audit_inode_context(int idx, const struct inode *inode)
 {
 	struct audit_context *context = current->audit_context;
+	const char *suffix = security_inode_xattr_getsuffix();
 	char *ctx = NULL;
 	int len = 0;
 
-	if (!security_inode_xattr_getsuffix())
-		return;
+	if (!suffix)
+		goto ret;
 
-	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0);
+	len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
+	if (len == -EOPNOTSUPP)
+		goto ret;
 	if (len < 0) 
 		goto error_path;
 
@@ -1318,18 +1320,19 @@ void audit_inode_context(int idx, const struct inode *inode)
 	if (!ctx) 
 		goto error_path;
 
-	len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0);
+	len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
 	if (len < 0)
 		goto error_path;
 
 	kfree(context->names[idx].ctx);
 	context->names[idx].ctx = ctx;
-	return;
+	goto ret;
 
 error_path:
 	if (ctx)
 		kfree(ctx);
 	audit_panic("error in audit_inode_context");
+ret:
 	return;
 }
 
diff --git a/security/dummy.c b/security/dummy.c
index 6febe7d39fa0..0a553d39729f 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -378,7 +378,7 @@ static int dummy_inode_removexattr (struct dentry *dentry, char *name)
 	return 0;
 }
 
-static int dummy_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int dummy_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	return -EOPNOTSUPP;
 }
@@ -393,6 +393,11 @@ static int dummy_inode_listsecurity(struct inode *inode, char *buffer, size_t bu
 	return 0;
 }
 
+static const char *dummy_inode_xattr_getsuffix(void)
+{
+	return NULL;
+}
+
 static int dummy_file_permission (struct file *file, int mask)
 {
 	return 0;
@@ -930,6 +935,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, inode_getxattr);
 	set_to_dummy_if_null(ops, inode_listxattr);
 	set_to_dummy_if_null(ops, inode_removexattr);
+	set_to_dummy_if_null(ops, inode_xattr_getsuffix);
 	set_to_dummy_if_null(ops, inode_getsecurity);
 	set_to_dummy_if_null(ops, inode_setsecurity);
 	set_to_dummy_if_null(ops, inode_listsecurity);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9c08a19cc81b..81b726b1a419 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2247,7 +2247,7 @@ static const char *selinux_inode_xattr_getsuffix(void)
  *
  * Permission check is handled by selinux_inode_getxattr hook.
  */
-static int selinux_inode_getsecurity(struct inode *inode, const char *name, void *buffer, size_t size, int err)
+static int selinux_inode_getsecurity(const struct inode *inode, const char *name, void *buffer, size_t size, int err)
 {
 	struct inode_security_struct *isec = inode->i_security;
 
-- 
cgit v1.2.3


From fe7752bab26a9ac0651b695ad4f55659761f68f7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 15 Dec 2005 18:33:52 +0000
Subject: [PATCH] Fix audit record filtering with !CONFIG_AUDITSYSCALL

This fixes the per-user and per-message-type filtering when syscall
auditing isn't enabled.

[AV: folded followup fix from the same author]

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  18 +--
 kernel/Makefile       |   2 +-
 kernel/audit.c        |   1 +
 kernel/audit.h        |  70 ++++++++++
 kernel/auditfilter.c  | 378 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/auditsc.c      | 380 +-------------------------------------------------
 6 files changed, 459 insertions(+), 390 deletions(-)
 create mode 100644 kernel/audit.h
 create mode 100644 kernel/auditfilter.c

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1912d8e8ae90..fbc21d6267f3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -278,8 +278,6 @@ static inline void audit_inode_child(const char *dname,
 }
 
 				/* Private API (for audit.c only) */
-extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-				 void *data, uid_t loginuid);
 extern unsigned int audit_serial(void);
 extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
@@ -290,8 +288,6 @@ extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
 extern void audit_signal_info(int sig, struct task_struct *t);
-extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
-extern int audit_filter_type(int type);
 extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
@@ -304,7 +300,6 @@ extern int audit_set_macxattr(const char *name);
 #define __audit_inode_child(d,i,p) do { ; } while (0)
 #define audit_inode(n,i,f) do { ; } while (0)
 #define audit_inode_child(d,i,p) do { ; } while (0)
-#define audit_receive_filter(t,p,u,s,d,l) ({ -EOPNOTSUPP; })
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
 #define audit_ipc_perms(q,u,g,m,i) ({ 0; })
@@ -312,7 +307,6 @@ extern int audit_set_macxattr(const char *name);
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
 #define audit_signal_info(s,t) do { ; } while (0)
-#define audit_filter_user(cb,t) ({ 1; })
 #define audit_set_macxattr(n) do { ; } while (0)
 #endif
 
@@ -337,13 +331,11 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 					     const char *prefix,
 					     struct dentry *dentry,
 					     struct vfsmount *vfsmnt);
-				/* Private API (for auditsc.c only) */
-extern void		    audit_send_reply(int pid, int seq, int type,
-					     int done, int multi,
-					     void *payload, int size);
-extern void		    audit_log_lost(const char *message);
-extern void		    audit_panic(const char *message);
-extern struct semaphore audit_netlink_sem;
+				/* Private API (for audit.c only) */
+extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
+extern int audit_filter_type(int type);
+extern int  audit_receive_filter(int type, int pid, int uid, int seq,
+				 void *data, uid_t loginuid);
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..58cf129e5622 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
-obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 45c123ef77a7..07c5d2bdd38c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -305,6 +305,7 @@ static int kauditd_thread(void *dummy)
 			remove_wait_queue(&kauditd_wait, &wait);
 		}
 	}
+	return 0;
 }
 
 /**
diff --git a/kernel/audit.h b/kernel/audit.h
new file mode 100644
index 000000000000..7643e46daeb2
--- /dev/null
+++ b/kernel/audit.h
@@ -0,0 +1,70 @@
+/* audit -- definition of audit_context structure and supporting types 
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/audit.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* Rule lists */
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+
+extern int audit_pid;
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+extern void		    audit_panic(const char *message);
+extern struct semaphore audit_netlink_sem;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
new file mode 100644
index 000000000000..7f347c360876
--- /dev/null
+++ b/kernel/auditfilter.c
@@ -0,0 +1,378 @@
+/* auditfilter.c -- filtering of audit events
+ *
+ * Copyright 2003-2004 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include "audit.h"
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+	LIST_HEAD_INIT(audit_filter_list[0]),
+	LIST_HEAD_INIT(audit_filter_list[1]),
+	LIST_HEAD_INIT(audit_filter_list[2]),
+	LIST_HEAD_INIT(audit_filter_list[3]),
+	LIST_HEAD_INIT(audit_filter_list[4]),
+	LIST_HEAD_INIT(audit_filter_list[5]),
+#if AUDIT_NR_FILTERS != 6
+#error Fix audit_filter_list initialiser
+#endif
+};
+
+/* Copy rule from user-space to kernel-space.  Called from 
+ * audit_add_rule during AUDIT_ADD. */
+static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_add_rule during AUDIT_ADD and 
+ * audit_del_rule during AUDIT_DEL. */
+static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_rule *rule,
+				  struct list_head *list)
+{
+	struct audit_entry  *entry;
+	int i;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * addition routine. */
+	list_for_each_entry(entry, list, list) {
+		if (!audit_compare_rule(rule, &entry->rule)) {
+			return -EEXIST;
+		}
+	}
+
+	for (i = 0; i < rule->field_count; i++) {
+		if (rule->fields[i] & AUDIT_UNUSED_BITS)
+			return -EINVAL;
+		if ( rule->fields[i] & AUDIT_NEGATE )
+			rule->fields[i] |= AUDIT_NOT_EQUAL;
+		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
+			rule->fields[i] |= AUDIT_EQUAL;
+		rule->fields[i] &= (~AUDIT_NEGATE);
+	}
+
+	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+		return -ENOMEM;
+	if (audit_copy_rule(&entry->rule, rule)) {
+		kfree(entry);
+		return -EINVAL;
+	}
+
+	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+
+	return 0;
+}
+
+static inline void audit_free_rule(struct rcu_head *head)
+{
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	kfree(e);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule);
+			return 0;
+		}
+	}
+	return -ENOENT;		/* No matching rule */
+}
+
+static int audit_list_rules(void *_dest)
+{
+	int pid, seq;
+	int *dest = _dest;
+	struct audit_entry *entry;
+	int i;
+
+	pid = dest[0];
+	seq = dest[1];
+	kfree(dest);
+
+	down(&audit_netlink_sem);
+
+	/* The *_rcu iterators not needed here because we are
+	   always called with audit_netlink_sem held. */
+	for (i=0; i<AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry(entry, &audit_filter_list[i], list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+	}
+	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+	
+	up(&audit_netlink_sem);
+	return 0;
+}
+
+/**
+ * audit_receive_filter - apply all rules to the specified message type
+ * @type: audit message type
+ * @pid: target pid for netlink audit messages
+ * @uid: target uid for netlink audit messages
+ * @seq: netlink audit message sequence (serial) number
+ * @data: payload data
+ * @loginuid: loginuid of sender
+ */
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
+							uid_t loginuid)
+{
+	struct task_struct *tsk;
+	int *dest;
+	int		   err = 0;
+	unsigned listnr;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* We can't just spew out the rules here because we might fill
+		 * the available socket buffer space and deadlock waiting for
+		 * auditctl to read from it... which isn't ever going to
+		 * happen if we're actually running in the context of auditctl
+		 * trying to _send_ the stuff */
+		 
+		dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+		if (!dest)
+			return -ENOMEM;
+		dest[0] = pid;
+		dest[1] = seq;
+
+		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+		if (IS_ERR(tsk)) {
+			kfree(dest);
+			err = PTR_ERR(tsk);
+		}
+		break;
+	case AUDIT_ADD:
+		listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		switch(listnr) {
+		default:
+			return -EINVAL;
+
+		case AUDIT_FILTER_USER:
+		case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+		case AUDIT_FILTER_ENTRY:
+		case AUDIT_FILTER_EXIT:
+		case AUDIT_FILTER_TASK:
+#endif
+			;
+		}
+		err = audit_add_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u added an audit rule\n", loginuid);
+		break;
+	case AUDIT_DEL:
+		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+		if (listnr >= AUDIT_NR_FILTERS)
+			return -EINVAL;
+
+		err = audit_del_rule(data, &audit_filter_list[listnr]);
+		if (!err)
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				  "auid=%u removed an audit rule\n", loginuid);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+
+int audit_comparator(const u32 left, const u32 op, const u32 right)
+{
+	switch (op) {
+	case AUDIT_EQUAL:
+		return (left == right);
+	case AUDIT_NOT_EQUAL:
+		return (left != right);
+	case AUDIT_LESS_THAN:
+		return (left < right);
+	case AUDIT_LESS_THAN_OR_EQUAL:
+		return (left <= right);
+	case AUDIT_GREATER_THAN:
+		return (left > right);
+	case AUDIT_GREATER_THAN_OR_EQUAL:
+		return (left >= right);
+	default:
+		return -EINVAL;
+	}
+}
+
+
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+				   struct audit_rule *rule,
+				   enum audit_state *state)
+{
+	int i;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = audit_comparator(cb->creds.pid, op, value);
+			break;
+		case AUDIT_UID:
+			result = audit_comparator(cb->creds.uid, op, value);
+			break;
+		case AUDIT_GID:
+			result = audit_comparator(cb->creds.gid, op, value);
+			break;
+		case AUDIT_LOGINUID:
+			result = audit_comparator(cb->loginuid, op, value);
+			break;
+		}
+
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int ret = 1;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+		if (audit_filter_user_rules(cb, &e->rule, &state)) {
+			if (state == AUDIT_DISABLED)
+				ret = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return ret; /* Audit by default */
+}
+
+int audit_filter_type(int type)
+{
+	struct audit_entry *e;
+	int result = 0;
+	
+	rcu_read_lock();
+	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
+		goto unlock_and_return;
+
+	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
+				list) {
+		struct audit_rule *rule = &e->rule;
+		int i;
+		for (i = 0; i < rule->field_count; i++) {
+			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
+			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
+			u32 value  = rule->values[i];
+			if ( field == AUDIT_MSGTYPE ) {
+				result = audit_comparator(type, op, value); 
+				if (!result)
+					break;
+			}
+		}
+		if (result)
+			goto unlock_and_return;
+	}
+unlock_and_return:
+	rcu_read_unlock();
+	return result;
+}
+
+
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4ef14515da35..17719b303638 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -52,17 +52,15 @@
 #include <linux/audit.h>
 #include <linux/personality.h>
 #include <linux/time.h>
-#include <linux/kthread.h>
 #include <linux/netlink.h>
 #include <linux/compiler.h>
 #include <asm/unistd.h>
 #include <linux/security.h>
+#include <linux/list.h>
 
-/* 0 = no checking
-   1 = put_count checking
-   2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
+#include "audit.h"
+
+extern struct list_head audit_filter_list[];
 
 /* No syscall auditing will take place unless audit_enabled != 0. */
 extern int audit_enabled;
@@ -76,29 +74,6 @@ extern int audit_enabled;
  * path_lookup. */
 #define AUDIT_NAMES_RESERVED 7
 
-/* At task start time, the audit_state is set in the audit_context using
-   a per-task filter.  At syscall entry, the audit_state is augmented by
-   the syscall filter. */
-enum audit_state {
-	AUDIT_DISABLED,		/* Do not create per-task audit_context.
-				 * No syscall-specific audit records can
-				 * be generated. */
-	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
-				 * but don't necessarily fill it in at
-				 * syscall entry time (i.e., filter
-				 * instead). */
-	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
-				 * and always fill it in at syscall
-				 * entry time.  This makes a full
-				 * syscall record available if some
-				 * other part of the kernel decides it
-				 * should be recorded. */
-	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
-				 * always fill it in at syscall entry
-				 * time, and always write out the audit
-				 * record at syscall exit time.  */
-};
-
 /* When fs/namei.c:getname() is called, we store the pointer in name and
  * we don't let putname() free it (instead we free all of the saved
  * pointers at syscall exit time).
@@ -183,264 +158,6 @@ struct audit_context {
 #endif
 };
 
-				/* Public API */
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
-	LIST_HEAD_INIT(audit_filter_list[0]),
-	LIST_HEAD_INIT(audit_filter_list[1]),
-	LIST_HEAD_INIT(audit_filter_list[2]),
-	LIST_HEAD_INIT(audit_filter_list[3]),
-	LIST_HEAD_INIT(audit_filter_list[4]),
-	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
-#error Fix audit_filter_list initialiser
-#endif
-};
-
-struct audit_entry {
-	struct list_head  list;
-	struct rcu_head   rcu;
-	struct audit_rule rule;
-};
-
-extern int audit_pid;
-
-/* Copy rule from user-space to kernel-space.  Called from 
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
-{
-	int i;
-
-	if (s->action != AUDIT_NEVER
-	    && s->action != AUDIT_POSSIBLE
-	    && s->action != AUDIT_ALWAYS)
-		return -1;
-	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-		return -1;
-	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
-		return -1;
-
-	d->flags	= s->flags;
-	d->action	= s->action;
-	d->field_count	= s->field_count;
-	for (i = 0; i < d->field_count; i++) {
-		d->fields[i] = s->fields[i];
-		d->values[i] = s->values[i];
-	}
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
-	return 0;
-}
-
-/* Check to see if two rules are identical.  It is called from
- * audit_add_rule during AUDIT_ADD and 
- * audit_del_rule during AUDIT_DEL. */
-static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
-{
-	int i;
-
-	if (a->flags != b->flags)
-		return 1;
-
-	if (a->action != b->action)
-		return 1;
-
-	if (a->field_count != b->field_count)
-		return 1;
-
-	for (i = 0; i < a->field_count; i++) {
-		if (a->fields[i] != b->fields[i]
-		    || a->values[i] != b->values[i])
-			return 1;
-	}
-
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
-		if (a->mask[i] != b->mask[i])
-			return 1;
-
-	return 0;
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
-				  struct list_head *list)
-{
-	struct audit_entry  *entry;
-	int i;
-
-	/* Do not use the _rcu iterator here, since this is the only
-	 * addition routine. */
-	list_for_each_entry(entry, list, list) {
-		if (!audit_compare_rule(rule, &entry->rule)) {
-			return -EEXIST;
-		}
-	}
-
-	for (i = 0; i < rule->field_count; i++) {
-		if (rule->fields[i] & AUDIT_UNUSED_BITS)
-			return -EINVAL;
-		if ( rule->fields[i] & AUDIT_NEGATE )
-			rule->fields[i] |= AUDIT_NOT_EQUAL;
-		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
-			rule->fields[i] |= AUDIT_EQUAL;
-		rule->fields[i] &= (~AUDIT_NEGATE);
-	}
-
-	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-		return -ENOMEM;
-	if (audit_copy_rule(&entry->rule, rule)) {
-		kfree(entry);
-		return -EINVAL;
-	}
-
-	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
-		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
-		list_add_rcu(&entry->list, list);
-	} else {
-		list_add_tail_rcu(&entry->list, list);
-	}
-
-	return 0;
-}
-
-static inline void audit_free_rule(struct rcu_head *head)
-{
-	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
-	kfree(e);
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
- * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
-				 struct list_head *list)
-{
-	struct audit_entry  *e;
-
-	/* Do not use the _rcu iterator here, since this is the only
-	 * deletion routine. */
-	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(rule, &e->rule)) {
-			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule);
-			return 0;
-		}
-	}
-	return -ENOENT;		/* No matching rule */
-}
-
-static int audit_list_rules(void *_dest)
-{
-	int pid, seq;
-	int *dest = _dest;
-	struct audit_entry *entry;
-	int i;
-
-	pid = dest[0];
-	seq = dest[1];
-	kfree(dest);
-
-	down(&audit_netlink_sem);
-
-	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_sem held. */
-	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list)
-			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
-	}
-	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
-	
-	up(&audit_netlink_sem);
-	return 0;
-}
-
-/**
- * audit_receive_filter - apply all rules to the specified message type
- * @type: audit message type
- * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
- * @seq: netlink audit message sequence (serial) number
- * @data: payload data
- * @loginuid: loginuid of sender
- */
-int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-							uid_t loginuid)
-{
-	struct task_struct *tsk;
-	int *dest;
-	int		   err = 0;
-	unsigned listnr;
-
-	switch (type) {
-	case AUDIT_LIST:
-		/* We can't just spew out the rules here because we might fill
-		 * the available socket buffer space and deadlock waiting for
-		 * auditctl to read from it... which isn't ever going to
-		 * happen if we're actually running in the context of auditctl
-		 * trying to _send_ the stuff */
-		 
-		dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
-		if (!dest)
-			return -ENOMEM;
-		dest[0] = pid;
-		dest[1] = seq;
-
-		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
-		if (IS_ERR(tsk)) {
-			kfree(dest);
-			err = PTR_ERR(tsk);
-		}
-		break;
-	case AUDIT_ADD:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_add_rule(data, &audit_filter_list[listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u added an audit rule\n", loginuid);
-		break;
-	case AUDIT_DEL:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_del_rule(data, &audit_filter_list[listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u removed an audit rule\n", loginuid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return err;
-}
-
-static int audit_comparator(const u32 left, const u32 op, const u32 right)
-{
-	switch (op) {
-	case AUDIT_EQUAL:
-		return (left == right);
-	case AUDIT_NOT_EQUAL:
-		return (left != right);
-	case AUDIT_LESS_THAN:
-		return (left < right);
-	case AUDIT_LESS_THAN_OR_EQUAL:
-		return (left <= right);
-	case AUDIT_GREATER_THAN:
-		return (left > right);
-	case AUDIT_GREATER_THAN_OR_EQUAL:
-		return (left >= right);
-	default:
-		return -EINVAL;
-	}
-}
 
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
@@ -613,95 +330,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-				   struct audit_rule *rule,
-				   enum audit_state *state)
-{
-	int i;
-
-	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
-		int result = 0;
-
-		switch (field) {
-		case AUDIT_PID:
-			result = audit_comparator(cb->creds.pid, op, value);
-			break;
-		case AUDIT_UID:
-			result = audit_comparator(cb->creds.uid, op, value);
-			break;
-		case AUDIT_GID:
-			result = audit_comparator(cb->creds.gid, op, value);
-			break;
-		case AUDIT_LOGINUID:
-			result = audit_comparator(cb->loginuid, op, value);
-			break;
-		}
-
-		if (!result)
-			return 0;
-	}
-	switch (rule->action) {
-	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
-	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
-	}
-	return 1;
-}
-
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
-{
-	struct audit_entry *e;
-	enum audit_state   state;
-	int ret = 1;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-		if (audit_filter_user_rules(cb, &e->rule, &state)) {
-			if (state == AUDIT_DISABLED)
-				ret = 0;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	return ret; /* Audit by default */
-}
-
-int audit_filter_type(int type)
-{
-	struct audit_entry *e;
-	int result = 0;
-	
-	rcu_read_lock();
-	if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE]))
-		goto unlock_and_return;
-
-	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
-				list) {
-		struct audit_rule *rule = &e->rule;
-		int i;
-		for (i = 0; i < rule->field_count; i++) {
-			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-			u32 value  = rule->values[i];
-			if ( field == AUDIT_MSGTYPE ) {
-				result = audit_comparator(type, op, value); 
-				if (!result)
-					break;
-			}
-		}
-		if (result)
-			goto unlock_and_return;
-	}
-unlock_and_return:
-	rcu_read_unlock();
-	return result;
-}
-
-
 /* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-- 
cgit v1.2.3


From d884596f44ef5a0bcd8a66405dc04902aeaa6fc7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 16 Dec 2005 10:48:28 +0000
Subject: [PATCH] Minor cosmetic cleanups to the code moved into auditfilter.c

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 kernel/auditfilter.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f347c360876..a3a32752f973 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -69,7 +69,7 @@ static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
 /* Check to see if two rules are identical.  It is called from
  * audit_add_rule during AUDIT_ADD and 
  * audit_del_rule during AUDIT_DEL. */
-static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 {
 	int i;
 
@@ -107,19 +107,18 @@ static inline int audit_add_rule(struct audit_rule *rule,
 	/* Do not use the _rcu iterator here, since this is the only
 	 * addition routine. */
 	list_for_each_entry(entry, list, list) {
-		if (!audit_compare_rule(rule, &entry->rule)) {
+		if (!audit_compare_rule(rule, &entry->rule))
 			return -EEXIST;
-		}
 	}
 
 	for (i = 0; i < rule->field_count; i++) {
 		if (rule->fields[i] & AUDIT_UNUSED_BITS)
 			return -EINVAL;
-		if ( rule->fields[i] & AUDIT_NEGATE )
+		if ( rule->fields[i] & AUDIT_NEGATE)
 			rule->fields[i] |= AUDIT_NOT_EQUAL;
 		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
 			rule->fields[i] |= AUDIT_EQUAL;
-		rule->fields[i] &= (~AUDIT_NEGATE);
+		rule->fields[i] &= ~AUDIT_NEGATE;
 	}
 
 	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
@@ -374,5 +373,3 @@ unlock_and_return:
 	rcu_read_unlock();
 	return result;
 }
-
-
-- 
cgit v1.2.3


From 93315ed6dd12dacfc941f9eb8ca0293aadf99793 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Tue, 7 Feb 2006 12:05:27 -0500
Subject: [PATCH] audit string fields interface + consumer

Updated patch to dynamically allocate audit rule fields in kernel's
internal representation.  Added unlikely() calls for testing memory
allocation result.

Amy Griffis wrote:     [Wed Jan 11 2006, 02:02:31PM EST]
> Modify audit's kernel-userspace interface to allow the specification
> of string fields in audit rules.
>
> Signed-off-by: Amy Griffis <amy.griffis@hp.com>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from 5ffc4a863f92351b720fe3e9c5cd647accff9e03 commit)
---
 include/linux/audit.h       |  31 ++-
 kernel/audit.c              |  19 +-
 kernel/audit.h              |  23 ++-
 kernel/auditfilter.c        | 467 +++++++++++++++++++++++++++++++++-----------
 kernel/auditsc.c            |  50 +++--
 security/selinux/nlmsgtab.c |   3 +
 6 files changed, 448 insertions(+), 145 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8868c96ca8a2..8a3b98175c25 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -50,15 +50,18 @@
  */
 #define AUDIT_GET		1000	/* Get status */
 #define AUDIT_SET		1001	/* Set status (enable/disable/auditd) */
-#define AUDIT_LIST		1002	/* List syscall filtering rules */
-#define AUDIT_ADD		1003	/* Add syscall filtering rule */
-#define AUDIT_DEL		1004	/* Delete syscall filtering rule */
+#define AUDIT_LIST		1002	/* List syscall rules -- deprecated */
+#define AUDIT_ADD		1003	/* Add syscall rule -- deprecated */
+#define AUDIT_DEL		1004	/* Delete syscall rule -- deprecated */
 #define AUDIT_USER		1005	/* Message from userspace -- deprecated */
 #define AUDIT_LOGIN		1006	/* Define the login id and information */
 #define AUDIT_WATCH_INS		1007	/* Insert file/dir watch entry */
 #define AUDIT_WATCH_REM		1008	/* Remove file/dir watch entry */
 #define AUDIT_WATCH_LIST	1009	/* List all file/dir watches */
 #define AUDIT_SIGNAL_INFO	1010	/* Get info about sender of signal to auditd */
+#define AUDIT_ADD_RULE		1011	/* Add syscall filtering rule */
+#define AUDIT_DEL_RULE		1012	/* Delete syscall filtering rule */
+#define AUDIT_LIST_RULES	1013	/* List syscall filtering rules */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
@@ -229,6 +232,26 @@ struct audit_status {
 	__u32		backlog;	/* messages waiting in queue */
 };
 
+/* audit_rule_data supports filter rules with both integer and string
+ * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
+ * AUDIT_LIST_RULES requests.
+ */
+struct audit_rule_data {
+	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
+	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
+	__u32		field_count;
+	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		fields[AUDIT_MAX_FIELDS];
+	__u32		values[AUDIT_MAX_FIELDS];
+	__u32		fieldflags[AUDIT_MAX_FIELDS];
+	__u32		buflen;	/* total length of string fields */
+	char		buf[0];	/* string fields buffer */
+};
+
+/* audit_rule is supported to maintain backward compatibility with
+ * userspace.  It supports integer fields only and corresponds to
+ * AUDIT_ADD, AUDIT_DEL and AUDIT_LIST requests.
+ */
 struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
 	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
 	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
@@ -338,7 +361,7 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-				 void *data, uid_t loginuid);
+				 void *data, size_t datasz, uid_t loginuid);
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
diff --git a/kernel/audit.c b/kernel/audit.c
index 07c5d2bdd38c..4eb97b62d7fa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -52,6 +52,7 @@
 #include <linux/audit.h>
 
 #include <net/sock.h>
+#include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 
@@ -361,9 +362,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 	switch (msg_type) {
 	case AUDIT_GET:
 	case AUDIT_LIST:
+	case AUDIT_LIST_RULES:
 	case AUDIT_SET:
 	case AUDIT_ADD:
+	case AUDIT_ADD_RULE:
 	case AUDIT_DEL:
+	case AUDIT_DEL_RULE:
 	case AUDIT_SIGNAL_INFO:
 		if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
 			err = -EPERM;
@@ -470,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	case AUDIT_ADD:
 	case AUDIT_DEL:
-		if (nlh->nlmsg_len < sizeof(struct audit_rule))
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule))
 			return -EINVAL;
 		/* fallthrough */
 	case AUDIT_LIST:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
-					   uid, seq, data, loginuid);
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
+		break;
+	case AUDIT_ADD_RULE:
+	case AUDIT_DEL_RULE:
+		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+			return -EINVAL;
+		/* fallthrough */
+	case AUDIT_LIST_RULES:
+		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+					   uid, seq, data, nlmsg_len(nlh),
+					   loginuid);
 		break;
 	case AUDIT_SIGNAL_INFO:
 		sig_data.uid = audit_sig_uid;
diff --git a/kernel/audit.h b/kernel/audit.h
index 7643e46daeb2..4b602cdcabef 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -52,10 +52,27 @@ enum audit_state {
 };
 
 /* Rule lists */
+struct audit_field {
+	u32			type;
+	u32			val;
+	u32			op;
+};
+
+struct audit_krule {
+	int			vers_ops;
+	u32			flags;
+	u32			listnr;
+	u32			action;
+	u32			mask[AUDIT_BITMASK_SIZE];
+	u32			buflen; /* for data alloc on list rules */
+	u32			field_count;
+	struct audit_field	*fields;
+};
+
 struct audit_entry {
-	struct list_head  list;
-	struct rcu_head   rcu;
-	struct audit_rule rule;
+	struct list_head	list;
+	struct rcu_head		rcu;
+	struct audit_krule	rule;
 };
 
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a3a32752f973..686d514a3518 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -40,52 +40,279 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 #endif
 };
 
-/* Copy rule from user-space to kernel-space.  Called from 
- * audit_add_rule during AUDIT_ADD. */
-static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+static inline void audit_free_rule(struct audit_entry *e)
 {
+	kfree(e->rule.fields);
+	kfree(e);
+}
+
+static inline void audit_free_rule_rcu(struct rcu_head *head)
+{
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	audit_free_rule(e);
+}
+
+/* Unpack a filter field's string representation from user-space
+ * buffer. */
+static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+{
+	char *str;
+
+	if (!*bufp || (len == 0) || (len > *remain))
+		return ERR_PTR(-EINVAL);
+
+	/* Of the currently implemented string fields, PATH_MAX
+	 * defines the longest valid length.
+	 */
+	if (len > PATH_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	str = kmalloc(len + 1, GFP_KERNEL);
+	if (unlikely(!str))
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(str, *bufp, len);
+	str[len] = 0;
+	*bufp += len;
+	*remain -= len;
+
+	return str;
+}
+
+/* Common user-space to kernel rule translation. */
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+{
+	unsigned listnr;
+	struct audit_entry *entry;
+	struct audit_field *fields;
+	int i, err;
+
+	err = -EINVAL;
+	listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
+	switch(listnr) {
+	default:
+		goto exit_err;
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+#ifdef CONFIG_AUDITSYSCALL
+	case AUDIT_FILTER_ENTRY:
+	case AUDIT_FILTER_EXIT:
+	case AUDIT_FILTER_TASK:
+#endif
+		;
+	}
+	if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
+	    rule->action != AUDIT_ALWAYS)
+		goto exit_err;
+	if (rule->field_count > AUDIT_MAX_FIELDS)
+		goto exit_err;
+
+	err = -ENOMEM;
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (unlikely(!entry))
+		goto exit_err;
+	fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
+	if (unlikely(!fields)) {
+		kfree(entry);
+		goto exit_err;
+	}
+
+	memset(&entry->rule, 0, sizeof(struct audit_krule));
+	memset(fields, 0, sizeof(struct audit_field));
+
+	entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
+	entry->rule.listnr = listnr;
+	entry->rule.action = rule->action;
+	entry->rule.field_count = rule->field_count;
+	entry->rule.fields = fields;
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		entry->rule.mask[i] = rule->mask[i];
+
+	return entry;
+
+exit_err:
+	return ERR_PTR(err);
+}
+
+/* Translate struct audit_rule to kernel's rule respresentation.
+ * Exists for backward compatibility with userspace. */
+static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+{
+	struct audit_entry *entry;
+	int err = 0;
 	int i;
 
-	if (s->action != AUDIT_NEVER
-	    && s->action != AUDIT_POSSIBLE
-	    && s->action != AUDIT_ALWAYS)
-		return -1;
-	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
-		return -1;
-	if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
-		return -1;
-
-	d->flags	= s->flags;
-	d->action	= s->action;
-	d->field_count	= s->field_count;
-	for (i = 0; i < d->field_count; i++) {
-		d->fields[i] = s->fields[i];
-		d->values[i] = s->values[i];
+	entry = audit_to_entry_common(rule);
+	if (IS_ERR(entry))
+		goto exit_nofree;
+
+	for (i = 0; i < rule->field_count; i++) {
+		struct audit_field *f = &entry->rule.fields[i];
+
+		if (rule->fields[i] & AUDIT_UNUSED_BITS) {
+			err = -EINVAL;
+			goto exit_free;
+		}
+
+		f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
+		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
+		f->val = rule->values[i];
+
+		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+		if (f->op & AUDIT_NEGATE)
+			f->op |= AUDIT_NOT_EQUAL;
+		else if (!(f->op & AUDIT_OPERATORS))
+			f->op |= AUDIT_EQUAL;
+		f->op &= ~AUDIT_NEGATE;
 	}
-	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
-	return 0;
+
+exit_nofree:
+	return entry;
+
+exit_free:
+	audit_free_rule(entry);
+	return ERR_PTR(err);
 }
 
-/* Check to see if two rules are identical.  It is called from
- * audit_add_rule during AUDIT_ADD and 
- * audit_del_rule during AUDIT_DEL. */
-static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+/* Translate struct audit_rule_data to kernel's rule respresentation. */
+static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
+					       size_t datasz)
 {
+	int err = 0;
+	struct audit_entry *entry;
+	void *bufp;
+	/* size_t remain = datasz - sizeof(struct audit_rule_data); */
 	int i;
 
-	if (a->flags != b->flags)
-		return 1;
+	entry = audit_to_entry_common((struct audit_rule *)data);
+	if (IS_ERR(entry))
+		goto exit_nofree;
 
-	if (a->action != b->action)
-		return 1;
+	bufp = data->buf;
+	entry->rule.vers_ops = 2;
+	for (i = 0; i < data->field_count; i++) {
+		struct audit_field *f = &entry->rule.fields[i];
+
+		err = -EINVAL;
+		if (!(data->fieldflags[i] & AUDIT_OPERATORS) ||
+		    data->fieldflags[i] & ~AUDIT_OPERATORS)
+			goto exit_free;
+
+		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
+		f->type = data->fields[i];
+		switch(f->type) {
+		/* call type-specific conversion routines here */
+		default:
+			f->val = data->values[i];
+		}
+	}
+
+exit_nofree:
+	return entry;
+
+exit_free:
+	audit_free_rule(entry);
+	return ERR_PTR(err);
+}
+
+/* Pack a filter field's string representation into data block. */
+static inline size_t audit_pack_string(void **bufp, char *str)
+{
+	size_t len = strlen(str);
+
+	memcpy(*bufp, str, len);
+	*bufp += len;
+
+	return len;
+}
+
+/* Translate kernel rule respresentation to struct audit_rule.
+ * Exists for backward compatibility with userspace. */
+static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
+{
+	struct audit_rule *rule;
+	int i;
+
+	rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+	if (unlikely(!rule))
+		return ERR_PTR(-ENOMEM);
+	memset(rule, 0, sizeof(*rule));
+
+	rule->flags = krule->flags | krule->listnr;
+	rule->action = krule->action;
+	rule->field_count = krule->field_count;
+	for (i = 0; i < rule->field_count; i++) {
+		rule->values[i] = krule->fields[i].val;
+		rule->fields[i] = krule->fields[i].type;
+
+		if (krule->vers_ops == 1) {
+			if (krule->fields[i].op & AUDIT_NOT_EQUAL)
+				rule->fields[i] |= AUDIT_NEGATE;
+		} else {
+			rule->fields[i] |= krule->fields[i].op;
+		}
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
+
+	return rule;
+}
 
-	if (a->field_count != b->field_count)
+/* Translate kernel rule respresentation to struct audit_rule_data. */
+static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
+{
+	struct audit_rule_data *data;
+	void *bufp;
+	int i;
+
+	data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+	if (unlikely(!data))
+		return ERR_PTR(-ENOMEM);
+	memset(data, 0, sizeof(*data));
+
+	data->flags = krule->flags | krule->listnr;
+	data->action = krule->action;
+	data->field_count = krule->field_count;
+	bufp = data->buf;
+	for (i = 0; i < data->field_count; i++) {
+		struct audit_field *f = &krule->fields[i];
+
+		data->fields[i] = f->type;
+		data->fieldflags[i] = f->op;
+		switch(f->type) {
+		/* call type-specific conversion routines here */
+		default:
+			data->values[i] = f->val;
+		}
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i];
+
+	return data;
+}
+
+/* Compare two rules in kernel format.  Considered success if rules
+ * don't match. */
+static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
+{
+	int i;
+
+	if (a->flags != b->flags ||
+	    a->listnr != b->listnr ||
+	    a->action != b->action ||
+	    a->field_count != b->field_count)
 		return 1;
 
 	for (i = 0; i < a->field_count; i++) {
-		if (a->fields[i] != b->fields[i]
-		    || a->values[i] != b->values[i])
+		if (a->fields[i].type != b->fields[i].type ||
+		    a->fields[i].op != b->fields[i].op)
 			return 1;
+
+		switch(a->fields[i].type) {
+		/* call type-specific comparison routines here */
+		default:
+			if (a->fields[i].val != b->fields[i].val)
+				return 1;
+		}
 	}
 
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
@@ -95,41 +322,21 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
 	return 0;
 }
 
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
+/* Add rule to given filterlist if not a duplicate.  Protected by
  * audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_rule *rule,
+static inline int audit_add_rule(struct audit_entry *entry,
 				  struct list_head *list)
 {
-	struct audit_entry  *entry;
-	int i;
+	struct audit_entry *e;
 
 	/* Do not use the _rcu iterator here, since this is the only
 	 * addition routine. */
-	list_for_each_entry(entry, list, list) {
-		if (!audit_compare_rule(rule, &entry->rule))
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(&entry->rule, &e->rule))
 			return -EEXIST;
 	}
 
-	for (i = 0; i < rule->field_count; i++) {
-		if (rule->fields[i] & AUDIT_UNUSED_BITS)
-			return -EINVAL;
-		if ( rule->fields[i] & AUDIT_NEGATE)
-			rule->fields[i] |= AUDIT_NOT_EQUAL;
-		else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 )
-			rule->fields[i] |= AUDIT_EQUAL;
-		rule->fields[i] &= ~AUDIT_NEGATE;
-	}
-
-	if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
-		return -ENOMEM;
-	if (audit_copy_rule(&entry->rule, rule)) {
-		kfree(entry);
-		return -EINVAL;
-	}
-
 	if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
-		entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
 		list_add_rcu(&entry->list, list);
 	} else {
 		list_add_tail_rcu(&entry->list, list);
@@ -138,16 +345,9 @@ static inline int audit_add_rule(struct audit_rule *rule,
 	return 0;
 }
 
-static inline void audit_free_rule(struct rcu_head *head)
-{
-	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
-	kfree(e);
-}
-
-/* Note that audit_add_rule and audit_del_rule are called via
- * audit_receive() in audit.c, and are protected by
+/* Remove an existing rule from filterlist.  Protected by
  * audit_netlink_sem. */
-static inline int audit_del_rule(struct audit_rule *rule,
+static inline int audit_del_rule(struct audit_entry *entry,
 				 struct list_head *list)
 {
 	struct audit_entry  *e;
@@ -155,16 +355,18 @@ static inline int audit_del_rule(struct audit_rule *rule,
 	/* Do not use the _rcu iterator here, since this is the only
 	 * deletion routine. */
 	list_for_each_entry(e, list, list) {
-		if (!audit_compare_rule(rule, &e->rule)) {
+		if (!audit_compare_rule(&entry->rule, &e->rule)) {
 			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule);
+			call_rcu(&e->rcu, audit_free_rule_rcu);
 			return 0;
 		}
 	}
 	return -ENOENT;		/* No matching rule */
 }
 
-static int audit_list_rules(void *_dest)
+/* List rules using struct audit_rule.  Exists for backward
+ * compatibility with userspace. */
+static int audit_list(void *_dest)
 {
 	int pid, seq;
 	int *dest = _dest;
@@ -180,9 +382,16 @@ static int audit_list_rules(void *_dest)
 	/* The *_rcu iterators not needed here because we are
 	   always called with audit_netlink_sem held. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
-		list_for_each_entry(entry, &audit_filter_list[i], list)
+		list_for_each_entry(entry, &audit_filter_list[i], list) {
+			struct audit_rule *rule;
+
+			rule = audit_krule_to_rule(&entry->rule);
+			if (unlikely(!rule))
+				break;
 			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
-					 &entry->rule, sizeof(entry->rule));
+					 rule, sizeof(*rule));
+			kfree(rule);
+		}
 	}
 	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
 	
@@ -190,6 +399,40 @@ static int audit_list_rules(void *_dest)
 	return 0;
 }
 
+/* List rules using struct audit_rule_data. */
+static int audit_list_rules(void *_dest)
+{
+	int pid, seq;
+	int *dest = _dest;
+	struct audit_entry *e;
+	int i;
+
+	pid = dest[0];
+	seq = dest[1];
+	kfree(dest);
+
+	down(&audit_netlink_sem);
+
+	/* The *_rcu iterators not needed here because we are
+	   always called with audit_netlink_sem held. */
+	for (i=0; i<AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry(e, &audit_filter_list[i], list) {
+			struct audit_rule_data *data;
+
+			data = audit_krule_to_data(&e->rule);
+			if (unlikely(!data))
+				break;
+			audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+					 data, sizeof(*data));
+			kfree(data);
+		}
+	}
+	audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+
+	up(&audit_netlink_sem);
+	return 0;
+}
+
 /**
  * audit_receive_filter - apply all rules to the specified message type
  * @type: audit message type
@@ -197,18 +440,20 @@ static int audit_list_rules(void *_dest)
  * @uid: target uid for netlink audit messages
  * @seq: netlink audit message sequence (serial) number
  * @data: payload data
+ * @datasz: size of payload data
  * @loginuid: loginuid of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-							uid_t loginuid)
+			 size_t datasz, uid_t loginuid)
 {
 	struct task_struct *tsk;
 	int *dest;
-	int		   err = 0;
-	unsigned listnr;
+	int err = 0;
+	struct audit_entry *entry;
 
 	switch (type) {
 	case AUDIT_LIST:
+	case AUDIT_LIST_RULES:
 		/* We can't just spew out the rules here because we might fill
 		 * the available socket buffer space and deadlock waiting for
 		 * auditctl to read from it... which isn't ever going to
@@ -221,41 +466,48 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 		dest[0] = pid;
 		dest[1] = seq;
 
-		tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+		if (type == AUDIT_LIST)
+			tsk = kthread_run(audit_list, dest, "audit_list");
+		else
+			tsk = kthread_run(audit_list_rules, dest,
+					  "audit_list_rules");
 		if (IS_ERR(tsk)) {
 			kfree(dest);
 			err = PTR_ERR(tsk);
 		}
 		break;
 	case AUDIT_ADD:
-		listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		switch(listnr) {
-		default:
-			return -EINVAL;
-
-		case AUDIT_FILTER_USER:
-		case AUDIT_FILTER_TYPE:
-#ifdef CONFIG_AUDITSYSCALL
-		case AUDIT_FILTER_ENTRY:
-		case AUDIT_FILTER_EXIT:
-		case AUDIT_FILTER_TASK:
-#endif
-			;
-		}
-		err = audit_add_rule(data, &audit_filter_list[listnr]);
+	case AUDIT_ADD_RULE:
+		if (type == AUDIT_ADD)
+			entry = audit_rule_to_entry(data);
+		else
+			entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
+
+		err = audit_add_rule(entry,
+				     &audit_filter_list[entry->rule.listnr]);
 		if (!err)
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 				  "auid=%u added an audit rule\n", loginuid);
+		else
+			audit_free_rule(entry);
 		break;
 	case AUDIT_DEL:
-		listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
-		if (listnr >= AUDIT_NR_FILTERS)
-			return -EINVAL;
-
-		err = audit_del_rule(data, &audit_filter_list[listnr]);
+	case AUDIT_DEL_RULE:
+		if (type == AUDIT_DEL)
+			entry = audit_rule_to_entry(data);
+		else
+			entry = audit_data_to_entry(data, datasz);
+		if (IS_ERR(entry))
+			return PTR_ERR(entry);
+
+		err = audit_del_rule(entry,
+				     &audit_filter_list[entry->rule.listnr]);
 		if (!err)
 			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 				  "auid=%u removed an audit rule\n", loginuid);
+		audit_free_rule(entry);
 		break;
 	default:
 		return -EINVAL;
@@ -287,29 +539,27 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 
 
 static int audit_filter_user_rules(struct netlink_skb_parms *cb,
-				   struct audit_rule *rule,
+				   struct audit_krule *rule,
 				   enum audit_state *state)
 {
 	int i;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
+		struct audit_field *f = &rule->fields[i];
 		int result = 0;
 
-		switch (field) {
+		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(cb->creds.pid, op, value);
+			result = audit_comparator(cb->creds.pid, f->op, f->val);
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(cb->creds.uid, op, value);
+			result = audit_comparator(cb->creds.uid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(cb->creds.gid, op, value);
+			result = audit_comparator(cb->creds.gid, f->op, f->val);
 			break;
 		case AUDIT_LOGINUID:
-			result = audit_comparator(cb->loginuid, op, value);
+			result = audit_comparator(cb->loginuid, f->op, f->val);
 			break;
 		}
 
@@ -354,14 +604,11 @@ int audit_filter_type(int type)
 
 	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE],
 				list) {
-		struct audit_rule *rule = &e->rule;
 		int i;
-		for (i = 0; i < rule->field_count; i++) {
-			u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-			u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-			u32 value  = rule->values[i];
-			if ( field == AUDIT_MSGTYPE ) {
-				result = audit_comparator(type, op, value); 
+		for (i = 0; i < e->rule.field_count; i++) {
+			struct audit_field *f = &e->rule.fields[i];
+			if (f->type == AUDIT_MSGTYPE) {
+				result = audit_comparator(type, f->op, f->val);
 				if (!result)
 					break;
 			}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 17719b303638..ba0878854777 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -162,70 +162,68 @@ struct audit_context {
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise. */
 static int audit_filter_rules(struct task_struct *tsk,
-			      struct audit_rule *rule,
+			      struct audit_krule *rule,
 			      struct audit_context *ctx,
 			      enum audit_state *state)
 {
 	int i, j;
 
 	for (i = 0; i < rule->field_count; i++) {
-		u32 field  = rule->fields[i] & ~AUDIT_OPERATORS;
-		u32 op  = rule->fields[i] & AUDIT_OPERATORS;
-		u32 value  = rule->values[i];
+		struct audit_field *f = &rule->fields[i];
 		int result = 0;
 
-		switch (field) {
+		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(tsk->pid, op, value);
+			result = audit_comparator(tsk->pid, f->op, f->val);
 			break;
 		case AUDIT_UID:
-			result = audit_comparator(tsk->uid, op, value);
+			result = audit_comparator(tsk->uid, f->op, f->val);
 			break;
 		case AUDIT_EUID:
-			result = audit_comparator(tsk->euid, op, value);
+			result = audit_comparator(tsk->euid, f->op, f->val);
 			break;
 		case AUDIT_SUID:
-			result = audit_comparator(tsk->suid, op, value);
+			result = audit_comparator(tsk->suid, f->op, f->val);
 			break;
 		case AUDIT_FSUID:
-			result = audit_comparator(tsk->fsuid, op, value);
+			result = audit_comparator(tsk->fsuid, f->op, f->val);
 			break;
 		case AUDIT_GID:
-			result = audit_comparator(tsk->gid, op, value);
+			result = audit_comparator(tsk->gid, f->op, f->val);
 			break;
 		case AUDIT_EGID:
-			result = audit_comparator(tsk->egid, op, value);
+			result = audit_comparator(tsk->egid, f->op, f->val);
 			break;
 		case AUDIT_SGID:
-			result = audit_comparator(tsk->sgid, op, value);
+			result = audit_comparator(tsk->sgid, f->op, f->val);
 			break;
 		case AUDIT_FSGID:
-			result = audit_comparator(tsk->fsgid, op, value);
+			result = audit_comparator(tsk->fsgid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
-			result = audit_comparator(tsk->personality, op, value);
+			result = audit_comparator(tsk->personality, f->op, f->val);
 			break;
 		case AUDIT_ARCH:
  			if (ctx)
-				result = audit_comparator(ctx->arch, op, value);
+				result = audit_comparator(ctx->arch, f->op, f->val);
 			break;
 
 		case AUDIT_EXIT:
 			if (ctx && ctx->return_valid)
-				result = audit_comparator(ctx->return_code, op, value);
+				result = audit_comparator(ctx->return_code, f->op, f->val);
 			break;
 		case AUDIT_SUCCESS:
 			if (ctx && ctx->return_valid) {
-				if (value)
-					result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS);
+				if (f->val)
+					result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
 				else
-					result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE);
+					result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE);
 			}
 			break;
 		case AUDIT_DEVMAJOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MAJOR(ctx->names[j].dev),	op, value)) {
+					if (audit_comparator(MAJOR(ctx->names[j].dev),	f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -235,7 +233,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_DEVMINOR:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) {
+					if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -245,8 +243,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_INODE:
 			if (ctx) {
 				for (j = 0; j < ctx->name_count; j++) {
-					if (audit_comparator(ctx->names[j].ino, op, value) ||
-					    audit_comparator(ctx->names[j].pino, op, value)) {
+					if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
+					    audit_comparator(ctx->names[j].pino, f->op, f->val)) {
 						++result;
 						break;
 					}
@@ -256,14 +254,14 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_LOGINUID:
 			result = 0;
 			if (ctx)
-				result = audit_comparator(ctx->loginuid, op, value);
+				result = audit_comparator(ctx->loginuid, f->op, f->val);
 			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
 		case AUDIT_ARG3:
 			if (ctx)
-				result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value);
+				result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
 			break;
 		}
 
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index d7c0e912c5f3..73158244cf8c 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -99,6 +99,9 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_LIST,		NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
 	{ AUDIT_ADD,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_DEL,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_LIST_RULES,	NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
+	{ AUDIT_ADD_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_DEL_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
 };
-- 
cgit v1.2.3


From 5d3301088f7e412992d9e61cc3604cbdff3090ff Mon Sep 17 00:00:00 2001
From: Steve Grubb <viro@zeniv.linux.org.uk>
Date: Mon, 9 Jan 2006 09:48:17 -0500
Subject: [PATCH] add/remove rule update

Hi,

The following patch adds a little more information to the add/remove rule message emitted
by the kernel.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  2 +-
 kernel/auditfilter.c  | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8a3b98175c25..d760430c8de3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -240,7 +240,7 @@ struct audit_rule_data {
 	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
 	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
 	__u32		field_count;
-	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		mask[AUDIT_BITMASK_SIZE]; /* syscall(s) affected */
 	__u32		fields[AUDIT_MAX_FIELDS];
 	__u32		values[AUDIT_MAX_FIELDS];
 	__u32		fieldflags[AUDIT_MAX_FIELDS];
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 686d514a3518..35f8fa82bb8b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -487,10 +487,11 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u added an audit rule\n", loginuid);
-		else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"auid=%u add rule to list=%d res=%d\n",
+			loginuid, entry->rule.listnr, !err);
+
+		if (err)
 			audit_free_rule(entry);
 		break;
 	case AUDIT_DEL:
@@ -504,9 +505,10 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_del_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		if (!err)
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				  "auid=%u removed an audit rule\n", loginuid);
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			"auid=%u remove rule from list=%d res=%d\n",
+			loginuid, entry->rule.listnr, !err);
+
 		audit_free_rule(entry);
 		break;
 	default:
-- 
cgit v1.2.3


From a6c043a887a9db32a545539426ddfc8cc2c28f8f Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sun, 1 Jan 2006 14:07:00 -0500
Subject: [PATCH] Add tty to syscall audit records

Hi,

>From the RBAC specs:

FAU_SAR.1.1 The TSF shall provide the set of authorized
RBAC administrators with the capability to read the following
audit information from the audit records:

<snip>
(e) The User Session Identifier or Terminal Type

A patch adding the tty for all syscalls is included in this email.
Please apply.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ba0878854777..d3d499272d13 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -57,6 +57,7 @@
 #include <asm/unistd.h>
 #include <linux/security.h>
 #include <linux/list.h>
+#include <linux/tty.h>
 
 #include "audit.h"
 
@@ -573,6 +574,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 	int i;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
+	const char *tty;
 
 	ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
 	if (!ab)
@@ -585,11 +587,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		audit_log_format(ab, " success=%s exit=%ld", 
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
 				 context->return_code);
+	if (current->signal->tty && current->signal->tty->name)
+		tty = current->signal->tty->name;
+	else
+		tty = "(none)";
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
 		  " pid=%d auid=%u uid=%u gid=%u"
 		  " euid=%u suid=%u fsuid=%u"
-		  " egid=%u sgid=%u fsgid=%u",
+		  " egid=%u sgid=%u fsgid=%u tty=%s",
 		  context->argv[0],
 		  context->argv[1],
 		  context->argv[2],
@@ -600,7 +606,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		  context->uid,
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
-		  context->egid, context->sgid, context->fsgid);
+		  context->egid, context->sgid, context->fsgid, tty);
 	audit_log_task_info(ab, gfp_mask);
 	audit_log_end(ab);
 
-- 
cgit v1.2.3


From d9d9ec6e2c45b22282cd36cf92fcb23d504350a8 Mon Sep 17 00:00:00 2001
From: Dustin Kirkland <dustin.kirkland@us.ibm.com>
Date: Thu, 16 Feb 2006 13:40:01 -0600
Subject: [PATCH] Fix audit operators

Darrel Goeddel initiated a discussion on IRC regarding the possibility
of audit_comparator() returning -EINVAL signaling an invalid operator.

It is possible when creating the rule to assure that the operator is one
of the 6 sane values.  Here's a snip from include/linux/audit.h  Note
that 0 (nonsense) and 7 (all operators) are not valid values for an
operator.

...

/* These are the supported operators.
 *      4  2  1
 *      =  >  <
 *      -------
 *      0  0  0         0       nonsense
 *      0  0  1         1       <
 *      0  1  0         2       >
 *      0  1  1         3       !=
 *      1  0  0         4       =
 *      1  0  1         5       <=
 *      1  1  0         6       >=
 *      1  1  1         7       all operators
 */
...

Furthermore, prior to adding these extended operators, flagging the
AUDIT_NEGATE bit implied !=, and otherwise == was assumed.

The following code forces the operator to be != if the AUDIT_NEGATE bit
was flipped on.  And if no operator was specified, == is assumed.  The
only invalid condition is if the AUDIT_NEGATE bit is off and all of the
AUDIT_EQUAL, AUDIT_LESS_THAN, and AUDIT_GREATER_THAN bits are
on--clearly a nonsensical operator.

Now that this is handled at rule insertion time, the default -EINVAL
return of audit_comparator() is eliminated such that the function can
only return 1 or 0.

If this is acceptable, let's get this applied to the current tree.

:-Dustin

--

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
(cherry picked from 9bf0a8e137040f87d1b563336d4194e38fb2ba1a commit)
---
 kernel/auditfilter.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 35f8fa82bb8b..b85fd8cce11f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -160,11 +160,17 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 		f->val = rule->values[i];
 
 		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
+
+		/* Support for legacy operators where
+		 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
 		if (f->op & AUDIT_NEGATE)
-			f->op |= AUDIT_NOT_EQUAL;
-		else if (!(f->op & AUDIT_OPERATORS))
-			f->op |= AUDIT_EQUAL;
-		f->op &= ~AUDIT_NEGATE;
+			f->op = AUDIT_NOT_EQUAL;
+		else if (!f->op)
+			f->op = AUDIT_EQUAL;
+		else if (f->op == AUDIT_OPERATORS) {
+			err = -EINVAL;
+			goto exit_free;
+		}
 	}
 
 exit_nofree:
@@ -533,9 +539,9 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
 		return (left > right);
 	case AUDIT_GREATER_THAN_OR_EQUAL:
 		return (left >= right);
-	default:
-		return -EINVAL;
 	}
+	BUG();
+	return 0;
 }
 
 
-- 
cgit v1.2.3


From 4023e020807ea249ae83f0d1d851b4c7cf0afd8a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Mar 2006 23:51:39 -0800
Subject: [PATCH] simplify audit_free() locking

Simplify audit_free()'s locking: no need to lock a task that we are tearing
down.  [the extra locking also caused false positives in the lock
validator]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d3d499272d13..b613ec89e99c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -704,10 +704,14 @@ void audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
-	task_lock(tsk);
+	/*
+	 * No need to lock the task - when we execute audit_free()
+	 * then the task has no external references anymore, and
+	 * we are tearing it down. (The locking also confuses
+	 * DEBUG_LOCKDEP - this freeing may occur in softirq
+	 * contexts as well, via RCU.)
+	 */
 	context = audit_get_context(tsk, 0, 0);
-	task_unlock(tsk);
-
 	if (likely(!context))
 		return;
 
-- 
cgit v1.2.3


From 5a0bbce58bb25bd756f7ec437319d6ed2201a18b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Mar 2006 23:51:38 -0800
Subject: [PATCH] sem2mutex: audit_netlink_sem

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c       |  6 +++---
 kernel/audit.h       |  3 ++-
 kernel/auditfilter.c | 16 ++++++++--------
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 4eb97b62d7fa..6a44e0a7707d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -114,7 +114,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 /* The netlink socket is only to be read by 1 CPU, which lets us assume
  * that list additions and deletions never happen simultaneously in
  * auditsc.c */
-DECLARE_MUTEX(audit_netlink_sem);
+DEFINE_MUTEX(audit_netlink_mutex);
 
 /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
  * audit records.  Since printk uses a 1024 byte buffer, this buffer
@@ -538,14 +538,14 @@ static void audit_receive(struct sock *sk, int length)
 	struct sk_buff  *skb;
 	unsigned int qlen;
 
-	down(&audit_netlink_sem);
+	mutex_lock(&audit_netlink_mutex);
 
 	for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		audit_receive_skb(skb);
 		kfree_skb(skb);
 	}
-	up(&audit_netlink_sem);
+	mutex_unlock(&audit_netlink_mutex);
 }
 
 
diff --git a/kernel/audit.h b/kernel/audit.h
index 4b602cdcabef..bc5392076e2b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,6 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/mutex.h>
 #include <linux/fs.h>
 #include <linux/audit.h>
 
@@ -84,4 +85,4 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 					     void *payload, int size);
 extern void		    audit_log_lost(const char *message);
 extern void		    audit_panic(const char *message);
-extern struct semaphore audit_netlink_sem;
+extern struct mutex audit_netlink_mutex;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b85fd8cce11f..d3a8539f3a83 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -329,7 +329,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 }
 
 /* Add rule to given filterlist if not a duplicate.  Protected by
- * audit_netlink_sem. */
+ * audit_netlink_mutex. */
 static inline int audit_add_rule(struct audit_entry *entry,
 				  struct list_head *list)
 {
@@ -352,7 +352,7 @@ static inline int audit_add_rule(struct audit_entry *entry,
 }
 
 /* Remove an existing rule from filterlist.  Protected by
- * audit_netlink_sem. */
+ * audit_netlink_mutex. */
 static inline int audit_del_rule(struct audit_entry *entry,
 				 struct list_head *list)
 {
@@ -383,10 +383,10 @@ static int audit_list(void *_dest)
 	seq = dest[1];
 	kfree(dest);
 
-	down(&audit_netlink_sem);
+	mutex_lock(&audit_netlink_mutex);
 
 	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_sem held. */
+	   always called with audit_netlink_mutex held. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry(entry, &audit_filter_list[i], list) {
 			struct audit_rule *rule;
@@ -401,7 +401,7 @@ static int audit_list(void *_dest)
 	}
 	audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
 	
-	up(&audit_netlink_sem);
+	mutex_unlock(&audit_netlink_mutex);
 	return 0;
 }
 
@@ -417,10 +417,10 @@ static int audit_list_rules(void *_dest)
 	seq = dest[1];
 	kfree(dest);
 
-	down(&audit_netlink_sem);
+	mutex_lock(&audit_netlink_mutex);
 
 	/* The *_rcu iterators not needed here because we are
-	   always called with audit_netlink_sem held. */
+	   always called with audit_netlink_mutex held. */
 	for (i=0; i<AUDIT_NR_FILTERS; i++) {
 		list_for_each_entry(e, &audit_filter_list[i], list) {
 			struct audit_rule_data *data;
@@ -435,7 +435,7 @@ static int audit_list_rules(void *_dest)
 	}
 	audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
 
-	up(&audit_netlink_sem);
+	mutex_unlock(&audit_netlink_mutex);
 	return 0;
 }
 
-- 
cgit v1.2.3


From bf45da97a45f634422559ec61429dddf4d2fffb9 Mon Sep 17 00:00:00 2001
From: "lorenzo@gnu.org" <lorenzo@gnu.org>
Date: Thu, 9 Mar 2006 00:33:47 +0100
Subject: [PATCH] EXPORT_SYMBOL patch for audit_log, audit_log_start,
 audit_log_end and audit_format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hi,

This is a trivial patch that enables the possibility of using some auditing
functions within loadable kernel modules (ie. inside a Linux Security Module).

_

Make the audit_log_start, audit_log_end, audit_format and audit_log
interfaces available to Loadable Kernel Modules, thus making possible
the usage of the audit framework inside LSMs, etc.

Signed-off-by: <Lorenzo Hernández García-Hierro <lorenzo@gnu.org>>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6a44e0a7707d..c9345d3e8ada 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -994,3 +994,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 		audit_log_end(ab);
 	}
 }
+
+EXPORT_SYMBOL(audit_log_start);
+EXPORT_SYMBOL(audit_log_end);
+EXPORT_SYMBOL(audit_log_format);
+EXPORT_SYMBOL(audit_log);
-- 
cgit v1.2.3


From 71e1c784b24a026a490b3de01541fc5ee14ebc09 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Mon, 6 Mar 2006 22:40:05 -0500
Subject: [PATCH] fix audit_init failure path

Make audit_init() failure path handle situations where the audit_panic()
action is not AUDIT_FAIL_PANIC (default is AUDIT_FAIL_PRINTK).  Other uses
of audit_sock are not reached unless audit's netlink message handler is
properly registered.  Bug noticed by Peter Staubach.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index c9345d3e8ada..04fe2e301b61 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -558,8 +558,9 @@ static int __init audit_init(void)
 					   THIS_MODULE);
 	if (!audit_sock)
 		audit_panic("cannot initialize netlink socket");
+	else
+		audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 
-	audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 	skb_queue_head_init(&audit_skb_queue);
 	audit_initialized = 1;
 	audit_enabled = audit_default;
-- 
cgit v1.2.3


From 51107301b629640f9ab76fe23bf385e187b9ac29 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Wed, 15 Mar 2006 08:28:55 -0500
Subject: [PATCH] kobject: fix build error if CONFIG_SYSFS=n

Moving uevent_seqnum and uevent_helper to kobject_uevent.c
because they are used even if CONFIG_SYSFS=n
while kernel/ksysfs.c is built only if CONFIG_SYSFS=y,

Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/ksysfs.c      | 3 ---
 lib/kobject_uevent.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d5eeae0fa5bc..f2690ed74530 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -15,9 +15,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 
-u64 uevent_seqnum;
-char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
-
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 086a0c6e888e..982226daf939 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -26,6 +26,8 @@
 #define NUM_ENVP	32	/* number of env pointers */
 
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+u64 uevent_seqnum;
+char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
 static DEFINE_SPINLOCK(sequence_lock);
 static struct sock *uevent_sock;
 
-- 
cgit v1.2.3


From 3fd6805f4dfb02bcfb5634972eabad0e790f119a Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 8 Feb 2006 21:16:45 +0100
Subject: [PATCH] Clean up module.c symbol searching logic

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/module.c | 73 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5aad477ddc79..2a892b20d68f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,6 +135,18 @@ extern const unsigned long __start___kcrctab_gpl[];
 #define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
 #endif
 
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 /* Find a symbol, return value, crc and module which owns it */
 static unsigned long __find_symbol(const char *name,
 				   struct module **owner,
@@ -142,39 +154,41 @@ static unsigned long __find_symbol(const char *name,
 				   int gplok)
 {
 	struct module *mod;
-	unsigned int i;
+	const struct kernel_symbol *ks;
 
 	/* Core kernel first. */ 
 	*owner = NULL;
-	for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++) {
-		if (strcmp(__start___ksymtab[i].name, name) == 0) {
-			*crc = symversion(__start___kcrctab, i);
-			return __start___ksymtab[i].value;
-		}
+	ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
+	if (ks) {
+		*crc = symversion(__start___kcrctab, (ks - __start___ksymtab));
+		return ks->value;
 	}
 	if (gplok) {
-		for (i = 0; __start___ksymtab_gpl+i<__stop___ksymtab_gpl; i++)
-			if (strcmp(__start___ksymtab_gpl[i].name, name) == 0) {
-				*crc = symversion(__start___kcrctab_gpl, i);
-				return __start___ksymtab_gpl[i].value;
-			}
+		ks = lookup_symbol(name, __start___ksymtab_gpl,
+					 __stop___ksymtab_gpl);
+		if (ks) {
+			*crc = symversion(__start___kcrctab_gpl,
+					  (ks - __start___ksymtab_gpl));
+			return ks->value;
+		}
 	}
 
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
 		*owner = mod;
-		for (i = 0; i < mod->num_syms; i++)
-			if (strcmp(mod->syms[i].name, name) == 0) {
-				*crc = symversion(mod->crcs, i);
-				return mod->syms[i].value;
-			}
+		ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
+		if (ks) {
+			*crc = symversion(mod->crcs, (ks - mod->syms));
+			return ks->value;
+		}
 
 		if (gplok) {
-			for (i = 0; i < mod->num_gpl_syms; i++) {
-				if (strcmp(mod->gpl_syms[i].name, name) == 0) {
-					*crc = symversion(mod->gpl_crcs, i);
-					return mod->gpl_syms[i].value;
-				}
+			ks = lookup_symbol(name, mod->gpl_syms,
+					   mod->gpl_syms + mod->num_gpl_syms);
+			if (ks) {
+				*crc = symversion(mod->gpl_crcs,
+						  (ks - mod->gpl_syms));
+				return ks->value;
 			}
 		}
 	}
@@ -1444,18 +1458,13 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
 {
-	unsigned int i;
-
-	if (!mod) {
-		for (i = 0; __start___ksymtab+i < __stop___ksymtab; i++)
-			if (strcmp(__start___ksymtab[i].name, name) == 0)
-				return 1;
-		return 0;
-	}
-	for (i = 0; i < mod->num_syms; i++)
-		if (strcmp(mod->syms[i].name, name) == 0)
+	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
+		return 1;
+	else
+		if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
 			return 1;
-	return 0;
+		else
+			return 0;
 }
 
 /* As per nm */
-- 
cgit v1.2.3


From 9f28bb7e1d0188a993403ab39b774785892805e1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Mar 2006 13:17:13 -0800
Subject: [PATCH] add EXPORT_SYMBOL_GPL_FUTURE()

This patch adds the ability to mark symbols that will be changed in the
future, so that kernel modules that don't include MODULE_LICENSE("GPL")
and use the symbols, will be flagged and printed out to the system log.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/m68knommu/kernel/vmlinux.lds.S | 10 ++++
 arch/v850/kernel/vmlinux.lds.S      |  8 ++++
 include/asm-generic/vmlinux.lds.h   | 14 ++++++
 include/linux/module.h              |  9 ++++
 kernel/module.c                     | 49 +++++++++++++++++++-
 scripts/genksyms/keywords.c_shipped | 91 +++++++++++++++++++------------------
 scripts/genksyms/keywords.gperf     |  1 +
 7 files changed, 135 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index ac9de2661c0b..a331cc90797c 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -269,6 +269,11 @@ SECTIONS {
 		*(__ksymtab_gpl)
 		__stop___ksymtab_gpl = .;
 
+		/* Kernel symbol table: GPL-future symbols */
+		__start___ksymtab_gpl_future = .;
+		*(__ksymtab_gpl_future)
+		__stop___ksymtab_gpl_future = .;
+
 		/* Kernel symbol table: Normal symbols */
 		__start___kcrctab = .;
 		*(__kcrctab)
@@ -279,6 +284,11 @@ SECTIONS {
 		*(__kcrctab_gpl)
 		__stop___kcrctab_gpl = .;
 
+		/* Kernel symbol table: GPL-future symbols */
+		__start___kcrctab_gpl_future = .;
+		*(__kcrctab_gpl_future)
+		__stop___kcrctab_gpl_future = .;
+
 		/* Kernel symbol table: strings */
 		*(__ksymtab_strings)
 
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 5be05f47109e..5b2ffcc6e2b2 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -64,6 +64,10 @@
 		___start___ksymtab_gpl = .;				      \
 			*(__ksymtab_gpl)				      \
 		___stop___ksymtab_gpl = .;				      \
+		/* Kernel symbol table: GPL-future symbols */		      \
+		___start___ksymtab_gpl_future = .;			      \
+			*(__ksymtab_gpl_future)				      \
+		___stop___ksymtab_gpl_future = .;			      \
 		/* Kernel symbol table: strings */			      \
 			*(__ksymtab_strings)				      \
 		/* Kernel symbol table: Normal symbols */		      \
@@ -74,6 +78,10 @@
 		___start___kcrctab_gpl = .;				      \
 			*(__kcrctab_gpl)				      \
 		___stop___kcrctab_gpl = .;				      \
+		/* Kernel symbol table: GPL-future symbols */		      \
+		___start___kcrctab_gpl_future = .;			      \
+			*(__kcrctab_gpl_future)				      \
+		___stop___kcrctab_gpl_future = .;			      \
 		/* Built-in module parameters */			      \
 		. = ALIGN (4) ;						      \
 		___start___param = .;					      \
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 35de20cf8fac..9d11550b4818 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -58,6 +58,13 @@
 		VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: GPL-future-only symbols */		\
+	__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___ksymtab_gpl_future) = .;	\
+		*(__ksymtab_gpl_future)					\
+		VMLINUX_SYMBOL(__stop___ksymtab_gpl_future) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: Normal symbols */			\
 	__kcrctab         : AT(ADDR(__kcrctab) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start___kcrctab) = .;			\
@@ -72,6 +79,13 @@
 		VMLINUX_SYMBOL(__stop___kcrctab_gpl) = .;		\
 	}								\
 									\
+	/* Kernel symbol table: GPL-future-only symbols */		\
+	__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
+		VMLINUX_SYMBOL(__start___kcrctab_gpl_future) = .;	\
+		*(__kcrctab_gpl_future)					\
+		VMLINUX_SYMBOL(__stop___kcrctab_gpl_future) = .;	\
+	}								\
+									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
diff --git a/include/linux/module.h b/include/linux/module.h
index 84d75f3a8aca..a25d5f61548c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -198,6 +198,9 @@ void *__symbol_get_gpl(const char *symbol);
 #define EXPORT_SYMBOL_GPL(sym)					\
 	__EXPORT_SYMBOL(sym, "_gpl")
 
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)				\
+	__EXPORT_SYMBOL(sym, "_gpl_future")
+
 #endif
 
 struct module_ref
@@ -255,6 +258,11 @@ struct module
 	unsigned int num_gpl_syms;
 	const unsigned long *gpl_crcs;
 
+	/* symbols that will be GPL-only in the near future. */
+	const struct kernel_symbol *gpl_future_syms;
+	unsigned int num_gpl_future_syms;
+	const unsigned long *gpl_future_crcs;
+
 	/* Exception table */
 	unsigned int num_exentries;
 	const struct exception_table_entry *extable;
@@ -441,6 +449,7 @@ void module_remove_driver(struct device_driver *);
 #else /* !CONFIG_MODULES... */
 #define EXPORT_SYMBOL(sym)
 #define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
 
 /* Given an address, look for it in the exception tables. */
 static inline const struct exception_table_entry *
diff --git a/kernel/module.c b/kernel/module.c
index 2a892b20d68f..5ca99fbe9f44 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -126,8 +126,11 @@ extern const struct kernel_symbol __start___ksymtab[];
 extern const struct kernel_symbol __stop___ksymtab[];
 extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
+extern const struct kernel_symbol __start___ksymtab_gpl_future[];
+extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
+extern const unsigned long __start___kcrctab_gpl_future[];
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -172,6 +175,22 @@ static unsigned long __find_symbol(const char *name,
 			return ks->value;
 		}
 	}
+	ks = lookup_symbol(name, __start___ksymtab_gpl_future,
+				 __stop___ksymtab_gpl_future);
+	if (ks) {
+		if (!gplok) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more "
+			       "details.\n");
+		}
+		*crc = symversion(__start___kcrctab_gpl_future,
+				  (ks - __start___ksymtab_gpl_future));
+		return ks->value;
+	}
 
 	/* Now try modules. */ 
 	list_for_each_entry(mod, &modules, list) {
@@ -191,6 +210,23 @@ static unsigned long __find_symbol(const char *name,
 				return ks->value;
 			}
 		}
+		ks = lookup_symbol(name, mod->gpl_future_syms,
+				   (mod->gpl_future_syms +
+				    mod->num_gpl_future_syms));
+		if (ks) {
+			if (!gplok) {
+				printk(KERN_WARNING "Symbol %s is being used "
+				       "by a non-GPL module, which will not "
+				       "be allowed in the future\n", name);
+				printk(KERN_WARNING "Please see the file "
+				       "Documentation/feature-removal-schedule.txt "
+				       "in the kernel source tree for more "
+				       "details.\n");
+			}
+			*crc = symversion(mod->gpl_future_crcs,
+					  (ks - mod->gpl_future_syms));
+			return ks->value;
+		}
 	}
 	DEBUGP("Failed to find symbol %s\n", name);
  	return 0;
@@ -1546,7 +1582,8 @@ static struct module *load_module(void __user *umod,
 	char *secstrings, *args, *modmagic, *strtab = NULL;
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
-		crcindex, gplcrcindex, versindex, pcpuindex;
+		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
+		gplfuturecrcindex;
 	long arglen;
 	struct module *mod;
 	long err = 0;
@@ -1627,8 +1664,10 @@ static struct module *load_module(void __user *umod,
 	/* Optional sections */
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
+	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
+	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1784,10 +1823,16 @@ static struct module *load_module(void __user *umod,
 	mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr;
 	if (gplcrcindex)
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
+	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
+					sizeof(*mod->gpl_future_syms);
+	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
+	if (gplfuturecrcindex)
+		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !crcindex) || 
-	    (mod->num_gpl_syms && !gplcrcindex)) {
+	    (mod->num_gpl_syms && !gplcrcindex) ||
+	    (mod->num_gpl_future_syms && !gplfuturecrcindex)) {
 		printk(KERN_WARNING "%s: No versions for exported symbols."
 		       " Tainting kernel.\n", mod->name);
 		add_taint(TAINT_FORCED_MODULE);
diff --git a/scripts/genksyms/keywords.c_shipped b/scripts/genksyms/keywords.c_shipped
index ee4647805c58..d8153f572e40 100644
--- a/scripts/genksyms/keywords.c_shipped
+++ b/scripts/genksyms/keywords.c_shipped
@@ -52,9 +52,9 @@ is_reserved_hash (register const char *str, register unsigned int len)
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
-      71, 71, 71, 71, 71, 71, 71, 71, 71, 15,
-      71, 71, 71, 71, 71, 71, 15, 71, 71, 71,
-      10, 71, 71, 71, 71, 71, 71, 71, 71, 71,
+      71, 71, 71, 71, 71, 71, 71, 71, 71,  0,
+      71, 71, 71, 71, 71, 71, 35, 71, 71, 71,
+       5, 71, 71, 71, 71, 71, 71, 71, 71, 71,
       71, 71, 71, 71, 71,  0, 71,  0, 71,  5,
        5,  0, 10, 20, 71, 25, 71, 71, 20,  0,
       20, 30, 25, 71, 10,  5,  0, 20, 15, 71,
@@ -84,9 +84,9 @@ is_reserved_word (register const char *str, register unsigned int len)
 {
   enum
     {
-      TOTAL_KEYWORDS = 41,
+      TOTAL_KEYWORDS = 42,
       MIN_WORD_LENGTH = 3,
-      MAX_WORD_LENGTH = 17,
+      MAX_WORD_LENGTH = 24,
       MIN_HASH_VALUE = 3,
       MAX_HASH_VALUE = 70
     };
@@ -94,104 +94,105 @@ is_reserved_word (register const char *str, register unsigned int len)
   static const struct resword wordlist[] =
     {
       {""}, {""}, {""},
-#line 24 "scripts/genksyms/keywords.gperf"
+#line 25 "scripts/genksyms/keywords.gperf"
       {"asm", ASM_KEYW},
       {""},
-#line 7 "scripts/genksyms/keywords.gperf"
+#line 8 "scripts/genksyms/keywords.gperf"
       {"__asm", ASM_KEYW},
       {""},
-#line 8 "scripts/genksyms/keywords.gperf"
+#line 9 "scripts/genksyms/keywords.gperf"
       {"__asm__", ASM_KEYW},
       {""},
-#line 21 "scripts/genksyms/keywords.gperf"
+#line 22 "scripts/genksyms/keywords.gperf"
       {"_restrict", RESTRICT_KEYW},
-#line 50 "scripts/genksyms/keywords.gperf"
+#line 51 "scripts/genksyms/keywords.gperf"
       {"__typeof__", TYPEOF_KEYW},
-#line 9 "scripts/genksyms/keywords.gperf"
+#line 10 "scripts/genksyms/keywords.gperf"
       {"__attribute", ATTRIBUTE_KEYW},
-#line 11 "scripts/genksyms/keywords.gperf"
+#line 12 "scripts/genksyms/keywords.gperf"
       {"__const", CONST_KEYW},
-#line 10 "scripts/genksyms/keywords.gperf"
+#line 11 "scripts/genksyms/keywords.gperf"
       {"__attribute__", ATTRIBUTE_KEYW},
-#line 12 "scripts/genksyms/keywords.gperf"
+#line 13 "scripts/genksyms/keywords.gperf"
       {"__const__", CONST_KEYW},
-#line 16 "scripts/genksyms/keywords.gperf"
+#line 17 "scripts/genksyms/keywords.gperf"
       {"__signed__", SIGNED_KEYW},
-#line 42 "scripts/genksyms/keywords.gperf"
+#line 43 "scripts/genksyms/keywords.gperf"
       {"static", STATIC_KEYW},
       {""},
-#line 15 "scripts/genksyms/keywords.gperf"
+#line 16 "scripts/genksyms/keywords.gperf"
       {"__signed", SIGNED_KEYW},
-#line 30 "scripts/genksyms/keywords.gperf"
+#line 31 "scripts/genksyms/keywords.gperf"
       {"char", CHAR_KEYW},
       {""},
-#line 43 "scripts/genksyms/keywords.gperf"
+#line 44 "scripts/genksyms/keywords.gperf"
       {"struct", STRUCT_KEYW},
-#line 22 "scripts/genksyms/keywords.gperf"
-      {"__restrict__", RESTRICT_KEYW},
 #line 23 "scripts/genksyms/keywords.gperf"
+      {"__restrict__", RESTRICT_KEYW},
+#line 24 "scripts/genksyms/keywords.gperf"
       {"restrict", RESTRICT_KEYW},
-#line 33 "scripts/genksyms/keywords.gperf"
+#line 34 "scripts/genksyms/keywords.gperf"
       {"enum", ENUM_KEYW},
-#line 17 "scripts/genksyms/keywords.gperf"
+#line 18 "scripts/genksyms/keywords.gperf"
       {"__volatile", VOLATILE_KEYW},
-#line 34 "scripts/genksyms/keywords.gperf"
+#line 35 "scripts/genksyms/keywords.gperf"
       {"extern", EXTERN_KEYW},
-#line 18 "scripts/genksyms/keywords.gperf"
+#line 19 "scripts/genksyms/keywords.gperf"
       {"__volatile__", VOLATILE_KEYW},
-#line 37 "scripts/genksyms/keywords.gperf"
+#line 38 "scripts/genksyms/keywords.gperf"
       {"int", INT_KEYW},
-      {""},
-#line 31 "scripts/genksyms/keywords.gperf"
-      {"const", CONST_KEYW},
+#line 7 "scripts/genksyms/keywords.gperf"
+      {"EXPORT_SYMBOL_GPL_FUTURE", EXPORT_SYMBOL_KEYW},
 #line 32 "scripts/genksyms/keywords.gperf"
+      {"const", CONST_KEYW},
+#line 33 "scripts/genksyms/keywords.gperf"
       {"double", DOUBLE_KEYW},
       {""},
-#line 13 "scripts/genksyms/keywords.gperf"
+#line 14 "scripts/genksyms/keywords.gperf"
       {"__inline", INLINE_KEYW},
-#line 29 "scripts/genksyms/keywords.gperf"
+#line 30 "scripts/genksyms/keywords.gperf"
       {"auto", AUTO_KEYW},
-#line 14 "scripts/genksyms/keywords.gperf"
+#line 15 "scripts/genksyms/keywords.gperf"
       {"__inline__", INLINE_KEYW},
-#line 41 "scripts/genksyms/keywords.gperf"
+#line 42 "scripts/genksyms/keywords.gperf"
       {"signed", SIGNED_KEYW},
       {""},
-#line 46 "scripts/genksyms/keywords.gperf"
+#line 47 "scripts/genksyms/keywords.gperf"
       {"unsigned", UNSIGNED_KEYW},
       {""},
-#line 40 "scripts/genksyms/keywords.gperf"
+#line 41 "scripts/genksyms/keywords.gperf"
       {"short", SHORT_KEYW},
-#line 49 "scripts/genksyms/keywords.gperf"
+#line 50 "scripts/genksyms/keywords.gperf"
       {"typeof", TYPEOF_KEYW},
-#line 44 "scripts/genksyms/keywords.gperf"
+#line 45 "scripts/genksyms/keywords.gperf"
       {"typedef", TYPEDEF_KEYW},
-#line 48 "scripts/genksyms/keywords.gperf"
+#line 49 "scripts/genksyms/keywords.gperf"
       {"volatile", VOLATILE_KEYW},
       {""},
-#line 35 "scripts/genksyms/keywords.gperf"
+#line 36 "scripts/genksyms/keywords.gperf"
       {"float", FLOAT_KEYW},
       {""}, {""},
-#line 39 "scripts/genksyms/keywords.gperf"
+#line 40 "scripts/genksyms/keywords.gperf"
       {"register", REGISTER_KEYW},
-#line 47 "scripts/genksyms/keywords.gperf"
+#line 48 "scripts/genksyms/keywords.gperf"
       {"void", VOID_KEYW},
       {""},
-#line 36 "scripts/genksyms/keywords.gperf"
+#line 37 "scripts/genksyms/keywords.gperf"
       {"inline", INLINE_KEYW},
       {""},
 #line 5 "scripts/genksyms/keywords.gperf"
       {"EXPORT_SYMBOL", EXPORT_SYMBOL_KEYW},
       {""},
-#line 20 "scripts/genksyms/keywords.gperf"
+#line 21 "scripts/genksyms/keywords.gperf"
       {"_Bool", BOOL_KEYW},
       {""},
 #line 6 "scripts/genksyms/keywords.gperf"
       {"EXPORT_SYMBOL_GPL", EXPORT_SYMBOL_KEYW},
       {""}, {""}, {""}, {""}, {""}, {""},
-#line 38 "scripts/genksyms/keywords.gperf"
+#line 39 "scripts/genksyms/keywords.gperf"
       {"long", LONG_KEYW},
       {""}, {""}, {""}, {""}, {""},
-#line 45 "scripts/genksyms/keywords.gperf"
+#line 46 "scripts/genksyms/keywords.gperf"
       {"union", UNION_KEYW}
     };
 
diff --git a/scripts/genksyms/keywords.gperf b/scripts/genksyms/keywords.gperf
index b6bec765996e..c75e0c8d8f0c 100644
--- a/scripts/genksyms/keywords.gperf
+++ b/scripts/genksyms/keywords.gperf
@@ -4,6 +4,7 @@ struct resword { const char *name; int token; }
 %%
 EXPORT_SYMBOL, EXPORT_SYMBOL_KEYW
 EXPORT_SYMBOL_GPL, EXPORT_SYMBOL_KEYW
+EXPORT_SYMBOL_GPL_FUTURE, EXPORT_SYMBOL_KEYW
 __asm, ASM_KEYW
 __asm__, ASM_KEYW
 __attribute, ATTRIBUTE_KEYW
-- 
cgit v1.2.3


From 01ca70dca5c64cb774a8ac2f50bddff21d60169f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Mar 2006 13:17:13 -0800
Subject: [PATCH] add EXPORT_SYMBOL_GPL_FUTURE() to RCU subsystem

As the RCU symbols are going to be changed to GPL in the near future,
lets warn users that this is going to happen.

Cc: Paul McKenney <paulmck@us.ibm.com>
Acked-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rcupdate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 8cf15a569fcd..fedf5e369755 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -609,7 +609,7 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.3


From 03e88ae1b13dfdc8bbaa59b8198e1ca53aad12ac Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 16 Feb 2006 13:50:23 -0800
Subject: [PATCH] fix module sysfs files reference counting

The module files, refcnt, version, and srcversion did not properly
increment the owner's module reference count, allowing the modules to
be removed while the files were open, causing oopses.

This patch fixes this, and also fixes the problem that the version and
srcversion files were not showing up, unless CONFIG_MODULE_UNLOAD was
enabled, which is not correct.

Cc: Nathan Lynch <ntl@pobox.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/module.h |  1 +
 kernel/module.c        | 77 ++++++++++++++++++++------------------------------
 kernel/params.c        | 10 -------
 3 files changed, 32 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index a25d5f61548c..70bd843c71cb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -245,6 +245,7 @@ struct module
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
 	struct module_param_attrs *param_attrs;
+	struct module_attribute *modinfo_attrs;
 	const char *version;
 	const char *srcversion;
 
diff --git a/kernel/module.c b/kernel/module.c
index 5ca99fbe9f44..77764f22f021 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -429,7 +429,6 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 }
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_MODULE_UNLOAD
 #define MODINFO_ATTR(field)	\
 static void setup_modinfo_##field(struct module *mod, const char *s)  \
 {                                                                     \
@@ -461,12 +460,7 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 
-static struct module_attribute *modinfo_attrs[] = {
-	&modinfo_version,
-	&modinfo_srcversion,
-	NULL,
-};
-
+#ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -781,6 +775,15 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
 
+static struct module_attribute *modinfo_attrs[] = {
+	&modinfo_version,
+	&modinfo_srcversion,
+#ifdef CONFIG_MODULE_UNLOAD
+	&refcnt,
+#endif
+	NULL,
+};
+
 #ifdef CONFIG_OBSOLETE_MODPARM
 /* Bounds checking done below */
 static int obsparm_copy_string(const char *val, struct kernel_param *kp)
@@ -1106,37 +1109,28 @@ static inline void remove_sect_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
 
-
-#ifdef CONFIG_MODULE_UNLOAD
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return sysfs_create_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-	return sysfs_remove_file(&mod->mkobj.kobj, &refcnt.attr);
-}
-#else
-static inline int module_add_refcnt_attr(struct module *mod)
-{
-	return 0;
-}
-static void module_remove_refcnt_attr(struct module *mod)
-{
-}
-#endif
-
-#ifdef CONFIG_MODULE_UNLOAD
 static int module_add_modinfo_attrs(struct module *mod)
 {
 	struct module_attribute *attr;
+	struct module_attribute *temp_attr;
 	int error = 0;
 	int i;
 
+	mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) *
+					(ARRAY_SIZE(modinfo_attrs) + 1)),
+					GFP_KERNEL);
+	if (!mod->modinfo_attrs)
+		return -ENOMEM;
+
+	temp_attr = mod->modinfo_attrs;
 	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
 		if (!attr->test ||
-		    (attr->test && attr->test(mod)))
-			error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+		    (attr->test && attr->test(mod))) {
+			memcpy(temp_attr, attr, sizeof(*temp_attr));
+			temp_attr->attr.owner = mod;
+			error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
+			++temp_attr;
+		}
 	}
 	return error;
 }
@@ -1146,12 +1140,16 @@ static void module_remove_modinfo_attrs(struct module *mod)
 	struct module_attribute *attr;
 	int i;
 
-	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+	for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+		/* pick a field to test for end of list */
+		if (!attr->attr.name)
+			break;
 		sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
-		attr->free(mod);
+		if (attr->free)
+			attr->free(mod);
 	}
+	kfree(mod->modinfo_attrs);
 }
-#endif
 
 static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
@@ -1169,19 +1167,13 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out;
 
-	err = module_add_refcnt_attr(mod);
-	if (err)
-		goto out_unreg;
-
 	err = module_param_sysfs_setup(mod, kparam, num_params);
 	if (err)
 		goto out_unreg;
 
-#ifdef CONFIG_MODULE_UNLOAD
 	err = module_add_modinfo_attrs(mod);
 	if (err)
 		goto out_unreg;
-#endif
 
 	return 0;
 
@@ -1193,10 +1185,7 @@ out:
 
 static void mod_kobject_remove(struct module *mod)
 {
-#ifdef CONFIG_MODULE_UNLOAD
 	module_remove_modinfo_attrs(mod);
-#endif
-	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
 	kobject_unregister(&mod->mkobj.kobj);
@@ -1474,7 +1463,6 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
 	return NULL;
 }
 
-#ifdef CONFIG_MODULE_UNLOAD
 static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 			  unsigned int infoindex)
 {
@@ -1489,7 +1477,6 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 						attr->attr.name));
 	}
 }
-#endif
 
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
@@ -1803,10 +1790,8 @@ static struct module *load_module(void __user *umod,
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint(TAINT_PROPRIETARY_MODULE);
 
-#ifdef CONFIG_MODULE_UNLOAD
 	/* Set up MODINFO_ATTR fields */
 	setup_modinfo(mod, sechdrs, infoindex);
-#endif
 
 	/* Fix up syms, so that st_value is a pointer to location. */
 	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
diff --git a/kernel/params.c b/kernel/params.c
index c76ad25e6a21..a29150582310 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -638,13 +638,8 @@ static ssize_t module_attr_show(struct kobject *kobj,
 	if (!attribute->show)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->show(attribute, mk->mod, buf);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
@@ -662,13 +657,8 @@ static ssize_t module_attr_store(struct kobject *kobj,
 	if (!attribute->store)
 		return -EIO;
 
-	if (!try_module_get(mk->mod))
-		return -ENODEV;
-
 	ret = attribute->store(attribute, mk->mod, buf, len);
 
-	module_put(mk->mod);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9430d58e34ec3861e1ca72f8e49105b227aad327 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 22 Mar 2006 00:07:33 -0800
Subject: [PATCH] sched: remove sleep_avg multiplier

Remove the sleep_avg multiplier.  This multiplier was necessary back when
we had 10 seconds of dynamic range in sleep_avg, but now that we only have
one second, it causes that one second to be compressed down to 100ms in
some cases.  This is particularly noticeable when compiling a kernel in a
slow NFS mount, and I believe it to be a very likely candidate for other
recently reported network related interactivity problems.

In testing, I can detect no negative impact of this removal.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4d46e90f59c3..6b6e0d70eb30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -706,12 +706,6 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
 				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
 						DEF_TIMESLICE);
 		} else {
-			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
 			/*
 			 * Tasks waking from uninterruptible sleep are
 			 * limited in their sleep_avg rise as they
-- 
cgit v1.2.3


From 06f9d4f94a075285d25253edbf57f2cda07d4ff3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 22 Mar 2006 00:07:40 -0800
Subject: [PATCH] unshare: Error if passed unsupported flags

A bare bones trivial patch to ensure we always get -EINVAL on the
unsupported cases for sys_unshare.  If this goes in before 2.6.16 it allows
us to forward compatible with future applications using sys_unshare.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: JANAK DESAI <janak@us.ibm.com>
Cc: <stable@kerenl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b373322ca497..9bd7b65ee418 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1534,6 +1534,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 
 	check_unshare_flags(&unshare_flags);
 
+	/* Return -EINVAL for all unsupported flags */
+	err = -EINVAL;
+	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM))
+		goto bad_unshare_out;
+
 	if ((err = unshare_thread(unshare_flags)))
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
-- 
cgit v1.2.3


From 78eef01b0fae087c5fadbd85dd4fe2918c3a015f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:16 -0800
Subject: [PATCH] on_each_cpu(): disable local interrupts

When on_each_cpu() runs the callback on other CPUs, it runs with local
interrupts disabled.  So we should run the function with local interrupts
disabled on this CPU, too.

And do the same for UP, so the callback is run in the same environment on both
UP and SMP.  (strictly it should do preempt_disable() too, but I think
local_irq_disable is sufficiently equivalent).

Also uninlines on_each_cpu().  softirq.c was the most appropriate file I could
find, but it doesn't seem to justify creating a new file.

Oh, and fix up that comment over (under?) x86's smp_call_function().  It
drives me nuts.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smp.c | 28 ++++++++++++----------------
 include/linux/smp.h    | 23 +++++++++--------------
 kernel/softirq.c       | 20 ++++++++++++++++++++
 3 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 218d725a5a1e..d134e9643a58 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void)
 	spin_unlock_irq(&call_lock);
 }
 
-static struct call_data_struct * call_data;
-
-/*
- * this function sends a 'generic call function' IPI to all other CPUs
- * in the system.
- */
-
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			int wait)
-/*
- * [SUMMARY] Run a function on all other CPUs.
- * <func> The function to run. This must be fast and non-blocking.
- * <info> An arbitrary pointer to pass to the function.
- * <nonatomic> currently unused.
- * <wait> If true, wait (atomically) until function has completed on other CPUs.
- * [RETURNS] 0 on success, else a negative status code. Does not return until
+static struct call_data_struct *call_data;
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: currently unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Does not return until
  * remote CPUs are nearly ready to execute <<func>> or are or have executed.
  *
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
+int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			int wait)
 {
 	struct call_data_struct data;
 	int cpus;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 44153fdf73fc..d699a16b0cb2 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -52,23 +52,12 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-extern int smp_call_function (void (*func) (void *info), void *info,
-			      int retry, int wait);
+int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
 
 /*
  * Call a function on all processors
  */
-static inline int on_each_cpu(void (*func) (void *info), void *info,
-			      int retry, int wait)
-{
-	int ret = 0;
-
-	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
-	func(info);
-	preempt_enable();
-	return ret;
-}
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -94,7 +83,13 @@ void smp_prepare_boot_cpu(void);
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
 #define smp_call_function(func,info,retry,wait)	({ 0; })
-#define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
+#define on_each_cpu(func,info,retry,wait)	\
+	({					\
+		local_irq_disable();		\
+		func(info);			\
+		local_irq_enable();		\
+		0;				\
+	})
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ad3295cdded5..ec8fed42a86f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/smp.h>
 
 #include <asm/irq.h>
 /*
@@ -495,3 +496,22 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_SMP
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, retry, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
+#endif
-- 
cgit v1.2.3


From e9028b0ff2bad1954568604dc17725692c8524d6 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 23 Mar 2006 02:59:20 -0800
Subject: [PATCH] fix scheduler deadlock

We have noticed lockups during boot when stress testing kexec on ppc64.
Two cpus would deadlock in scheduler code trying to grab already taken
spinlocks.

The double_rq_lock code uses the address of the runqueue to order the
taking of multiple locks.  This address is a per cpu variable:

	if (rq1 < rq2) {
		spin_lock(&rq1->lock);
		spin_lock(&rq2->lock);
	} else {
		spin_lock(&rq2->lock);
		spin_lock(&rq1->lock);
	}

On the other hand, the code in wake_sleeping_dependent uses the cpu id
order to grab locks:

	for_each_cpu_mask(i, sibling_map)
		spin_lock(&cpu_rq(i)->lock);

This means we rely on the address of per cpu data increasing as cpu ids
increase.  While this will be true for the generic percpu implementation it
may not be true for arch specific implementations.

One way to solve this is to always take runqueues in cpu id order. To do
this we add a cpu variable to the runqueue and check it in the
double runqueue locking functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6b6e0d70eb30..a5bd60453eae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -237,6 +237,7 @@ struct runqueue {
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
+	int cpu;
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
@@ -1654,6 +1655,9 @@ unsigned long nr_iowait(void)
 /*
  * double_rq_lock - safely lock two runqueues
  *
+ * We must take them in cpu order to match code in
+ * dependent_sleeper and wake_dependent_sleeper.
+ *
  * Note this does not disable interrupts like task_rq_lock,
  * you need to do so manually before calling.
  */
@@ -1665,7 +1669,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
 		spin_lock(&rq1->lock);
 		__acquire(rq2->lock);	/* Fake it out ;) */
 	} else {
-		if (rq1 < rq2) {
+		if (rq1->cpu < rq2->cpu) {
 			spin_lock(&rq1->lock);
 			spin_lock(&rq2->lock);
 		} else {
@@ -1701,7 +1705,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	__acquires(this_rq->lock)
 {
 	if (unlikely(!spin_trylock(&busiest->lock))) {
-		if (busiest < this_rq) {
+		if (busiest->cpu < this_rq->cpu) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
@@ -6029,6 +6033,7 @@ void __init sched_init(void)
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
+		rq->cpu = i;
 #endif
 		atomic_set(&rq->nr_iowait, 0);
 
-- 
cgit v1.2.3


From 2b322ce210aec74ae0d02938d3a01e29fe079469 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 02:59:58 -0800
Subject: [PATCH] revert "swsusp: fix breakage with swap on lvm"

This was a temporary thing for 2.6.16.

Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d9d08f72f76..4e90905f0e87 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -153,11 +153,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int i;
 
+	if (!swsusp_resume_device)
+		return -ENODEV;
 	spin_lock(&swap_lock);
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
-		if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
+		if (is_resume_device(swap_info + i)) {
 			spin_unlock(&swap_lock);
 			root_swap = i;
 			return 0;
-- 
cgit v1.2.3


From f577eb30afdc68233f25d4d82b04102129262365 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 02:59:59 -0800
Subject: [PATCH] swsusp: low level interface

Introduce the low level interface that can be used for handling the
snapshot of the system memory by the in-kernel swap-writing/reading code of
swsusp and the userland interface code (to be introduced shortly).

Also change the way in which swsusp records the allocated swap pages and,
consequently, simplifies the in-kernel swap-writing/reading code (this is
necessary for the userland interface too).  To this end, it introduces two
helper functions in mm/swapfile.c, so that the swsusp code does not refer
directly to the swap internals.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   5 +-
 kernel/power/disk.c     |  12 +-
 kernel/power/power.h    |  26 +-
 kernel/power/snapshot.c | 326 +++++++++++++++++++++-
 kernel/power/swsusp.c   | 723 +++++++++++++++++-------------------------------
 mm/swapfile.c           |  55 +++-
 6 files changed, 656 insertions(+), 491 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12415dd94451..54eac8a39a4c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -234,14 +234,15 @@ extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *v
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
-extern struct swap_info_struct swap_info[];
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
-extern swp_entry_t get_swap_page_of_type(int type);
+extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
+extern int swap_type_of(dev_t);
+extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..4eb464b71347 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,9 +26,9 @@ extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
+extern int swsusp_write(void);
 extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
+extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -70,10 +70,6 @@ static void power_down(suspend_disk_method_t mode)
 	while(1);
 }
 
-
-static int in_suspend __nosavedata = 0;
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -145,7 +141,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write(pagedir_nosave, nr_copy_pages);
+		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -216,7 +212,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read(&pagedir_nosave))) {
+	if ((error = swsusp_read())) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..ea7132ed029b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -37,21 +37,31 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 
+extern int in_suspend;
+
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern unsigned int snapshot_nr_pages(void);
-extern struct pbe *snapshot_pblist(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+
+struct snapshot_handle {
+	loff_t		offset;
+	unsigned int	page;
+	unsigned int	page_offset;
+	unsigned int	prev;
+	struct pbe	*pbe;
+	void		*buffer;
+	unsigned int	buf_offset;
+};
+
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..cc349437fb72 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
  */
 
 
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,8 @@
 #include "power.h"
 
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -235,7 +237,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
@@ -301,7 +303,7 @@ struct eaten_page {
 
 static struct eaten_page *eaten_pages = NULL;
 
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
 	struct eaten_page *p, *q;
 
@@ -376,7 +378,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!nr_pages)
 		return NULL;
 
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
 	pblist = alloc_image_page(gfp_mask, safe_needed);
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -414,6 +415,9 @@ void swsusp_free(void)
 				}
 			}
 	}
+	nr_copy_pages = 0;
+	nr_meta_pages = 0;
+	pagedir_nosave = NULL;
 }
 
 
@@ -437,7 +441,7 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
 	struct pbe *p;
 
@@ -504,7 +508,319 @@ asmlinkage int swsusp_save(void)
 	 */
 
 	nr_copy_pages = nr_pages;
+	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
+
+static void init_header(struct swsusp_info *info)
+{
+	memset(info, 0, sizeof(struct swsusp_info));
+	info->version_code = LINUX_VERSION_CODE;
+	info->num_physpages = num_physpages;
+	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	info->cpus = num_online_cpus();
+	info->image_pages = nr_copy_pages;
+	info->pages = nr_copy_pages + nr_meta_pages + 1;
+}
+
+/**
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
+}
+
+/**
+ *	snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to read from the snapshot.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to read up to the returned number of bytes from the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the read would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the end of data stream condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+
+	if (handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset) {
+		init_header((struct swsusp_info *)buffer);
+		handle->buffer = buffer;
+		handle->pbe = pagedir_nosave;
+	}
+	if (handle->prev < handle->page) {
+		if (handle->page <= nr_meta_pages) {
+			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe)
+				handle->pbe = pagedir_nosave;
+		} else {
+			handle->buffer = (void *)handle->pbe->address;
+			handle->pbe = handle->pbe->next;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+/**
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *p;
+
+	if (!pblist) /* a sanity check */
+		return -EINVAL;
+
+	/* Clear page flags */
+	for_each_zone (zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Mark orig addresses */
+	for_each_pbe (p, pblist) {
+		if (virt_addr_valid(p->orig_address))
+			SetPageNosaveFree(virt_to_page(p->orig_address));
+		else
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst = dst->next;
+		src = src->next;
+	}
+}
+
+static int check_header(struct swsusp_info *info)
+{
+	char *reason = NULL;
+
+	if (info->version_code != LINUX_VERSION_CODE)
+		reason = "kernel version";
+	if (info->num_physpages != num_physpages)
+		reason = "memory size";
+	if (strcmp(info->uts.sysname,system_utsname.sysname))
+		reason = "system type";
+	if (strcmp(info->uts.release,system_utsname.release))
+		reason = "kernel release";
+	if (strcmp(info->uts.version,system_utsname.version))
+		reason = "version";
+	if (strcmp(info->uts.machine,system_utsname.machine))
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ *	load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+	int error;
+	struct pbe *pblist;
+
+	error = check_header(info);
+	if (!error) {
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		if (!pblist)
+			return -ENOMEM;
+		pagedir_nosave = pblist;
+		handle->pbe = pblist;
+		nr_copy_pages = info->image_pages;
+		nr_meta_pages = info->pages - info->image_pages - 1;
+	}
+	return error;
+}
+
+/**
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
+
+/**
+ *	create_image - use metadata contained in the PBE list
+ *	pointed to by pagedir_nosave to mark the pages that will
+ *	be overwritten in the process of restoring the system
+ *	memory state from the image and allocate memory for
+ *	the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+	int error = 0;
+	struct pbe *p, *pblist;
+
+	p = pagedir_nosave;
+	error = mark_unsafe_pages(p);
+	if (!error) {
+		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+	}
+	if (!error)
+		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	if (!error) {
+		release_eaten_pages();
+		pagedir_nosave = pblist;
+	} else {
+		pagedir_nosave = NULL;
+		handle->pbe = NULL;
+		nr_copy_pages = 0;
+		nr_meta_pages = 0;
+	}
+	return error;
+}
+
+/**
+ *	snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to write to the image.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to write up to the returned number of bytes to the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the write would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the "end of file" condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+	int error = 0;
+
+	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset)
+		handle->buffer = buffer;
+	if (handle->prev < handle->page) {
+		if (!handle->prev) {
+			error = load_header(handle, (struct swsusp_info *)buffer);
+			if (error)
+				return error;
+		} else if (handle->prev <= nr_meta_pages) {
+			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe) {
+				error = create_image(handle);
+				if (error)
+					return error;
+				handle->pbe = pagedir_nosave;
+				handle->buffer = (void *)handle->pbe->address;
+			}
+		} else {
+			handle->pbe = handle->pbe->next;
+			handle->buffer = (void *)handle->pbe->address;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+		handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e87..457084f50010 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -77,6 +77,8 @@
  */
 unsigned long image_size = 500 * 1024 * 1024;
 
+int in_suspend __nosavedata = 0;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -98,8 +100,6 @@ static struct swsusp_header {
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
-static struct swsusp_info swsusp_info;
-
 /*
  * Saving part...
  */
@@ -129,255 +129,261 @@ static int mark_swapfiles(swp_entry_t start)
 	return error;
 }
 
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
  */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-	struct file *file = swap_info->swap_file;
-	struct inode *inode = file->f_dentry->d_inode;
-
-	return S_ISBLK(inode->i_mode) &&
-		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int i;
-
-	if (!swsusp_resume_device)
-		return -ENODEV;
-	spin_lock(&swap_lock);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK))
-			continue;
-		if (is_resume_device(swap_info + i)) {
-			spin_unlock(&swap_lock);
-			root_swap = i;
-			return 0;
-		}
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
 	}
-	spin_unlock(&swap_lock);
-	return -ENODEV;
+	return res;
 }
 
 /**
- *	write_page - Write one page to a fresh swap location.
- *	@addr:	Address we're writing.
- *	@loc:	Place to store the entry we used.
+ *	The bitmap is used for tracing allocated swap pages
  *
- *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *	errors. That is an artifact left over from swsusp. It did not
- *	check the return of rw_swap_page_sync() at all, since most pages
- *	written back to swap would return -EIO.
- *	This is a partial improvement, since we will at least return other
- *	errors, though we need to eventually fix the damn code.
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
  */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
 
-	entry = get_swap_page_of_type(root_swap);
-	if (swp_offset(entry)) {
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-		if (!error || error == -EIO)
-			*loc = entry;
-	}
-	return error;
-}
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
 
 /**
- *	Swap map-handling functions
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
  *
- *	The swap map is a data structure used for keeping track of each page
- *	written to the swap.  It consists of many swap_map_page structures
- *	that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are linked together with the help of either the
- *	.next (in memory) or the .next_swap (in swap) member.
- *
- *	The swap map is created during suspend.  At that time we need to keep
- *	it in memory, because we have to free all of the allocated swap
- *	entries if an error occurs.  The memory needed is preallocated
- *	so that we know in advance if there's enough of it.
- *
- *	The first swap_map_page structure is filled with the swap entries that
- *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *	so on.  After the all of the data pages have been written, the order
- *	of the swap_map_page structures in the map is reversed so that they
- *	can be read from swap in the original order.  This causes the data
- *	pages to be loaded in exactly the same order in which they have been
- *	saved.
- *
- *	During resume we only need to use one swap_map_page structure
- *	at a time, which means that we only need to use two memory pages for
- *	reading the image - one for reading the swap_map_page structures
- *	and the second for reading the data pages from swap.
+ *	The functions operate on a linked bitmap structure defined
+ *	above
  */
 
-#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
-			/ sizeof(swp_entry_t))
-
-struct swap_map_page {
-	swp_entry_t		entries[MAP_PAGE_SIZE];
-	swp_entry_t		next_swap;
-	struct swap_map_page	*next;
-};
-
-static inline void free_swap_map(struct swap_map_page *swap_map)
+static void free_bitmap(struct bitmap_page *bitmap)
 {
-	struct swap_map_page *swp;
+	struct bitmap_page *bp;
 
-	while (swap_map) {
-		swp = swap_map->next;
-		free_page((unsigned long)swap_map);
-		swap_map = swp;
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
 	}
 }
 
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-	struct swap_map_page *swap_map, *swp;
-	unsigned n = 0;
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
 
-	if (!nr_pages)
+	if (!nr_bits)
 		return NULL;
 
-	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
-	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	swp = swap_map;
-	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
-		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-		swp = swp->next;
-		if (!swp) {
-			free_swap_map(swap_map);
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
 			return NULL;
 		}
 	}
-	return swap_map;
+	return bitmap;
 }
 
-/**
- *	reverse_swap_map - reverse the order of pages in the swap map
- *	@swap_map
- */
-
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 {
-	struct swap_map_page *prev, *next;
-
-	prev = NULL;
-	while (swap_map) {
-		next = swap_map->next;
-		swap_map->next = prev;
-		prev = swap_map;
-		swap_map = next;
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
 	}
-	return prev;
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
 }
 
-/**
- *	free_swap_map_entries - free the swap entries allocated to store
- *	the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
-	while (swap_map) {
-		if (swap_map->next_swap.val)
-			swap_free(swap_map->next_swap);
-		swap_map = swap_map->next;
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset)) {
+			swap_free(swp_entry(swap, offset));
+			offset = 0;
+		}
 	}
+	return offset;
 }
 
-/**
- *	save_swap_map - save the swap map used for tracing the data pages
- *	stored in the swap
- */
-
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
-	swp_entry_t entry = (swp_entry_t){0};
-	int error;
+	unsigned int bit, n;
+	unsigned long test;
 
-	while (swap_map) {
-		swap_map->next_swap = entry;
-		if ((error = write_page((unsigned long)swap_map, &entry)))
-			return error;
-		swap_map = swap_map->next;
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
 	}
-	*start = entry;
-	return 0;
 }
 
 /**
- *	free_image_entries - free the swap entries allocated to store
- *	the image data pages (this is only called in case of an error)
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
  */
 
-static inline void free_image_entries(struct swap_map_page *swp)
+static int write_page(void *buf, unsigned long offset)
 {
-	unsigned k;
+	swp_entry_t entry;
+	int error = -ENOSPC;
 
-	while (swp) {
-		for (k = 0; k < MAP_PAGE_SIZE; k++)
-			if (swp->entries[k].val)
-				swap_free(swp->entries[k]);
-		swp = swp->next;
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
 	}
+	return error;
 }
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
 /**
- *	The swap_map_handle structure is used for handling the swap map in
+ *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
  */
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
 	unsigned int k;
 };
 
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
+static void release_swap_writer(struct swap_map_handle *handle)
 {
-	handle->cur = map;
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
 	handle->k = 0;
+	return 0;
 }
 
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
 {
 	int error;
+	unsigned long offset;
 
-	error = write_page(addr, handle->cur->entries + handle->k);
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
-		handle->cur = handle->cur->next;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
 		handle->k = 0;
 	}
 	return 0;
 }
 
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
 /**
- *	save_image_data - save the data pages pointed to by the PBEs
- *	from the list @pblist using the swap map handle @handle
- *	(assume there are @nr_pages data pages to save)
+ *	save_image - save the suspend image data
  */
 
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
 	unsigned int m;
-	struct pbe *p;
+	int ret;
 	int error = 0;
 
 	printk("Saving image data pages (%u pages) ...     ", nr_pages);
@@ -385,98 +391,22 @@ static int save_image_data(struct pbe *pblist,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	for_each_pbe (p, pblist) {
-		error = swap_map_write_page(handle, p->address);
-		if (error)
-			break;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	return error;
 }
 
-static void dump_info(void)
-{
-	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-
-static void init_header(unsigned int nr_pages)
-{
-	memset(&swsusp_info, 0, sizeof(swsusp_info));
-	swsusp_info.version_code = LINUX_VERSION_CODE;
-	swsusp_info.num_physpages = num_physpages;
-	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-
-	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_pages;
-	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-
-/**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
- */
-
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
-	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
-}
-
-/**
- *	save_image_metadata - save the .orig_address fields of the PBEs
- *	from the list @pblist using the swap map handle @handle
- */
-
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
-{
-	unsigned long *buf;
-	unsigned int n = 0;
-	struct pbe *p;
-	int error = 0;
-
-	printk("Saving image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		p = pack_orig_addresses(buf, p);
-		error = swap_map_write_page(handle, (unsigned long)buf);
-		if (error)
-			break;
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages saved)\n", n);
-	return error;
-}
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -486,8 +416,7 @@ static int save_image_metadata(struct pbe *pblist,
 
 static int enough_swap(unsigned int nr_pages)
 {
-	unsigned int free_swap = swap_info[root_swap].pages -
-		swap_info[root_swap].inuse_pages;
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
 	return free_swap > (nr_pages + PAGES_FOR_IO +
@@ -503,57 +432,44 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
+int swsusp_write(void)
 {
-	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
-	swp_entry_t start;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
 		return error;
 	}
-	if (!enough_swap(nr_pages)) {
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
-
-	init_header(nr_pages);
-	swap_map = alloc_swap_map(swsusp_info.pages);
-	if (!swap_map)
-		return -ENOMEM;
-	init_swap_map_handle(&handle, swap_map);
-
-	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-	if (!error)
-		error = save_image_metadata(pblist, &handle);
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
 	if (!error)
-		error = save_image_data(pblist, &handle, nr_pages);
-	if (error)
-		goto Free_image_entries;
-
-	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &start);
-	if (error)
-		goto Free_map_entries;
-
-	dump_info();
-	printk( "S" );
-	error = mark_swapfiles(start);
-	printk( "|\n" );
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
 	if (error)
-		goto Free_map_entries;
-
-Free_swap_map:
-	free_swap_map(swap_map);
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
 	return error;
-
-Free_map_entries:
-	free_swap_map_entries(swap_map);
-Free_image_entries:
-	free_image_entries(swap_map);
-	goto Free_swap_map;
 }
 
 /**
@@ -663,45 +579,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
- */
-
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return;
-
-	/* Clear page flags */
-	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
-	}
-
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist)
-		SetPageNosaveFree(virt_to_page(p->orig_address));
-
-}
-
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
-	}
-}
-
 /*
  *	Using bio to read from swap.
  *	This code requires a bit more work than just using buffer heads
@@ -779,14 +656,14 @@ static int bio_write_page(pgoff_t page_off, void *page)
  *	in a file-alike way
  */
 
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
+static void release_swap_reader(struct swap_map_handle *handle)
 {
 	if (handle->cur)
 		free_page((unsigned long)handle->cur);
 	handle->cur = NULL;
 }
 
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle,
                                       swp_entry_t start)
 {
 	int error;
@@ -798,149 +675,80 @@ static inline int get_swap_map_reader(struct swap_map_handle *handle,
 		return -ENOMEM;
 	error = bio_read_page(swp_offset(start), handle->cur);
 	if (error) {
-		release_swap_map_reader(handle);
+		release_swap_reader(handle);
 		return error;
 	}
 	handle->k = 0;
 	return 0;
 }
 
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
 {
 	unsigned long offset;
 	int error;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = swp_offset(handle->cur->entries[handle->k]);
+	offset = handle->cur->entries[handle->k];
 	if (!offset)
-		return -EINVAL;
+		return -EFAULT;
 	error = bio_read_page(offset, buf);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
 		handle->k = 0;
-		offset = swp_offset(handle->cur->next_swap);
+		offset = handle->cur->next_swap;
 		if (!offset)
-			release_swap_map_reader(handle);
+			release_swap_reader(handle);
 		else
 			error = bio_read_page(offset, handle->cur);
 	}
 	return error;
 }
 
-static int check_header(void)
-{
-	char *reason = NULL;
-
-	dump_info();
-	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		reason = "kernel version";
-	if (swsusp_info.num_physpages != num_physpages)
-		reason = "memory size";
-	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		reason = "system type";
-	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		reason = "kernel release";
-	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		reason = "version";
-	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		reason = "machine";
-	if (reason) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-		return -EPERM;
-	}
-	return 0;
-}
-
 /**
- *	load_image_data - load the image data using the swap map handle
- *	@handle and store them using the page backup list @pblist
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
  *	(assume there are @nr_pages pages to load)
  */
 
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
-	int error;
 	unsigned int m;
-	struct pbe *p;
+	int ret;
+	int error = 0;
 
-	if (!pblist)
-		return -EINVAL;
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, (void *)p->address);
-		if (error)
-			break;
-		p = p->next;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
 	return error;
 }
 
-/**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
- */
-
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
-	}
-	return pbe;
-}
-
-/**
- *	load_image_metadata - load the image metadata using the swap map
- *	handle @handle and put them into the PBEs in the list @pblist
- */
-
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-	struct pbe *p;
-	unsigned long *buf;
-	unsigned int n = 0;
-	int error = 0;
-
-	printk("Loading image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, buf);
-		if (error)
-			break;
-		p = unpack_orig_addresses(buf, p);
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages loaded)\n", n);
-	return error;
-}
-
-int swsusp_read(struct pbe **pblist_ptr)
+int swsusp_read(void)
 {
 	int error;
-	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
 	unsigned int nr_pages;
 
 	if (IS_ERR(resume_bdev)) {
@@ -948,38 +756,19 @@ int swsusp_read(struct pbe **pblist_ptr)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = get_swap_map_reader(&handle, swsusp_header.image);
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_map_read_page(&handle, &swsusp_info);
-	if (!error)
-		error = check_header();
-	if (error)
-		return error;
-	nr_pages = swsusp_info.image_pages;
-	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-	if (!p)
-		return -ENOMEM;
-	error = load_image_metadata(p, &handle);
+		error = swap_read_page(&handle, header);
 	if (!error) {
-		mark_unsafe_pages(p);
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p);
-		if (!pblist)
-			error = -ENOMEM;
-
-		/* Allocate memory for the image and read the data from swap */
-		if (!error)
-			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error) {
-			release_eaten_pages();
-			error = load_image_data(pblist, &handle, nr_pages);
-		}
-		if (!error)
-			*pblist_ptr = pblist;
+		nr_pages = header->image_pages;
+		error = load_image(&handle, &snapshot, nr_pages);
 	}
-	release_swap_map_reader(&handle);
+	release_swap_reader(&handle);
 
 	blkdev_put(resume_bdev);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 365ed6ff182d..4d11f9d84666 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 
 struct swap_list_t swap_list = {-1, -1};
 
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
@@ -417,6 +417,59 @@ void free_swap_and_cache(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+	int i;
+
+	if (!device)
+		return -EINVAL;
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct inode *inode;
+
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		inode = swap_info->swap_file->f_dentry->d_inode;
+		if (S_ISBLK(inode->i_mode) &&
+		    device == MKDEV(imajor(inode), iminor(inode))) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+	}
+	spin_unlock(&swap_lock);
+	return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+	unsigned int n = 0;
+
+	if (type < nr_swapfiles) {
+		spin_lock(&swap_lock);
+		if (swap_info[type].flags & SWP_WRITEOK) {
+			n = swap_info[type].pages;
+			if (free)
+				n -= swap_info[type].inuse_pages;
+		}
+		spin_unlock(&swap_lock);
+	}
+	return n;
+}
+#endif
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
-- 
cgit v1.2.3


From 61159a314bca6408320c3173c1282c64f5cdaa76 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:00 -0800
Subject: [PATCH] swsusp: separate swap-writing/reading code

Move the swap-writing/reading code of swsusp to a separate file.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Makefile |   2 +-
 kernel/power/power.h  |  31 ++-
 kernel/power/swap.c   | 544 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c | 568 +-------------------------------------------------
 4 files changed, 581 insertions(+), 564 deletions(-)
 create mode 100644 kernel/power/swap.c

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 04be7d0d96a7..bb91a0615303 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ea7132ed029b..089c84bed895 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -41,8 +41,8 @@ extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
-
 extern int in_suspend;
+extern dev_t swsusp_resume_device;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
@@ -65,3 +65,32 @@ struct snapshot_handle {
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 int snapshot_image_loaded(struct snapshot_handle *handle);
+
+/**
+ *	The bitmap is used for tracing allocated swap pages
+ *
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
+ */
+
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
+
+extern void free_bitmap(struct bitmap_page *bitmap);
+extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
+extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+
+extern int swsusp_shrink_memory(void);
+extern int swsusp_suspend(void);
+extern int swsusp_resume(void);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
new file mode 100644
index 000000000000..9177f3f73a6c
--- /dev/null
+++ b/kernel/power/swap.c
@@ -0,0 +1,544 @@
+/*
+ * linux/kernel/power/swap.c
+ *
+ * This file provides functions for reading the suspend image from
+ * and writing it to a swap partition.
+ *
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/genhd.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+
+#include "power.h"
+
+extern char resume_file[];
+
+#define SWSUSP_SIG	"S1SUSPEND"
+
+static struct swsusp_header {
+	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+	swp_entry_t image;
+	char	orig_sig[10];
+	char	sig[10];
+} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+
+/*
+ * Saving part...
+ */
+
+static unsigned short root_swap = 0xffff;
+
+static int mark_swapfiles(swp_entry_t start)
+{
+	int error;
+
+	rw_swap_page_sync(READ,
+			  swp_entry(root_swap, 0),
+			  virt_to_page((unsigned long)&swsusp_header));
+	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
+	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
+		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
+		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+		swsusp_header.image = start;
+		error = rw_swap_page_sync(WRITE,
+					  swp_entry(root_swap, 0),
+					  virt_to_page((unsigned long)
+						       &swsusp_header));
+	} else {
+		pr_debug("swsusp: Partition is not swap space.\n");
+		error = -ENODEV;
+	}
+	return error;
+}
+
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
+ */
+
+static int swsusp_swap_check(void) /* This is called before saving image */
+{
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
+	}
+	return res;
+}
+
+/**
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
+ */
+
+static int write_page(void *buf, unsigned long offset)
+{
+	swp_entry_t entry;
+	int error = -ENOSPC;
+
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+	}
+	return error;
+}
+
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
+/**
+ *	The swap_map_handle structure is used for handling swap in
+ *	a file-alike way
+ */
+
+struct swap_map_handle {
+	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
+	unsigned int k;
+};
+
+static void release_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
+{
+	int error;
+	unsigned long offset;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
+	if (error)
+		return error;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
+		handle->k = 0;
+	}
+	return 0;
+}
+
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
+/**
+ *	save_image - save the suspend image data
+ */
+
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+	unsigned int m;
+	int ret;
+	int error = 0;
+
+	printk("Saving image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
+	if (!error)
+		printk("\b\b\b\bdone\n");
+	return error;
+}
+
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap
+ *	space avaiable from the resume partition.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
+
+	pr_debug("swsusp: free swap pages: %u\n", free_swap);
+	return free_swap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
+/**
+ *	swsusp_write - Write entire image and metadata.
+ *
+ *	It is important _NOT_ to umount filesystems at this point. We want
+ *	them synced (in case something goes wrong) but we DO not want to mark
+ *	filesystem clean: it is not. (And it does not matter, if we resume
+ *	correctly, we'll mark system clean, anyway.)
+ */
+
+int swsusp_write(void)
+{
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
+	int error;
+
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+		return error;
+	}
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
+	if (!error)
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
+	if (error)
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
+	return error;
+}
+
+/*
+ *	Using bio to read from swap.
+ *	This code requires a bit more work than just using buffer heads
+ *	but, it is the recommended way for 2.5/2.6.
+ *	The following are to signal the beginning and end of I/O. Bios
+ *	finish asynchronously, while we want them to happen synchronously.
+ *	A simple atomic_t, and a wait loop take care of this problem.
+ */
+
+static atomic_t io_done = ATOMIC_INIT(0);
+
+static int end_io(struct bio *bio, unsigned int num, int err)
+{
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		panic("I/O error reading memory image");
+	atomic_set(&io_done, 0);
+	return 0;
+}
+
+static struct block_device *resume_bdev;
+
+/**
+ *	submit - submit BIO request.
+ *	@rw:	READ or WRITE.
+ *	@off	physical offset of page.
+ *	@page:	page we're reading or writing.
+ *
+ *	Straight from the textbook - allocate and initialize the bio.
+ *	If we're writing, make sure the page is marked as dirty.
+ *	Then submit it and wait.
+ */
+
+static int submit(int rw, pgoff_t page_off, void *page)
+{
+	int error = 0;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_ATOMIC, 1);
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+	bio->bi_bdev = resume_bdev;
+	bio->bi_end_io = end_io;
+
+	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+		error = -EFAULT;
+		goto Done;
+	}
+
+	atomic_set(&io_done, 1);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+	while (atomic_read(&io_done))
+		yield();
+	if (rw == READ)
+		bio_set_pages_dirty(bio);
+ Done:
+	bio_put(bio);
+	return error;
+}
+
+static int bio_read_page(pgoff_t page_off, void *page)
+{
+	return submit(READ, page_off, page);
+}
+
+static int bio_write_page(pgoff_t page_off, void *page)
+{
+	return submit(WRITE, page_off, page);
+}
+
+/**
+ *	The following functions allow us to read data using a swap map
+ *	in a file-alike way
+ */
+
+static void release_swap_reader(struct swap_map_handle *handle)
+{
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+}
+
+static int get_swap_reader(struct swap_map_handle *handle,
+                                      swp_entry_t start)
+{
+	int error;
+
+	if (!swp_offset(start))
+		return -EINVAL;
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+	if (!handle->cur)
+		return -ENOMEM;
+	error = bio_read_page(swp_offset(start), handle->cur);
+	if (error) {
+		release_swap_reader(handle);
+		return error;
+	}
+	handle->k = 0;
+	return 0;
+}
+
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
+{
+	unsigned long offset;
+	int error;
+
+	if (!handle->cur)
+		return -EINVAL;
+	offset = handle->cur->entries[handle->k];
+	if (!offset)
+		return -EFAULT;
+	error = bio_read_page(offset, buf);
+	if (error)
+		return error;
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
+		handle->k = 0;
+		offset = handle->cur->next_swap;
+		if (!offset)
+			release_swap_reader(handle);
+		else
+			error = bio_read_page(offset, handle->cur);
+	}
+	return error;
+}
+
+/**
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
+ *	(assume there are @nr_pages pages to load)
+ */
+
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
+{
+	unsigned int m;
+	int ret;
+	int error = 0;
+
+	printk("Loading image data pages (%u pages) ...     ", nr_pages);
+	m = nr_pages / 100;
+	if (!m)
+		m = 1;
+	nr_pages = 0;
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
+	if (!error)
+		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
+	return error;
+}
+
+int swsusp_read(void)
+{
+	int error;
+	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return PTR_ERR(resume_bdev);
+	}
+
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
+	if (!error)
+		error = swap_read_page(&handle, header);
+	if (!error)
+		error = load_image(&handle, &snapshot, header->pages - 1);
+	release_swap_reader(&handle);
+
+	blkdev_put(resume_bdev);
+
+	if (!error)
+		pr_debug("swsusp: Reading resume file was successful\n");
+	else
+		pr_debug("swsusp: Error %d resuming\n", error);
+	return error;
+}
+
+/**
+ *      swsusp_check - Check for swsusp signature in the resume device
+ */
+
+int swsusp_check(void)
+{
+	int error;
+
+	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	if (!IS_ERR(resume_bdev)) {
+		set_blocksize(resume_bdev, PAGE_SIZE);
+		memset(&swsusp_header, 0, sizeof(swsusp_header));
+		if ((error = bio_read_page(0, &swsusp_header)))
+			return error;
+		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
+			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+			/* Reset swap signature now */
+			error = bio_write_page(0, &swsusp_header);
+		} else {
+			return -EINVAL;
+		}
+		if (error)
+			blkdev_put(resume_bdev);
+		else
+			pr_debug("swsusp: Signature found, resuming\n");
+	} else {
+		error = PTR_ERR(resume_bdev);
+	}
+
+	if (error)
+		pr_debug("swsusp: Error %d check for resume file\n", error);
+
+	return error;
+}
+
+/**
+ *	swsusp_close - close swap device.
+ */
+
+void swsusp_close(void)
+{
+	if (IS_ERR(resume_bdev)) {
+		pr_debug("swsusp: block device not initialised\n");
+		return;
+	}
+
+	blkdev_put(resume_bdev);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 457084f50010..c4016cbbd3e0 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,41 +31,24 @@
  * Fixed runaway init
  *
  * Rafael J. Wysocki <rjw@sisk.pl>
- * Added the swap map data structure and reworked the handling of swap
+ * Reworked the freeing of memory and the handling of swap
  *
  * More state savers are welcome. Especially for the scsi layer...
  *
  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
  */
 
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
-#include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
-#include <linux/delay.h>
-#include <linux/bitops.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/swap.h>
 #include <linux/pm.h>
-#include <linux/device.h>
-#include <linux/buffer_head.h>
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-
-#include <asm/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
 
 #include "power.h"
 
@@ -89,91 +72,15 @@ static int restore_highmem(void) { return 0; }
 static unsigned int count_highmem_pages(void) { return 0; }
 #endif
 
-extern char resume_file[];
-
-#define SWSUSP_SIG	"S1SUSPEND"
-
-static struct swsusp_header {
-	char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
-	swp_entry_t image;
-	char	orig_sig[10];
-	char	sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
-
-/*
- * Saving part...
- */
-
-static unsigned short root_swap = 0xffff;
-
-static int mark_swapfiles(swp_entry_t start)
-{
-	int error;
-
-	rw_swap_page_sync(READ,
-			  swp_entry(root_swap, 0),
-			  virt_to_page((unsigned long)&swsusp_header));
-	if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
-	    !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
-		memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
-		memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
-		swsusp_header.image = start;
-		error = rw_swap_page_sync(WRITE,
-					  swp_entry(root_swap, 0),
-					  virt_to_page((unsigned long)
-						       &swsusp_header));
-	} else {
-		pr_debug("swsusp: Partition is not swap space.\n");
-		error = -ENODEV;
-	}
-	return error;
-}
-
-/**
- *	swsusp_swap_check - check if the resume device is a swap device
- *	and get its index (if so)
- */
-
-static int swsusp_swap_check(void) /* This is called before saving image */
-{
-	int res = swap_type_of(swsusp_resume_device);
-
-	if (res >= 0) {
-		root_swap = res;
-		return 0;
-	}
-	return res;
-}
-
-/**
- *	The bitmap is used for tracing allocated swap pages
- *
- *	The entire bitmap consists of a number of bitmap_page
- *	structures linked with the help of the .next member.
- *	Thus each page can be allocated individually, so we only
- *	need to make 0-order memory allocations to create
- *	the bitmap.
- */
-
-#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
-#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
-#define BITS_PER_CHUNK		(sizeof(long) * 8)
-#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
-
-struct bitmap_page {
-	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
-	struct bitmap_page	*next;
-};
-
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
  *
  *	The functions operate on a linked bitmap structure defined
- *	above
+ *	in power.h
  */
 
-static void free_bitmap(struct bitmap_page *bitmap)
+void free_bitmap(struct bitmap_page *bitmap)
 {
 	struct bitmap_page *bp;
 
@@ -184,7 +91,7 @@ static void free_bitmap(struct bitmap_page *bitmap)
 	}
 }
 
-static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
 	struct bitmap_page *bitmap, *bp;
 	unsigned int n;
@@ -227,7 +134,7 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 	return 0;
 }
 
-static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
 	unsigned long offset;
 
@@ -241,7 +148,7 @@ static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 	return offset;
 }
 
-static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
 	unsigned int bit, n;
 	unsigned long test;
@@ -258,220 +165,6 @@ static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 	}
 }
 
-/**
- *	write_page - Write one page to given swap location.
- *	@buf:		Address we're writing.
- *	@offset:	Offset of the swap page we're writing to.
- */
-
-static int write_page(void *buf, unsigned long offset)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
-
-	if (offset) {
-		entry = swp_entry(root_swap, offset);
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
-	}
-	return error;
-}
-
-/*
- *	The swap map is a data structure used for keeping track of each page
- *	written to a swap partition.  It consists of many swap_map_page
- *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are stored on the swap and linked together with the
- *	help of the .next_swap member.
- *
- *	The swap map is created during suspend.  The swap map pages are
- *	allocated and populated one at a time, so we only need one memory
- *	page to set up the entire structure.
- *
- *	During resume we also only need to use one swap_map_page structure
- *	at a time.
- */
-
-#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
-
-struct swap_map_page {
-	unsigned long		entries[MAP_PAGE_ENTRIES];
-	unsigned long		next_swap;
-};
-
-/**
- *	The swap_map_handle structure is used for handling swap in
- *	a file-alike way
- */
-
-struct swap_map_handle {
-	struct swap_map_page *cur;
-	unsigned long cur_swap;
-	struct bitmap_page *bitmap;
-	unsigned int k;
-};
-
-static void release_swap_writer(struct swap_map_handle *handle)
-{
-	if (handle->cur)
-		free_page((unsigned long)handle->cur);
-	handle->cur = NULL;
-	if (handle->bitmap)
-		free_bitmap(handle->bitmap);
-	handle->bitmap = NULL;
-}
-
-static int get_swap_writer(struct swap_map_handle *handle)
-{
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
-	if (!handle->cur)
-		return -ENOMEM;
-	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
-	if (!handle->bitmap) {
-		release_swap_writer(handle);
-		return -ENOMEM;
-	}
-	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
-	if (!handle->cur_swap) {
-		release_swap_writer(handle);
-		return -ENOSPC;
-	}
-	handle->k = 0;
-	return 0;
-}
-
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
-{
-	int error;
-	unsigned long offset;
-
-	if (!handle->cur)
-		return -EINVAL;
-	offset = alloc_swap_page(root_swap, handle->bitmap);
-	error = write_page(buf, offset);
-	if (error)
-		return error;
-	handle->cur->entries[handle->k++] = offset;
-	if (handle->k >= MAP_PAGE_ENTRIES) {
-		offset = alloc_swap_page(root_swap, handle->bitmap);
-		if (!offset)
-			return -ENOSPC;
-		handle->cur->next_swap = offset;
-		error = write_page(handle->cur, handle->cur_swap);
-		if (error)
-			return error;
-		memset(handle->cur, 0, PAGE_SIZE);
-		handle->cur_swap = offset;
-		handle->k = 0;
-	}
-	return 0;
-}
-
-static int flush_swap_writer(struct swap_map_handle *handle)
-{
-	if (handle->cur && handle->cur_swap)
-		return write_page(handle->cur, handle->cur_swap);
-	else
-		return -EINVAL;
-}
-
-/**
- *	save_image - save the suspend image data
- */
-
-static int save_image(struct swap_map_handle *handle,
-                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
-{
-	unsigned int m;
-	int ret;
-	int error = 0;
-
-	printk("Saving image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
-	if (!m)
-		m = 1;
-	nr_pages = 0;
-	do {
-		ret = snapshot_read_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_write_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
-	if (!error)
-		printk("\b\b\b\bdone\n");
-	return error;
-}
-
-/**
- *	enough_swap - Make sure we have enough swap to save the image.
- *
- *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable from the resume partition.
- */
-
-static int enough_swap(unsigned int nr_pages)
-{
-	unsigned int free_swap = count_swap_pages(root_swap, 1);
-
-	pr_debug("swsusp: free swap pages: %u\n", free_swap);
-	return free_swap > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-
-/**
- *	swsusp_write - Write entire image and metadata.
- *
- *	It is important _NOT_ to umount filesystems at this point. We want
- *	them synced (in case something goes wrong) but we DO not want to mark
- *	filesystem clean: it is not. (And it does not matter, if we resume
- *	correctly, we'll mark system clean, anyway.)
- */
-
-int swsusp_write(void)
-{
-	struct swap_map_handle handle;
-	struct snapshot_handle snapshot;
-	struct swsusp_info *header;
-	unsigned long start;
-	int error;
-
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
-		return error;
-	}
-	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_read_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
-	header = (struct swsusp_info *)data_of(snapshot);
-	if (!enough_swap(header->pages)) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-	error = get_swap_writer(&handle);
-	if (!error) {
-		start = handle.cur_swap;
-		error = swap_write_page(&handle, header);
-	}
-	if (!error)
-		error = save_image(&handle, &snapshot, header->pages - 1);
-	if (!error) {
-		flush_swap_writer(&handle);
-		printk("S");
-		error = mark_swapfiles(swp_entry(root_swap, start));
-		printk("|\n");
-	}
-	if (error)
-		free_all_swap_pages(root_swap, handle.bitmap);
-	release_swap_writer(&handle);
-	return error;
-}
-
 /**
  *	swsusp_shrink_memory -  Try to free as much memory as needed
  *
@@ -578,252 +271,3 @@ int swsusp_resume(void)
 	local_irq_enable();
 	return error;
 }
-
-/*
- *	Using bio to read from swap.
- *	This code requires a bit more work than just using buffer heads
- *	but, it is the recommended way for 2.5/2.6.
- *	The following are to signal the beginning and end of I/O. Bios
- *	finish asynchronously, while we want them to happen synchronously.
- *	A simple atomic_t, and a wait loop take care of this problem.
- */
-
-static atomic_t io_done = ATOMIC_INIT(0);
-
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		panic("I/O error reading memory image");
-	atomic_set(&io_done, 0);
-	return 0;
-}
-
-static struct block_device *resume_bdev;
-
-/**
- *	submit - submit BIO request.
- *	@rw:	READ or WRITE.
- *	@off	physical offset of page.
- *	@page:	page we're reading or writing.
- *
- *	Straight from the textbook - allocate and initialize the bio.
- *	If we're writing, make sure the page is marked as dirty.
- *	Then submit it and wait.
- */
-
-static int submit(int rw, pgoff_t page_off, void *page)
-{
-	int error = 0;
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_ATOMIC, 1);
-	if (!bio)
-		return -ENOMEM;
-	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = resume_bdev;
-	bio->bi_end_io = end_io;
-
-	if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
-		error = -EFAULT;
-		goto Done;
-	}
-
-
-	atomic_set(&io_done, 1);
-	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-	while (atomic_read(&io_done))
-		yield();
-	if (rw == READ)
-		bio_set_pages_dirty(bio);
- Done:
-	bio_put(bio);
-	return error;
-}
-
-static int bio_read_page(pgoff_t page_off, void *page)
-{
-	return submit(READ, page_off, page);
-}
-
-static int bio_write_page(pgoff_t page_off, void *page)
-{
-	return submit(WRITE, page_off, page);
-}
-
-/**
- *	The following functions allow us to read data using a swap map
- *	in a file-alike way
- */
-
-static void release_swap_reader(struct swap_map_handle *handle)
-{
-	if (handle->cur)
-		free_page((unsigned long)handle->cur);
-	handle->cur = NULL;
-}
-
-static int get_swap_reader(struct swap_map_handle *handle,
-                                      swp_entry_t start)
-{
-	int error;
-
-	if (!swp_offset(start))
-		return -EINVAL;
-	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	if (!handle->cur)
-		return -ENOMEM;
-	error = bio_read_page(swp_offset(start), handle->cur);
-	if (error) {
-		release_swap_reader(handle);
-		return error;
-	}
-	handle->k = 0;
-	return 0;
-}
-
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
-{
-	unsigned long offset;
-	int error;
-
-	if (!handle->cur)
-		return -EINVAL;
-	offset = handle->cur->entries[handle->k];
-	if (!offset)
-		return -EFAULT;
-	error = bio_read_page(offset, buf);
-	if (error)
-		return error;
-	if (++handle->k >= MAP_PAGE_ENTRIES) {
-		handle->k = 0;
-		offset = handle->cur->next_swap;
-		if (!offset)
-			release_swap_reader(handle);
-		else
-			error = bio_read_page(offset, handle->cur);
-	}
-	return error;
-}
-
-/**
- *	load_image - load the image using the swap map handle
- *	@handle and the snapshot handle @snapshot
- *	(assume there are @nr_pages pages to load)
- */
-
-static int load_image(struct swap_map_handle *handle,
-                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
-{
-	unsigned int m;
-	int ret;
-	int error = 0;
-
-	printk("Loading image data pages (%u pages) ...     ", nr_pages);
-	m = nr_pages / 100;
-	if (!m)
-		m = 1;
-	nr_pages = 0;
-	do {
-		ret = snapshot_write_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_read_page(handle, data_of(*snapshot));
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
-	if (!error)
-		printk("\b\b\b\bdone\n");
-	if (!snapshot_image_loaded(snapshot))
-		error = -ENODATA;
-	return error;
-}
-
-int swsusp_read(void)
-{
-	int error;
-	struct swap_map_handle handle;
-	struct snapshot_handle snapshot;
-	struct swsusp_info *header;
-	unsigned int nr_pages;
-
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return PTR_ERR(resume_bdev);
-	}
-
-	memset(&snapshot, 0, sizeof(struct snapshot_handle));
-	error = snapshot_write_next(&snapshot, PAGE_SIZE);
-	if (error < PAGE_SIZE)
-		return error < 0 ? error : -EFAULT;
-	header = (struct swsusp_info *)data_of(snapshot);
-	error = get_swap_reader(&handle, swsusp_header.image);
-	if (!error)
-		error = swap_read_page(&handle, header);
-	if (!error) {
-		nr_pages = header->image_pages;
-		error = load_image(&handle, &snapshot, nr_pages);
-	}
-	release_swap_reader(&handle);
-
-	blkdev_put(resume_bdev);
-
-	if (!error)
-		pr_debug("swsusp: Reading resume file was successful\n");
-	else
-		pr_debug("swsusp: Error %d resuming\n", error);
-	return error;
-}
-
-/**
- *      swsusp_check - Check for swsusp signature in the resume device
- */
-
-int swsusp_check(void)
-{
-	int error;
-
-	resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
-	if (!IS_ERR(resume_bdev)) {
-		set_blocksize(resume_bdev, PAGE_SIZE);
-		memset(&swsusp_header, 0, sizeof(swsusp_header));
-		if ((error = bio_read_page(0, &swsusp_header)))
-			return error;
-		if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
-			memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
-			/* Reset swap signature now */
-			error = bio_write_page(0, &swsusp_header);
-		} else {
-			return -EINVAL;
-		}
-		if (error)
-			blkdev_put(resume_bdev);
-		else
-			pr_debug("swsusp: Signature found, resuming\n");
-	} else {
-		error = PTR_ERR(resume_bdev);
-	}
-
-	if (error)
-		pr_debug("swsusp: Error %d check for resume file\n", error);
-
-	return error;
-}
-
-/**
- *	swsusp_close - close swap device.
- */
-
-void swsusp_close(void)
-{
-	if (IS_ERR(resume_bdev)) {
-		pr_debug("swsusp: block device not initialised\n");
-		return;
-	}
-
-	blkdev_put(resume_bdev);
-}
-- 
cgit v1.2.3


From 74c7e2efbe37378026f00ad9e7253796d7b2fc99 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 23 Mar 2006 03:00:01 -0800
Subject: [PATCH] kernel/power: move externs to header files

Move externs from C source files to header files.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pm.h   |  3 ++-
 kernel/power/disk.c  | 11 -----------
 kernel/power/power.h |  6 +++++-
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 5be87ba3b7ac..6df2585c0169 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -188,6 +188,8 @@ extern void device_power_up(void);
 extern void device_resume(void);
 
 #ifdef CONFIG_PM
+extern suspend_disk_method_t pm_disk_mode;
+
 extern int device_suspend(pm_message_t state);
 
 #define device_set_wakeup_enable(dev,val) \
@@ -215,7 +217,6 @@ static inline int dpm_runtime_suspend(struct device * dev, pm_message_t state)
 
 static inline void dpm_runtime_resume(struct device * dev)
 {
-
 }
 
 #endif
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4eb464b71347..4bd68f482f2b 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,17 +22,6 @@
 #include "power.h"
 
 
-extern suspend_disk_method_t pm_disk_mode;
-
-extern int swsusp_shrink_memory(void);
-extern int swsusp_suspend(void);
-extern int swsusp_write(void);
-extern int swsusp_check(void);
-extern int swsusp_read(void);
-extern void swsusp_close(void);
-extern int swsusp_resume(void);
-
-
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 089c84bed895..5d1abffbb9ce 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -48,7 +48,6 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void swsusp_free(void);
 
 struct snapshot_handle {
 	loff_t		offset;
@@ -91,6 +90,11 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
+extern void swsusp_free(void);
 extern int swsusp_suspend(void);
 extern int swsusp_resume(void);
+extern int swsusp_read(void);
+extern int swsusp_write(void);
+extern void swsusp_close(void);
-- 
cgit v1.2.3


From 543cc27d09643640cbc34189c03a40beb8227aef Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 23 Mar 2006 03:00:02 -0800
Subject: [PATCH] swsusp: documentation updates

Update suspend-to-RAM documentation with new machines, and makes message
when processes can't be stopped little clearer.  (In one case, waiting
longer actually did help).

From: "Rafael J. Wysocki" <rjw@sisk.pl>

  Warn in the documentation that data may be lost if there are some
  filesystems mounted from USB devices before suspend.

  [Thanks to Alan Stern for providing the answer to the question in the
  Q:-A: part.]

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/swsusp.txt | 51 +++++++++++++++++++++++++----
 Documentation/power/video.txt  | 74 +++++++++++++++++++-----------------------
 kernel/power/process.c         |  2 +-
 3 files changed, 78 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index b28b7f04abb8..d7814a113ee1 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -17,6 +17,11 @@ Some warnings, first.
  * but it will probably only crash.
  *
  * (*) suspend/resume support is needed to make it safe.
+ *
+ * If you have any filesystems on USB devices mounted before suspend,
+ * they won't be accessible after resume and you may lose data, as though
+ * you have unplugged the USB devices with mounted filesystems on them
+ * (see the FAQ below for details).
 
 You need to append resume=/dev/your_swap_partition to kernel command
 line. Then you suspend by
@@ -27,19 +32,18 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
+. If you have SATA disks, you'll need recent kernels with SATA suspend
+support. For suspend and resume to work, make sure your disk drivers
+are built into kernel -- not modules. [There's way to make
+suspend/resume with modular disk drivers, see FAQ, but you probably
+should not do that.]
+
 If you want to limit the suspend image size to N bytes, do
 
 echo N > /sys/power/image_size
 
 before suspend (it is limited to 500 MB by default).
 
-Encrypted suspend image:
-------------------------
-If you want to store your suspend image encrypted with a temporary
-key to prevent data gathering after resume you must compile
-crypto and the aes algorithm into the kernel - modules won't work
-as they cannot be loaded at resume time.
-
 
 Article about goals and implementation of Software Suspend for Linux
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -333,4 +337,37 @@ init=/bin/bash, then swapon and starting suspend sequence manually
 usually does the trick. Then it is good idea to try with latest
 vanilla kernel.
 
+Q: How can distributions ship a swsusp-supporting kernel with modular
+disk drivers (especially SATA)?
+
+A: Well, it can be done, load the drivers, then do echo into
+/sys/power/disk/resume file from initrd. Be sure not to mount
+anything, not even read-only mount, or you are going to lose your
+data.
+
+Q: How do I make suspend more verbose?
+
+A: If you want to see any non-error kernel messages on the virtual
+terminal the kernel switches to during suspend, you have to set the
+kernel console loglevel to at least 5, for example by doing
+
+	echo 5 > /proc/sys/kernel/printk
+
+Q: Is this true that if I have a mounted filesystem on a USB device and
+I suspend to disk, I can lose data unless the filesystem has been mounted
+with "sync"?
+
+A: That's right.  It depends on your hardware, and it could be true even for
+suspend-to-RAM.  In fact, even with "-o sync" you can lose data if your
+programs have information in buffers they haven't written out to disk.
+
+If you're lucky, your hardware will support low-power modes for USB
+controllers while the system is asleep.  Lots of hardware doesn't,
+however.  Shutting off the power to a USB controller is equivalent to
+unplugging all the attached devices.
+
+Remember that it's always a bad idea to unplug a disk drive containing a
+mounted filesystem.  With USB that's true even when your system is asleep!
+The safest thing is to unmount all USB-based filesystems before suspending
+and remount them after resuming.
 
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
index 912bed87c758..d18a57d1a531 100644
--- a/Documentation/power/video.txt
+++ b/Documentation/power/video.txt
@@ -1,7 +1,7 @@
 
 		Video issues with S3 resume
 		~~~~~~~~~~~~~~~~~~~~~~~~~~~
-		  2003-2005, Pavel Machek
+		  2003-2006, Pavel Machek
 
 During S3 resume, hardware needs to be reinitialized. For most
 devices, this is easy, and kernel driver knows how to do
@@ -15,6 +15,27 @@ run normally so video card is normally initialized. It should not be
 problem for S1 standby, because hardware should retain its state over
 that.
 
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is neccessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
 There are a few types of systems where video works after S3 resume:
 
 (1) systems where video state is preserved over S3.
@@ -104,6 +125,7 @@ HP NX7000			??? (*)
 HP Pavilion ZD7000		vbetool post needed, need open-source nv driver for X
 HP Omnibook XE3	athlon version	none (1)
 HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
+HP Omnibook 5150		none (1), (S1 also works OK)
 IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
 IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
 IBM TP R32 / Type 2658-MMG      none (1)
@@ -120,18 +142,24 @@ IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
 IBM TP X20			??? (*)
 IBM TP X30			s3_bios (2)
 IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32			none (1), but backlight is on and video is trashed after long suspend
+IBM TP X32			none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
 IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
+IBM TP 600e			none(1), but a switch to console and back to X is needed
 Medion MD4220			??? (*)
 Samsung P35			vbetool needed (6)
-Sharp PC-AR10 (ATI rage)	none (1)
+Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
 Sony Vaio PCG-C1VRX/K		s3_bios (2)
 Sony Vaio PCG-F403		??? (*)
+Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
 Sony Vaio PCG-N505SN		??? (*)
 Sony Vaio vgn-s260		X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will be blank unless you return to X.
+Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
 Toshiba Libretto L5		none (1)
-Toshiba Satellite 4030CDT	s3_mode (3)
-Toshiba Satellite 4080XCDT      s3_mode (3)
+Toshiba Portege 3020CT		s3_mode (3)
+Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
 Toshiba Satellite 4090XCDT      ??? (*)
 Toshiba Satellite P10-554       s3_bios,s3_mode (4)(****)
 Toshiba M30                     (2) xor X with nvidia driver using internal AGP
@@ -151,39 +179,3 @@ Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
 (***) To be tested with a newer kernel.
 
 (****) Not with SMP kernel, UP only.
-
-VBEtool details
-~~~~~~~~~~~~~~~
-(with thanks to Carl-Daniel Hailfinger)
-
-First, boot into X and run the following script ONCE:
-#!/bin/bash
-statedir=/root/s3/state
-mkdir -p $statedir
-chvt 2
-sleep 1
-vbetool vbestate save >$statedir/vbe
-
-
-To suspend and resume properly, call the following script as root:
-#!/bin/bash
-statedir=/root/s3/state
-curcons=`fgconsole`
-fuser /dev/tty$curcons 2>/dev/null|xargs ps -o comm= -p|grep -q X && chvt 2
-cat /dev/vcsa >$statedir/vcsa
-sync
-echo 3 >/proc/acpi/sleep
-sync
-vbetool post
-vbetool vbestate restore <$statedir/vbe
-cat $statedir/vcsa >/dev/vcsa
-rckbd restart
-chvt $[curcons%6+1]
-chvt $curcons
-
-
-Unless you change your graphics card or other hardware configuration,
-the state once saved will be OK for every resume afterwards.
-NOTE: The "rckbd restart" command may be different for your
-distribution. Simply replace it with the command you would use to
-set the fonts on screen.
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 28de118f7a0b..02a1b3a9fa90 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -83,7 +83,7 @@ int freeze_processes(void)
 		yield();			/* Yield is okay here */
 		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
 			printk( "\n" );
-			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
+			printk(KERN_ERR " stopping tasks timed out (%d tasks remaining)\n", todo );
 			break;
 		}
 	} while(todo);
-- 
cgit v1.2.3


From 6e1819d615f24ce0726a7d0bd3dd0152d7b21654 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:03 -0800
Subject: [PATCH] swsusp: userland interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a user space interface for swsusp.

The interface is based on a special character device, called the snapshot
device, that allows user space processes to perform suspend and resume-related
operations with the help of some ioctls and the read()/write() functions.
 Additionally it allows these processes to allocate free swap pages from a
selected swap partition, called the resume partition, so that they know which
sectors of the resume partition are available to them.

The interface uses the same low-level system memory snapshot-handling
functions that are used by the built-it swap-writing/reading code of swsusp.

The interface documentation is included in the patch.

The patch assumes that the major and minor numbers of the snapshot device will
be 10 (ie.  misc device) and 231, the registration of which has already been
requested.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/userland-swsusp.txt | 149 ++++++++++++++++
 init/do_mounts_initrd.c                 |   1 +
 kernel/power/Makefile                   |   2 +-
 kernel/power/power.h                    |  14 ++
 kernel/power/snapshot.c                 |   9 +-
 kernel/power/user.c                     | 301 ++++++++++++++++++++++++++++++++
 mm/swapfile.c                           |   6 +-
 7 files changed, 475 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/power/userland-swsusp.txt
 create mode 100644 kernel/power/user.c

(limited to 'kernel')

diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
new file mode 100644
index 000000000000..94058220aaf0
--- /dev/null
+++ b/Documentation/power/userland-swsusp.txt
@@ -0,0 +1,149 @@
+Documentation for userland software suspend interface
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://www.sisk.pl/kernel/utilities/suspend>.  You may want to have
+a look at them if you are going to develop your own suspend/resume
+utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in kernel/power/power.h.  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for reading
+and writing.  It is also impossible to have the device open more than once
+at a time.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE - freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_ATOMIC_SNAPSHOT
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_ATOMIC_SNAPSHOT - create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_ATOMIC_SNAPSHOT ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE - free memory allocated for the snapshot image
+
+SNAPSHOT_SET_IMAGE_SIZE - set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_AVAIL_SWAP - return the amount of available swap in bytes (the last
+	argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_GET_SWAP_PAGE - allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with
+	SNAPSHOT_GET_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument
+	should specify the device's major and minor numbers in the old
+	two-byte format, as returned by the stat() function in the .st_rdev
+	member of the stat structure); it is recommended to always use this
+	call, because the code to set the resume partition could be removed from
+	future kernels
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+- you cannot read() more than one virtual memory page at a time
+- read()s accross page boundaries are impossible (ie. if ypu read() 1/2 of
+	a page in the previous call, you will only be able to read()
+	_at_ _most_ 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_GET_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap parition, called the resume
+partition, as storage space.  However, this is not really required, as they
+can use, for example, a special (blank) suspend partition or a file on a partition
+that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and mounted afterwards.
+
+These utilities SHOULD NOT make any assumptions regarding the ordering of
+data within the snapshot image, except for the image header that MAY be
+assumed to start with an swsusp_info structure, as specified in
+kernel/power/power.h.  This structure MAY be used by the userland utilities
+to obtain some information about the snapshot image, such as the size
+of the snapshot image, including the metadata and the header itself,
+contained in the .size member of swsusp_info.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferrably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_ATOMIC_SNAPSHOT
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferrably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_ATOMIC_SNAPSHOT has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index a05cabd0fd10..405f9031af87 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -56,6 +56,7 @@ static void __init handle_initrd(void)
 	sys_chroot(".");
 	mount_devfs_fs ();
 
+	current->flags |= PF_NOFREEZE;
 	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 	if (pid > 0) {
 		while (pid != sys_wait4(-1, NULL, 0, NULL))
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index bb91a0615303..8d0af3d37a4b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d1abffbb9ce..42c431c8bdde 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
+	unsigned long		size;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -65,6 +66,19 @@ extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 int snapshot_image_loaded(struct snapshot_handle *handle);
 
+#define SNAPSHOT_IOC_MAGIC	'3'
+#define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE		_IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE			_IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	10
+
 /**
  *	The bitmap is used for tracing allocated swap pages
  *
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cc349437fb72..0036955357e0 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,7 @@
 struct pbe *pagedir_nosave;
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
+static unsigned long *buffer;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 		free_pagedir(pblist);
 		pblist = NULL;
         } else
-        	create_pbe_list(pblist, nr_pages);
+		create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -418,6 +419,7 @@ void swsusp_free(void)
 	nr_copy_pages = 0;
 	nr_meta_pages = 0;
 	pagedir_nosave = NULL;
+	buffer = NULL;
 }
 
 
@@ -523,6 +525,8 @@ static void init_header(struct swsusp_info *info)
 	info->cpus = num_online_cpus();
 	info->image_pages = nr_copy_pages;
 	info->pages = nr_copy_pages + nr_meta_pages + 1;
+	info->size = info->pages;
+	info->size <<= PAGE_SHIFT;
 }
 
 /**
@@ -568,8 +572,6 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
-
 	if (handle->page > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
@@ -774,7 +776,6 @@ static int create_image(struct snapshot_handle *handle)
 
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
 	int error = 0;
 
 	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000000..8cabc405ca10
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,301 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR	231
+
+static struct snapshot_data {
+	struct snapshot_handle handle;
+	int swap;
+	struct bitmap_page *bitmap;
+	int mode;
+	char frozen;
+	char ready;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	if (!atomic_add_unless(&device_available, -1, 0))
+		return -EBUSY;
+
+	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+		return -ENOSYS;
+
+	nonseekable_open(inode, filp);
+	data = &snapshot_state;
+	filp->private_data = data;
+	memset(&data->handle, 0, sizeof(struct snapshot_handle));
+	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+		data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+		data->mode = O_RDONLY;
+	} else {
+		data->swap = -1;
+		data->mode = O_WRONLY;
+	}
+	data->bitmap = NULL;
+	data->frozen = 0;
+	data->ready = 0;
+
+	return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	swsusp_free();
+	data = filp->private_data;
+	free_all_swap_pages(data->swap, data->bitmap);
+	free_bitmap(data->bitmap);
+	if (data->frozen) {
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		up(&pm_sem);
+	}
+	atomic_inc(&device_available);
+	return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_read_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_to_user(buf, data_of(data->handle), res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_write_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_from_user(data_of(data->handle), buf, res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+	int error = 0;
+	struct snapshot_data *data;
+	loff_t offset, avail;
+
+	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+		return -ENOTTY;
+	if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+		return -ENOTTY;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	data = filp->private_data;
+
+	switch (cmd) {
+
+	case SNAPSHOT_FREEZE:
+		if (data->frozen)
+			break;
+		sys_sync();
+		down(&pm_sem);
+		pm_prepare_console();
+		disable_nonboot_cpus();
+		if (freeze_processes()) {
+			thaw_processes();
+			enable_nonboot_cpus();
+			pm_restore_console();
+			error = -EBUSY;
+		}
+		up(&pm_sem);
+		if (!error)
+			data->frozen = 1;
+		break;
+
+	case SNAPSHOT_UNFREEZE:
+		if (!data->frozen)
+			break;
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		pm_restore_console();
+		up(&pm_sem);
+		data->frozen = 0;
+		break;
+
+	case SNAPSHOT_ATOMIC_SNAPSHOT:
+		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		/* Free memory before shutting down devices. */
+		error = swsusp_shrink_memory();
+		if (!error) {
+			error = device_suspend(PMSG_FREEZE);
+			if (!error) {
+				in_suspend = 1;
+				error = swsusp_suspend();
+				device_resume();
+			}
+		}
+		up(&pm_sem);
+		if (!error)
+			error = put_user(in_suspend, (unsigned int __user *)arg);
+		if (!error)
+			data->ready = 1;
+		break;
+
+	case SNAPSHOT_ATOMIC_RESTORE:
+		if (data->mode != O_WRONLY || !data->frozen ||
+		    !snapshot_image_loaded(&data->handle)) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		pm_prepare_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (!error) {
+			error = swsusp_resume();
+			device_resume();
+		}
+		pm_restore_console();
+		up(&pm_sem);
+		break;
+
+	case SNAPSHOT_FREE:
+		swsusp_free();
+		memset(&data->handle, 0, sizeof(struct snapshot_handle));
+		data->ready = 0;
+		break;
+
+	case SNAPSHOT_SET_IMAGE_SIZE:
+		image_size = arg;
+		break;
+
+	case SNAPSHOT_AVAIL_SWAP:
+		avail = count_swap_pages(data->swap, 1);
+		avail <<= PAGE_SHIFT;
+		error = put_user(avail, (loff_t __user *)arg);
+		break;
+
+	case SNAPSHOT_GET_SWAP_PAGE:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		if (!data->bitmap) {
+			data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+			if (!data->bitmap) {
+				error = -ENOMEM;
+				break;
+			}
+		}
+		offset = alloc_swap_page(data->swap, data->bitmap);
+		if (offset) {
+			offset <<= PAGE_SHIFT;
+			error = put_user(offset, (loff_t __user *)arg);
+		} else {
+			error = -ENOSPC;
+		}
+		break;
+
+	case SNAPSHOT_FREE_SWAP_PAGES:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		free_all_swap_pages(data->swap, data->bitmap);
+		free_bitmap(data->bitmap);
+		data->bitmap = NULL;
+		break;
+
+	case SNAPSHOT_SET_SWAP_FILE:
+		if (!data->bitmap) {
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			if (old_decode_dev(arg)) {
+				data->swap = swap_type_of(old_decode_dev(arg));
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		} else {
+			error = -EPERM;
+		}
+		break;
+
+	default:
+		error = -ENOTTY;
+
+	}
+
+	return error;
+}
+
+static struct file_operations snapshot_fops = {
+	.open = snapshot_open,
+	.release = snapshot_release,
+	.read = snapshot_read,
+	.write = snapshot_write,
+	.llseek = no_llseek,
+	.ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+	.minor = SNAPSHOT_MINOR,
+	.name = "snapshot",
+	.fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+	return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4d11f9d84666..39aa9d129612 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -428,14 +428,16 @@ int swap_type_of(dev_t device)
 {
 	int i;
 
-	if (!device)
-		return -EINVAL;
 	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
 		struct inode *inode;
 
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
+		if (!device) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
 		inode = swap_info->swap_file->f_dentry->d_inode;
 		if (S_ISBLK(inode->i_mode) &&
 		    device == MKDEV(imajor(inode), iminor(inode))) {
-- 
cgit v1.2.3


From 02aaeb9b952f30b1ad6284d5d45be02030f679db Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:04 -0800
Subject: [PATCH] swsusp: freeze user space processes first

Allow swsusp to freeze processes successfully under heavy load by freezing
userspace processes before kernel threads.

[Thanks to Nigel Cunningham <nigel@suspend2.net> for suggesting the
way to go.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c    |  1 -
 kernel/power/process.c | 61 +++++++++++++++++++++++++++++++++++++-------------
 kernel/power/user.c    |  1 -
 3 files changed, 46 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4bd68f482f2b..81d4d982f3f0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -72,7 +72,6 @@ static int prepare_processes(void)
 	int error;
 
 	pm_prepare_console();
-	sys_sync();
 	disable_nonboot_cpus();
 
 	if (freeze_processes()) {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 02a1b3a9fa90..8ac7c35fad77 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,11 +12,12 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/syscalls.h>
 
 /* 
  * Timeout for stopping processes
  */
-#define TIMEOUT	(6 * HZ)
+#define TIMEOUT	(20 * HZ)
 
 
 static inline int freezeable(struct task_struct * p)
@@ -54,38 +55,62 @@ void refrigerator(void)
 	current->state = save;
 }
 
+static inline void freeze_process(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!freezing(p)) {
+		freeze(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		signal_wake_up(p, 0);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-	int todo;
+	int todo, nr_user, user_frozen;
 	unsigned long start_time;
 	struct task_struct *g, *p;
 	unsigned long flags;
 
 	printk( "Stopping tasks: " );
 	start_time = jiffies;
+	user_frozen = 0;
 	do {
-		todo = 0;
+		nr_user = todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			if (!freezeable(p))
 				continue;
 			if (frozen(p))
 				continue;
-
-			freeze(p);
-			spin_lock_irqsave(&p->sighand->siglock, flags);
-			signal_wake_up(p, 0);
-			spin_unlock_irqrestore(&p->sighand->siglock, flags);
-			todo++;
+			if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+				/* The task is a user-space one.
+				 * Freeze it unless there's a vfork completion
+				 * pending
+				 */
+				if (!p->vfork_done)
+					freeze_process(p);
+				nr_user++;
+			} else {
+				/* Freeze only if the user space is frozen */
+				if (user_frozen)
+					freeze_process(p);
+				todo++;
+			}
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
+		todo += nr_user;
+		if (!user_frozen && !nr_user) {
+			sys_sync();
+			start_time = jiffies;
+		}
+		user_frozen = !nr_user;
 		yield();			/* Yield is okay here */
-		if (todo && time_after(jiffies, start_time + TIMEOUT)) {
-			printk( "\n" );
-			printk(KERN_ERR " stopping tasks timed out (%d tasks remaining)\n", todo );
+		if (todo && time_after(jiffies, start_time + TIMEOUT))
 			break;
-		}
 	} while(todo);
 
 	/* This does not unfreeze processes that are already frozen
@@ -94,8 +119,14 @@ int freeze_processes(void)
 	 * but it cleans up leftover PF_FREEZE requests.
 	 */
 	if (todo) {
+		printk( "\n" );
+		printk(KERN_ERR " stopping tasks timed out "
+			"after %d seconds (%d tasks remaining):\n",
+			TIMEOUT / HZ, todo);
 		read_lock(&tasklist_lock);
-		do_each_thread(g, p)
+		do_each_thread(g, p) {
+			if (freezeable(p) && !frozen(p))
+				printk(KERN_ERR "  %s\n", p->comm);
 			if (freezing(p)) {
 				pr_debug("  clean up: %s\n", p->comm);
 				p->flags &= ~PF_FREEZE;
@@ -103,7 +134,7 @@ int freeze_processes(void)
 				recalc_sigpending_tsk(p);
 				spin_unlock_irqrestore(&p->sighand->siglock, flags);
 			}
-		while_each_thread(g, p);
+		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 		return todo;
 	}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 8cabc405ca10..a97406b86ef3 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -138,7 +138,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		sys_sync();
 		down(&pm_sem);
 		pm_prepare_console();
 		disable_nonboot_cpus();
-- 
cgit v1.2.3


From ce6ed29f3136bc4b3644ecf4091d6390d444f628 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 23 Mar 2006 03:00:05 -0800
Subject: [PATCH] suspend: make progress printing prettier

Combination of printk/pr_debug led to <7> in the middle of the line, and we
printed way too many dots.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0036955357e0..1b46c2da5a50 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -83,7 +83,7 @@ static int save_highmem_zone(struct zone *zone)
 		void *kaddr;
 		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
 
-		if (!(pfn%1000))
+		if (!(pfn%10000))
 			printk(".");
 		if (!pfn_valid(pfn))
 			continue;
@@ -122,13 +122,14 @@ int save_highmem(void)
 	struct zone *zone;
 	int res = 0;
 
-	pr_debug("swsusp: Saving Highmem\n");
+	pr_debug("swsusp: Saving Highmem");
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			res = save_highmem_zone(zone);
 		if (res)
 			return res;
 	}
+	printk("\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From fc558a7496bfab3d29a68953b07a95883fdcfbb1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:05 -0800
Subject: [PATCH] swsusp: finally solve mysqld problem

This patch from Pavel moves userland freeze signals handling into more logical
place.  It now hits even with mysqld running.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/signal.c   | 4 ----
 arch/x86_64/kernel/signal.c | 4 ----
 kernel/signal.c             | 3 ++-
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 608eac4a7324..5c352c3a9e7f 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -583,9 +583,6 @@ static void fastcall do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (try_to_freeze())
-		goto no_signal;
-
 	if (test_thread_flag(TIF_RESTORE_SIGMASK))
 		oldset = &current->saved_sigmask;
 	else
@@ -614,7 +611,6 @@ static void fastcall do_signal(struct pt_regs *regs)
 		return;
 	}
 
-no_signal:
 	/* Did we come from a system call? */
 	if (regs->orig_eax >= 0) {
 		/* Restart the system call - no handlers present */
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 5876df116c92..e5f5ce7909a3 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -443,9 +443,6 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (try_to_freeze())
-		goto no_signal;
-
 	if (!oldset)
 		oldset = &current->blocked;
 
@@ -463,7 +460,6 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 		return handle_signal(signr, &info, &ka, oldset, regs);
 	}
 
- no_signal:
 	/* Did we come from a system call? */
 	if ((long)regs->orig_rax >= 0) {
 		/* Restart the system call - no handlers present */
diff --git a/kernel/signal.c b/kernel/signal.c
index ea154104a00b..dfb09ba5c013 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1922,6 +1922,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+	try_to_freeze();
+
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
@@ -2307,7 +2309,6 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 
 			timeout = schedule_timeout_interruptible(timeout);
 
-			try_to_freeze();
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &these, &info);
 			current->blocked = current->real_blocked;
-- 
cgit v1.2.3


From e4e4d665560c75afb6060cb43bb6738777648ca1 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 23 Mar 2006 03:00:06 -0800
Subject: [PATCH] swsusp: drain high mem pages

Highmem could be in pcp list as well.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1b46c2da5a50..c5863d02c89e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -123,6 +123,7 @@ int save_highmem(void)
 	int res = 0;
 
 	pr_debug("swsusp: Saving Highmem");
+	drain_local_pages();
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			res = save_highmem_zone(zone);
-- 
cgit v1.2.3


From 94c188d32996beac00426740974310e32f162c14 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:08 -0800
Subject: [PATCH] swsusp: let userland tools switch console on suspend

Remove the console-switching code from the suspend part of the swsusp userland
interface and let the userland tools switch the console.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/user.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index a97406b86ef3..bbd4842104aa 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -139,12 +139,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		if (data->frozen)
 			break;
 		down(&pm_sem);
-		pm_prepare_console();
 		disable_nonboot_cpus();
 		if (freeze_processes()) {
 			thaw_processes();
 			enable_nonboot_cpus();
-			pm_restore_console();
 			error = -EBUSY;
 		}
 		up(&pm_sem);
@@ -158,7 +156,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		down(&pm_sem);
 		thaw_processes();
 		enable_nonboot_cpus();
-		pm_restore_console();
 		up(&pm_sem);
 		data->frozen = 0;
 		break;
-- 
cgit v1.2.3


From 9b238205ba5d79a8a242d7a5ddb82b89e4dc4e48 Mon Sep 17 00:00:00 2001
From: Luca Tettamanti <kronos.it@gmail.com>
Date: Thu, 23 Mar 2006 03:00:09 -0800
Subject: [PATCH] swsusp: add s2ram ioctl to userland interface

Add the SNAPSHOT_S2RAM ioctl to the snapshot device.

This ioctl allows a userland application to make the system (previously frozen
with the SNAPSHOT_FREE ioctl) enter the S3 state without freezing processes
and disabling nonboot CPUs for the second time.

This will allow us to implement the suspend-to-disk-and-RAM (STDR)
functionality in the userland suspend tools.

Signed-off-by: Luca Tettamanti <kronos.it@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c  |  2 +-
 kernel/power/power.h |  4 +++-
 kernel/power/user.c  | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9cb235cba4a9..ee371f50ccaa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -103,7 +103,7 @@ static int suspend_prepare(suspend_state_t state)
 }
 
 
-static int suspend_enter(suspend_state_t state)
+int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 	unsigned long flags;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 42c431c8bdde..f06f12f21767 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -77,7 +77,8 @@ int snapshot_image_loaded(struct snapshot_handle *handle);
 #define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
 #define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_IOC_MAXNR	10
+#define SNAPSHOT_S2RAM			_IO(SNAPSHOT_IOC_MAGIC, 11)
+#define SNAPSHOT_IOC_MAXNR	11
 
 /**
  *	The bitmap is used for tracing allocated swap pages
@@ -112,3 +113,4 @@ extern int swsusp_resume(void);
 extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
+extern int suspend_enter(suspend_state_t state);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bbd4842104aa..3f1539fbe48a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -266,6 +266,42 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		}
 		break;
 
+	case SNAPSHOT_S2RAM:
+		if (!data->frozen) {
+			error = -EPERM;
+			break;
+		}
+
+		if (down_trylock(&pm_sem)) {
+			error = -EBUSY;
+			break;
+		}
+
+		if (pm_ops->prepare) {
+			error = pm_ops->prepare(PM_SUSPEND_MEM);
+			if (error)
+				goto OutS3;
+		}
+
+		/* Put devices to sleep */
+		error = device_suspend(PMSG_SUSPEND);
+		if (error) {
+			printk(KERN_ERR "Failed to suspend some devices.\n");
+		} else {
+			/* Enter S3, system is already frozen */
+			suspend_enter(PM_SUSPEND_MEM);
+
+			/* Wake up devices */
+			device_resume();
+		}
+
+		if (pm_ops->finish)
+			pm_ops->finish(PM_SUSPEND_MEM);
+
+OutS3:
+		up(&pm_sem);
+		break;
+
 	default:
 		error = -ENOTTY;
 
-- 
cgit v1.2.3


From 0c9e63fd38a2fb2181668a0cdd622a3c23cfd567 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 23 Mar 2006 03:00:12 -0800
Subject: [PATCH] Shrinks sizeof(files_struct) and better layout

1) Reduce the size of (struct fdtable) to exactly 64 bytes on 32bits
   platforms, lowering kmalloc() allocated space by 50%.

2) Reduce the size of (files_struct), using a special 32 bits (or
   64bits) embedded_fd_set, instead of a 1024 bits fd_set for the
   close_on_exec_init and open_fds_init fields.  This save some ram (248
   bytes per task) as most tasks dont open more than 32 files.  D-Cache
   footprint for such tasks is also reduced to the minimum.

3) Reduce size of allocated fdset.  Currently two full pages are
   allocated, that is 32768 bits on x86 for example, and way too much.  The
   minimum is now L1_CACHE_BYTES.

UP and SMP should benefit from this patch, because most tasks will touch
only one cache line when open()/close() stdin/stdout/stderr (0/1/2),
(next_fd, close_on_exec_init, open_fds_init, fd_array[0 ..  2] being in the
same cache line)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c                |  9 ++++-----
 fs/file.c                 | 34 ++++++++++++++--------------------
 fs/open.c                 |  8 ++++----
 include/linux/file.h      | 28 ++++++++++++++++++++++++----
 include/linux/init_task.h | 10 +++++-----
 kernel/fork.c             |  8 ++++----
 6 files changed, 55 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index dc4a7007f4e7..03c789560fb8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -73,8 +73,8 @@ repeat:
 	 * orig_start..fdt->next_fd
 	 */
 	start = orig_start;
-	if (start < fdt->next_fd)
-		start = fdt->next_fd;
+	if (start < files->next_fd)
+		start = files->next_fd;
 
 	newfd = start;
 	if (start < fdt->max_fdset) {
@@ -102,9 +102,8 @@ repeat:
 	 * we reacquire the fdtable pointer and use it while holding
 	 * the lock, no one can free it during that time.
 	 */
-	fdt = files_fdtable(files);
-	if (start <= fdt->next_fd)
-		fdt->next_fd = newfd + 1;
+	if (start <= files->next_fd)
+		files->next_fd = newfd + 1;
 
 	error = newfd;
 	
diff --git a/fs/file.c b/fs/file.c
index cea7cbea11d0..bbc743314730 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -125,7 +125,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 		kmem_cache_free(files_cachep, fdt->free_files);
 		return;
 	}
-	if (fdt->max_fdset <= __FD_SETSIZE && fdt->max_fds <= NR_OPEN_DEFAULT) {
+	if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE &&
+		fdt->max_fds <= NR_OPEN_DEFAULT) {
 		/*
 		 * The fdtable was embedded
 		 */
@@ -155,8 +156,9 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 
 void free_fdtable(struct fdtable *fdt)
 {
-	if (fdt->free_files || fdt->max_fdset > __FD_SETSIZE ||
-					fdt->max_fds > NR_OPEN_DEFAULT)
+	if (fdt->free_files ||
+		fdt->max_fdset > EMBEDDED_FD_SET_SIZE ||
+		fdt->max_fds > NR_OPEN_DEFAULT)
 		call_rcu(&fdt->rcu, free_fdtable_rcu);
 }
 
@@ -199,7 +201,6 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
 		       (nfdt->max_fds - fdt->max_fds) *
 					sizeof(struct file *));
 	}
-	nfdt->next_fd = fdt->next_fd;
 }
 
 /*
@@ -220,11 +221,9 @@ fd_set * alloc_fdset(int num)
 
 void free_fdset(fd_set *array, int num)
 {
-	int size = num / 8;
-
-	if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+	if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */
 		return;
-	else if (size <= PAGE_SIZE)
+	else if (num <= 8 * PAGE_SIZE)
 		kfree(array);
 	else
 		vfree(array);
@@ -237,22 +236,17 @@ static struct fdtable *alloc_fdtable(int nr)
   	fd_set *new_openset = NULL, *new_execset = NULL;
 	struct file **new_fds;
 
-	fdt = kmalloc(sizeof(*fdt), GFP_KERNEL);
+	fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
 	if (!fdt)
   		goto out;
-	memset(fdt, 0, sizeof(*fdt));
 
-	nfds = __FD_SETSIZE;
+	nfds = 8 * L1_CACHE_BYTES;
   	/* Expand to the max in easy steps */
-  	do {
-		if (nfds < (PAGE_SIZE * 8))
-			nfds = PAGE_SIZE * 8;
-		else {
-			nfds = nfds * 2;
-			if (nfds > NR_OPEN)
-				nfds = NR_OPEN;
-		}
-	} while (nfds <= nr);
+  	while (nfds <= nr) {
+		nfds = nfds * 2;
+		if (nfds > NR_OPEN)
+			nfds = NR_OPEN;
+	}
 
   	new_openset = alloc_fdset(nfds);
   	new_execset = alloc_fdset(nfds);
diff --git a/fs/open.c b/fs/open.c
index 70e0230d8e77..1091dadd6c38 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -973,7 +973,7 @@ repeat:
 	fdt = files_fdtable(files);
  	fd = find_next_zero_bit(fdt->open_fds->fds_bits,
 				fdt->max_fdset,
-				fdt->next_fd);
+				files->next_fd);
 
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
@@ -998,7 +998,7 @@ repeat:
 
 	FD_SET(fd, fdt->open_fds);
 	FD_CLR(fd, fdt->close_on_exec);
-	fdt->next_fd = fd + 1;
+	files->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
 	if (fdt->fd[fd] != NULL) {
@@ -1019,8 +1019,8 @@ static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
 	__FD_CLR(fd, fdt->open_fds);
-	if (fd < fdt->next_fd)
-		fdt->next_fd = fd;
+	if (fd < files->next_fd)
+		files->next_fd = fd;
 }
 
 void fastcall put_unused_fd(unsigned int fd)
diff --git a/include/linux/file.h b/include/linux/file.h
index 9901b850f2e4..9f7c2513866f 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/types.h>
 
 /*
  * The default fd array needs to be at least BITS_PER_LONG,
@@ -17,10 +18,22 @@
  */
 #define NR_OPEN_DEFAULT BITS_PER_LONG
 
+/*
+ * The embedded_fd_set is a small fd_set,
+ * suitable for most tasks (which open <= BITS_PER_LONG files)
+ */
+struct embedded_fd_set {
+	unsigned long fds_bits[1];
+};
+
+/*
+ * More than this number of fds: we use a separately allocated fd_set
+ */
+#define EMBEDDED_FD_SET_SIZE (BITS_PER_BYTE * sizeof(struct embedded_fd_set))
+
 struct fdtable {
 	unsigned int max_fds;
 	int max_fdset;
-	int next_fd;
 	struct file ** fd;      /* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
@@ -33,13 +46,20 @@ struct fdtable {
  * Open file table structure
  */
 struct files_struct {
+  /*
+   * read mostly part
+   */
 	atomic_t count;
 	struct fdtable *fdt;
 	struct fdtable fdtab;
-	fd_set close_on_exec_init;
-	fd_set open_fds_init;
+  /*
+   * written part on a separate cache line in SMP
+   */
+	spinlock_t file_lock ____cacheline_aligned_in_smp;
+	int next_fd;
+	struct embedded_fd_set close_on_exec_init;
+	struct embedded_fd_set open_fds_init;
 	struct file * fd_array[NR_OPEN_DEFAULT];
-	spinlock_t file_lock;     /* Protects concurrent writers.  Nests inside tsk->alloc_lock */
 };
 
 #define files_fdtable(files) (rcu_dereference((files)->fdt))
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfd2ecccb5d..92146f3b7423 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -7,11 +7,10 @@
 #define INIT_FDTABLE \
 {							\
 	.max_fds	= NR_OPEN_DEFAULT, 		\
-	.max_fdset	= __FD_SETSIZE, 		\
-	.next_fd	= 0, 				\
+	.max_fdset	= EMBEDDED_FD_SET_SIZE,		\
 	.fd		= &init_files.fd_array[0], 	\
-	.close_on_exec	= &init_files.close_on_exec_init, \
-	.open_fds	= &init_files.open_fds_init, 	\
+	.close_on_exec	= (fd_set *)&init_files.close_on_exec_init, \
+	.open_fds	= (fd_set *)&init_files.open_fds_init, 	\
 	.rcu		= RCU_HEAD_INIT, 		\
 	.free_files	= NULL,		 		\
 	.next		= NULL,		 		\
@@ -20,9 +19,10 @@
 #define INIT_FILES \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
+	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.next_fd	= 0, 				\
 	.close_on_exec_init = { { 0, } }, 		\
 	.open_fds_init	= { { 0, } }, 			\
 	.fd_array	= { NULL, } 			\
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd7b65ee418..c79ae0b19a49 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -607,12 +607,12 @@ static struct files_struct *alloc_files(void)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
 	fdt = &newf->fdtab;
-	fdt->next_fd = 0;
 	fdt->max_fds = NR_OPEN_DEFAULT;
-	fdt->max_fdset = __FD_SETSIZE;
-	fdt->close_on_exec = &newf->close_on_exec_init;
-	fdt->open_fds = &newf->open_fds_init;
+	fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
+	fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	fdt->open_fds = (fd_set *)&newf->open_fds_init;
 	fdt->fd = &newf->fd_array[0];
 	INIT_RCU_HEAD(&fdt->rcu);
 	fdt->free_files = NULL;
-- 
cgit v1.2.3


From 2dd0ebcd2ab7b18a50c0810ddb45a84316e4ee2e Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Thu, 23 Mar 2006 03:00:13 -0800
Subject: [PATCH] Avoid taking global tasklist_lock for single threadedprocess
 at getrusage()

Avoid taking the global tasklist_lock when possible, if a process is single
threaded during getrusage().  Any avoidance of tasklist_lock is good for
NUMA boxes (and possibly for large SMPs).  Thanks to Oleg Nesterov for
review and suggestions.

Signed-off-by: Nippun Goel <nippung@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f91218a5463e..4941b9b14b97 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1677,9 +1677,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
  * a lot simpler!  (Which we're not doing right now because we're not
  * measuring them yet).
  *
- * This expects to be called with tasklist_lock read-locked or better,
- * and the siglock not locked.  It may momentarily take the siglock.
- *
  * When sampling multiple threads for RUSAGE_SELF, under SMP we might have
  * races with threads incrementing their own counters.  But since word
  * reads are atomic, we either get new values or old values and we don't
@@ -1687,6 +1684,25 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
  * the c* fields from p->signal from races with exit.c updating those
  * fields when reaping, so a sample either gets all the additions of a
  * given child after it's reaped, or none so this sample is before reaping.
+ *
+ * tasklist_lock locking optimisation:
+ * If we are current and single threaded, we do not need to take the tasklist
+ * lock or the siglock.  No one else can take our signal_struct away,
+ * no one else can reap the children to update signal->c* counters, and
+ * no one else can race with the signal-> fields.
+ * If we do not take the tasklist_lock, the signal-> fields could be read
+ * out of order while another thread was just exiting. So we place a
+ * read memory barrier when we avoid the lock.  On the writer side,
+ * write memory barrier is implied in  __exit_signal as __exit_signal releases
+ * the siglock spinlock after updating the signal-> fields.
+ *
+ * We don't really need the siglock when we access the non c* fields
+ * of the signal_struct (for RUSAGE_SELF) even in multithreaded
+ * case, since we take the tasklist lock for read and the non c* signal->
+ * fields are updated only in __exit_signal, which is called with
+ * tasklist_lock taken for write, hence these two threads cannot execute
+ * concurrently.
+ *
  */
 
 static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
@@ -1694,13 +1710,23 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	cputime_t utime, stime;
+	int need_lock = 0;
 
 	memset((char *) r, 0, sizeof *r);
+	utime = stime = cputime_zero;
 
-	if (unlikely(!p->signal))
-		return;
+	if (p != current || !thread_group_empty(p))
+		need_lock = 1;
 
-	utime = stime = cputime_zero;
+	if (need_lock) {
+		read_lock(&tasklist_lock);
+		if (unlikely(!p->signal)) {
+			read_unlock(&tasklist_lock);
+			return;
+		}
+	} else
+		/* See locking comments above */
+		smp_rmb();
 
 	switch (who) {
 		case RUSAGE_BOTH:
@@ -1740,6 +1766,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			BUG();
 	}
 
+	if (need_lock)
+		read_unlock(&tasklist_lock);
 	cputime_to_timeval(utime, &r->ru_utime);
 	cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -1747,9 +1775,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 {
 	struct rusage r;
-	read_lock(&tasklist_lock);
 	k_getrusage(p, who, &r);
-	read_unlock(&tasklist_lock);
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
 
-- 
cgit v1.2.3


From 3d3f26a7baaa921a0e790b4c72d20f0de91a5d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:18 -0800
Subject: [PATCH] kernel/cpuset.c, mutex conversion

convert cpuset.c's callback_sem and manage_sem to mutexes.
Build and boot tested by Ingo.
Build, boot, unit and stress tested by pj.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 212 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 103 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12815d3f1a05..c86ee051b734 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -53,7 +53,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #define CPUSET_SUPER_MAGIC		0x27e0eb
 
@@ -168,63 +168,57 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb;
 
 /*
- * We have two global cpuset semaphores below.  They can nest.
- * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * We have two global cpuset mutexes below.  They can nest.
+ * It is ok to first take manage_mutex, then nest callback_mutex.  We also
  * require taking task_lock() when dereferencing a tasks cpuset pointer.
  * See "The task_lock() exception", at the end of this comment.
  *
- * A task must hold both semaphores to modify cpusets.  If a task
- * holds manage_sem, then it blocks others wanting that semaphore,
- * ensuring that it is the only task able to also acquire callback_sem
+ * A task must hold both mutexes to modify cpusets.  If a task
+ * holds manage_mutex, then it blocks others wanting that mutex,
+ * ensuring that it is the only task able to also acquire callback_mutex
  * and be able to modify cpusets.  It can perform various checks on
  * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_sem.  While it is
+ * also allocate memory while just holding manage_mutex.  While it is
  * performing these checks, various callback routines can briefly
- * acquire callback_sem to query cpusets.  Once it is ready to make
- * the changes, it takes callback_sem, blocking everyone else.
+ * acquire callback_mutex to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_mutex, blocking everyone else.
  *
  * Calls to the kernel memory allocator can not be made while holding
- * callback_sem, as that would risk double tripping on callback_sem
+ * callback_mutex, as that would risk double tripping on callback_mutex
  * from one of the callbacks into the cpuset code from within
  * __alloc_pages().
  *
- * If a task is only holding callback_sem, then it has read-only
+ * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
  * The task_struct fields mems_allowed and mems_generation may only
  * be accessed in the context of that task, so require no locks.
  *
  * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_sem or callback_sem can't rely
+ * So in general, code holding manage_mutex or callback_mutex can't rely
  * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both semaphores, can
+ * zero, then only attach_task(), which holds both mutexes, can
  * increment it again.  Because a count of zero means that no tasks
  * are currently attached, therefore there is no way a task attached
  * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_sem or callback_sem can safely assume that
+ * So code holding manage_mutex or callback_mutex can safely assume that
  * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * holds manage_mutex or callback_mutex on a cpuset with zero count, it
  * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those semaphores.
- *
- * A possible optimization to improve parallelism would be to make
- * callback_sem a R/W semaphore (rwsem), allowing the callback routines
- * to proceed in parallel, with read access, until the holder of
- * manage_sem needed to take this rwsem for exclusive write access
- * and modify some cpusets.
+ * both of those mutexes.
  *
  * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_sem across the entire operation,
+ * the cpuset hierarchy holds manage_mutex across the entire operation,
  * single threading all such cpuset modifications across the system.
  *
- * The cpuset_common_file_read() handlers only hold callback_sem across
+ * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
  * cpumasks and nodemasks.
  *
  * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either semaphore.  These are the two most performance
+ * (usually) take either mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_sem
+ * when a task in a notify_on_release cpuset exits.  Then manage_mutex
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
@@ -242,9 +236,9 @@ static struct super_block *cpuset_sb;
  *
  * The need for this exception arises from the action of attach_task(),
  * which overwrites one tasks cpuset pointer with another.  It does
- * so using both semaphores, however there are several performance
+ * so using both mutexes, however there are several performance
  * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global semaphore.  Therefore except as
+ * expense of grabbing a system global mutex.  Therefore except as
  * noted below, when dereferencing or, as in attach_task(), modifying
  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
  * (task->alloc_lock) already in the task_struct routinely used for
@@ -256,8 +250,8 @@ static struct super_block *cpuset_sb;
  * the routine cpuset_update_task_memory_state().
  */
 
-static DECLARE_MUTEX(manage_sem);
-static DECLARE_MUTEX(callback_sem);
+static DEFINE_MUTEX(manage_mutex);
+static DEFINE_MUTEX(callback_mutex);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -432,7 +426,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with manage_sem held.  Writes path of cpuset into buf.
+ * Call with manage_mutex held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -484,11 +478,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * status of the /sbin/cpuset_release_agent task, so no sense holding
  * our caller up for that.
  *
- * When we had only one cpuset semaphore, we had to call this
+ * When we had only one cpuset mutex, we had to call this
  * without holding it, to avoid deadlock when call_usermodehelper()
  * allocated memory.  With two locks, we could now call this while
- * holding manage_sem, but we still don't, so as to minimize
- * the time manage_sem is held.
+ * holding manage_mutex, but we still don't, so as to minimize
+ * the time manage_mutex is held.
  */
 
 static void cpuset_release_agent(const char *pathbuf)
@@ -520,15 +514,15 @@ static void cpuset_release_agent(const char *pathbuf)
  * cs is notify_on_release() and now both the user count is zero and
  * the list of children is empty, prepare cpuset path in a kmalloc'd
  * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once manage_sem is dropped.
- * Call here with manage_sem held.
+ * cpuset_release_agent() with it later on, once manage_mutex is dropped.
+ * Call here with manage_mutex held.
  *
  * This check_for_release() routine is responsible for kmalloc'ing
  * pathbuf.  The above cpuset_release_agent() is responsible for
  * kfree'ing pathbuf.  The caller of these routines is responsible
  * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with manage_sem held and the address
- * of the pathbuf pointer, then dropping manage_sem, then calling
+ * calling check_for_release() with manage_mutex held and the address
+ * of the pathbuf pointer, then dropping manage_mutex, then calling
  * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
@@ -559,7 +553,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -583,7 +577,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with callback_sem held.
+ * Call with callback_mutex held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -608,12 +602,12 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * current->cpuset if a task has its memory placement changed.
  * Do not call this routine if in_interrupt().
  *
- * Call without callback_sem or task_lock() held.  May be called
- * with or without manage_sem held.  Doesn't need task_lock to guard
+ * Call without callback_mutex or task_lock() held.  May be called
+ * with or without manage_mutex held.  Doesn't need task_lock to guard
  * against another task changing a non-NULL cpuset pointer to NULL,
  * as that is only done by a task on itself, and if the current task
  * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer.  This routine also might acquire callback_sem and
+ * cpuset pointer.  This routine also might acquire callback_mutex and
  * current->mm->mmap_sem during call.
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
@@ -658,13 +652,13 @@ void cpuset_update_task_memory_state(void)
 	}
 
 	if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
-		down(&callback_sem);
+		mutex_lock(&callback_mutex);
 		task_lock(tsk);
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(tsk);
-		up(&callback_sem);
+		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
 	}
 }
@@ -674,7 +668,7 @@ void cpuset_update_task_memory_state(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_sem.
+ * are only set if the other's are set.  Call holding manage_mutex.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -692,7 +686,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * manage_sem held.
+ * manage_mutex held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -746,7 +740,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with manage_sem held.  May nest a call to the
+ * Call with manage_mutex held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
@@ -792,7 +786,7 @@ static void update_cpu_domains(struct cpuset *cur)
 }
 
 /*
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
  */
 
 static int update_cpumask(struct cpuset *cs, char *buf)
@@ -811,9 +805,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	cs->cpus_allowed = trialcs.cpus_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
@@ -827,7 +821,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  * the cpuset is marked 'memory_migrate', migrate the tasks
  * pages to the new memory.
  *
- * Call with manage_sem held.  May take callback_sem during call.
+ * Call with manage_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
@@ -862,11 +856,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		goto done;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs.mems_allowed;
 	atomic_inc(&cpuset_mems_generation);
 	cs->mems_generation = atomic_read(&cpuset_mems_generation);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
 
@@ -922,7 +916,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
 	 * cpuset_being_rebound check will catch such forks, and rebind
 	 * their vma mempolicies too.  Because we still hold the global
-	 * cpuset manage_sem, we know that no other rebind effort will
+	 * cpuset manage_mutex, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -948,7 +942,7 @@ done:
 }
 
 /*
- * Call with manage_sem held.
+ * Call with manage_mutex held.
  */
 
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -967,7 +961,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
- * Call with manage_sem held.
+ * Call with manage_mutex held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -989,12 +983,12 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
@@ -1104,7 +1098,7 @@ static int fmeter_getrate(struct fmeter *fmp)
  * writing the path of the old cpuset in 'ppathbuf' if it needs to be
  * notified on release.
  *
- * Call holding manage_sem.  May take callback_sem and task_lock of
+ * Call holding manage_mutex.  May take callback_mutex and task_lock of
  * the task 'pid' during call.
  */
 
@@ -1144,13 +1138,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
-		up(&callback_sem);
+		mutex_unlock(&callback_mutex);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -1164,7 +1158,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
 
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	mm = get_task_mm(tsk);
 	if (mm) {
@@ -1221,7 +1215,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -1264,7 +1258,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -1304,9 +1298,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	mask = cs->cpus_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1315,9 +1309,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	mask = cs->mems_allowed;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1598,7 +1592,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
  * Handle an open on 'tasks' file.  Prepare a buffer listing the
  * process id's of tasks currently attached to the cpuset being opened.
  *
- * Does not require any specific cpuset semaphores, and does not take any.
+ * Does not require any specific cpuset mutexes, and does not take any.
  */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
@@ -1754,7 +1748,7 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
  *	name:		name of the new cpuset. Will be strcpy'ed.
  *	mode:		mode to set on new inode
  *
- *	Must be called with the semaphore on the parent inode held
+ *	Must be called with the mutex on the parent inode held
  */
 
 static long cpuset_create(struct cpuset *parent, const char *name, int mode)
@@ -1766,7 +1760,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cpuset_update_task_memory_state();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1782,28 +1776,28 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	list_add(&cs->sibling, &cs->parent->children);
 	number_of_cpusets++;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release manage_sem before cpuset_populate_dir() because it
+	 * Release manage_mutex before cpuset_populate_dir() because it
 	 * will down() this new directory's i_mutex and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	kfree(cs);
 	return err;
 }
@@ -1825,18 +1819,18 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_mutex already */
 
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cpuset_update_task_memory_state();
 	if (atomic_read(&cs->count) > 0) {
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
 	parent = cs->parent;
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
@@ -1848,10 +1842,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	cpuset_d_remove_dir(d);
 	dput(d);
 	number_of_cpusets--;
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	if (list_empty(&parent->children))
 		check_for_release(parent, &pathbuf);
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1960,19 +1954,19 @@ void cpuset_fork(struct task_struct *child)
  * Description: Detach cpuset from @tsk and release it.
  *
  * Note that cpusets marked notify_on_release force every task in
- * them to take the global manage_sem semaphore when exiting.
+ * them to take the global manage_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cpusets where very high task exit scaling
  * is required on large systems.
  *
  * Don't even think about derefencing 'cs' after the cpuset use count
- * goes to zero, except inside a critical section guarded by manage_sem
- * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * goes to zero, except inside a critical section guarded by manage_mutex
+ * or callback_mutex.   Otherwise a zero cpuset use count is a license to
  * any other task to nuke the cpuset immediately, via cpuset_rmdir().
  *
- * This routine has to take manage_sem, not callback_sem, because
- * it is holding that semaphore while calling check_for_release(),
- * which calls kmalloc(), so can't be called holding callback__sem().
+ * This routine has to take manage_mutex, not callback_mutex, because
+ * it is holding that mutex while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback_mutex().
  *
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
@@ -2022,10 +2016,10 @@ void cpuset_exit(struct task_struct *tsk)
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		down(&manage_sem);
+		mutex_lock(&manage_mutex);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		up(&manage_sem);
+		mutex_unlock(&manage_mutex);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -2046,11 +2040,11 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	task_lock(tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock(tsk);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return mask;
 }
@@ -2074,11 +2068,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
 	nodemask_t mask;
 
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 	task_lock(tsk);
 	guarantee_online_mems(tsk->cpuset, &mask);
 	task_unlock(tsk);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 
 	return mask;
 }
@@ -2104,7 +2098,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 
 /*
  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call holding callback_sem.
+ * ancestor to the specified cpuset.  Call holding callback_mutex.
  * If no ancestor is mem_exclusive (an unusual configuration), then
  * returns the root cpuset.
  */
@@ -2131,12 +2125,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest mem_exclusive ancestor cpuset.
  *
- * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
  * routine only calls here with __GFP_HARDWALL bit _not_ set if
  * it's a GFP_KERNEL allocation, and all nodes in the current tasks
  * mems_allowed came up empty on the first pass over the zonelist.
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_sem semaphore.
+ * short of memory, might require taking the callback_mutex mutex.
  *
  * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
  * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -2171,31 +2165,31 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 1;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 
 	task_lock(current);
 	cs = nearest_exclusive_ancestor(current->cpuset);
 	task_unlock(current);
 
 	allowed = node_isset(node, cs->mems_allowed);
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 	return allowed;
 }
 
 /**
  * cpuset_lock - lock out any changes to cpuset structures
  *
- * The out of memory (oom) code needs to lock down cpusets
+ * The out of memory (oom) code needs to mutex_lock cpusets
  * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_sem via this
+ * task in an overlapping cpuset.  Expose callback_mutex via this
  * cpuset_lock() routine, so the oom code can lock it, before
  * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_sem.
+ * must be taken inside callback_mutex.
  */
 
 void cpuset_lock(void)
 {
-	down(&callback_sem);
+	mutex_lock(&callback_mutex);
 }
 
 /**
@@ -2206,7 +2200,7 @@ void cpuset_lock(void)
 
 void cpuset_unlock(void)
 {
-	up(&callback_sem);
+	mutex_unlock(&callback_mutex);
 }
 
 /**
@@ -2218,7 +2212,7 @@ void cpuset_unlock(void)
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Call while holding callback_sem.
+ * Call while holding callback_mutex.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2289,7 +2283,7 @@ void __cpuset_memory_pressure_bump(void)
  *  - Used for /proc/<pid>/cpuset.
  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
  *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_sem, keeping attach_task() from changing it
+ *    and we take manage_mutex, keeping attach_task() from changing it
  *    anyway.
  */
 
@@ -2305,7 +2299,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&manage_sem);
+	mutex_lock(&manage_mutex);
 	cs = tsk->cpuset;
 	if (!cs) {
 		retval = -EINVAL;
@@ -2318,7 +2312,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&manage_sem);
+	mutex_unlock(&manage_mutex);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 9331b3157c835353dd28efcd80d23563ad226aee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:19 -0800
Subject: [PATCH] convert kernel/rcupdate.c:rcu_barrier_sema to mutex

Convert kernel/rcupdate's rcu_barrier_sema to mutex.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index fedf5e369755..af8a2a57e17d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/cpu.h>
+#include <linux/mutex.h>
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = {
@@ -75,7 +76,7 @@ static int rsinterval = 1000;
 #endif
 
 static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
+static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
 #ifdef CONFIG_SMP
@@ -207,13 +208,13 @@ static void rcu_barrier_func(void *notused)
 void rcu_barrier(void)
 {
 	BUG_ON(in_interrupt());
-	/* Take cpucontrol semaphore to protect against CPU hotplug */
-	down(&rcu_barrier_sema);
+	/* Take cpucontrol mutex to protect against CPU hotplug */
+	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	atomic_set(&rcu_barrier_cpu_count, 0);
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
 	wait_for_completion(&rcu_barrier_completion);
-	up(&rcu_barrier_sema);
+	mutex_unlock(&rcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -549,7 +550,6 @@ static struct notifier_block __devinitdata rcu_nb = {
  */
 void __init rcu_init(void)
 {
-	sema_init(&rcu_barrier_sema, 1);
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
-- 
cgit v1.2.3


From 97d1f15b7ef52c1e9c28dc48b454024bb53a5fd2 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Thu, 23 Mar 2006 03:00:24 -0800
Subject: [PATCH] sem2mutex: kernel/

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c                |  7 ++++---
 kernel/module.c                 | 15 ++++++++-------
 kernel/posix-timers.c           |  1 +
 kernel/power/pm.c               | 21 +++++++++++----------
 kernel/profile.c                | 11 ++++++-----
 lib/reed_solomon/reed_solomon.c | 11 ++++++-----
 6 files changed, 36 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index e75950a1092c..6a5373868a98 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,6 +12,7 @@
 #include <linux/unistd.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 
 /*
@@ -41,7 +42,7 @@ struct kthread_stop_info
 
 /* Thread stopping is done by setthing this var: lock serializes
  * multiple kthread_stop calls. */
-static DECLARE_MUTEX(kthread_stop_lock);
+static DEFINE_MUTEX(kthread_stop_lock);
 static struct kthread_stop_info kthread_stop_info;
 
 int kthread_should_stop(void)
@@ -173,7 +174,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
-	down(&kthread_stop_lock);
+	mutex_lock(&kthread_stop_lock);
 
 	/* It could exit after stop_info.k set, but before wake_up_process. */
 	get_task_struct(k);
@@ -194,7 +195,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 	wait_for_completion(&kthread_stop_info.done);
 	kthread_stop_info.k = NULL;
 	ret = kthread_stop_info.err;
-	up(&kthread_stop_lock);
+	mutex_unlock(&kthread_stop_lock);
 
 	return ret;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 77764f22f021..de6312da6bb5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -39,6 +39,7 @@
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/sched.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -63,15 +64,15 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DECLARE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
-static DECLARE_MUTEX(notify_mutex);
+static DEFINE_MUTEX(notify_mutex);
 static struct notifier_block * module_notify_list;
 
 int register_module_notifier(struct notifier_block * nb)
 {
 	int err;
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	err = notifier_chain_register(&module_notify_list, nb);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 	return err;
 }
 EXPORT_SYMBOL(register_module_notifier);
@@ -79,9 +80,9 @@ EXPORT_SYMBOL(register_module_notifier);
 int unregister_module_notifier(struct notifier_block * nb)
 {
 	int err;
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	err = notifier_chain_unregister(&module_notify_list, nb);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 	return err;
 }
 EXPORT_SYMBOL(unregister_module_notifier);
@@ -1989,9 +1990,9 @@ sys_init_module(void __user *umod,
 	/* Drop lock so they can recurse */
 	up(&module_mutex);
 
-	down(&notify_mutex);
+	mutex_lock(&notify_mutex);
 	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-	up(&notify_mutex);
+	mutex_unlock(&notify_mutex);
 
 	/* Start the module */
 	if (mod->init != NULL)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fa895fc2ecf5..9944379360b5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 33c508e857dd..0f6908cce1dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -25,6 +25,7 @@
 #include <linux/pm.h>
 #include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
+#include <linux/mutex.h>
 
 int pm_active;
 
@@ -40,7 +41,7 @@ int pm_active;
  *	until a resume but that will be fine.
  */
  
-static DECLARE_MUTEX(pm_devs_lock);
+static DEFINE_MUTEX(pm_devs_lock);
 static LIST_HEAD(pm_devs);
 
 /**
@@ -67,9 +68,9 @@ struct pm_dev *pm_register(pm_dev_t type,
 		dev->id = id;
 		dev->callback = callback;
 
-		down(&pm_devs_lock);
+		mutex_lock(&pm_devs_lock);
 		list_add(&dev->entry, &pm_devs);
-		up(&pm_devs_lock);
+		mutex_unlock(&pm_devs_lock);
 	}
 	return dev;
 }
@@ -85,9 +86,9 @@ struct pm_dev *pm_register(pm_dev_t type,
 void pm_unregister(struct pm_dev *dev)
 {
 	if (dev) {
-		down(&pm_devs_lock);
+		mutex_lock(&pm_devs_lock);
 		list_del(&dev->entry);
-		up(&pm_devs_lock);
+		mutex_unlock(&pm_devs_lock);
 
 		kfree(dev);
 	}
@@ -118,7 +119,7 @@ void pm_unregister_all(pm_callback callback)
 	if (!callback)
 		return;
 
-	down(&pm_devs_lock);
+	mutex_lock(&pm_devs_lock);
 	entry = pm_devs.next;
 	while (entry != &pm_devs) {
 		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -126,7 +127,7 @@ void pm_unregister_all(pm_callback callback)
 		if (dev->callback == callback)
 			__pm_unregister(dev);
 	}
-	up(&pm_devs_lock);
+	mutex_unlock(&pm_devs_lock);
 }
 
 /**
@@ -234,7 +235,7 @@ int pm_send_all(pm_request_t rqst, void *data)
 {
 	struct list_head *entry;
 	
-	down(&pm_devs_lock);
+	mutex_lock(&pm_devs_lock);
 	entry = pm_devs.next;
 	while (entry != &pm_devs) {
 		struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
@@ -246,13 +247,13 @@ int pm_send_all(pm_request_t rqst, void *data)
 				 */
 				if (rqst == PM_SUSPEND)
 					pm_undo_all(dev);
-				up(&pm_devs_lock);
+				mutex_unlock(&pm_devs_lock);
 				return status;
 			}
 		}
 		entry = entry->next;
 	}
-	up(&pm_devs_lock);
+	mutex_unlock(&pm_devs_lock);
 	return 0;
 }
 
diff --git a/kernel/profile.c b/kernel/profile.c
index f89248e6d704..ad81f799a9b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/profile.h>
 #include <linux/highmem.h>
+#include <linux/mutex.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
 
@@ -44,7 +45,7 @@ static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
-static DECLARE_MUTEX(profile_flip_mutex);
+static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 
 static int __init profile_setup(char * str)
@@ -243,7 +244,7 @@ static void profile_flip_buffers(void)
 {
 	int i, j, cpu;
 
-	down(&profile_flip_mutex);
+	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
 	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -259,14 +260,14 @@ static void profile_flip_buffers(void)
 			hits[i].hits = hits[i].pc = 0;
 		}
 	}
-	up(&profile_flip_mutex);
+	mutex_unlock(&profile_flip_mutex);
 }
 
 static void profile_discard_flip_buffers(void)
 {
 	int i, cpu;
 
-	down(&profile_flip_mutex);
+	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
 	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
@@ -274,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 	}
-	up(&profile_flip_mutex);
+	mutex_unlock(&profile_flip_mutex);
 }
 
 void profile_hit(int type, void *__pc)
diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c
index f5fef948a415..f8ac9fa95de1 100644
--- a/lib/reed_solomon/reed_solomon.c
+++ b/lib/reed_solomon/reed_solomon.c
@@ -44,12 +44,13 @@
 #include <linux/module.h>
 #include <linux/rslib.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 #include <asm/semaphore.h>
 
 /* This list holds all currently allocated rs control structures */
 static LIST_HEAD (rslist);
 /* Protection for the list */
-static DECLARE_MUTEX(rslistlock);
+static DEFINE_MUTEX(rslistlock);
 
 /**
  * rs_init - Initialize a Reed-Solomon codec
@@ -161,7 +162,7 @@ errrs:
  */
 void free_rs(struct rs_control *rs)
 {
-	down(&rslistlock);
+	mutex_lock(&rslistlock);
 	rs->users--;
 	if(!rs->users) {
 		list_del(&rs->list);
@@ -170,7 +171,7 @@ void free_rs(struct rs_control *rs)
 		kfree(rs->genpoly);
 		kfree(rs);
 	}
-	up(&rslistlock);
+	mutex_unlock(&rslistlock);
 }
 
 /**
@@ -201,7 +202,7 @@ struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
 	if (nroots < 0 || nroots >= (1<<symsize))
 		return NULL;
 
-	down(&rslistlock);
+	mutex_lock(&rslistlock);
 
 	/* Walk through the list and look for a matching entry */
 	list_for_each(tmp, &rslist) {
@@ -228,7 +229,7 @@ struct rs_control *init_rs(int symsize, int gfpoly, int fcr, int prim,
 		list_add(&rs->list, &rslist);
 	}
 out:
-	up(&rslistlock);
+	mutex_unlock(&rslistlock);
 	return rs;
 }
 
-- 
cgit v1.2.3


From 70522e121a521aa09bd0f4e62e1aa68708b798e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:31 -0800
Subject: [PATCH] sem2mutex: tty

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/n_tty.c  | 10 +++++-----
 drivers/char/tty_io.c | 50 +++++++++++++++++++++++++-------------------------
 drivers/char/vt.c     | 14 +++++++-------
 include/linux/tty.h   |  8 ++++----
 kernel/exit.c         |  4 ++--
 kernel/sys.c          |  4 ++--
 6 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index ccad7ae94541..ede365d05387 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -132,7 +132,7 @@ static void put_tty_queue(unsigned char c, struct tty_struct *tty)
  *	We test the TTY_THROTTLED bit first so that it always
  *	indicates the current state. The decision about whether
  *	it is worth allowing more input has been taken by the caller.
- *	Can sleep, may be called under the atomic_read semaphore but
+ *	Can sleep, may be called under the atomic_read_lock mutex but
  *	this is not guaranteed.
  */
  
@@ -1132,7 +1132,7 @@ static inline int input_available_p(struct tty_struct *tty, int amt)
  *	buffer, and once to drain the space from the (physical) beginning of
  *	the buffer to head pointer.
  *
- *	Called under the tty->atomic_read sem and with TTY_DONT_FLIP set
+ *	Called under the tty->atomic_read_lock sem and with TTY_DONT_FLIP set
  *
  */
  
@@ -1262,11 +1262,11 @@ do_it_again:
 	 *	Internal serialization of reads.
 	 */
 	if (file->f_flags & O_NONBLOCK) {
-		if (down_trylock(&tty->atomic_read))
+		if (!mutex_trylock(&tty->atomic_read_lock))
 			return -EAGAIN;
 	}
 	else {
-		if (down_interruptible(&tty->atomic_read))
+		if (mutex_lock_interruptible(&tty->atomic_read_lock))
 			return -ERESTARTSYS;
 	}
 
@@ -1393,7 +1393,7 @@ do_it_again:
 			timeout = time;
 	}
 	clear_bit(TTY_DONT_FLIP, &tty->flags);
-	up(&tty->atomic_read);
+	mutex_unlock(&tty->atomic_read_lock);
 	remove_wait_queue(&tty->read_wait, &wait);
 
 	if (!waitqueue_active(&tty->read_wait))
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 53d3d066554e..76592ee1fb38 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -130,7 +130,7 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 
 /* Semaphore to protect creating and releasing a tty. This is shared with
    vt.c for deeply disgusting hack reasons */
-DECLARE_MUTEX(tty_sem);
+DEFINE_MUTEX(tty_mutex);
 
 #ifdef CONFIG_UNIX98_PTYS
 extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
@@ -1188,11 +1188,11 @@ void disassociate_ctty(int on_exit)
 
 	lock_kernel();
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	tty = current->signal->tty;
 	if (tty) {
 		tty_pgrp = tty->pgrp;
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
 	} else {
@@ -1200,7 +1200,7 @@ void disassociate_ctty(int on_exit)
 			kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit);
 			kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit);
 		}
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		unlock_kernel();	
 		return;
 	}
@@ -1211,7 +1211,7 @@ void disassociate_ctty(int on_exit)
 	}
 
 	/* Must lock changes to tty_old_pgrp */
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	current->signal->tty_old_pgrp = 0;
 	tty->session = 0;
 	tty->pgrp = -1;
@@ -1222,7 +1222,7 @@ void disassociate_ctty(int on_exit)
 		p->signal->tty = NULL;
 	} while_each_task_pid(current->signal->session, PIDTYPE_SID, p);
 	read_unlock(&tasklist_lock);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	unlock_kernel();
 }
 
@@ -1306,7 +1306,7 @@ static inline ssize_t do_tty_write(
 	ssize_t ret = 0, written = 0;
 	unsigned int chunk;
 	
-	if (down_interruptible(&tty->atomic_write)) {
+	if (mutex_lock_interruptible(&tty->atomic_write_lock)) {
 		return -ERESTARTSYS;
 	}
 
@@ -1329,7 +1329,7 @@ static inline ssize_t do_tty_write(
 	if (count < chunk)
 		chunk = count;
 
-	/* write_buf/write_cnt is protected by the atomic_write semaphore */
+	/* write_buf/write_cnt is protected by the atomic_write_lock mutex */
 	if (tty->write_cnt < chunk) {
 		unsigned char *buf;
 
@@ -1338,7 +1338,7 @@ static inline ssize_t do_tty_write(
 
 		buf = kmalloc(chunk, GFP_KERNEL);
 		if (!buf) {
-			up(&tty->atomic_write);
+			mutex_unlock(&tty->atomic_write_lock);
 			return -ENOMEM;
 		}
 		kfree(tty->write_buf);
@@ -1374,7 +1374,7 @@ static inline ssize_t do_tty_write(
 		inode->i_mtime = current_fs_time(inode->i_sb);
 		ret = written;
 	}
-	up(&tty->atomic_write);
+	mutex_unlock(&tty->atomic_write_lock);
 	return ret;
 }
 
@@ -1442,8 +1442,8 @@ static inline void tty_line_name(struct tty_driver *driver, int index, char *p)
 
 /*
  * WSH 06/09/97: Rewritten to remove races and properly clean up after a
- * failed open.  The new code protects the open with a semaphore, so it's
- * really quite straightforward.  The semaphore locking can probably be
+ * failed open.  The new code protects the open with a mutex, so it's
+ * really quite straightforward.  The mutex locking can probably be
  * relaxed for the (most common) case of reopening a tty.
  */
 static int init_dev(struct tty_driver *driver, int idx,
@@ -1640,7 +1640,7 @@ fast_track:
 success:
 	*ret_tty = tty;
 	
-	/* All paths come through here to release the semaphore */
+	/* All paths come through here to release the mutex */
 end_init:
 	return retval;
 
@@ -1837,7 +1837,7 @@ static void release_dev(struct file * filp)
 		/* Guard against races with tty->count changes elsewhere and
 		   opens on /dev/tty */
 		   
-		down(&tty_sem);
+		mutex_lock(&tty_mutex);
 		tty_closing = tty->count <= 1;
 		o_tty_closing = o_tty &&
 			(o_tty->count <= (pty_master ? 1 : 0));
@@ -1868,7 +1868,7 @@ static void release_dev(struct file * filp)
 
 		printk(KERN_WARNING "release_dev: %s: read/write wait queue "
 				    "active!\n", tty_name(tty, buf));
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		schedule();
 	}	
 
@@ -1934,7 +1934,7 @@ static void release_dev(struct file * filp)
 		read_unlock(&tasklist_lock);
 	}
 
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 
 	/* check whether both sides are closing ... */
 	if (!tty_closing || (o_tty && !o_tty_closing))
@@ -2040,11 +2040,11 @@ retry_open:
 	index  = -1;
 	retval = 0;
 	
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 
 	if (device == MKDEV(TTYAUX_MAJOR,0)) {
 		if (!current->signal->tty) {
-			up(&tty_sem);
+			mutex_unlock(&tty_mutex);
 			return -ENXIO;
 		}
 		driver = current->signal->tty->driver;
@@ -2070,18 +2070,18 @@ retry_open:
 			noctty = 1;
 			goto got_driver;
 		}
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
 
 	driver = get_tty_driver(device, &index);
 	if (!driver) {
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		return -ENODEV;
 	}
 got_driver:
 	retval = init_dev(driver, index, &tty);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	if (retval)
 		return retval;
 
@@ -2167,9 +2167,9 @@ static int ptmx_open(struct inode * inode, struct file * filp)
 	}
 	up(&allocated_ptys_lock);
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	retval = init_dev(ptm_driver, index, &tty);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	
 	if (retval)
 		goto out;
@@ -2915,8 +2915,8 @@ static void initialize_tty_struct(struct tty_struct *tty)
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup, tty);
-	sema_init(&tty->atomic_read, 1);
-	sema_init(&tty->atomic_write, 1);
+	mutex_init(&tty->atomic_read_lock);
+	mutex_init(&tty->atomic_write_lock);
 	spin_lock_init(&tty->read_lock);
 	INIT_LIST_HEAD(&tty->tty_files);
 	INIT_WORK(&tty->SAK_work, NULL, NULL);
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 86b31b87eb85..ca4844c527da 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2489,7 +2489,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 }
 
 /*
- * We take tty_sem in here to prevent another thread from coming in via init_dev
+ * We take tty_mutex in here to prevent another thread from coming in via init_dev
  * and taking a ref against the tty while we're in the process of forgetting
  * about it and cleaning things up.
  *
@@ -2497,7 +2497,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
  */
 static void con_close(struct tty_struct *tty, struct file *filp)
 {
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	acquire_console_sem();
 	if (tty && tty->count == 1) {
 		struct vc_data *vc = tty->driver_data;
@@ -2507,15 +2507,15 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 		tty->driver_data = NULL;
 		release_console_sem();
 		vcs_remove_devfs(tty);
-		up(&tty_sem);
+		mutex_unlock(&tty_mutex);
 		/*
-		 * tty_sem is released, but we still hold BKL, so there is
+		 * tty_mutex is released, but we still hold BKL, so there is
 		 * still exclusion against init_dev()
 		 */
 		return;
 	}
 	release_console_sem();
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 }
 
 static void vc_init(struct vc_data *vc, unsigned int rows,
@@ -2869,9 +2869,9 @@ void unblank_screen(void)
 }
 
 /*
- * We defer the timer blanking to work queue so it can take the console semaphore
+ * We defer the timer blanking to work queue so it can take the console mutex
  * (console operations can still happen at irq time, but only from printk which
- * has the console semaphore. Not perfect yet, but better than no locking
+ * has the console mutex. Not perfect yet, but better than no locking
  */
 static void blank_screen_t(unsigned long dummy)
 {
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f45cd74e6f24..f13f49afe198 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -24,6 +24,7 @@
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
 #include <linux/screen_info.h>
+#include <linux/mutex.h>
 
 #include <asm/system.h>
 
@@ -231,8 +232,8 @@ struct tty_struct {
 	int canon_data;
 	unsigned long canon_head;
 	unsigned int canon_column;
-	struct semaphore atomic_read;
-	struct semaphore atomic_write;
+	struct mutex atomic_read_lock;
+	struct mutex atomic_write_lock;
 	unsigned char *write_buf;
 	int write_cnt;
 	spinlock_t read_lock;
@@ -319,8 +320,7 @@ extern void tty_ldisc_put(int);
 extern void tty_wakeup(struct tty_struct *tty);
 extern void tty_ldisc_flush(struct tty_struct *tty);
 
-struct semaphore;
-extern struct semaphore tty_sem;
+extern struct mutex tty_mutex;
 
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
diff --git a/kernel/exit.c b/kernel/exit.c
index d1e8d500a7e1..8037405e136e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -345,9 +345,9 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	current->signal->tty = NULL;
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index 4941b9b14b97..c0fcad9f826c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1227,7 +1227,7 @@ asmlinkage long sys_setsid(void)
 	struct pid *pid;
 	int err = -EPERM;
 
-	down(&tty_sem);
+	mutex_lock(&tty_mutex);
 	write_lock_irq(&tasklist_lock);
 
 	pid = find_pid(PIDTYPE_PGID, group_leader->pid);
@@ -1241,7 +1241,7 @@ asmlinkage long sys_setsid(void)
 	err = process_group(group_leader);
 out:
 	write_unlock_irq(&tasklist_lock);
-	up(&tty_sem);
+	mutex_unlock(&tty_mutex);
 	return err;
 }
 
-- 
cgit v1.2.3


From 7a7d1cf95408863a657035701606b13644c9f55e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:35 -0800
Subject: [PATCH] sem2mutex: kprobes

Semaphore to mutex conversion.

The conversion was generated via scripts, and the result was validated
automatically via a script as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    |  4 ++--
 arch/powerpc/kernel/kprobes.c |  4 ++--
 arch/x86_64/kernel/kprobes.c  |  4 ++--
 include/linux/kprobes.h       |  3 ++-
 kernel/kprobes.c              | 14 +++++++-------
 5 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 694a13997637..7a59050242a7 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -84,9 +84,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 258039fb3016..cb1fe5878e8b 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -81,9 +81,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 8b866a8572cf..14f0ced613b6 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -222,9 +222,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	free_insn_slot(p->ainsn.insn);
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 }
 
 static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 669756bc20a2..778adc0fa640 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,7 @@
 #include <linux/percpu.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/mutex.h>
 
 #ifdef CONFIG_KPROBES
 #include <asm/kprobes.h>
@@ -152,7 +153,7 @@ struct kretprobe_instance {
 };
 
 extern spinlock_t kretprobe_lock;
-extern struct semaphore kprobe_mutex;
+extern struct mutex kprobe_mutex;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fef1af8a73ce..1fb9f753ef60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -48,7 +48,7 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-DECLARE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
+DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
 DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
@@ -460,7 +460,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
 	}
 
 	p->nmissed = 0;
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
@@ -477,7 +477,7 @@ static int __kprobes __register_kprobe(struct kprobe *p,
   	arch_arm_kprobe(p);
 
 out:
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	if (ret && probed_mod)
 		module_put(probed_mod);
@@ -496,10 +496,10 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 	struct kprobe *old_p, *list_p;
 	int cleanup_p;
 
-	down(&kprobe_mutex);
+	mutex_lock(&kprobe_mutex);
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p)) {
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 	if (p != old_p) {
@@ -507,7 +507,7 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
 				goto valid_p;
-		up(&kprobe_mutex);
+		mutex_unlock(&kprobe_mutex);
 		return;
 	}
 valid_p:
@@ -523,7 +523,7 @@ valid_p:
 		cleanup_p = 0;
 	}
 
-	up(&kprobe_mutex);
+	mutex_unlock(&kprobe_mutex);
 
 	synchronize_sched();
 	if (p->mod_refcounted &&
-- 
cgit v1.2.3


From 6389a385114ae358693f213266de6468ea116c77 Mon Sep 17 00:00:00 2001
From: Ashutosh Naik <ashutosh.naik@gmail.com>
Date: Thu, 23 Mar 2006 03:00:46 -0800
Subject: [PATCH] kernel/module.c Semaphore to Mutex Conversion for
 module_mutex

This patch converts the module_mutex semaphore to a mutex.

Signed-off-by: Ashutosh Naik <ashutosh.naik@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index de6312da6bb5..fb404299082e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -61,7 +61,7 @@
 static DEFINE_SPINLOCK(modlist_lock);
 
 /* List of modules, protected by module_mutex AND modlist_lock */
-static DECLARE_MUTEX(module_mutex);
+static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
 static DEFINE_MUTEX(notify_mutex);
@@ -602,7 +602,7 @@ static void free_module(struct module *mod);
 static void wait_for_zero_refcount(struct module *mod)
 {
 	/* Since we might sleep for some time, drop the semaphore first */
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	for (;;) {
 		DEBUGP("Looking at refcount...\n");
 		set_current_state(TASK_UNINTERRUPTIBLE);
@@ -611,7 +611,7 @@ static void wait_for_zero_refcount(struct module *mod)
 		schedule();
 	}
 	current->state = TASK_RUNNING;
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 }
 
 asmlinkage long
@@ -628,7 +628,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	if (down_interruptible(&module_mutex) != 0)
+	if (mutex_lock_interruptible(&module_mutex) != 0)
 		return -EINTR;
 
 	mod = find_module(name);
@@ -677,14 +677,14 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 
 	/* Final destruction now noone is using it. */
 	if (mod->exit != NULL) {
-		up(&module_mutex);
+		mutex_unlock(&module_mutex);
 		mod->exit();
-		down(&module_mutex);
+		mutex_lock(&module_mutex);
 	}
 	free_module(mod);
 
  out:
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	return ret;
 }
 
@@ -1973,13 +1973,13 @@ sys_init_module(void __user *umod,
 		return -EPERM;
 
 	/* Only one module load at a time, please */
-	if (down_interruptible(&module_mutex) != 0)
+	if (mutex_lock_interruptible(&module_mutex) != 0)
 		return -EINTR;
 
 	/* Do all the hard work */
 	mod = load_module(umod, len, uargs);
 	if (IS_ERR(mod)) {
-		up(&module_mutex);
+		mutex_unlock(&module_mutex);
 		return PTR_ERR(mod);
 	}
 
@@ -1988,7 +1988,7 @@ sys_init_module(void __user *umod,
 	stop_machine_run(__link_module, mod, NR_CPUS);
 
 	/* Drop lock so they can recurse */
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 
 	mutex_lock(&notify_mutex);
 	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
@@ -2007,15 +2007,15 @@ sys_init_module(void __user *umod,
 			       mod->name);
 		else {
 			module_put(mod);
-			down(&module_mutex);
+			mutex_lock(&module_mutex);
 			free_module(mod);
-			up(&module_mutex);
+			mutex_unlock(&module_mutex);
 		}
 		return ret;
 	}
 
 	/* Now it's a first class citizen! */
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	mod->state = MODULE_STATE_LIVE;
 	/* Drop initial reference. */
 	module_put(mod);
@@ -2023,7 +2023,7 @@ sys_init_module(void __user *umod,
 	mod->module_init = NULL;
 	mod->init_size = 0;
 	mod->init_text_size = 0;
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 
 	return 0;
 }
@@ -2113,7 +2113,7 @@ struct module *module_get_kallsym(unsigned int symnum,
 {
 	struct module *mod;
 
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	list_for_each_entry(mod, &modules, list) {
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
@@ -2121,12 +2121,12 @@ struct module *module_get_kallsym(unsigned int symnum,
 			strncpy(namebuf,
 				mod->strtab + mod->symtab[symnum].st_name,
 				127);
-			up(&module_mutex);
+			mutex_unlock(&module_mutex);
 			return mod;
 		}
 		symnum -= mod->num_symtab;
 	}
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 	return NULL;
 }
 
@@ -2169,7 +2169,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	struct list_head *i;
 	loff_t n = 0;
 
-	down(&module_mutex);
+	mutex_lock(&module_mutex);
 	list_for_each(i, &modules) {
 		if (n++ == *pos)
 			break;
@@ -2190,7 +2190,7 @@ static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 
 static void m_stop(struct seq_file *m, void *p)
 {
-	up(&module_mutex);
+	mutex_unlock(&module_mutex);
 }
 
 static int m_show(struct seq_file *m, void *p)
-- 
cgit v1.2.3


From a26fd335b481e0bd14f4e7d1f5e7bb1138b1731f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Mar 2006 03:00:49 -0800
Subject: [PATCH] sigprocmask: kill unneeded temp var

Cleanup, remove unneeded double copying of current->blocked.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index dfb09ba5c013..75f7341b0c39 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2101,10 +2101,11 @@ long do_no_restart_syscall(struct restart_block *param)
 int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 {
 	int error;
-	sigset_t old_block;
 
 	spin_lock_irq(&current->sighand->siglock);
-	old_block = current->blocked;
+	if (oldset)
+		*oldset = current->blocked;
+
 	error = 0;
 	switch (how) {
 	case SIG_BLOCK:
@@ -2121,8 +2122,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 	}
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
-	if (oldset)
-		*oldset = old_block;
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 91368d73e4b60d577ad171e5bd315b564265fcdb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Mar 2006 03:00:54 -0800
Subject: [PATCH] make bug messages more consistent

Consolidate all kernel bug printouts to begin with the "BUG: " string.
Makes it easier to find them in large bootup logs.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c    | 2 +-
 arch/i386/mm/fault.c      | 4 ++--
 include/asm-generic/bug.h | 4 ++--
 kernel/sched.c            | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index be87c5e2ee95..1db34effdd8d 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -543,7 +543,7 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 			/*
 			 * die_nmi will return ONLY if NOTIFY_STOP happens..
 			 */
-			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+			die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP");
 	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index bbb24af5d860..47a3b72ec7b6 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -518,9 +518,9 @@ no_context:
 	}
 #endif
 	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+		printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference");
 	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
+		printk(KERN_ALERT "BUG: unable to handle kernel paging request");
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 400c2b41896e..1a565a9d2fa7 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -7,7 +7,7 @@
 #ifdef CONFIG_BUG
 #ifndef HAVE_ARCH_BUG
 #define BUG() do { \
-	printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+	printk("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __FUNCTION__); \
 	panic("BUG!"); \
 } while (0)
 #endif
@@ -19,7 +19,7 @@
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
 	if (unlikely((condition)!=0)) { \
-		printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
+		printk("BUG: warning at %s:%d/%s()\n", __FILE__, __LINE__, __FUNCTION__); \
 		dump_stack(); \
 	} \
 } while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index a5bd60453eae..7ffaabd64f89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2873,7 +2873,7 @@ asmlinkage void __sched schedule(void)
 	 */
 	if (likely(!current->exit_state)) {
 		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
+			printk(KERN_ERR "BUG: scheduling while atomic: "
 				"%s/0x%08x/%d\n",
 				current->comm, preempt_count(), current->pid);
 			dump_stack();
@@ -6074,7 +6074,7 @@ void __might_sleep(char *file, int line)
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
 		printk("in_atomic():%d, irqs_disabled():%d\n",
 			in_atomic(), irqs_disabled());
-- 
cgit v1.2.3


From dd287796d608fcdc3fe5e8fdb5bf762a8f1bc32a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:57 -0800
Subject: [PATCH] pause_on_oops command line option

Attempt to fix the problem wherein people's oops reports scroll off the screen
due to repeated oopsing or to oopses on other CPUs.

If this happens the user can reboot with the `pause_on_oops=<seconds>' option.
It will allow the first oopsing CPU to print an oops record just a single
time.  Second oopsing attempts, or oopses on other CPUs will cause those CPUs
to enter a tight loop until the specified number of seconds have elapsed.

The patch implements the infrastructure generically in the expectation that
architectures other than x86 will find it useful.

Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  5 ++
 arch/i386/kernel/traps.c            |  3 ++
 arch/i386/mm/fault.c                | 39 +++++++++------
 include/linux/kernel.h              |  3 ++
 kernel/panic.c                      | 97 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 130 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 880be3a30d8d..7b7382d0f758 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1126,6 +1126,11 @@ running once the system is up.
 	pas16=		[HW,SCSI]
 			See header of drivers/scsi/pas16.c.
 
+	pause_on_oops=
+			Halt all CPUs after the first oops has been printed for
+			the specified number of seconds.  This is to be used if
+			your oopses keep scrolling off the screen.
+
 	pcbit=		[HW,ISDN]
 
 	pcd.		[PARIDE]
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1b7ad4115d81..de5386b01d38 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -352,6 +352,8 @@ void die(const char * str, struct pt_regs * regs, long err)
 	static int die_counter;
 	unsigned long flags;
 
+	oops_enter();
+
 	if (die.lock_owner != raw_smp_processor_id()) {
 		console_verbose();
 		spin_lock_irqsave(&die.lock, flags);
@@ -404,6 +406,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		ssleep(5);
 		panic("Fatal exception");
 	}
+	oops_exit();
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 47a3b72ec7b6..7f0fcf219a26 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -509,24 +509,31 @@ no_context:
 
 	bust_spinlocks(1);
 
-#ifdef CONFIG_X86_PAE
-	if (error_code & 16) {
-		pte_t *pte = lookup_address(address);
-
-		if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-			printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid);
+	if (oops_may_print()) {
+	#ifdef CONFIG_X86_PAE
+		if (error_code & 16) {
+			pte_t *pte = lookup_address(address);
+
+			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+				printk(KERN_CRIT "kernel tried to execute "
+					"NX-protected page - exploit attempt? "
+					"(uid: %d)\n", current->uid);
+		}
+	#endif
+		if (address < PAGE_SIZE)
+			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
+					"pointer dereference");
+		else
+			printk(KERN_ALERT "BUG: unable to handle kernel paging"
+					" request");
+		printk(" at virtual address %08lx\n",address);
+		printk(KERN_ALERT " printing eip:\n");
+		printk("%08lx\n", regs->eip);
 	}
-#endif
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference");
-	else
-		printk(KERN_ALERT "BUG: unable to handle kernel paging request");
-	printk(" at virtual address %08lx\n",address);
-	printk(KERN_ALERT " printing eip:\n");
-	printk("%08lx\n", regs->eip);
 	page = read_cr3();
 	page = ((unsigned long *) __va(page))[address >> 22];
-	printk(KERN_ALERT "*pde = %08lx\n", page);
+	if (oops_may_print())
+		printk(KERN_ALERT "*pde = %08lx\n", page);
 	/*
 	 * We must not directly access the pte in the highpte
 	 * case, the page table might be allocated in highmem.
@@ -534,7 +541,7 @@ no_context:
 	 * it's allocated already.
 	 */
 #ifndef CONFIG_HIGHPTE
-	if (page & 1) {
+	if ((page & 1) && oops_may_print()) {
 		page &= PAGE_MASK;
 		address &= 0x003ff000;
 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3b507bf05d09..bb6e7ddee2fd 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -91,6 +91,9 @@ extern struct notifier_block *panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
+extern void oops_enter(void);
+extern void oops_exit(void);
+extern int oops_may_print(void);
 fastcall NORET_TYPE void do_exit(long error_code)
 	ATTRIB_NORET;
 NORET_TYPE void complete_and_exit(struct completion *, long)
diff --git a/kernel/panic.c b/kernel/panic.c
index 126dc43f1c74..acd95adddb93 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -20,10 +20,13 @@
 #include <linux/nmi.h>
 #include <linux/kexec.h>
 
-int panic_timeout;
 int panic_on_oops;
 int tainted;
+static int pause_on_oops;
+static int pause_on_oops_flag;
+static DEFINE_SPINLOCK(pause_on_oops_lock);
 
+int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
 struct notifier_block *panic_notifier_list;
@@ -174,3 +177,95 @@ void add_taint(unsigned flag)
 	tainted |= flag;
 }
 EXPORT_SYMBOL(add_taint);
+
+static int __init pause_on_oops_setup(char *str)
+{
+	pause_on_oops = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("pause_on_oops=", pause_on_oops_setup);
+
+static void spin_msec(int msecs)
+{
+	int i;
+
+	for (i = 0; i < msecs; i++) {
+		touch_nmi_watchdog();
+		mdelay(1);
+	}
+}
+
+/*
+ * It just happens that oops_enter() and oops_exit() are identically
+ * implemented...
+ */
+static void do_oops_enter_exit(void)
+{
+	unsigned long flags;
+	static int spin_counter;
+
+	if (!pause_on_oops)
+		return;
+
+	spin_lock_irqsave(&pause_on_oops_lock, flags);
+	if (pause_on_oops_flag == 0) {
+		/* This CPU may now print the oops message */
+		pause_on_oops_flag = 1;
+	} else {
+		/* We need to stall this CPU */
+		if (!spin_counter) {
+			/* This CPU gets to do the counting */
+			spin_counter = pause_on_oops;
+			do {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(MSEC_PER_SEC);
+				spin_lock(&pause_on_oops_lock);
+			} while (--spin_counter);
+			pause_on_oops_flag = 0;
+		} else {
+			/* This CPU waits for a different one */
+			while (spin_counter) {
+				spin_unlock(&pause_on_oops_lock);
+				spin_msec(1);
+				spin_lock(&pause_on_oops_lock);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&pause_on_oops_lock, flags);
+}
+
+/*
+ * Return true if the calling CPU is allowed to print oops-related info.  This
+ * is a bit racy..
+ */
+int oops_may_print(void)
+{
+	return pause_on_oops_flag == 0;
+}
+
+/*
+ * Called when the architecture enters its oops handler, before it prints
+ * anything.  If this is the first CPU to oops, and it's oopsing the first time
+ * then let it proceed.
+ *
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all this
+ * to ensure that oopses don't scroll off the screen.  It has the side-effect
+ * of preventing later-oopsing CPUs from mucking up the display, too.
+ *
+ * It turns out that the CPU which is allowed to print ends up pausing for the
+ * right duration, whereas all the other CPUs pause for twice as long: once in
+ * oops_enter(), once in oops_exit().
+ */
+void oops_enter(void)
+{
+	do_oops_enter_exit();
+}
+
+/*
+ * Called when the architecture exits its oops handler, after printing
+ * everything.
+ */
+void oops_exit(void)
+{
+	do_oops_enter_exit();
+}
-- 
cgit v1.2.3


From ee25e96fcd78837c9f192aa655ce12a88bfd63d4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Mar 2006 03:00:58 -0800
Subject: [PATCH] BUILD_LOCK_OPS: cleanup preempt_disable() usage

This patch changes the code from:

	preempt_disable();
	for (;;) {
		...
		preempt_disable();
	}
to:
	for (;;) {
		preempt_disable();
		...
	}

which seems more clean to me and saves a couple of bytes for
each function.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/spinlock.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0375fcd5921d..d1b810782bc4 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -179,16 +179,16 @@ EXPORT_SYMBOL(_write_lock);
 #define BUILD_LOCK_OPS(op, locktype)					\
 void __lockfunc _##op##_lock(locktype##_t *lock)			\
 {									\
-	preempt_disable();						\
 	for (;;) {							\
+		preempt_disable();					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		preempt_enable();					\
+									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			cpu_relax();					\
-		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
@@ -199,19 +199,18 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
-	preempt_disable();						\
 	for (;;) {							\
+		preempt_disable();					\
 		local_irq_save(flags);					\
 		if (likely(_raw_##op##_trylock(lock)))			\
 			break;						\
 		local_irq_restore(flags);				\
-									\
 		preempt_enable();					\
+									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
 		while (!op##_can_lock(lock) && (lock)->break_lock)	\
 			cpu_relax();					\
-		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 	return flags;							\
-- 
cgit v1.2.3


From 2178426d26661ed6e18a8d6ea0bc05c98d73600d Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Mar 2006 03:01:00 -0800
Subject: [PATCH] kernel/rcupdate.c: make two structs static

This patch makes two needlessly global structs static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 2 --
 kernel/rcupdate.c        | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c2ec6c77874e..5673008b61e1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -113,8 +113,6 @@ struct rcu_data {
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-extern struct rcu_ctrlblk rcu_ctrlblk;
-extern struct rcu_ctrlblk rcu_bh_ctrlblk;
 
 /*
  * Increment the quiescent state counter.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index af8a2a57e17d..6df1559b1c02 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -50,13 +50,13 @@
 #include <linux/mutex.h>
 
 /* Definition for rcupdate control block. */
-struct rcu_ctrlblk rcu_ctrlblk = {
+static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
 	.cpumask = CPU_MASK_NONE,
 };
-struct rcu_ctrlblk rcu_bh_ctrlblk = {
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
 	.lock = SPIN_LOCK_UNLOCKED,
-- 
cgit v1.2.3


From b86ff981a8252d83d6a7719ae09f3a05307e3592 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 19:56:55 +0100
Subject: [PATCH] relay: migrate from relayfs to a generic relay API

Original patch from Paul Mundt, sysfs parts removed by me since they
were broken.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/Kconfig            |  12 -
 fs/Makefile           |   1 -
 fs/relayfs/Makefile   |   4 -
 fs/relayfs/buffers.c  | 190 -----------
 fs/relayfs/buffers.h  |  12 -
 fs/relayfs/inode.c    | 581 -------------------------------
 fs/relayfs/relay.c    | 482 --------------------------
 fs/relayfs/relay.h    |   8 -
 include/linux/relay.h | 281 +++++++++++++++
 init/Kconfig          |  11 +
 kernel/Makefile       |   1 +
 kernel/relay.c        | 919 ++++++++++++++++++++++++++++++++++++++++++++++++++
 12 files changed, 1212 insertions(+), 1290 deletions(-)
 delete mode 100644 fs/relayfs/Makefile
 delete mode 100644 fs/relayfs/buffers.c
 delete mode 100644 fs/relayfs/buffers.h
 delete mode 100644 fs/relayfs/inode.c
 delete mode 100644 fs/relayfs/relay.c
 delete mode 100644 fs/relayfs/relay.h
 create mode 100644 include/linux/relay.h
 create mode 100644 kernel/relay.c

(limited to 'kernel')

diff --git a/fs/Kconfig b/fs/Kconfig
index e9749b0eecd8..c8d0a209120c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -859,18 +859,6 @@ config RAMFS
 	  To compile this as a module, choose M here: the module will be called
 	  ramfs.
 
-config RELAYFS_FS
-	tristate "Relayfs file system support"
-	---help---
-	  Relayfs is a high-speed data relay filesystem designed to provide
-	  an efficient mechanism for tools and facilities to relay large
-	  amounts of data from kernel space to user space.
-
-	  To compile this code as a module, choose M here: the module will be
-	  called relayfs.
-
-	  If unsure, say N.
-
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 1db711319c80..080b3867be4d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -91,7 +91,6 @@ obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
-obj-$(CONFIG_RELAYFS_FS)	+= relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
deleted file mode 100644
index e76e182cdb38..000000000000
--- a/fs/relayfs/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-obj-$(CONFIG_RELAYFS_FS) += relayfs.o
-
-relayfs-y := relay.o inode.o buffers.o
-
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
deleted file mode 100644
index 10187812771e..000000000000
--- a/fs/relayfs/buffers.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * RelayFS buffer management code.
- *
- * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
- *
- * This file is released under the GPL.
- */
-
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-/*
- * close() vm_op implementation for relayfs file mapping.
- */
-static void relay_file_mmap_close(struct vm_area_struct *vma)
-{
-	struct rchan_buf *buf = vma->vm_private_data;
-	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
-}
-
-/*
- * nopage() vm_op implementation for relayfs file mapping.
- */
-static struct page *relay_buf_nopage(struct vm_area_struct *vma,
-				     unsigned long address,
-				     int *type)
-{
-	struct page *page;
-	struct rchan_buf *buf = vma->vm_private_data;
-	unsigned long offset = address - vma->vm_start;
-
-	if (address > vma->vm_end)
-		return NOPAGE_SIGBUS; /* Disallow mremap */
-	if (!buf)
-		return NOPAGE_OOM;
-
-	page = vmalloc_to_page(buf->start + offset);
-	if (!page)
-		return NOPAGE_OOM;
-	get_page(page);
-
-	if (type)
-		*type = VM_FAULT_MINOR;
-
-	return page;
-}
-
-/*
- * vm_ops for relay file mappings.
- */
-static struct vm_operations_struct relay_file_mmap_ops = {
-	.nopage = relay_buf_nopage,
-	.close = relay_file_mmap_close,
-};
-
-/**
- *	relay_mmap_buf: - mmap channel buffer to process address space
- *	@buf: relay channel buffer
- *	@vma: vm_area_struct describing memory to be mapped
- *
- *	Returns 0 if ok, negative on error
- *
- *	Caller should already have grabbed mmap_sem.
- */
-int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
-{
-	unsigned long length = vma->vm_end - vma->vm_start;
-	struct file *filp = vma->vm_file;
-
-	if (!buf)
-		return -EBADF;
-
-	if (length != (unsigned long)buf->chan->alloc_size)
-		return -EINVAL;
-
-	vma->vm_ops = &relay_file_mmap_ops;
-	vma->vm_private_data = buf;
-	buf->chan->cb->buf_mapped(buf, filp);
-
-	return 0;
-}
-
-/**
- *	relay_alloc_buf - allocate a channel buffer
- *	@buf: the buffer struct
- *	@size: total size of the buffer
- *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful
- */
-static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
-{
-	void *mem;
-	unsigned int i, j, n_pages;
-
-	size = PAGE_ALIGN(size);
-	n_pages = size >> PAGE_SHIFT;
-
-	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
-	if (!buf->page_array)
-		return NULL;
-
-	for (i = 0; i < n_pages; i++) {
-		buf->page_array[i] = alloc_page(GFP_KERNEL);
-		if (unlikely(!buf->page_array[i]))
-			goto depopulate;
-	}
-	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
-	if (!mem)
-		goto depopulate;
-
-	memset(mem, 0, size);
-	buf->page_count = n_pages;
-	return mem;
-
-depopulate:
-	for (j = 0; j < i; j++)
-		__free_page(buf->page_array[j]);
-	kfree(buf->page_array);
-	return NULL;
-}
-
-/**
- *	relay_create_buf - allocate and initialize a channel buffer
- *	@alloc_size: size of the buffer to allocate
- *	@n_subbufs: number of sub-buffers in the channel
- *
- *	Returns channel buffer if successful, NULL otherwise
- */
-struct rchan_buf *relay_create_buf(struct rchan *chan)
-{
-	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
-	if (!buf)
-		return NULL;
-
-	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
-	if (!buf->padding)
-		goto free_buf;
-
-	buf->start = relay_alloc_buf(buf, chan->alloc_size);
-	if (!buf->start)
-		goto free_buf;
-
-	buf->chan = chan;
-	kref_get(&buf->chan->kref);
-	return buf;
-
-free_buf:
-	kfree(buf->padding);
-	kfree(buf);
-	return NULL;
-}
-
-/**
- *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
- *	@buf: the buffer struct
- */
-void relay_destroy_buf(struct rchan_buf *buf)
-{
-	struct rchan *chan = buf->chan;
-	unsigned int i;
-
-	if (likely(buf->start)) {
-		vunmap(buf->start);
-		for (i = 0; i < buf->page_count; i++)
-			__free_page(buf->page_array[i]);
-		kfree(buf->page_array);
-	}
-	kfree(buf->padding);
-	kfree(buf);
-	kref_put(&chan->kref, relay_destroy_channel);
-}
-
-/**
- *	relay_remove_buf - remove a channel buffer
- *
- *	Removes the file from the relayfs fileystem, which also frees the
- *	rchan_buf_struct and the channel buffer.  Should only be called from
- *	kref_put().
- */
-void relay_remove_buf(struct kref *kref)
-{
-	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-	buf->chan->cb->remove_buf_file(buf->dentry);
-	relay_destroy_buf(buf);
-}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
deleted file mode 100644
index 37a12493f641..000000000000
--- a/fs/relayfs/buffers.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _BUFFERS_H
-#define _BUFFERS_H
-
-/* This inspired by rtai/shmem */
-#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
-
-extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
-extern struct rchan_buf *relay_create_buf(struct rchan *chan);
-extern void relay_destroy_buf(struct rchan_buf *buf);
-extern void relay_remove_buf(struct kref *kref);
-
-#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
deleted file mode 100644
index 383523011aad..000000000000
--- a/fs/relayfs/inode.c
+++ /dev/null
@@ -1,581 +0,0 @@
-/*
- * VFS-related code for RelayFS, a high-speed data relay filesystem.
- *
- * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
- * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
- *
- * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
- *
- * This file is released under the GPL.
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/backing-dev.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-#define RELAYFS_MAGIC			0xF0B4A981
-
-static struct vfsmount *		relayfs_mount;
-static int				relayfs_mount_count;
-
-static struct backing_dev_info		relayfs_backing_dev_info = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
-};
-
-static struct inode *relayfs_get_inode(struct super_block *sb,
-				       int mode,
- 				       struct file_operations *fops,
-				       void *data)
-{
-	struct inode *inode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return NULL;
-
-	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
-	inode->i_blocks = 0;
-	inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		inode->i_fop = fops;
-		if (data)
-			inode->u.generic_ip = data;
-		break;
-	case S_IFDIR:
-		inode->i_op = &simple_dir_inode_operations;
-		inode->i_fop = &simple_dir_operations;
-
-		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
-		break;
-	default:
-		break;
-	}
-
-	return inode;
-}
-
-/**
- *	relayfs_create_entry - create a relayfs directory or file
- *	@name: the name of the file to create
- *	@parent: parent directory
- *	@mode: mode
- *	@fops: file operations to use for the file
- *	@data: user-associated data for this file
- *
- *	Returns the new dentry, NULL on failure
- *
- *	Creates a file or directory with the specifed permissions.
- */
-static struct dentry *relayfs_create_entry(const char *name,
-					   struct dentry *parent,
-					   int mode,
-					   struct file_operations *fops,
-					   void *data)
-{
-	struct dentry *d;
-	struct inode *inode;
-	int error = 0;
-
-	BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
-
-	error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
-	if (error) {
-		printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
-		return NULL;
-	}
-
-	if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
-		parent = relayfs_mount->mnt_sb->s_root;
-
-	if (!parent) {
-		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-		return NULL;
-	}
-
-	parent = dget(parent);
-	mutex_lock(&parent->d_inode->i_mutex);
-	d = lookup_one_len(name, parent, strlen(name));
-	if (IS_ERR(d)) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	if (d->d_inode) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data);
-	if (!inode) {
-		d = NULL;
-		goto release_mount;
-	}
-
-	d_instantiate(d, inode);
-	dget(d);	/* Extra count - pin the dentry in core */
-
-	if (S_ISDIR(mode))
-		parent->d_inode->i_nlink++;
-
-	goto exit;
-
-release_mount:
-	simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-
-exit:
-	mutex_unlock(&parent->d_inode->i_mutex);
-	dput(parent);
-	return d;
-}
-
-/**
- *	relayfs_create_file - create a file in the relay filesystem
- *	@name: the name of the file to create
- *	@parent: parent directory
- *	@mode: mode, if not specied the default perms are used
- *	@fops: file operations to use for the file
- *	@data: user-associated data for this file
- *
- *	Returns file dentry if successful, NULL otherwise.
- *
- *	The file will be created user r on behalf of current user.
- */
-struct dentry *relayfs_create_file(const char *name,
-				   struct dentry *parent,
-				   int mode,
-				   struct file_operations *fops,
-				   void *data)
-{
-	BUG_ON(!fops);
-
-	if (!mode)
-		mode = S_IRUSR;
-	mode = (mode & S_IALLUGO) | S_IFREG;
-
-	return relayfs_create_entry(name, parent, mode, fops, data);
-}
-
-/**
- *	relayfs_create_dir - create a directory in the relay filesystem
- *	@name: the name of the directory to create
- *	@parent: parent directory, NULL if parent should be fs root
- *
- *	Returns directory dentry if successful, NULL otherwise.
- *
- *	The directory will be created world rwx on behalf of current user.
- */
-struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
-{
-	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	return relayfs_create_entry(name, parent, mode, NULL, NULL);
-}
-
-/**
- *	relayfs_remove - remove a file or directory in the relay filesystem
- *	@dentry: file or directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove(struct dentry *dentry)
-{
-	struct dentry *parent;
-	int error = 0;
-
-	if (!dentry)
-		return -EINVAL;
-	parent = dentry->d_parent;
-	if (!parent)
-		return -EINVAL;
-
-	parent = dget(parent);
-	mutex_lock(&parent->d_inode->i_mutex);
-	if (dentry->d_inode) {
-		if (S_ISDIR(dentry->d_inode->i_mode))
-			error = simple_rmdir(parent->d_inode, dentry);
-		else
-			error = simple_unlink(parent->d_inode, dentry);
-		if (!error)
-			d_delete(dentry);
-	}
-	if (!error)
-		dput(dentry);
-	mutex_unlock(&parent->d_inode->i_mutex);
-	dput(parent);
-
-	if (!error)
-		simple_release_fs(&relayfs_mount, &relayfs_mount_count);
-
-	return error;
-}
-
-/**
- *	relayfs_remove_file - remove a file from relay filesystem
- *	@dentry: directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove_file(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/**
- *	relayfs_remove_dir - remove a directory in the relay filesystem
- *	@dentry: directory dentry
- *
- *	Returns 0 if successful, negative otherwise.
- */
-int relayfs_remove_dir(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/**
- *	relay_file_open - open file op for relay files
- *	@inode: the inode
- *	@filp: the file
- *
- *	Increments the channel buffer refcount.
- */
-static int relay_file_open(struct inode *inode, struct file *filp)
-{
-	struct rchan_buf *buf = inode->u.generic_ip;
-	kref_get(&buf->kref);
-	filp->private_data = buf;
-
-	return 0;
-}
-
-/**
- *	relay_file_mmap - mmap file op for relay files
- *	@filp: the file
- *	@vma: the vma describing what to map
- *
- *	Calls upon relay_mmap_buf to map the file into user space.
- */
-static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct rchan_buf *buf = filp->private_data;
-	return relay_mmap_buf(buf, vma);
-}
-
-/**
- *	relay_file_poll - poll file op for relay files
- *	@filp: the file
- *	@wait: poll table
- *
- *	Poll implemention.
- */
-static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
-{
-	unsigned int mask = 0;
-	struct rchan_buf *buf = filp->private_data;
-
-	if (buf->finalized)
-		return POLLERR;
-
-	if (filp->f_mode & FMODE_READ) {
-		poll_wait(filp, &buf->read_wait, wait);
-		if (!relay_buf_empty(buf))
-			mask |= POLLIN | POLLRDNORM;
-	}
-
-	return mask;
-}
-
-/**
- *	relay_file_release - release file op for relay files
- *	@inode: the inode
- *	@filp: the file
- *
- *	Decrements the channel refcount, as the filesystem is
- *	no longer using it.
- */
-static int relay_file_release(struct inode *inode, struct file *filp)
-{
-	struct rchan_buf *buf = filp->private_data;
-	kref_put(&buf->kref, relay_remove_buf);
-
-	return 0;
-}
-
-/**
- *	relay_file_read_consume - update the consumed count for the buffer
- */
-static void relay_file_read_consume(struct rchan_buf *buf,
-				    size_t read_pos,
-				    size_t bytes_consumed)
-{
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t read_subbuf;
-
-	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
-		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
-		buf->bytes_consumed = 0;
-	}
-
-	buf->bytes_consumed += bytes_consumed;
-	read_subbuf = read_pos / buf->chan->subbuf_size;
-	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
-		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
-		    (buf->offset == subbuf_size))
-			return;
-		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
-		buf->bytes_consumed = 0;
-	}
-}
-
-/**
- *	relay_file_read_avail - boolean, are there unconsumed bytes available?
- */
-static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
-{
-	size_t bytes_produced, bytes_consumed, write_offset;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t produced = buf->subbufs_produced % n_subbufs;
-	size_t consumed = buf->subbufs_consumed % n_subbufs;
-
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
-
-	if (consumed > produced) {
-		if ((produced > n_subbufs) &&
-		    (produced + n_subbufs - consumed <= n_subbufs))
-			produced += n_subbufs;
-	} else if (consumed == produced) {
-		if (buf->offset > subbuf_size) {
-			produced += n_subbufs;
-			if (buf->subbufs_produced == buf->subbufs_consumed)
-				consumed += n_subbufs;
-		}
-	}
-
-	if (buf->offset > subbuf_size)
-		bytes_produced = (produced - 1) * subbuf_size + write_offset;
-	else
-		bytes_produced = produced * subbuf_size + write_offset;
-	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
-
-	if (bytes_produced == bytes_consumed)
-		return 0;
-
-	relay_file_read_consume(buf, read_pos, 0);
-
-	return 1;
-}
-
-/**
- *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
- */
-static size_t relay_file_read_subbuf_avail(size_t read_pos,
-					   struct rchan_buf *buf)
-{
-	size_t padding, avail = 0;
-	size_t read_subbuf, read_offset, write_subbuf, write_offset;
-	size_t subbuf_size = buf->chan->subbuf_size;
-
-	write_subbuf = (buf->data - buf->start) / subbuf_size;
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
-	read_subbuf = read_pos / subbuf_size;
-	read_offset = read_pos % subbuf_size;
-	padding = buf->padding[read_subbuf];
-
-	if (read_subbuf == write_subbuf) {
-		if (read_offset + padding < write_offset)
-			avail = write_offset - (read_offset + padding);
-	} else
-		avail = (subbuf_size - padding) - read_offset;
-
-	return avail;
-}
-
-/**
- *	relay_file_read_start_pos - find the first available byte to read
- *
- *	If the read_pos is in the middle of padding, return the
- *	position of the first actually available byte, otherwise
- *	return the original value.
- */
-static size_t relay_file_read_start_pos(size_t read_pos,
-					struct rchan_buf *buf)
-{
-	size_t read_subbuf, padding, padding_start, padding_end;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-
-	read_subbuf = read_pos / subbuf_size;
-	padding = buf->padding[read_subbuf];
-	padding_start = (read_subbuf + 1) * subbuf_size - padding;
-	padding_end = (read_subbuf + 1) * subbuf_size;
-	if (read_pos >= padding_start && read_pos < padding_end) {
-		read_subbuf = (read_subbuf + 1) % n_subbufs;
-		read_pos = read_subbuf * subbuf_size;
-	}
-
-	return read_pos;
-}
-
-/**
- *	relay_file_read_end_pos - return the new read position
- */
-static size_t relay_file_read_end_pos(struct rchan_buf *buf,
-				      size_t read_pos,
-				      size_t count)
-{
-	size_t read_subbuf, padding, end_pos;
-	size_t subbuf_size = buf->chan->subbuf_size;
-	size_t n_subbufs = buf->chan->n_subbufs;
-
-	read_subbuf = read_pos / subbuf_size;
-	padding = buf->padding[read_subbuf];
-	if (read_pos % subbuf_size + count + padding == subbuf_size)
-		end_pos = (read_subbuf + 1) * subbuf_size;
-	else
-		end_pos = read_pos + count;
-	if (end_pos >= subbuf_size * n_subbufs)
-		end_pos = 0;
-
-	return end_pos;
-}
-
-/**
- *	relay_file_read - read file op for relay files
- *	@filp: the file
- *	@buffer: the userspace buffer
- *	@count: number of bytes to read
- *	@ppos: position to read from
- *
- *	Reads count bytes or the number of bytes available in the
- *	current sub-buffer being read, whichever is smaller.
- */
-static ssize_t relay_file_read(struct file *filp,
-			       char __user *buffer,
-			       size_t count,
-			       loff_t *ppos)
-{
-	struct rchan_buf *buf = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
-	size_t read_start, avail;
-	ssize_t ret = 0;
-	void *from;
-
-	mutex_lock(&inode->i_mutex);
-	if(!relay_file_read_avail(buf, *ppos))
-		goto out;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		goto out;
-
-	from = buf->start + read_start;
-	ret = count = min(count, avail);
-	if (copy_to_user(buffer, from, count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-	relay_file_read_consume(buf, read_start, count);
-	*ppos = relay_file_read_end_pos(buf, read_start, count);
-out:
-	mutex_unlock(&inode->i_mutex);
-	return ret;
-}
-
-struct file_operations relay_file_operations = {
-	.open		= relay_file_open,
-	.poll		= relay_file_poll,
-	.mmap		= relay_file_mmap,
-	.read		= relay_file_read,
-	.llseek		= no_llseek,
-	.release	= relay_file_release,
-};
-
-static struct super_operations relayfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-};
-
-static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
-{
-	struct inode *inode;
-	struct dentry *root;
-	int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = RELAYFS_MAGIC;
-	sb->s_op = &relayfs_ops;
-	inode = relayfs_get_inode(sb, mode, NULL, NULL);
-
-	if (!inode)
-		return -ENOMEM;
-
-	root = d_alloc_root(inode);
-	if (!root) {
-		iput(inode);
-		return -ENOMEM;
-	}
-	sb->s_root = root;
-
-	return 0;
-}
-
-static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
-{
-	return get_sb_single(fs_type, flags, data, relayfs_fill_super);
-}
-
-static struct file_system_type relayfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "relayfs",
-	.get_sb		= relayfs_get_sb,
-	.kill_sb	= kill_litter_super,
-};
-
-static int __init init_relayfs_fs(void)
-{
-	return register_filesystem(&relayfs_fs_type);
-}
-
-static void __exit exit_relayfs_fs(void)
-{
-
-
-
-
-
-	unregister_filesystem(&relayfs_fs_type);
-}
-
-module_init(init_relayfs_fs)
-module_exit(exit_relayfs_fs)
-
-EXPORT_SYMBOL_GPL(relay_file_operations);
-EXPORT_SYMBOL_GPL(relayfs_create_dir);
-EXPORT_SYMBOL_GPL(relayfs_remove_dir);
-EXPORT_SYMBOL_GPL(relayfs_create_file);
-EXPORT_SYMBOL_GPL(relayfs_remove_file);
-
-MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
-MODULE_DESCRIPTION("Relay Filesystem");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
deleted file mode 100644
index abf3ceaace49..000000000000
--- a/fs/relayfs/relay.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * Public API and common code for RelayFS.
- *
- * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
- *
- * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
- * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
- *
- * This file is released under the GPL.
- */
-
-#include <linux/errno.h>
-#include <linux/stddef.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/relayfs_fs.h>
-#include "relay.h"
-#include "buffers.h"
-
-/**
- *	relay_buf_empty - boolean, is the channel buffer empty?
- *	@buf: channel buffer
- *
- *	Returns 1 if the buffer is empty, 0 otherwise.
- */
-int relay_buf_empty(struct rchan_buf *buf)
-{
-	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
-}
-
-/**
- *	relay_buf_full - boolean, is the channel buffer full?
- *	@buf: channel buffer
- *
- *	Returns 1 if the buffer is full, 0 otherwise.
- */
-int relay_buf_full(struct rchan_buf *buf)
-{
-	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
-	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
-}
-
-/*
- * High-level relayfs kernel API and associated functions.
- */
-
-/*
- * rchan_callback implementations defining default channel behavior.  Used
- * in place of corresponding NULL values in client callback struct.
- */
-
-/*
- * subbuf_start() default callback.  Does nothing.
- */
-static int subbuf_start_default_callback (struct rchan_buf *buf,
-					  void *subbuf,
-					  void *prev_subbuf,
-					  size_t prev_padding)
-{
-	if (relay_buf_full(buf))
-		return 0;
-
-	return 1;
-}
-
-/*
- * buf_mapped() default callback.  Does nothing.
- */
-static void buf_mapped_default_callback(struct rchan_buf *buf,
-					struct file *filp)
-{
-}
-
-/*
- * buf_unmapped() default callback.  Does nothing.
- */
-static void buf_unmapped_default_callback(struct rchan_buf *buf,
-					  struct file *filp)
-{
-}
-
-/*
- * create_buf_file_create() default callback.  Creates file to represent buf.
- */
-static struct dentry *create_buf_file_default_callback(const char *filename,
-						       struct dentry *parent,
-						       int mode,
-						       struct rchan_buf *buf,
-						       int *is_global)
-{
-	return relayfs_create_file(filename, parent, mode,
-				   &relay_file_operations, buf);
-}
-
-/*
- * remove_buf_file() default callback.  Removes file representing relay buffer.
- */
-static int remove_buf_file_default_callback(struct dentry *dentry)
-{
-	return relayfs_remove(dentry);
-}
-
-/* relay channel default callbacks */
-static struct rchan_callbacks default_channel_callbacks = {
-	.subbuf_start = subbuf_start_default_callback,
-	.buf_mapped = buf_mapped_default_callback,
-	.buf_unmapped = buf_unmapped_default_callback,
-	.create_buf_file = create_buf_file_default_callback,
-	.remove_buf_file = remove_buf_file_default_callback,
-};
-
-/**
- *	wakeup_readers - wake up readers waiting on a channel
- *	@private: the channel buffer
- *
- *	This is the work function used to defer reader waking.  The
- *	reason waking is deferred is that calling directly from write
- *	causes problems if you're writing from say the scheduler.
- */
-static void wakeup_readers(void *private)
-{
-	struct rchan_buf *buf = private;
-	wake_up_interruptible(&buf->read_wait);
-}
-
-/**
- *	__relay_reset - reset a channel buffer
- *	@buf: the channel buffer
- *	@init: 1 if this is a first-time initialization
- *
- *	See relay_reset for description of effect.
- */
-static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
-{
-	size_t i;
-
-	if (init) {
-		init_waitqueue_head(&buf->read_wait);
-		kref_init(&buf->kref);
-		INIT_WORK(&buf->wake_readers, NULL, NULL);
-	} else {
-		cancel_delayed_work(&buf->wake_readers);
-		flush_scheduled_work();
-	}
-
-	buf->subbufs_produced = 0;
-	buf->subbufs_consumed = 0;
-	buf->bytes_consumed = 0;
-	buf->finalized = 0;
-	buf->data = buf->start;
-	buf->offset = 0;
-
-	for (i = 0; i < buf->chan->n_subbufs; i++)
-		buf->padding[i] = 0;
-
-	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
-}
-
-/**
- *	relay_reset - reset the channel
- *	@chan: the channel
- *
- *	This has the effect of erasing all data from all channel buffers
- *	and restarting the channel in its initial state.  The buffers
- *	are not freed, so any mappings are still in effect.
- *
- *	NOTE: Care should be taken that the channel isn't actually
- *	being used by anything when this call is made.
- */
-void relay_reset(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		__relay_reset(chan->buf[i], 0);
-		prev = chan->buf[i];
-	}
-}
-
-/**
- *	relay_open_buf - create a new channel buffer in relayfs
- *
- *	Internal - used by relay_open().
- */
-static struct rchan_buf *relay_open_buf(struct rchan *chan,
-					const char *filename,
-					struct dentry *parent,
-					int *is_global)
-{
-	struct rchan_buf *buf;
-	struct dentry *dentry;
-
-	if (*is_global)
-		return chan->buf[0];
-
- 	buf = relay_create_buf(chan);
- 	if (!buf)
- 		return NULL;
-
-	/* Create file in fs */
- 	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
- 					   buf, is_global);
- 	if (!dentry) {
- 		relay_destroy_buf(buf);
-		return NULL;
- 	}
-
-	buf->dentry = dentry;
-	__relay_reset(buf, 1);
-
-	return buf;
-}
-
-/**
- *	relay_close_buf - close a channel buffer
- *	@buf: channel buffer
- *
- *	Marks the buffer finalized and restores the default callbacks.
- *	The channel buffer and channel buffer data structure are then freed
- *	automatically when the last reference is given up.
- */
-static inline void relay_close_buf(struct rchan_buf *buf)
-{
-	buf->finalized = 1;
-	buf->chan->cb = &default_channel_callbacks;
-	cancel_delayed_work(&buf->wake_readers);
-	flush_scheduled_work();
-	kref_put(&buf->kref, relay_remove_buf);
-}
-
-static inline void setup_callbacks(struct rchan *chan,
-				   struct rchan_callbacks *cb)
-{
-	if (!cb) {
-		chan->cb = &default_channel_callbacks;
-		return;
-	}
-
-	if (!cb->subbuf_start)
-		cb->subbuf_start = subbuf_start_default_callback;
-	if (!cb->buf_mapped)
-		cb->buf_mapped = buf_mapped_default_callback;
-	if (!cb->buf_unmapped)
-		cb->buf_unmapped = buf_unmapped_default_callback;
-	if (!cb->create_buf_file)
-		cb->create_buf_file = create_buf_file_default_callback;
-	if (!cb->remove_buf_file)
-		cb->remove_buf_file = remove_buf_file_default_callback;
-	chan->cb = cb;
-}
-
-/**
- *	relay_open - create a new relayfs channel
- *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, NULL for root directory
- *	@subbuf_size: size of sub-buffers
- *	@n_subbufs: number of sub-buffers
- *	@cb: client callback functions
- *
- *	Returns channel pointer if successful, NULL otherwise.
- *
- *	Creates a channel buffer for each cpu using the sizes and
- *	attributes specified.  The created channel buffer files
- *	will be named base_filename0...base_filenameN-1.  File
- *	permissions will be S_IRUSR.
- */
-struct rchan *relay_open(const char *base_filename,
-			 struct dentry *parent,
-			 size_t subbuf_size,
-			 size_t n_subbufs,
-			 struct rchan_callbacks *cb)
-{
-	unsigned int i;
-	struct rchan *chan;
-	char *tmpname;
-	int is_global = 0;
-
-	if (!base_filename)
-		return NULL;
-
-	if (!(subbuf_size && n_subbufs))
-		return NULL;
-
-	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
-	if (!chan)
-		return NULL;
-
-	chan->version = RELAYFS_CHANNEL_VERSION;
-	chan->n_subbufs = n_subbufs;
-	chan->subbuf_size = subbuf_size;
-	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
-	setup_callbacks(chan, cb);
-	kref_init(&chan->kref);
-
-	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!tmpname)
-		goto free_chan;
-
-	for_each_online_cpu(i) {
-		sprintf(tmpname, "%s%d", base_filename, i);
-		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
-					      &is_global);
-		chan->buf[i]->cpu = i;
-		if (!chan->buf[i])
-			goto free_bufs;
-	}
-
-	kfree(tmpname);
-	return chan;
-
-free_bufs:
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i])
-			break;
-		relay_close_buf(chan->buf[i]);
-		if (is_global)
-			break;
-	}
-	kfree(tmpname);
-
-free_chan:
-	kref_put(&chan->kref, relay_destroy_channel);
-	return NULL;
-}
-
-/**
- *	relay_switch_subbuf - switch to a new sub-buffer
- *	@buf: channel buffer
- *	@length: size of current event
- *
- *	Returns either the length passed in or 0 if full.
-
- *	Performs sub-buffer-switch tasks such as invoking callbacks,
- *	updating padding counts, waking up readers, etc.
- */
-size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
-{
-	void *old, *new;
-	size_t old_subbuf, new_subbuf;
-
-	if (unlikely(length > buf->chan->subbuf_size))
-		goto toobig;
-
-	if (buf->offset != buf->chan->subbuf_size + 1) {
-		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
-		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-		buf->padding[old_subbuf] = buf->prev_padding;
-		buf->subbufs_produced++;
-		if (waitqueue_active(&buf->read_wait)) {
-			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
-			schedule_delayed_work(&buf->wake_readers, 1);
-		}
-	}
-
-	old = buf->data;
-	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-	new = buf->start + new_subbuf * buf->chan->subbuf_size;
-	buf->offset = 0;
-	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
-		buf->offset = buf->chan->subbuf_size + 1;
-		return 0;
-	}
-	buf->data = new;
-	buf->padding[new_subbuf] = 0;
-
-	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
-		goto toobig;
-
-	return length;
-
-toobig:
-	buf->chan->last_toobig = length;
-	return 0;
-}
-
-/**
- *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
- *	@chan: the channel
- *	@cpu: the cpu associated with the channel buffer to update
- *	@subbufs_consumed: number of sub-buffers to add to current buf's count
- *
- *	Adds to the channel buffer's consumed sub-buffer count.
- *	subbufs_consumed should be the number of sub-buffers newly consumed,
- *	not the total consumed.
- *
- *	NOTE: kernel clients don't need to call this function if the channel
- *	mode is 'overwrite'.
- */
-void relay_subbufs_consumed(struct rchan *chan,
-			    unsigned int cpu,
-			    size_t subbufs_consumed)
-{
-	struct rchan_buf *buf;
-
-	if (!chan)
-		return;
-
-	if (cpu >= NR_CPUS || !chan->buf[cpu])
-		return;
-
-	buf = chan->buf[cpu];
-	buf->subbufs_consumed += subbufs_consumed;
-	if (buf->subbufs_consumed > buf->subbufs_produced)
-		buf->subbufs_consumed = buf->subbufs_produced;
-}
-
-/**
- *	relay_destroy_channel - free the channel struct
- *
- *	Should only be called from kref_put().
- */
-void relay_destroy_channel(struct kref *kref)
-{
-	struct rchan *chan = container_of(kref, struct rchan, kref);
-	kfree(chan);
-}
-
-/**
- *	relay_close - close the channel
- *	@chan: the channel
- *
- *	Closes all channel buffers and frees the channel.
- */
-void relay_close(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		relay_close_buf(chan->buf[i]);
-		prev = chan->buf[i];
-	}
-
-	if (chan->last_toobig)
-		printk(KERN_WARNING "relayfs: one or more items not logged "
-		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
-		       chan->last_toobig, chan->subbuf_size);
-
-	kref_put(&chan->kref, relay_destroy_channel);
-}
-
-/**
- *	relay_flush - close the channel
- *	@chan: the channel
- *
- *	Flushes all channel buffers i.e. forces buffer switch.
- */
-void relay_flush(struct rchan *chan)
-{
-	unsigned int i;
-	struct rchan_buf *prev = NULL;
-
-	if (!chan)
-		return;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!chan->buf[i] || chan->buf[i] == prev)
-			break;
-		relay_switch_subbuf(chan->buf[i], 0);
-		prev = chan->buf[i];
-	}
-}
-
-EXPORT_SYMBOL_GPL(relay_open);
-EXPORT_SYMBOL_GPL(relay_close);
-EXPORT_SYMBOL_GPL(relay_flush);
-EXPORT_SYMBOL_GPL(relay_reset);
-EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
-EXPORT_SYMBOL_GPL(relay_switch_subbuf);
-EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
deleted file mode 100644
index 0993d3e5753b..000000000000
--- a/fs/relayfs/relay.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _RELAY_H
-#define _RELAY_H
-
-extern int relayfs_remove(struct dentry *dentry);
-extern int relay_buf_empty(struct rchan_buf *buf);
-extern void relay_destroy_channel(struct kref *kref);
-
-#endif /* _RELAY_H */
diff --git a/include/linux/relay.h b/include/linux/relay.h
new file mode 100644
index 000000000000..4bcc1531d6a9
--- /dev/null
+++ b/include/linux/relay.h
@@ -0,0 +1,281 @@
+/*
+ * linux/include/linux/relay.h
+ *
+ * Copyright (C) 2002, 2003 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999, 2000, 2001, 2002 - Karim Yaghmour (karim@opersys.com)
+ *
+ * CONFIG_RELAY definitions and declarations
+ */
+
+#ifndef _LINUX_RELAY_H
+#define _LINUX_RELAY_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/kref.h>
+
+/* Needs a _much_ better name... */
+#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
+
+/*
+ * Tracks changes to rchan/rchan_buf structs
+ */
+#define RELAYFS_CHANNEL_VERSION		6
+
+/*
+ * Per-cpu relay channel buffer
+ */
+struct rchan_buf
+{
+	void *start;			/* start of channel buffer */
+	void *data;			/* start of current sub-buffer */
+	size_t offset;			/* current offset into sub-buffer */
+	size_t subbufs_produced;	/* count of sub-buffers produced */
+	size_t subbufs_consumed;	/* count of sub-buffers consumed */
+	struct rchan *chan;		/* associated channel */
+	wait_queue_head_t read_wait;	/* reader wait queue */
+	struct work_struct wake_readers; /* reader wake-up work struct */
+	struct dentry *dentry;		/* channel file dentry */
+	struct kref kref;		/* channel buffer refcount */
+	struct page **page_array;	/* array of current buffer pages */
+	unsigned int page_count;	/* number of current buffer pages */
+	unsigned int finalized;		/* buffer has been finalized */
+	size_t *padding;		/* padding counts per sub-buffer */
+	size_t prev_padding;		/* temporary variable */
+	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
+	unsigned int cpu;		/* this buf's cpu */
+} ____cacheline_aligned;
+
+/*
+ * Relay channel data structure
+ */
+struct rchan
+{
+	u32 version;			/* the version of this struct */
+	size_t subbuf_size;		/* sub-buffer size */
+	size_t n_subbufs;		/* number of sub-buffers per buffer */
+	size_t alloc_size;		/* total buffer size allocated */
+	struct rchan_callbacks *cb;	/* client callbacks */
+	struct kref kref;		/* channel refcount */
+	void *private_data;		/* for user-defined data */
+	size_t last_toobig;		/* tried to log event > subbuf size */
+	struct rchan_buf *buf[NR_CPUS]; /* per-cpu channel buffers */
+};
+
+/*
+ * Relay channel client callbacks
+ */
+struct rchan_callbacks
+{
+	/*
+	 * subbuf_start - called on buffer-switch to a new sub-buffer
+	 * @buf: the channel buffer containing the new sub-buffer
+	 * @subbuf: the start of the new sub-buffer
+	 * @prev_subbuf: the start of the previous sub-buffer
+	 * @prev_padding: unused space at the end of previous sub-buffer
+	 *
+	 * The client should return 1 to continue logging, 0 to stop
+	 * logging.
+	 *
+	 * NOTE: subbuf_start will also be invoked when the buffer is
+	 *       created, so that the first sub-buffer can be initialized
+	 *       if necessary.  In this case, prev_subbuf will be NULL.
+	 *
+	 * NOTE: the client can reserve bytes at the beginning of the new
+	 *       sub-buffer by calling subbuf_start_reserve() in this callback.
+	 */
+	int (*subbuf_start) (struct rchan_buf *buf,
+			     void *subbuf,
+			     void *prev_subbuf,
+			     size_t prev_padding);
+
+	/*
+	 * buf_mapped - relay buffer mmap notification
+	 * @buf: the channel buffer
+	 * @filp: relay file pointer
+	 *
+	 * Called when a relay file is successfully mmapped
+	 */
+        void (*buf_mapped)(struct rchan_buf *buf,
+			   struct file *filp);
+
+	/*
+	 * buf_unmapped - relay buffer unmap notification
+	 * @buf: the channel buffer
+	 * @filp: relay file pointer
+	 *
+	 * Called when a relay file is successfully unmapped
+	 */
+        void (*buf_unmapped)(struct rchan_buf *buf,
+			     struct file *filp);
+	/*
+	 * create_buf_file - create file to represent a relay channel buffer
+	 * @filename: the name of the file to create
+	 * @parent: the parent of the file to create
+	 * @mode: the mode of the file to create
+	 * @buf: the channel buffer
+	 * @is_global: outparam - set non-zero if the buffer should be global
+	 *
+	 * Called during relay_open(), once for each per-cpu buffer,
+	 * to allow the client to create a file to be used to
+	 * represent the corresponding channel buffer.  If the file is
+	 * created outside of relay, the parent must also exist in
+	 * that filesystem.
+	 *
+	 * The callback should return the dentry of the file created
+	 * to represent the relay buffer.
+	 *
+	 * Setting the is_global outparam to a non-zero value will
+	 * cause relay_open() to create a single global buffer rather
+	 * than the default set of per-cpu buffers.
+	 *
+	 * See Documentation/filesystems/relayfs.txt for more info.
+	 */
+	struct dentry *(*create_buf_file)(const char *filename,
+					  struct dentry *parent,
+					  int mode,
+					  struct rchan_buf *buf,
+					  int *is_global);
+
+	/*
+	 * remove_buf_file - remove file representing a relay channel buffer
+	 * @dentry: the dentry of the file to remove
+	 *
+	 * Called during relay_close(), once for each per-cpu buffer,
+	 * to allow the client to remove a file used to represent a
+	 * channel buffer.
+	 *
+	 * The callback should return 0 if successful, negative if not.
+	 */
+	int (*remove_buf_file)(struct dentry *dentry);
+};
+
+/*
+ * CONFIG_RELAY kernel API, kernel/relay.c
+ */
+
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb);
+extern void relay_close(struct rchan *chan);
+extern void relay_flush(struct rchan *chan);
+extern void relay_subbufs_consumed(struct rchan *chan,
+				   unsigned int cpu,
+				   size_t consumed);
+extern void relay_reset(struct rchan *chan);
+extern int relay_buf_full(struct rchan_buf *buf);
+
+extern size_t relay_switch_subbuf(struct rchan_buf *buf,
+				  size_t length);
+
+/**
+ *	relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling interrupts.  Use this
+ *	if you might be logging from interrupt context.  Try
+ *	__relay_write() if you know you	won't be logging from
+ *	interrupt context.
+ */
+static inline void relay_write(struct rchan *chan,
+			       const void *data,
+			       size_t length)
+{
+	unsigned long flags;
+	struct rchan_buf *buf;
+
+	local_irq_save(flags);
+	buf = chan->buf[smp_processor_id()];
+	if (unlikely(buf->offset + length > chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	local_irq_restore(flags);
+}
+
+/**
+ *	__relay_write - write data into the channel
+ *	@chan: relay channel
+ *	@data: data to be written
+ *	@length: number of bytes to write
+ *
+ *	Writes data into the current cpu's channel buffer.
+ *
+ *	Protects the buffer by disabling preemption.  Use
+ *	relay_write() if you might be logging from interrupt
+ *	context.
+ */
+static inline void __relay_write(struct rchan *chan,
+				 const void *data,
+				 size_t length)
+{
+	struct rchan_buf *buf;
+
+	buf = chan->buf[get_cpu()];
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size))
+		length = relay_switch_subbuf(buf, length);
+	memcpy(buf->data + buf->offset, data, length);
+	buf->offset += length;
+	put_cpu();
+}
+
+/**
+ *	relay_reserve - reserve slot in channel buffer
+ *	@chan: relay channel
+ *	@length: number of bytes to reserve
+ *
+ *	Returns pointer to reserved slot, NULL if full.
+ *
+ *	Reserves a slot in the current cpu's channel buffer.
+ *	Does not protect the buffer at all - caller must provide
+ *	appropriate synchronization.
+ */
+static inline void *relay_reserve(struct rchan *chan, size_t length)
+{
+	void *reserved;
+	struct rchan_buf *buf = chan->buf[smp_processor_id()];
+
+	if (unlikely(buf->offset + length > buf->chan->subbuf_size)) {
+		length = relay_switch_subbuf(buf, length);
+		if (!length)
+			return NULL;
+	}
+	reserved = buf->data + buf->offset;
+	buf->offset += length;
+
+	return reserved;
+}
+
+/**
+ *	subbuf_start_reserve - reserve bytes at the start of a sub-buffer
+ *	@buf: relay channel buffer
+ *	@length: number of bytes to reserve
+ *
+ *	Helper function used to reserve bytes at the beginning of
+ *	a sub-buffer in the subbuf_start() callback.
+ */
+static inline void subbuf_start_reserve(struct rchan_buf *buf,
+					size_t length)
+{
+	BUG_ON(length >= buf->chan->subbuf_size - 1);
+	buf->offset = length;
+}
+
+/*
+ * exported relay file operations, kernel/relay.c
+ */
+extern struct file_operations relay_file_operations;
+
+#endif /* _LINUX_RELAY_H */
+
diff --git a/init/Kconfig b/init/Kconfig
index 38416a199def..1d19fd25204b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -214,6 +214,17 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config RELAY
+	bool "Kernel->user space relay support (formerly relayfs)"
+	help
+	  This option enables support for relay interface support in
+	  certain file systems (such as debugfs).
+	  It is designed to provide an efficient mechanism for tools and
+	  facilities to relay large amounts of data from kernel space to
+	  user space.
+
+	  If unsure, say N.
+
 source "usr/Kconfig"
 
 config UID16
diff --git a/kernel/Makefile b/kernel/Makefile
index 4ae0fbde815d..aebd7a78984e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RELAY) += relay.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/relay.c b/kernel/relay.c
new file mode 100644
index 000000000000..9358e8eb8476
--- /dev/null
+++ b/kernel/relay.c
@@ -0,0 +1,919 @@
+/*
+ * Public API and common code for kernel->userspace relay file support.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * Moved to kernel/relay.c by Paul Mundt, 2006.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relay.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+/*
+ * close() vm_op implementation for relay file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = vma->vm_private_data;
+	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+
+/*
+ * nopage() vm_op implementation for relay file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+				     unsigned long address,
+				     int *type)
+{
+	struct page *page;
+	struct rchan_buf *buf = vma->vm_private_data;
+	unsigned long offset = address - vma->vm_start;
+
+	if (address > vma->vm_end)
+		return NOPAGE_SIGBUS; /* Disallow mremap */
+	if (!buf)
+		return NOPAGE_OOM;
+
+	page = vmalloc_to_page(buf->start + offset);
+	if (!page)
+		return NOPAGE_OOM;
+	get_page(page);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	return page;
+}
+
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+	.nopage = relay_buf_nopage,
+	.close = relay_file_mmap_close,
+};
+
+/**
+ *	relay_mmap_buf: - mmap channel buffer to process address space
+ *	@buf: relay channel buffer
+ *	@vma: vm_area_struct describing memory to be mapped
+ *
+ *	Returns 0 if ok, negative on error
+ *
+ *	Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+	unsigned long length = vma->vm_end - vma->vm_start;
+	struct file *filp = vma->vm_file;
+
+	if (!buf)
+		return -EBADF;
+
+	if (length != (unsigned long)buf->chan->alloc_size)
+		return -EINVAL;
+
+	vma->vm_ops = &relay_file_mmap_ops;
+	vma->vm_private_data = buf;
+	buf->chan->cb->buf_mapped(buf, filp);
+
+	return 0;
+}
+
+/**
+ *	relay_alloc_buf - allocate a channel buffer
+ *	@buf: the buffer struct
+ *	@size: total size of the buffer
+ *
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+	void *mem;
+	unsigned int i, j, n_pages;
+
+	size = PAGE_ALIGN(size);
+	n_pages = size >> PAGE_SHIFT;
+
+	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!buf->page_array)
+		return NULL;
+
+	for (i = 0; i < n_pages; i++) {
+		buf->page_array[i] = alloc_page(GFP_KERNEL);
+		if (unlikely(!buf->page_array[i]))
+			goto depopulate;
+	}
+	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
+	if (!mem)
+		goto depopulate;
+
+	memset(mem, 0, size);
+	buf->page_count = n_pages;
+	return mem;
+
+depopulate:
+	for (j = 0; j < i; j++)
+		__free_page(buf->page_array[j]);
+	kfree(buf->page_array);
+	return NULL;
+}
+
+/**
+ *	relay_create_buf - allocate and initialize a channel buffer
+ *	@alloc_size: size of the buffer to allocate
+ *	@n_subbufs: number of sub-buffers in the channel
+ *
+ *	Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+	struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+	if (!buf->padding)
+		goto free_buf;
+
+	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	if (!buf->start)
+		goto free_buf;
+
+	buf->chan = chan;
+	kref_get(&buf->chan->kref);
+	return buf;
+
+free_buf:
+	kfree(buf->padding);
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ *	relay_destroy_channel - free the channel struct
+ *
+ *	Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+	struct rchan *chan = container_of(kref, struct rchan, kref);
+	kfree(chan);
+}
+
+/**
+ *	relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *	@buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+	struct rchan *chan = buf->chan;
+	unsigned int i;
+
+	if (likely(buf->start)) {
+		vunmap(buf->start);
+		for (i = 0; i < buf->page_count; i++)
+			__free_page(buf->page_array[i]);
+		kfree(buf->page_array);
+	}
+	kfree(buf->padding);
+	kfree(buf);
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+
+/**
+ *	relay_remove_buf - remove a channel buffer
+ *
+ *	Removes the file from the fileystem, which also frees the
+ *	rchan_buf_struct and the channel buffer.  Should only be called from
+ *	kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+	buf->chan->cb->remove_buf_file(buf->dentry);
+	relay_destroy_buf(buf);
+}
+
+/**
+ *	relay_buf_empty - boolean, is the channel buffer empty?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(relay_buf_empty);
+
+/**
+ *	relay_buf_full - boolean, is the channel buffer full?
+ *	@buf: channel buffer
+ *
+ *	Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(relay_buf_full);
+
+/*
+ * High-level relay kernel API and associated functions.
+ */
+
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+					  void *subbuf,
+					  void *prev_subbuf,
+					  size_t prev_padding)
+{
+	if (relay_buf_full(buf))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+					struct file *filp)
+{
+}
+
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+					  struct file *filp)
+{
+}
+
+/*
+ * create_buf_file_create() default callback.  Does nothing.
+ */
+static struct dentry *create_buf_file_default_callback(const char *filename,
+						       struct dentry *parent,
+						       int mode,
+						       struct rchan_buf *buf,
+						       int *is_global)
+{
+	return NULL;
+}
+
+/*
+ * remove_buf_file() default callback.  Does nothing.
+ */
+static int remove_buf_file_default_callback(struct dentry *dentry)
+{
+	return -EINVAL;
+}
+
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+	.subbuf_start = subbuf_start_default_callback,
+	.buf_mapped = buf_mapped_default_callback,
+	.buf_unmapped = buf_unmapped_default_callback,
+	.create_buf_file = create_buf_file_default_callback,
+	.remove_buf_file = remove_buf_file_default_callback,
+};
+
+/**
+ *	wakeup_readers - wake up readers waiting on a channel
+ *	@private: the channel buffer
+ *
+ *	This is the work function used to defer reader waking.  The
+ *	reason waking is deferred is that calling directly from write
+ *	causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+	struct rchan_buf *buf = private;
+	wake_up_interruptible(&buf->read_wait);
+}
+
+/**
+ *	__relay_reset - reset a channel buffer
+ *	@buf: the channel buffer
+ *	@init: 1 if this is a first-time initialization
+ *
+ *	See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+	size_t i;
+
+	if (init) {
+		init_waitqueue_head(&buf->read_wait);
+		kref_init(&buf->kref);
+		INIT_WORK(&buf->wake_readers, NULL, NULL);
+	} else {
+		cancel_delayed_work(&buf->wake_readers);
+		flush_scheduled_work();
+	}
+
+	buf->subbufs_produced = 0;
+	buf->subbufs_consumed = 0;
+	buf->bytes_consumed = 0;
+	buf->finalized = 0;
+	buf->data = buf->start;
+	buf->offset = 0;
+
+	for (i = 0; i < buf->chan->n_subbufs; i++)
+		buf->padding[i] = 0;
+
+	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+
+/**
+ *	relay_reset - reset the channel
+ *	@chan: the channel
+ *
+ *	This has the effect of erasing all data from all channel buffers
+ *	and restarting the channel in its initial state.  The buffers
+ *	are not freed, so any mappings are still in effect.
+ *
+ *	NOTE: Care should be taken that the channel isn't actually
+ *	being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		__relay_reset(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_reset);
+
+/**
+ *	relay_open_buf - create a new relay channel buffer
+ *
+ *	Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+					const char *filename,
+					struct dentry *parent,
+					int *is_global)
+{
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+
+	if (*is_global)
+		return chan->buf[0];
+
+	buf = relay_create_buf(chan);
+	if (!buf)
+		return NULL;
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR,
+					   buf, is_global);
+	if (!dentry) {
+		relay_destroy_buf(buf);
+		return NULL;
+	}
+
+	buf->dentry = dentry;
+	__relay_reset(buf, 1);
+
+	return buf;
+}
+
+/**
+ *	relay_close_buf - close a channel buffer
+ *	@buf: channel buffer
+ *
+ *	Marks the buffer finalized and restores the default callbacks.
+ *	The channel buffer and channel buffer data structure are then freed
+ *	automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+	buf->finalized = 1;
+	cancel_delayed_work(&buf->wake_readers);
+	flush_scheduled_work();
+	kref_put(&buf->kref, relay_remove_buf);
+}
+
+static inline void setup_callbacks(struct rchan *chan,
+				   struct rchan_callbacks *cb)
+{
+	if (!cb) {
+		chan->cb = &default_channel_callbacks;
+		return;
+	}
+
+	if (!cb->subbuf_start)
+		cb->subbuf_start = subbuf_start_default_callback;
+	if (!cb->buf_mapped)
+		cb->buf_mapped = buf_mapped_default_callback;
+	if (!cb->buf_unmapped)
+		cb->buf_unmapped = buf_unmapped_default_callback;
+	if (!cb->create_buf_file)
+		cb->create_buf_file = create_buf_file_default_callback;
+	if (!cb->remove_buf_file)
+		cb->remove_buf_file = remove_buf_file_default_callback;
+	chan->cb = cb;
+}
+
+/**
+ *	relay_open - create a new relay channel
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, NULL for root directory
+ *	@subbuf_size: size of sub-buffers
+ *	@n_subbufs: number of sub-buffers
+ *	@cb: client callback functions
+ *
+ *	Returns channel pointer if successful, NULL otherwise.
+ *
+ *	Creates a channel buffer for each cpu using the sizes and
+ *	attributes specified.  The created channel buffer files
+ *	will be named base_filename0...base_filenameN-1.  File
+ *	permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+			 struct dentry *parent,
+			 size_t subbuf_size,
+			 size_t n_subbufs,
+			 struct rchan_callbacks *cb)
+{
+	unsigned int i;
+	struct rchan *chan;
+	char *tmpname;
+	int is_global = 0;
+
+	if (!base_filename)
+		return NULL;
+
+	if (!(subbuf_size && n_subbufs))
+		return NULL;
+
+	chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+	if (!chan)
+		return NULL;
+
+	chan->version = RELAYFS_CHANNEL_VERSION;
+	chan->n_subbufs = n_subbufs;
+	chan->subbuf_size = subbuf_size;
+	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+	setup_callbacks(chan, cb);
+	kref_init(&chan->kref);
+
+	tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		goto free_chan;
+
+	for_each_online_cpu(i) {
+		sprintf(tmpname, "%s%d", base_filename, i);
+		chan->buf[i] = relay_open_buf(chan, tmpname, parent,
+					      &is_global);
+		if (!chan->buf[i])
+			goto free_bufs;
+
+		chan->buf[i]->cpu = i;
+	}
+
+	kfree(tmpname);
+	return chan;
+
+free_bufs:
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i])
+			break;
+		relay_close_buf(chan->buf[i]);
+		if (is_global)
+			break;
+	}
+	kfree(tmpname);
+
+free_chan:
+	kref_put(&chan->kref, relay_destroy_channel);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(relay_open);
+
+/**
+ *	relay_switch_subbuf - switch to a new sub-buffer
+ *	@buf: channel buffer
+ *	@length: size of current event
+ *
+ *	Returns either the length passed in or 0 if full.
+ *
+ *	Performs sub-buffer-switch tasks such as invoking callbacks,
+ *	updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+	void *old, *new;
+	size_t old_subbuf, new_subbuf;
+
+	if (unlikely(length > buf->chan->subbuf_size))
+		goto toobig;
+
+	if (buf->offset != buf->chan->subbuf_size + 1) {
+		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->subbufs_produced++;
+		if (waitqueue_active(&buf->read_wait)) {
+			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+			schedule_delayed_work(&buf->wake_readers, 1);
+		}
+	}
+
+	old = buf->data;
+	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+	new = buf->start + new_subbuf * buf->chan->subbuf_size;
+	buf->offset = 0;
+	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+		buf->offset = buf->chan->subbuf_size + 1;
+		return 0;
+	}
+	buf->data = new;
+	buf->padding[new_subbuf] = 0;
+
+	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+		goto toobig;
+
+	return length;
+
+toobig:
+	buf->chan->last_toobig = length;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+
+/**
+ *	relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *	@chan: the channel
+ *	@cpu: the cpu associated with the channel buffer to update
+ *	@subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *	Adds to the channel buffer's consumed sub-buffer count.
+ *	subbufs_consumed should be the number of sub-buffers newly consumed,
+ *	not the total consumed.
+ *
+ *	NOTE: kernel clients don't need to call this function if the channel
+ *	mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+			    unsigned int cpu,
+			    size_t subbufs_consumed)
+{
+	struct rchan_buf *buf;
+
+	if (!chan)
+		return;
+
+	if (cpu >= NR_CPUS || !chan->buf[cpu])
+		return;
+
+	buf = chan->buf[cpu];
+	buf->subbufs_consumed += subbufs_consumed;
+	if (buf->subbufs_consumed > buf->subbufs_produced)
+		buf->subbufs_consumed = buf->subbufs_produced;
+}
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+
+/**
+ *	relay_close - close the channel
+ *	@chan: the channel
+ *
+ *	Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_close_buf(chan->buf[i]);
+		prev = chan->buf[i];
+	}
+
+	if (chan->last_toobig)
+		printk(KERN_WARNING "relay: one or more items not logged "
+		       "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+		       chan->last_toobig, chan->subbuf_size);
+
+	kref_put(&chan->kref, relay_destroy_channel);
+}
+EXPORT_SYMBOL_GPL(relay_close);
+
+/**
+ *	relay_flush - close the channel
+ *	@chan: the channel
+ *
+ *	Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+	unsigned int i;
+	struct rchan_buf *prev = NULL;
+
+	if (!chan)
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!chan->buf[i] || chan->buf[i] == prev)
+			break;
+		relay_switch_subbuf(chan->buf[i], 0);
+		prev = chan->buf[i];
+	}
+}
+EXPORT_SYMBOL_GPL(relay_flush);
+
+/**
+ *	relay_file_open - open file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Increments the channel buffer refcount.
+ */
+static int relay_file_open(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = inode->u.generic_ip;
+	kref_get(&buf->kref);
+	filp->private_data = buf;
+
+	return 0;
+}
+
+/**
+ *	relay_file_mmap - mmap file op for relay files
+ *	@filp: the file
+ *	@vma: the vma describing what to map
+ *
+ *	Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct rchan_buf *buf = filp->private_data;
+	return relay_mmap_buf(buf, vma);
+}
+
+/**
+ *	relay_file_poll - poll file op for relay files
+ *	@filp: the file
+ *	@wait: poll table
+ *
+ *	Poll implemention.
+ */
+static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct rchan_buf *buf = filp->private_data;
+
+	if (buf->finalized)
+		return POLLERR;
+
+	if (filp->f_mode & FMODE_READ) {
+		poll_wait(filp, &buf->read_wait, wait);
+		if (!relay_buf_empty(buf))
+			mask |= POLLIN | POLLRDNORM;
+	}
+
+	return mask;
+}
+
+/**
+ *	relay_file_release - release file op for relay files
+ *	@inode: the inode
+ *	@filp: the file
+ *
+ *	Decrements the channel refcount, as the filesystem is
+ *	no longer using it.
+ */
+static int relay_file_release(struct inode *inode, struct file *filp)
+{
+	struct rchan_buf *buf = filp->private_data;
+	kref_put(&buf->kref, relay_remove_buf);
+
+	return 0;
+}
+
+/**
+ *	relay_file_read_consume - update the consumed count for the buffer
+ */
+static void relay_file_read_consume(struct rchan_buf *buf,
+				    size_t read_pos,
+				    size_t bytes_consumed)
+{
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t read_subbuf;
+
+	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+
+	buf->bytes_consumed += bytes_consumed;
+	read_subbuf = read_pos / buf->chan->subbuf_size;
+	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+		if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+		    (buf->offset == subbuf_size))
+			return;
+		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+		buf->bytes_consumed = 0;
+	}
+}
+
+/**
+ *	relay_file_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+	size_t bytes_produced, bytes_consumed, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+	size_t produced = buf->subbufs_produced % n_subbufs;
+	size_t consumed = buf->subbufs_consumed % n_subbufs;
+
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+
+	if (consumed > produced) {
+		if ((produced > n_subbufs) &&
+		    (produced + n_subbufs - consumed <= n_subbufs))
+			produced += n_subbufs;
+	} else if (consumed == produced) {
+		if (buf->offset > subbuf_size) {
+			produced += n_subbufs;
+			if (buf->subbufs_produced == buf->subbufs_consumed)
+				consumed += n_subbufs;
+		}
+	}
+
+	if (buf->offset > subbuf_size)
+		bytes_produced = (produced - 1) * subbuf_size + write_offset;
+	else
+		bytes_produced = produced * subbuf_size + write_offset;
+	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+
+	if (bytes_produced == bytes_consumed)
+		return 0;
+
+	relay_file_read_consume(buf, read_pos, 0);
+
+	return 1;
+}
+
+/**
+ *	relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relay_file_read_subbuf_avail(size_t read_pos,
+					   struct rchan_buf *buf)
+{
+	size_t padding, avail = 0;
+	size_t read_subbuf, read_offset, write_subbuf, write_offset;
+	size_t subbuf_size = buf->chan->subbuf_size;
+
+	write_subbuf = (buf->data - buf->start) / subbuf_size;
+	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	read_subbuf = read_pos / subbuf_size;
+	read_offset = read_pos % subbuf_size;
+	padding = buf->padding[read_subbuf];
+
+	if (read_subbuf == write_subbuf) {
+		if (read_offset + padding < write_offset)
+			avail = write_offset - (read_offset + padding);
+	} else
+		avail = (subbuf_size - padding) - read_offset;
+
+	return avail;
+}
+
+/**
+ *	relay_file_read_start_pos - find the first available byte to read
+ *
+ *	If the read_pos is in the middle of padding, return the
+ *	position of the first actually available byte, otherwise
+ *	return the original value.
+ */
+static size_t relay_file_read_start_pos(size_t read_pos,
+					struct rchan_buf *buf)
+{
+	size_t read_subbuf, padding, padding_start, padding_end;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	padding_start = (read_subbuf + 1) * subbuf_size - padding;
+	padding_end = (read_subbuf + 1) * subbuf_size;
+	if (read_pos >= padding_start && read_pos < padding_end) {
+		read_subbuf = (read_subbuf + 1) % n_subbufs;
+		read_pos = read_subbuf * subbuf_size;
+	}
+
+	return read_pos;
+}
+
+/**
+ *	relay_file_read_end_pos - return the new read position
+ */
+static size_t relay_file_read_end_pos(struct rchan_buf *buf,
+				      size_t read_pos,
+				      size_t count)
+{
+	size_t read_subbuf, padding, end_pos;
+	size_t subbuf_size = buf->chan->subbuf_size;
+	size_t n_subbufs = buf->chan->n_subbufs;
+
+	read_subbuf = read_pos / subbuf_size;
+	padding = buf->padding[read_subbuf];
+	if (read_pos % subbuf_size + count + padding == subbuf_size)
+		end_pos = (read_subbuf + 1) * subbuf_size;
+	else
+		end_pos = read_pos + count;
+	if (end_pos >= subbuf_size * n_subbufs)
+		end_pos = 0;
+
+	return end_pos;
+}
+
+/**
+ *	relay_file_read - read file op for relay files
+ *	@filp: the file
+ *	@buffer: the userspace buffer
+ *	@count: number of bytes to read
+ *	@ppos: position to read from
+ *
+ *	Reads count bytes or the number of bytes available in the
+ *	current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
+{
+	struct rchan_buf *buf = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	size_t read_start, avail;
+	ssize_t ret = 0;
+	void *from;
+
+	mutex_lock(&inode->i_mutex);
+	if(!relay_file_read_avail(buf, *ppos))
+		goto out;
+
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		goto out;
+
+	from = buf->start + read_start;
+	ret = count = min(count, avail);
+	if (copy_to_user(buffer, from, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	relay_file_read_consume(buf, read_start, count);
+	*ppos = relay_file_read_end_pos(buf, read_start, count);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+struct file_operations relay_file_operations = {
+	.open		= relay_file_open,
+	.poll		= relay_file_poll,
+	.mmap		= relay_file_mmap,
+	.read		= relay_file_read,
+	.llseek		= no_llseek,
+	.release	= relay_file_release,
+};
+EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 221415d76231d9012871e6e6abcbad906c46626a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 19:57:55 +0100
Subject: [PATCH] relay: add sendfile() support

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/relay.c | 144 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 115 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 9358e8eb8476..fefe2b2a7277 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,15 +95,16 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
  *	@buf: the buffer struct
  *	@size: total size of the buffer
  *
- *	Returns a pointer to the resulting buffer, NULL if unsuccessful
+ *	Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *	passed in size will get page aligned, if it isn't already.
  */
-static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
 {
 	void *mem;
 	unsigned int i, j, n_pages;
 
-	size = PAGE_ALIGN(size);
-	n_pages = size >> PAGE_SHIFT;
+	*size = PAGE_ALIGN(*size);
+	n_pages = *size >> PAGE_SHIFT;
 
 	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
 	if (!buf->page_array)
@@ -118,7 +119,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
 	if (!mem)
 		goto depopulate;
 
-	memset(mem, 0, size);
+	memset(mem, 0, *size);
 	buf->page_count = n_pages;
 	return mem;
 
@@ -146,7 +147,7 @@ struct rchan_buf *relay_create_buf(struct rchan *chan)
 	if (!buf->padding)
 		goto free_buf;
 
-	buf->start = relay_alloc_buf(buf, chan->alloc_size);
+	buf->start = relay_alloc_buf(buf, &chan->alloc_size);
 	if (!buf->start)
 		goto free_buf;
 
@@ -543,6 +544,9 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
+		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
+			buf->padding[old_subbuf];
+		smp_mb();
 		if (waitqueue_active(&buf->read_wait)) {
 			PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
 			schedule_delayed_work(&buf->wake_readers, 1);
@@ -757,37 +761,33 @@ static void relay_file_read_consume(struct rchan_buf *buf,
  */
 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 {
-	size_t bytes_produced, bytes_consumed, write_offset;
 	size_t subbuf_size = buf->chan->subbuf_size;
 	size_t n_subbufs = buf->chan->n_subbufs;
-	size_t produced = buf->subbufs_produced % n_subbufs;
-	size_t consumed = buf->subbufs_consumed % n_subbufs;
+	size_t produced = buf->subbufs_produced;
+	size_t consumed = buf->subbufs_consumed;
 
-	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+	relay_file_read_consume(buf, read_pos, 0);
 
-	if (consumed > produced) {
-		if ((produced > n_subbufs) &&
-		    (produced + n_subbufs - consumed <= n_subbufs))
-			produced += n_subbufs;
-	} else if (consumed == produced) {
-		if (buf->offset > subbuf_size) {
-			produced += n_subbufs;
-			if (buf->subbufs_produced == buf->subbufs_consumed)
-				consumed += n_subbufs;
-		}
+	if (unlikely(buf->offset > subbuf_size)) {
+		if (produced == consumed)
+			return 0;
+		return 1;
 	}
 
-	if (buf->offset > subbuf_size)
-		bytes_produced = (produced - 1) * subbuf_size + write_offset;
-	else
-		bytes_produced = produced * subbuf_size + write_offset;
-	bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
-
-	if (bytes_produced == bytes_consumed)
+	if (unlikely(produced - consumed >= n_subbufs)) {
+		consumed = (produced / n_subbufs) * n_subbufs;
+		buf->subbufs_consumed = consumed;
+	}
+	
+	produced = (produced % n_subbufs) * subbuf_size + buf->offset;
+	consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
+
+	if (consumed > produced)
+		produced += n_subbufs * subbuf_size;
+	
+	if (consumed == produced)
 		return 0;
 
-	relay_file_read_consume(buf, read_pos, 0);
-
 	return 1;
 }
 
@@ -908,6 +908,91 @@ out:
 	return ret;
 }
 
+static ssize_t relay_file_sendsubbuf(struct file *filp, loff_t *ppos,
+				     size_t count, read_actor_t actor,
+				     void *target)
+{
+	struct rchan_buf *buf = filp->private_data;
+	read_descriptor_t desc;
+	size_t read_start, avail;
+	unsigned long pidx, poff;
+	unsigned int subbuf_pages;
+	ssize_t ret = 0;
+
+	if (!relay_file_read_avail(buf, *ppos))
+		return 0;
+
+	read_start = relay_file_read_start_pos(*ppos, buf);
+	avail = relay_file_read_subbuf_avail(read_start, buf);
+	if (!avail)
+		return 0;
+
+	count = min(count, avail);
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+
+	subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
+	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
+	poff = read_start & ~PAGE_MASK;
+	while (count) {
+		struct page *p = buf->page_array[pidx];
+		unsigned int len;
+
+		len = PAGE_SIZE - poff;
+		if (len > count)
+			len = count;
+
+		len = actor(&desc, p, poff, len);
+
+		if (desc.error) {
+			if (!ret)
+				ret = desc.error;
+			break;
+		}
+
+		count -= len;
+		ret += len;
+		poff = 0;
+		pidx = (pidx + 1) % subbuf_pages;
+	}
+
+	if (ret > 0) {
+		relay_file_read_consume(buf, read_start, ret);
+		*ppos = relay_file_read_end_pos(buf, read_start, ret);
+	}
+
+	return ret;
+}
+
+static ssize_t relay_file_sendfile(struct file *filp, loff_t *ppos,
+				   size_t count, read_actor_t actor,
+				   void *target)
+{
+	ssize_t sent = 0, ret = 0;
+
+	if (!count)
+		return 0;
+
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+
+	do {
+		ret = relay_file_sendsubbuf(filp, ppos, count, actor, target);
+		if (ret < 0) {
+			if (!sent)
+				sent = ret;
+			break;
+		}
+		count -= ret;
+		sent += ret;
+	} while (count && ret);
+
+	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+	return sent;
+}
+
 struct file_operations relay_file_operations = {
 	.open		= relay_file_open,
 	.poll		= relay_file_poll,
@@ -915,5 +1000,6 @@ struct file_operations relay_file_operations = {
 	.read		= relay_file_read,
 	.llseek		= no_llseek,
 	.release	= relay_file_release,
+	.sendfile       = relay_file_sendfile,
 };
 EXPORT_SYMBOL_GPL(relay_file_operations);
-- 
cgit v1.2.3


From 6dac40a7ce2483a47b54af07afebeb84131c7228 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@us.ibm.com>
Date: Thu, 23 Mar 2006 19:58:45 +0100
Subject: [PATCH] relay: consolidate sendfile() and read() code

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/relay.c | 175 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 91 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index fefe2b2a7277..33345e73485c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -866,131 +866,138 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 }
 
 /**
- *	relay_file_read - read file op for relay files
- *	@filp: the file
- *	@buffer: the userspace buffer
- *	@count: number of bytes to read
- *	@ppos: position to read from
- *
- *	Reads count bytes or the number of bytes available in the
- *	current sub-buffer being read, whichever is smaller.
+ *	subbuf_read_actor - read up to one subbuf's worth of data
  */
-static ssize_t relay_file_read(struct file *filp,
-			       char __user *buffer,
-			       size_t count,
-			       loff_t *ppos)
+static int subbuf_read_actor(size_t read_start,
+			     struct rchan_buf *buf,
+			     size_t avail,
+			     read_descriptor_t *desc,
+			     read_actor_t actor)
 {
-	struct rchan_buf *buf = filp->private_data;
-	struct inode *inode = filp->f_dentry->d_inode;
-	size_t read_start, avail;
-	ssize_t ret = 0;
 	void *from;
-
-	mutex_lock(&inode->i_mutex);
-	if(!relay_file_read_avail(buf, *ppos))
-		goto out;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		goto out;
+	int ret = 0;
 
 	from = buf->start + read_start;
-	ret = count = min(count, avail);
-	if (copy_to_user(buffer, from, count)) {
-		ret = -EFAULT;
-		goto out;
+	ret = avail;
+	if (copy_to_user(desc->arg.data, from, avail)) {
+		desc->error = -EFAULT;
+		ret = 0;
 	}
-	relay_file_read_consume(buf, read_start, count);
-	*ppos = relay_file_read_end_pos(buf, read_start, count);
-out:
-	mutex_unlock(&inode->i_mutex);
+	desc->arg.data += ret;
+	desc->written += ret;
+	desc->count -= ret;
+
 	return ret;
 }
 
-static ssize_t relay_file_sendsubbuf(struct file *filp, loff_t *ppos,
-				     size_t count, read_actor_t actor,
-				     void *target)
+/**
+ *	subbuf_send_actor - send up to one subbuf's worth of data
+ */
+static int subbuf_send_actor(size_t read_start,
+			     struct rchan_buf *buf,
+			     size_t avail,
+			     read_descriptor_t *desc,
+			     read_actor_t actor)
 {
-	struct rchan_buf *buf = filp->private_data;
-	read_descriptor_t desc;
-	size_t read_start, avail;
 	unsigned long pidx, poff;
 	unsigned int subbuf_pages;
-	ssize_t ret = 0;
-
-	if (!relay_file_read_avail(buf, *ppos))
-		return 0;
-
-	read_start = relay_file_read_start_pos(*ppos, buf);
-	avail = relay_file_read_subbuf_avail(read_start, buf);
-	if (!avail)
-		return 0;
-
-	count = min(count, avail);
-
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
+	int ret = 0;
 
 	subbuf_pages = buf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	while (count) {
+	while (avail) {
 		struct page *p = buf->page_array[pidx];
 		unsigned int len;
 
 		len = PAGE_SIZE - poff;
-		if (len > count)
-			len = count;
+		if (len > avail)
+			len = avail;
 
-		len = actor(&desc, p, poff, len);
-
-		if (desc.error) {
-			if (!ret)
-				ret = desc.error;
+		len = actor(desc, p, poff, len);
+		if (desc->error)
 			break;
-		}
 
-		count -= len;
+		avail -= len;
 		ret += len;
 		poff = 0;
 		pidx = (pidx + 1) % subbuf_pages;
 	}
 
-	if (ret > 0) {
-		relay_file_read_consume(buf, read_start, ret);
-		*ppos = relay_file_read_end_pos(buf, read_start, ret);
-	}
-
 	return ret;
 }
 
-static ssize_t relay_file_sendfile(struct file *filp, loff_t *ppos,
-				   size_t count, read_actor_t actor,
-				   void *target)
+typedef int (*subbuf_actor_t) (size_t read_start,
+			       struct rchan_buf *buf,
+			       size_t avail,
+			       read_descriptor_t *desc,
+			       read_actor_t actor);
+
+/**
+ *	relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
+ */
+static inline ssize_t relay_file_read_subbufs(struct file *filp,
+					      loff_t *ppos,
+					      size_t count,
+					      subbuf_actor_t subbuf_actor,
+					      read_actor_t actor,
+					      void *target)
 {
-	ssize_t sent = 0, ret = 0;
+	struct rchan_buf *buf = filp->private_data;
+	size_t read_start, avail;
+	read_descriptor_t desc;
+	int ret;
 
 	if (!count)
 		return 0;
 
-	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
 
+	mutex_lock(&filp->f_dentry->d_inode->i_mutex);
 	do {
-		ret = relay_file_sendsubbuf(filp, ppos, count, actor, target);
-		if (ret < 0) {
-			if (!sent)
-				sent = ret;
+		if (!relay_file_read_avail(buf, *ppos))
+			break;
+
+		read_start = relay_file_read_start_pos(*ppos, buf);
+		avail = relay_file_read_subbuf_avail(read_start, buf);
+		if (!avail)
 			break;
-		}
-		count -= ret;
-		sent += ret;
-	} while (count && ret);
 
+		avail = min(desc.count, avail);
+		ret = subbuf_actor(read_start, buf, avail, &desc, actor);
+		if (desc.error < 0)
+			break;
+
+		if (ret) {
+			relay_file_read_consume(buf, read_start, ret);
+			*ppos = relay_file_read_end_pos(buf, read_start, ret);
+		}
+	} while (desc.count && ret);
 	mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
-	return sent;
+
+	return desc.written;
+}
+
+static ssize_t relay_file_read(struct file *filp,
+			       char __user *buffer,
+			       size_t count,
+			       loff_t *ppos)
+{
+	return relay_file_read_subbufs(filp, ppos, count, subbuf_read_actor,
+				       NULL, buffer);
+}
+
+static ssize_t relay_file_sendfile(struct file *filp,
+				   loff_t *ppos,
+				   size_t count,
+				   read_actor_t actor,
+				   void *target)
+{
+	return relay_file_read_subbufs(filp, ppos, count, subbuf_send_actor,
+				       actor, target);
 }
 
 struct file_operations relay_file_operations = {
-- 
cgit v1.2.3


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/Kconfig                |  12 +
 block/Makefile               |   2 +
 block/blktrace.c             | 538 +++++++++++++++++++++++++++++++++++++++++++
 block/elevator.c             |   4 +
 block/ioctl.c                |   6 +
 block/ll_rw_blk.c            |  44 +++-
 drivers/block/cciss.c        |   2 +
 drivers/md/dm.c              |  13 +-
 fs/bio.c                     |   4 +
 fs/compat_ioctl.c            |   1 +
 include/linux/blkdev.h       |   3 +
 include/linux/blktrace_api.h | 277 ++++++++++++++++++++++
 include/linux/compat_ioctl.h |   4 +
 include/linux/fs.h           |   4 +
 include/linux/sched.h        |   1 +
 kernel/fork.c                |   1 +
 mm/highmem.c                 |   3 +
 17 files changed, 916 insertions(+), 3 deletions(-)
 create mode 100644 block/blktrace.c
 create mode 100644 include/linux/blktrace_api.h

(limited to 'kernel')

diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e17..96783645092d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44e..c05de0e0037f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 000000000000..36f3a172275f
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a0..5e558c4689a4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc6512..9cfa2e1ecb24 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa9..062067fa7ead 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2129,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3034,6 +3063,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80e..1f2890989b56 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 
 	cmd->rq->completion_data = cmd;
 	cmd->rq->errors = status;
+	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
 	blk_complete_request(cmd->rq);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c7..8c82373f7ff3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
 
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
+		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
 		bio_endio(io->bio, io->bio->bi_size, io->error);
 		free_io(io->md, io);
 	}
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct target_io *tio)
 {
 	int r;
+	sector_t sector;
 
 	/*
 	 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
+	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
-	if (r > 0)
+	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
+
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+				    tio->io->bio->bi_bdev->bd_dev, sector, 
+				    clone->bi_sector);
+
 		generic_make_request(clone);
+	}
 
 	else if (r < 0) {
 		/* error the io and bail out */
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c96..0a8c59cb68f5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	if (!bp)
 		return bp;
 
+	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+				bi->bi_sector + first_sectors);
+
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875d..7c031f00fd79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
+#include <linux/blktrace_api.h>
 
 #include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f3..c179966f1a2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
+struct blk_trace;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 	int			node;
 
+	struct blk_trace	*blk_trace;
+
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 000000000000..b34d3e73d5ea
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
+#ifndef BLKTRACE_H
+#define BLKTRACE_H
+
+#include <linux/config.h>
+#include <linux/blkdev.h>
+#include <linux/relay.h>
+
+/*
+ * Trace categories
+ */
+enum blktrace_cat {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_SYNC	= 1 << 3,	/* barrier */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum blktrace_act {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	u32 magic;		/* MAGIC << 8 | version */
+	u32 sequence;		/* event number */
+	u64 time;		/* in microseconds */
+	u64 sector;		/* disk offset */
+	u32 bytes;		/* transfer length */
+	u32 action;		/* what happened */
+	u32 pid;		/* who did it */
+	u32 device;		/* device number */
+	u32 cpu;		/* on what cpu did it happen */
+	u16 error;		/* completion error */
+	u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	u32 device;
+	u32 __pad;
+	u64 sector;
+};
+
+enum {
+	Blktrace_setup = 1,
+	Blktrace_running,
+	Blktrace_stopped,
+};
+
+struct blk_trace {
+	int trace_state;
+	struct rchan *rchan;
+	unsigned long *sequence;
+	u16 act_mask;
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+	u32 dev;
+	struct dentry *dir;
+	struct dentry *dropped_file;
+	atomic_t dropped;
+};
+
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+	char name[BDEVNAME_SIZE];	/* output */
+	u16 act_mask;			/* input */
+	u32 buf_size;			/* input */
+	u32 buf_nr;			/* input */
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+};
+
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
+extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
+extern void blk_trace_shutdown(request_queue_t *);
+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->flags & 0x07;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ * @what:	the action
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static inline void blk_add_trace_generic(struct request_queue *q,
+					 struct bio *bio, int rw, u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(q, bio, what);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * @q:		queue the io is for
+ * @what:	the action
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
+					 struct bio *bio, unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+	u64 rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#else /* !CONFIG_BLK_DEV_IO_TRACE */
+#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+#define blk_trace_shutdown(q)			do { } while (0)
+#define blk_add_trace_rq(q, rq, what)		do { } while (0)
+#define blk_add_trace_bio(q, rq, what)		do { } while (0)
+#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
+#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
+#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df3..efb518f16bb3 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
 /* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d0..9b34a1b03455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f0..e60a91d5b369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned short ioprio;
+	unsigned int btrace_seq;
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a49..c21bae8c93b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..d0ea1eec6a9a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
-- 
cgit v1.2.3


From 36f574135e36b86bb6ae794bf1d0fce3efa5601f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:15:47 -0800
Subject: [PATCH] free_uid() locking improvement

Reduce lock hold times in free_uid().

Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/user.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index d9deae43a9ab..2116642f42c6 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -105,15 +105,19 @@ void free_uid(struct user_struct *up)
 {
 	unsigned long flags;
 
+	if (!up)
+		return;
+
 	local_irq_save(flags);
-	if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+	if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
+		spin_unlock_irqrestore(&uidhash_lock, flags);
 		key_put(up->uid_keyring);
 		key_put(up->session_keyring);
 		kmem_cache_free(uid_cachep, up);
-		spin_unlock(&uidhash_lock);
+	} else {
+		local_irq_restore(flags);
 	}
-	local_irq_restore(flags);
 }
 
 struct user_struct * alloc_uid(uid_t uid)
-- 
cgit v1.2.3


From f6ef943813ac3085ece7252ea101d663581219f6 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:48 -0800
Subject: [PATCH] Represent dirty_*_centisecs as jiffies internally

Make that the internal values for:

/proc/sys/vm/dirty_writeback_centisecs
/proc/sys/vm/dirty_expire_centisecs

are stored as jiffies instead of centiseconds.  Let the sysctl interface do
the conversions with full precision using clock_t_to_jiffies, instead of
doing overflow-sensitive on-the-fly conversions every time the values are
used.

Cons: apparent precision loss if HZ is not a multiple of 100, because of
conversion back and forth.  This is a common problem for all sysctl values
that use proc_dointvec_userhz_jiffies.  (There is only one other in-tree
use, in net/core/neighbour.c.)

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           | 10 +++++-----
 mm/page-writeback.c       | 24 ++++++++++++------------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index beaef5c7a0ea..609565961494 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,8 +88,8 @@ void throttle_vm_writeout(void);
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
-extern int dirty_writeback_centisecs;
-extern int dirty_expire_centisecs;
+extern int dirty_writeback_interval;
+extern int dirty_expire_interval;
 extern int block_dump;
 extern int laptop_mode;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32b48e8ee36e..817ba25517eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -742,18 +742,18 @@ static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_DIRTY_WB_CS,
 		.procname	= "dirty_writeback_centisecs",
-		.data		= &dirty_writeback_centisecs,
-		.maxlen		= sizeof(dirty_writeback_centisecs),
+		.data		= &dirty_writeback_interval,
+		.maxlen		= sizeof(dirty_writeback_interval),
 		.mode		= 0644,
 		.proc_handler	= &dirty_writeback_centisecs_handler,
 	},
 	{
 		.ctl_name	= VM_DIRTY_EXPIRE_CS,
 		.procname	= "dirty_expire_centisecs",
-		.data		= &dirty_expire_centisecs,
-		.maxlen		= sizeof(dirty_expire_centisecs),
+		.data		= &dirty_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_userhz_jiffies,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..e79107991d20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
  * The interval between `kupdate'-style writebacks, in centiseconds
  * (hundredths of a second)
  */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 
 /*
  * The longest number of centiseconds for which data is allowed to remain dirty
  */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -380,8 +380,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +406,9 @@ static void wb_kupdate(unsigned long arg)
 	sync_supers();
 
 	get_writeback_state(&wbs);
-	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	oldest_jif = jiffies - dirty_expire_interval;
 	start_jif = jiffies;
-	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	next_jif = start_jif + dirty_writeback_interval;
 	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
@@ -425,7 +425,7 @@ static void wb_kupdate(unsigned long arg)
 	}
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
-	if (dirty_writeback_centisecs)
+	if (dirty_writeback_interval)
 		mod_timer(&wb_timer, next_jif);
 }
 
@@ -435,11 +435,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (dirty_writeback_centisecs) {
+	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_interval) {
 		mod_timer(&wb_timer,
-			jiffies + (dirty_writeback_centisecs * HZ) / 100);
-	} else {
+			jiffies + dirty_writeback_interval);
+		} else {
 		del_timer(&wb_timer);
 	}
 	return 0;
@@ -544,7 +544,7 @@ void __init page_writeback_init(void)
 		if (vm_dirty_ratio <= 0)
 			vm_dirty_ratio = 1;
 	}
-	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
-- 
cgit v1.2.3


From ed5b43f15a8e86e3ae939b98bc161ee973ecedf2 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:49 -0800
Subject: [PATCH] Represent laptop_mode as jiffies internally

Make that the internal value for /proc/sys/vm/laptop_mode is stored as
jiffies instead of seconds.  Let the sysctl interface do the conversions,
instead of doing on-the-fly conversions every time the value is used.

Add a description of the fact that laptop_mode doubles as a flag and a
timeout to the comment above the laptop_mode variable.

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c     | 5 ++---
 mm/page-writeback.c | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 817ba25517eb..d13426680d10 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -848,9 +848,8 @@ static ctl_table vm_table[] = {
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
 	},
 	{
 		.ctl_name	= VM_BLOCK_DUMP,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e79107991d20..c1052ee79f01 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -88,7 +88,8 @@ int dirty_expire_interval = 30 * HZ;
 int block_dump;
 
 /*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
  */
 int laptop_mode;
 
@@ -468,7 +469,7 @@ static void laptop_timer_fn(unsigned long unused)
  */
 void laptop_io_completion(void)
 {
-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 
 /*
-- 
cgit v1.2.3


From cba9f33d13a8ca3125b2a30abe2425ce562d8a83 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:50 -0800
Subject: [PATCH] Range checking in do_proc_dointvec_(userhz_)jiffies_conv

When (integer) sysctl values are in either seconds or centiseconds, but
represented internally as jiffies, the allowable value range is decreased.
This patch adds range checks to the conversion routines.

For values in seconds: maximum LONG_MAX / HZ.

For values in centiseconds: maximum (LONG_MAX / HZ) * USER_HZ.

(BTW, does anyone else feel that an interface in seconds should not be
accepting negative values?)

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d13426680d10..e82726faeeff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2053,6 +2053,8 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
 					 int write, void *data)
 {
 	if (write) {
+		if (*lvalp > LONG_MAX / HZ)
+			return 1;
 		*valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ);
 	} else {
 		int val = *valp;
@@ -2074,6 +2076,8 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
 						int write, void *data)
 {
 	if (write) {
+		if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ)
+			return 1;
 		*valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp);
 	} else {
 		int val = *valp;
-- 
cgit v1.2.3


From caa9ee771de3195ae85ac6f8cb550f53e9ecdd82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 24 Mar 2006 03:15:50 -0800
Subject: [PATCH] rcu_process_callbacks: don't cli() while testing ->nxtlist

__rcu_process_callbacks() disables interrupts to protect itself from
call_rcu() which adds new entries to ->nxtlist.

However we can check "->nxtlist != NULL" with interrupts enabled, we can't
get "false positives" because call_rcu() can only change this condition
from 0 to 1.

Tested with rcutorture.ko.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 6df1559b1c02..13458bbaa1be 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -416,8 +416,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		rdp->curtail = &rdp->curlist;
 	}
 
-	local_irq_disable();
 	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_disable();
 		rdp->curlist = rdp->nxtlist;
 		rdp->curtail = rdp->nxttail;
 		rdp->nxtlist = NULL;
@@ -442,9 +442,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 			rcu_start_batch(rcp);
 			spin_unlock(&rcp->lock);
 		}
-	} else {
-		local_irq_enable();
 	}
+
 	rcu_check_quiescent_state(rcp, rdp);
 	if (rdp->donelist)
 		rcu_do_batch(rdp);
-- 
cgit v1.2.3


From a4a6198b80cf82eb8160603c98da218d1bd5e104 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 24 Mar 2006 03:15:54 -0800
Subject: [PATCH] tvec_bases too large for per-cpu data

With internal Xen-enabled kernels we see the kernel's static per-cpu data
area exceed the limit of 32k on x86-64, and even native x86-64 kernels get
fairly close to that limit.  I generally question whether it is reasonable
to have data structures several kb in size allocated as per-cpu data when
the space there is rather limited.

The biggest arch-independent consumer is tvec_bases (over 4k on 32-bit
archs, over 8k on 64-bit ones), which now gets converted to use dynamically
allocated memory instead.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 45 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 2410c18dbeb1..4427e725ccdd 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -86,7 +86,8 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+static tvec_base_t boot_tvec_bases;
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
@@ -157,7 +158,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
-	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -218,7 +219,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		ret = 1;
 	}
 
-	new_base = &__get_cpu_var(tvec_bases);
+	new_base = __get_cpu_var(tvec_bases);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -258,7 +259,7 @@ EXPORT_SYMBOL(__mod_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
+	tvec_base_t *base = per_cpu(tvec_bases, cpu);
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -504,7 +505,7 @@ unsigned long next_timer_interrupt(void)
 	}
 	hr_expires += jiffies;
 
-	base = &__get_cpu_var(tvec_bases);
+	base = __get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = NULL;
@@ -901,7 +902,7 @@ EXPORT_SYMBOL(xtime_lock);
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
  	hrtimer_run_queues();
 	if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1256,12 +1257,32 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 	return 0;
 }
 
-static void __devinit init_timers_cpu(int cpu)
+static int __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
 
-	base = &per_cpu(tvec_bases, cpu);
+	base = per_cpu(tvec_bases, cpu);
+	if (!base) {
+		static char boot_done;
+
+		/*
+		 * Cannot do allocation in init_timers as that runs before the
+		 * allocator initializes (and would waste memory if there are
+		 * more possible CPUs than will ever be installed/brought up).
+		 */
+		if (boot_done) {
+			base = kmalloc_node(sizeof(*base), GFP_KERNEL,
+						cpu_to_node(cpu));
+			if (!base)
+				return -ENOMEM;
+			memset(base, 0, sizeof(*base));
+		} else {
+			base = &boot_tvec_bases;
+			boot_done = 1;
+		}
+		per_cpu(tvec_bases, cpu) = base;
+	}
 	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
@@ -1273,6 +1294,7 @@ static void __devinit init_timers_cpu(int cpu)
 		INIT_LIST_HEAD(base->tv1.vec + j);
 
 	base->timer_jiffies = jiffies;
+	return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1295,8 +1317,8 @@ static void __devinit migrate_timers(int cpu)
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = &per_cpu(tvec_bases, cpu);
-	new_base = &get_cpu_var(tvec_bases);
+	old_base = per_cpu(tvec_bases, cpu);
+	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
 	spin_lock(&new_base->t_base.lock);
@@ -1326,7 +1348,8 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	long cpu = (long)hcpu;
 	switch(action) {
 	case CPU_UP_PREPARE:
-		init_timers_cpu(cpu);
+		if (init_timers_cpu(cpu) < 0)
+			return NOTIFY_BAD;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
-- 
cgit v1.2.3


From 95c3832272fc77ea3e31f6382f82ba17be985cc7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 24 Mar 2006 03:15:58 -0800
Subject: [PATCH] rcutorture: tag success/failure line with module parameters

A long-running rcutorture test can overflow dmesg, so that the line
containing the module parameters is lost.  Although it is usually possible
to retrieve this information from the log files, it is much better to just
tag it onto the final success/failure line so that it may be easily found.
This patch does just that.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7712912dbc84..9a1fa8894b95 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -441,6 +441,16 @@ rcu_torture_shuffle(void *arg)
 	return 0;
 }
 
+static inline void
+rcu_torture_print_module_parms(char *tag)
+{
+	printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d "
+		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
+		"shuffle_interval = %d\n",
+		tag, nrealreaders, stat_interval, verbose, test_no_idle_hz,
+		shuffle_interval);
+}
+
 static void
 rcu_torture_cleanup(void)
 {
@@ -483,9 +493,10 @@ rcu_torture_cleanup(void)
 	rcu_barrier();
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-	printk(KERN_ALERT TORTURE_FLAG
-	       "--- End of test: %s\n",
-	       atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
+	if (atomic_read(&n_rcu_torture_error))
+		rcu_torture_print_module_parms("End of test: FAILURE");
+	else
+		rcu_torture_print_module_parms("End of test: SUCCESS");
 }
 
 static int
@@ -501,11 +512,7 @@ rcu_torture_init(void)
 		nrealreaders = nreaders;
 	else
 		nrealreaders = 2 * num_online_cpus();
-	printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d "
-		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval = %d\n",
-		nrealreaders, stat_interval, verbose, test_no_idle_hz,
-		shuffle_interval);
+	rcu_torture_print_module_parms("Start of test");
 	fullstop = 0;
 
 	/* Set up the freelist. */
-- 
cgit v1.2.3


From 7b5b9ef0e17d52c188fe73ea78e884fe67079e6c Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:00 -0800
Subject: [PATCH] cpuset cleanup not not operators

Since the test_bit() bit operator is boolean (return 0 or 1), the double not
"!!" operations needed to convert a scalar (zero or not zero) to a boolean are
not needed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c86ee051b734..9f28e1f00185 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -114,27 +114,27 @@ typedef enum {
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
-	return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
+	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 }
 
 static inline int is_mem_exclusive(const struct cpuset *cs)
 {
-	return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
+	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 }
 
 static inline int is_removed(const struct cpuset *cs)
 {
-	return !!test_bit(CS_REMOVED, &cs->flags);
+	return test_bit(CS_REMOVED, &cs->flags);
 }
 
 static inline int notify_on_release(const struct cpuset *cs)
 {
-	return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+	return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
 }
 
 static inline int is_memory_migrate(const struct cpuset *cs)
 {
-	return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
+	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 }
 
 /*
-- 
cgit v1.2.3


From 8a39cc60bfa5a72f32d975729a354daca124f6de Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:01 -0800
Subject: [PATCH] cpuset use combined atomic_inc_return calls

Replace pairs of calls to <atomic_inc, atomic_read>, with a single call
atomic_inc_return, saving a few bytes of source and kernel text.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9f28e1f00185..44d13c246e5c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -858,8 +858,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs.mems_allowed;
-	atomic_inc(&cpuset_mems_generation);
-	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
 	mutex_unlock(&callback_mutex);
 
 	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
@@ -1770,8 +1769,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	atomic_set(&cs->count, 0);
 	INIT_LIST_HEAD(&cs->sibling);
 	INIT_LIST_HEAD(&cs->children);
-	atomic_inc(&cpuset_mems_generation);
-	cs->mems_generation = atomic_read(&cpuset_mems_generation);
+	cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
 	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
@@ -1861,7 +1859,7 @@ int __init cpuset_init_early(void)
 	struct task_struct *tsk = current;
 
 	tsk->cpuset = &top_cpuset;
-	tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
+	tsk->cpuset->mems_generation = atomic_inc_return(&cpuset_mems_generation);
 	return 0;
 }
 
@@ -1880,8 +1878,7 @@ int __init cpuset_init(void)
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
 	fmeter_init(&top_cpuset.fmeter);
-	atomic_inc(&cpuset_mems_generation);
-	top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
+	top_cpuset.mems_generation = atomic_inc_return(&cpuset_mems_generation);
 
 	init_task.cpuset = &top_cpuset;
 
-- 
cgit v1.2.3


From 825a46af5ac171f9f41f794a0a00165588ba1589 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:03 -0800
Subject: [PATCH] cpuset memory spread basic implementation

This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).

The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.

All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.

There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures.  They are called 'memory_spread_page' and 'memory_spread_slab'.

If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.

If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.

The implementation is simple.  Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset.  In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.

The cpuset_mem_spread_node() routine is also simple.  It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.

This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit.  Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.

A couple of Copyright year ranges are updated as well.  And a couple of email
addresses that can be found in the MAINTAINERS file are removed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt |  76 ++++++++++++++++++++++++++++++++-
 include/linux/cpuset.h    |  29 ++++++++++++-
 include/linux/sched.h     |   3 ++
 kernel/cpuset.c           | 104 +++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 203 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 30c41459953c..159e2a0c3e80 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -18,7 +18,8 @@ CONTENTS:
   1.4 What are exclusive cpusets ?
   1.5 What does notify_on_release do ?
   1.6 What is memory_pressure ?
-  1.7 How do I use cpusets ?
+  1.7 What is memory spread ?
+  1.8 How do I use cpusets ?
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Adding/removing cpus
@@ -317,7 +318,78 @@ the tasks in the cpuset, in units of reclaims attempted per second,
 times 1000.
 
 
-1.7 How do I use cpusets ?
+1.7 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'memory_spread_page' and
+'memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the tasks NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the tasks NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing tasks memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'memory_spread_page' turns on a per-process flag
+PF_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PF_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'memory_spread_cache' turns on the flag
+PF_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current tasks mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+
+1.8 How do I use cpusets ?
 --------------------------
 
 In order to minimize the impact of cpusets on critical kernel
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3bc606927116..9354722a9217 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -4,7 +4,7 @@
  *  cpuset interface
  *
  *  Copyright (C) 2003 BULL SA
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
 extern void cpuset_lock(void);
 extern void cpuset_unlock(void);
 
+extern int cpuset_mem_spread_node(void);
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_PAGE;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return current->flags & PF_SPREAD_SLAB;
+}
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task,
 static inline void cpuset_lock(void) {}
 static inline void cpuset_unlock(void) {}
 
+static inline int cpuset_mem_spread_node(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_page_mem_spread(void)
+{
+	return 0;
+}
+
+static inline int cpuset_do_slab_mem_spread(void)
+{
+	return 0;
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a91d5b369..b0e37cfa09f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -869,6 +869,7 @@ struct task_struct {
 	struct cpuset *cpuset;
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
+	int cpuset_mem_spread_rotor;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
+#define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
+#define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 44d13c246e5c..38f18b33de6c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
  *  Processor and Memory placement constraints for sets of tasks.
  *
  *  Copyright (C) 2003 BULL SA.
- *  Copyright (C) 2004 Silicon Graphics, Inc.
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
- *  Portions Copyright (c) 2004 Silicon Graphics, Inc.
  *
- *  2003-10-10 Written by Simon Derr <simon.derr@bull.net>
+ *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
- *  2004 May-July Rework by Paul Jackson <pj@sgi.com>
+ *  2004 May-July Rework by Paul Jackson.
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
@@ -108,7 +107,9 @@ typedef enum {
 	CS_MEM_EXCLUSIVE,
 	CS_MEMORY_MIGRATE,
 	CS_REMOVED,
-	CS_NOTIFY_ON_RELEASE
+	CS_NOTIFY_ON_RELEASE,
+	CS_SPREAD_PAGE,
+	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -137,6 +138,16 @@ static inline int is_memory_migrate(const struct cpuset *cs)
 	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 }
 
+static inline int is_spread_page(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_PAGE, &cs->flags);
+}
+
+static inline int is_spread_slab(const struct cpuset *cs)
+{
+	return test_bit(CS_SPREAD_SLAB, &cs->flags);
+}
+
 /*
  * Increment this atomic integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -657,6 +668,14 @@ void cpuset_update_task_memory_state(void)
 		cs = tsk->cpuset;	/* Maybe changed when task not locked */
 		guarantee_online_mems(cs, &tsk->mems_allowed);
 		tsk->cpuset_mems_generation = cs->mems_generation;
+		if (is_spread_page(cs))
+			tsk->flags |= PF_SPREAD_PAGE;
+		else
+			tsk->flags &= ~PF_SPREAD_PAGE;
+		if (is_spread_slab(cs))
+			tsk->flags |= PF_SPREAD_SLAB;
+		else
+			tsk->flags &= ~PF_SPREAD_SLAB;
 		task_unlock(tsk);
 		mutex_unlock(&callback_mutex);
 		mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -956,7 +975,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
- *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
+ *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
+ *				CS_SPREAD_PAGE, CS_SPREAD_SLAB)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
@@ -1187,6 +1207,8 @@ typedef enum {
 	FILE_NOTIFY_ON_RELEASE,
 	FILE_MEMORY_PRESSURE_ENABLED,
 	FILE_MEMORY_PRESSURE,
+	FILE_SPREAD_PAGE,
+	FILE_SPREAD_SLAB,
 	FILE_TASKLIST,
 } cpuset_filetype_t;
 
@@ -1246,6 +1268,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	case FILE_MEMORY_PRESSURE:
 		retval = -EACCES;
 		break;
+	case FILE_SPREAD_PAGE:
+		retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		break;
+	case FILE_SPREAD_SLAB:
+		retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
 		break;
@@ -1355,6 +1385,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	case FILE_MEMORY_PRESSURE:
 		s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
 		break;
+	case FILE_SPREAD_PAGE:
+		*s++ = is_spread_page(cs) ? '1' : '0';
+		break;
+	case FILE_SPREAD_SLAB:
+		*s++ = is_spread_slab(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1718,6 +1754,16 @@ static struct cftype cft_memory_pressure = {
 	.private = FILE_MEMORY_PRESSURE,
 };
 
+static struct cftype cft_spread_page = {
+	.name = "memory_spread_page",
+	.private = FILE_SPREAD_PAGE,
+};
+
+static struct cftype cft_spread_slab = {
+	.name = "memory_spread_slab",
+	.private = FILE_SPREAD_SLAB,
+};
+
 static int cpuset_populate_dir(struct dentry *cs_dentry)
 {
 	int err;
@@ -1736,6 +1782,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
 		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
 		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
+		return err;
+	if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
+		return err;
 	if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
 		return err;
 	return 0;
@@ -1764,6 +1814,10 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
+	if (is_spread_page(parent))
+		set_bit(CS_SPREAD_PAGE, &cs->flags);
+	if (is_spread_slab(parent))
+		set_bit(CS_SPREAD_SLAB, &cs->flags);
 	cs->cpus_allowed = CPU_MASK_NONE;
 	cs->mems_allowed = NODE_MASK_NONE;
 	atomic_set(&cs->count, 0);
@@ -2200,6 +2254,44 @@ void cpuset_unlock(void)
 	mutex_unlock(&callback_mutex);
 }
 
+/**
+ * cpuset_mem_spread_node() - On which node to begin search for a page
+ *
+ * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
+ * tasks in a cpuset with is_spread_page or is_spread_slab set),
+ * and if the memory allocation used cpuset_mem_spread_node()
+ * to determine on which node to start looking, as it will for
+ * certain page cache or slab cache pages such as used for file
+ * system buffers and inode caches, then instead of starting on the
+ * local node to look for a free page, rather spread the starting
+ * node around the tasks mems_allowed nodes.
+ *
+ * We don't have to worry about the returned node being offline
+ * because "it can't happen", and even if it did, it would be ok.
+ *
+ * The routines calling guarantee_online_mems() are careful to
+ * only set nodes in task->mems_allowed that are online.  So it
+ * should not be possible for the following code to return an
+ * offline node.  But if it did, that would be ok, as this routine
+ * is not returning the node where the allocation must be, only
+ * the node where the search should start.  The zonelist passed to
+ * __alloc_pages() will include all nodes.  If the slab allocator
+ * is passed an offline node, it will fall back to the local node.
+ * See kmem_cache_alloc_node().
+ */
+
+int cpuset_mem_spread_node(void)
+{
+	int node;
+
+	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+	if (node == MAX_NUMNODES)
+		node = first_node(current->mems_allowed);
+	current->cpuset_mem_spread_rotor = node;
+	return node;
+}
+EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
+
 /**
  * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
  * @p: pointer to task_struct of some other task.
-- 
cgit v1.2.3


From c61afb181c649754ea221f104e268cbacfc993e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:08 -0800
Subject: [PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA
mempolicies and cpuset memory spreading are in an important code path.  Many
systems will use neither feature.

This patch optimizes those hooks down to a single check of some bits in the
current tasks task_struct flags.  For non NUMA systems, this hook and related
code is already ifdef'd out.

The optimization is done by using another task flag, set if the task is using
a non-default NUMA mempolicy.  Taking this flag bit along with the
PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset
memory spreading' patch set, one can check for the combination of any of these
special case memory placement mechanisms with a single test of the current
tasks task_struct flags.

This patch also tightens up the code, to save a few bytes of kernel text
space, and moves some of it out of line.  Due to the nested inlines called
from multiple places, we were ending up with three copies of this code, which
once we get off the main code path (for local node allocation) seems a bit
wasteful of instruction memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  5 +++++
 include/linux/sched.h     |  1 +
 kernel/fork.c             |  1 +
 mm/mempolicy.c            | 32 ++++++++++++++++++++++++++++++++
 mm/slab.c                 | 41 ++++++++++++++++++++++++++++-------------
 5 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bbd2221923c3..6a7621b2b12b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_fix_fork_child_flag(struct task_struct *p);
 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 
 #ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+}
+
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0e37cfa09f5..2cda439ece43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/fork.c b/kernel/fork.c
index c21bae8c93b9..a02063903aaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
  		p->mempolicy = NULL;
  		goto bad_fork_cleanup_cpuset;
  	}
+	mpol_fix_fork_child_flag(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+	if (p->mempolicy)
+		p->flags |= PF_MEMPOLICY;
+	else
+		p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+	mpol_fix_fork_child_flag(current);
+}
+
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index de516658d3d8..f80b52388a12 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->mempolicy && !in_interrupt())) {
-		int nid = slab_node(current->mempolicy);
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
-	}
-	if (unlikely(cpuset_do_slab_mem_spread() &&
-					(cachep->flags & SLAB_MEM_SPREAD) &&
-					!in_interrupt())) {
-		int nid = cpuset_mem_spread_node();
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+	if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
+							PF_MEMPOLICY))) {
+		objp = alternate_node_alloc(cachep, flags);
+		if (objp != NULL)
+			return objp;
 	}
 #endif
 
@@ -2855,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 }
 
 #ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	int nid_alloc, nid_here;
+
+	if (in_interrupt())
+		return NULL;
+	nid_alloc = nid_here = numa_node_id();
+	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+		nid_alloc = cpuset_mem_spread_node();
+	else if (current->mempolicy)
+		nid_alloc = slab_node(current->mempolicy);
+	if (nid_alloc != nid_here)
+		return __cache_alloc_node(cachep, flags, nid_alloc);
+	return NULL;
+}
+
 /*
  * A interface to enable slab creation on nodeid
  */
-- 
cgit v1.2.3


From 8488bc359d674baf710992e4b641513ea5ebd212 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:10 -0800
Subject: [PATCH] cpuset: remove unnecessary NULL check

Remove a no longer needed test for NULL cpuset pointer, with a little
comment explaining why the test isn't needed.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 38f18b33de6c..bc4131141230 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2023,7 +2023,7 @@ void cpuset_fork(struct task_struct *child)
  * because tsk is already marked PF_EXITING, so attach_task() won't
  * mess with it, or task is a failed fork, never visible to attach_task.
  *
- * Hack:
+ * the_top_cpuset_hack:
  *
  *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
  *
@@ -2062,7 +2062,7 @@ void cpuset_exit(struct task_struct *tsk)
 	struct cpuset *cs;
 
 	cs = tsk->cpuset;
-	tsk->cpuset = &top_cpuset;	/* Hack - see comment above */
+	tsk->cpuset = &top_cpuset;	/* the_top_cpuset_hack - see above */
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
@@ -2373,12 +2373,12 @@ void __cpuset_memory_pressure_bump(void)
  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
  *    doesn't really matter if tsk->cpuset changes after we read it,
  *    and we take manage_mutex, keeping attach_task() from changing it
- *    anyway.
+ *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
+ *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
+ *    cpuset to top_cpuset.
  */
-
 static int proc_cpuset_show(struct seq_file *m, void *v)
 {
-	struct cpuset *cs;
 	struct task_struct *tsk;
 	char *buf;
 	int retval = 0;
@@ -2389,13 +2389,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 
 	tsk = m->private;
 	mutex_lock(&manage_mutex);
-	cs = tsk->cpuset;
-	if (!cs) {
-		retval = -EINVAL;
-		goto out;
-	}
-
-	retval = cpuset_path(cs, buf, PAGE_SIZE);
+	retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
 	if (retval < 0)
 		goto out;
 	seq_puts(m, buf);
-- 
cgit v1.2.3


From 151a44202d097ae8b1bbaa6d8d2f97df30e3cd1e Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:11 -0800
Subject: [PATCH] cpuset: don't need to mark cpuset_mems_generation atomic

Drop the atomic_t marking on the cpuset static global
cpuset_mems_generation.  Since all access to it is guarded by the global
manage_mutex, there is no need for further serialization of this value.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bc4131141230..702928664f42 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -149,7 +149,7 @@ static inline int is_spread_slab(const struct cpuset *cs)
 }
 
 /*
- * Increment this atomic integer everytime any cpuset changes its
+ * Increment this integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
@@ -163,8 +163,11 @@ static inline int is_spread_slab(const struct cpuset *cs)
  * on every visit to __alloc_pages(), to efficiently check whether
  * its current->cpuset->mems_allowed has changed, requiring an update
  * of its current->mems_allowed.
+ *
+ * Since cpuset_mems_generation is guarded by manage_mutex,
+ * there is no need to mark it atomic.
  */
-static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
+static int cpuset_mems_generation;
 
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -877,7 +880,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs.mems_allowed;
-	cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
 	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */
@@ -1270,11 +1273,11 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 		break;
 	case FILE_SPREAD_PAGE:
 		retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
-		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
-		cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+		cs->mems_generation = cpuset_mems_generation++;
 		break;
 	case FILE_TASKLIST:
 		retval = attach_task(cs, buffer, &pathbuf);
@@ -1823,7 +1826,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	atomic_set(&cs->count, 0);
 	INIT_LIST_HEAD(&cs->sibling);
 	INIT_LIST_HEAD(&cs->children);
-	cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+	cs->mems_generation = cpuset_mems_generation++;
 	fmeter_init(&cs->fmeter);
 
 	cs->parent = parent;
@@ -1913,7 +1916,7 @@ int __init cpuset_init_early(void)
 	struct task_struct *tsk = current;
 
 	tsk->cpuset = &top_cpuset;
-	tsk->cpuset->mems_generation = atomic_inc_return(&cpuset_mems_generation);
+	tsk->cpuset->mems_generation = cpuset_mems_generation++;
 	return 0;
 }
 
@@ -1932,7 +1935,7 @@ int __init cpuset_init(void)
 	top_cpuset.mems_allowed = NODE_MASK_ALL;
 
 	fmeter_init(&top_cpuset.fmeter);
-	top_cpuset.mems_generation = atomic_inc_return(&cpuset_mems_generation);
+	top_cpuset.mems_generation = cpuset_mems_generation++;
 
 	init_task.cpuset = &top_cpuset;
 
-- 
cgit v1.2.3


From 29afd49b72a9b2c26fa8c678bcf3976d0540446b Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:12 -0800
Subject: [PATCH] cpuset: remove useless local variable initialization

Remove a useless variable initialization in cpuset __cpuset_zone_allowed().
 The local variable 'allowed' is unconditionally set before use, later on
in the code, so does not need to be initialized.

Not that it seems to matter to the code generated any, as the compiler
optimizes out the superfluous assignment anyway.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 702928664f42..18aea1bd1284 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2205,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
-	int allowed = 1;		/* is allocation in zone z allowed? */
+	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt())
 		return 1;
-- 
cgit v1.2.3


From 2ea1c5392cc8ce249fb661db4f4cdfbbae373a89 Mon Sep 17 00:00:00 2001
From: "John Z. Bohach" <jzb@aexorsyst.com>
Date: Fri, 24 Mar 2006 03:18:19 -0800
Subject: [PATCH] console_setup() depends (wrongly?) on CONFIG_PRINTK

It appears that console_setup() code only gets compiled into the kernel if
CONFIG_PRINTK is enabled.  One detrimental side-effect of this is that
serial8250_console_setup() never gets invoked when CONFIG_PRINTK is not
set, resulting in baud rate not being read/parsed from command line (i.e.
console=ttyS0,115200n8 is ignored, at least the baud rate part...)

Attached patch moves console_setup() code from inside

#ifdef CONFIG_PRINTK

to outside (in printk.c), removing dependence on said config. option.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 76 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 13ced0f7828f..8cc19431e74b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -122,44 +122,6 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
 
-/*
- *	Setup a list of consoles. Called from init/main.c
- */
-static int __init console_setup(char *str)
-{
-	char name[sizeof(console_cmdline[0].name)];
-	char *s, *options;
-	int idx;
-
-	/*
-	 *	Decode str into name, index, options.
-	 */
-	if (str[0] >= '0' && str[0] <= '9') {
-		strcpy(name, "ttyS");
-		strncpy(name + 4, str, sizeof(name) - 5);
-	} else
-		strncpy(name, str, sizeof(name) - 1);
-	name[sizeof(name) - 1] = 0;
-	if ((options = strchr(str, ',')) != NULL)
-		*(options++) = 0;
-#ifdef __sparc__
-	if (!strcmp(str, "ttya"))
-		strcpy(name, "ttyS0");
-	if (!strcmp(str, "ttyb"))
-		strcpy(name, "ttyS1");
-#endif
-	for (s = name; *s; s++)
-		if ((*s >= '0' && *s <= '9') || *s == ',')
-			break;
-	idx = simple_strtoul(s, NULL, 10);
-	*s = 0;
-
-	add_preferred_console(name, idx, options);
-	return 1;
-}
-
-__setup("console=", console_setup);
-
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned long size = memparse(str, &str);
@@ -659,6 +621,44 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 
 #endif
 
+/*
+ * Set up a list of consoles.  Called from init/main.c
+ */
+static int __init console_setup(char *str)
+{
+	char name[sizeof(console_cmdline[0].name)];
+	char *s, *options;
+	int idx;
+
+	/*
+	 * Decode str into name, index, options.
+	 */
+	if (str[0] >= '0' && str[0] <= '9') {
+		strcpy(name, "ttyS");
+		strncpy(name + 4, str, sizeof(name) - 5);
+	} else {
+		strncpy(name, str, sizeof(name) - 1);
+	}
+	name[sizeof(name) - 1] = 0;
+	if ((options = strchr(str, ',')) != NULL)
+		*(options++) = 0;
+#ifdef __sparc__
+	if (!strcmp(str, "ttya"))
+		strcpy(name, "ttyS0");
+	if (!strcmp(str, "ttyb"))
+		strcpy(name, "ttyS1");
+#endif
+	for (s = name; *s; s++)
+		if ((*s >= '0' && *s <= '9') || *s == ',')
+			break;
+	idx = simple_strtoul(s, NULL, 10);
+	*s = 0;
+
+	add_preferred_console(name, idx, options);
+	return 1;
+}
+__setup("console=", console_setup);
+
 /**
  * add_preferred_console - add a device to the list of preferred consoles.
  * @name: device name
-- 
cgit v1.2.3


From ec9e16bacdba1da1ee15dd162384e22df5c87e09 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:34 -0800
Subject: [PATCH] sys_setrlimit() cleanup

- Whitespace cleanups

- Make that expression comprehensible.

There's a potential logic change here: we do the "is it_prof_expires equal to
zero" test after converting it to seconds, rather than doing the comparison
between raw cputime_t's.

But given that it's in units of seconds anyway, that shouldn't change
anything.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ulrich Weigand <uweigand@de.ibm.com>
Cc: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index c0fcad9f826c..9bdf94f3ae29 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1630,20 +1630,21 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
 asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 {
 	struct rlimit new_rlim, *old_rlim;
+	unsigned long it_prof_secs;
 	int retval;
 
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
-	if(copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
 		return -EFAULT;
-       if (new_rlim.rlim_cur > new_rlim.rlim_max)
-               return -EINVAL;
+	if (new_rlim.rlim_cur > new_rlim.rlim_max)
+		return -EINVAL;
 	old_rlim = current->signal->rlim + resource;
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
-			return -EPERM;
+		return -EPERM;
 
 	retval = security_task_setrlimit(resource, &new_rlim);
 	if (retval)
@@ -1653,19 +1654,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	*old_rlim = new_rlim;
 	task_unlock(current->group_leader);
 
-	if (resource == RLIMIT_CPU && new_rlim.rlim_cur != RLIM_INFINITY &&
-	    (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
-	     new_rlim.rlim_cur <= cputime_to_secs(
-		     current->signal->it_prof_expires))) {
+	if (resource != RLIMIT_CPU)
+		goto out;
+	if (new_rlim.rlim_cur == RLIM_INFINITY)
+		goto out;
+
+	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
+	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
 		cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
+
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&current->sighand->siglock);
-		set_process_cpu_timer(current, CPUCLOCK_PROF,
-				      &cputime, NULL);
+		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
 		spin_unlock_irq(&current->sighand->siglock);
 		read_unlock(&tasklist_lock);
 	}
-
+out:
 	return 0;
 }
 
-- 
cgit v1.2.3


From e0661111e5441995f7a69dc4336c9f131cb9bc58 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:35 -0800
Subject: [PATCH] RLIMIT_CPU: fix handling of a zero limit

At present the kernel doesn't honour an attempt to set RLIMIT_CPU to zero
seconds.  But the spec says it should, and that's what 2.4.x does.

Fixing this for real would involve some complexity (such as adding a new
it-has-been-set flag to the task_struct, and testing that everwhere, instead
of overloading the value of it_prof_expires).

Given that a 2.4 kernel won't actually send the signal until one second has
expired anyway, let's just handle this case by treating the caller's
zero-seconds as one second.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ulrich Weigand <uweigand@de.ibm.com>
Cc: Cliff Wickman <cpw@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 9bdf94f3ae29..9e157e0240d4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1661,8 +1661,19 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 
 	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
 	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
-		cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
+		unsigned long rlim_cur = new_rlim.rlim_cur;
+		cputime_t cputime;
 
+		if (rlim_cur == 0) {
+			/*
+			 * The caller is asking for an immediate RLIMIT_CPU
+			 * expiry.  But we use the zero value to mean "it was
+			 * never set".  So let's cheat and make it one second
+			 * instead
+			 */
+			rlim_cur = 1;
+		}
+		cputime = secs_to_cputime(rlim_cur);
 		read_lock(&tasklist_lock);
 		spin_lock_irq(&current->sighand->siglock);
 		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
-- 
cgit v1.2.3


From d3561f78fd379a7110e46c87964ba7aa4120235c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:36 -0800
Subject: [PATCH] RLIMIT_CPU: document wrong return value

Document the fact that setrlimit(RLIMIT_CPU) doesn't return error codes when
it should.  I don't think we can fix this without a 2.7.x..

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ulrich Weigand <uweigand@de.ibm.com>
Cc: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 9e157e0240d4..19d058be49d4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1656,6 +1656,13 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 
 	if (resource != RLIMIT_CPU)
 		goto out;
+
+	/*
+	 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
+	 * code if it rejected the user's attempt to set RLIMIT_CPU.  This is a
+	 * very long-standing error, and fixing it now risks breakage of
+	 * applications, so we live with it
+	 */
 	if (new_rlim.rlim_cur == RLIM_INFINITY)
 		goto out;
 
-- 
cgit v1.2.3


From 6a4d11c2abc57ed7ca42041e5f68ae4f7f640a81 Mon Sep 17 00:00:00 2001
From: Sergey Vlasov <vsu@altlinux.ru>
Date: Fri, 24 Mar 2006 03:18:38 -0800
Subject: [PATCH] Fix module refcount leak in __set_personality()

If the change of personality does not lead to change of exec domain,
__set_personality() returned without releasing the module reference
acquired by lookup_exec_domain().

Signed-off-by: Sergey Vlasov <vsu@altlinux.ru>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exec_domain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 867d6dbeb574..c01cead2cfd6 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -140,6 +140,7 @@ __set_personality(u_long personality)
 	ep = lookup_exec_domain(personality);
 	if (ep == current_thread_info()->exec_domain) {
 		current->personality = personality;
+		module_put(ep->module);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 6687a97d4041f996f725902d2990e5de6ef5cbe5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 24 Mar 2006 03:18:41 -0800
Subject: [PATCH] timer-irq-driven soft-watchdog, cleanups

Make the softlockup detector purely timer-interrupt driven, removing
softirq-context (timer) dependencies.  This means that if the softlockup
watchdog triggers, it has truly observed a longer than 10 seconds
scheduling delay of a SCHED_FIFO prio 99 task.

(the patch also turns off the softlockup detector during the initial bootup
phase and does small style fixes)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  4 ++--
 kernel/softlockup.c   | 54 ++++++++++++++++++++++++++++-----------------------
 kernel/timer.c        |  2 +-
 3 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cda439ece43..e0054c1b9a09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -206,11 +206,11 @@ extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(struct pt_regs *regs);
+extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 #else
-static inline void softlockup_tick(struct pt_regs *regs)
+static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c67189a25d52..dd9524fa649a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -1,12 +1,11 @@
 /*
  * Detect Soft Lockups
  *
- * started by Ingo Molnar, (C) 2005, Red Hat
+ * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc.
  *
  * this code detects soft lockups: incidents in where on a CPU
  * the kernel does not reschedule for 10 seconds or more.
  */
-
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/init.h>
@@ -17,13 +16,14 @@
 
 static DEFINE_SPINLOCK(print_lock);
 
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, touch_timestamp);
+static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic = 0;
-static int softlock_panic(struct notifier_block *this, unsigned long event,
-				void *ptr)
+
+static int
+softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	did_panic = 1;
 
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
 
 void touch_softlockup_watchdog(void)
 {
-	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+	per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
 }
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
@@ -44,25 +44,35 @@ EXPORT_SYMBOL(touch_softlockup_watchdog);
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
  */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
-	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+	unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
 
-	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+	/* prevent double reports: */
+	if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+		did_panic ||
+			!per_cpu(watchdog_task, this_cpu))
 		return;
 
-	/* Do not cause a second panic when there already was one */
-	if (did_panic)
+	/* do not print during early bootup: */
+	if (unlikely(system_state != SYSTEM_RUNNING)) {
+		touch_softlockup_watchdog();
 		return;
+	}
 
-	if (time_after(jiffies, timestamp + 10*HZ)) {
-		per_cpu(print_timestamp, this_cpu) = timestamp;
+	/* Wake up the high-prio watchdog task every second: */
+	if (time_after(jiffies, touch_timestamp + HZ))
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+
+	/* Warn about unreasonable 10+ seconds delays: */
+	if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = touch_timestamp;
 
 		spin_lock(&print_lock);
 		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
 			this_cpu);
-		show_regs(regs);
+		dump_stack();
 		spin_unlock(&print_lock);
 	}
 }
@@ -77,18 +87,16 @@ static int watchdog(void * __bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	/*
-	 * Run briefly once per second - if this gets delayed for
-	 * more than 10 seconds then the debug-printout triggers
-	 * in softlockup_tick():
+	 * Run briefly once per second to reset the softlockup timestamp.
+	 * If this gets delayed for more than 10 seconds then the
+	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		msleep_interruptible(1000);
+		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
+		schedule();
 	}
-	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
@@ -114,7 +122,6 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		kthread_bind(p, hotcpu);
  		break;
 	case CPU_ONLINE:
-
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -146,4 +153,3 @@ __init void spawn_softlockup_task(void)
 
 	notifier_chain_register(&panic_notifier_list, &panic_block);
 }
-
diff --git a/kernel/timer.c b/kernel/timer.c
index 4427e725ccdd..17d956cebcb9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -915,6 +915,7 @@ static void run_timer_softirq(struct softirq_action *h)
 void run_local_timers(void)
 {
 	raise_softirq(TIMER_SOFTIRQ);
+	softlockup_tick();
 }
 
 /*
@@ -945,7 +946,6 @@ void do_timer(struct pt_regs *regs)
 	/* prevent loading jiffies before storing new jiffies_64 value. */
 	barrier();
 	update_times();
-	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From 24277dda3a54aa5e6265487e1a3091e27f3c0c45 Mon Sep 17 00:00:00 2001
From: Davi Arnaut <davi.arnaut@gmail.com>
Date: Fri, 24 Mar 2006 03:18:43 -0800
Subject: [PATCH] strndup_user: convert module

Change hand-coded userspace string copying to strndup_user.

Signed-off-by: Davi Arnaut <davi.arnaut@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index fb404299082e..54623c714bba 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1572,7 +1572,6 @@ static struct module *load_module(void __user *umod,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
 		crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
 		gplfuturecrcindex;
-	long arglen;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1691,23 +1690,11 @@ static struct module *load_module(void __user *umod,
 	}
 
 	/* Now copy in args */
-	arglen = strlen_user(uargs);
-	if (!arglen) {
-		err = -EFAULT;
-		goto free_hdr;
-	}
-	args = kmalloc(arglen, GFP_KERNEL);
-	if (!args) {
-		err = -ENOMEM;
+	args = strndup_user(uargs, ~0UL >> 1);
+	if (IS_ERR(args)) {
+		err = PTR_ERR(args);
 		goto free_hdr;
 	}
-	if (copy_from_user(args, uargs, arglen) != 0) {
-		err = -EFAULT;
-		goto free_mod;
-	}
-
-	/* Userspace could have altered the string after the strlen_user() */
-	args[arglen - 1] = '\0';
 
 	if (find_module(mod->name)) {
 		err = -EEXIST;
-- 
cgit v1.2.3


From f125b56113be4956867cc9bd098bb99b1b9bb93f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:44 -0800
Subject: [PATCH] fix build error if CONFIG_SYSFS=n

uevent_seqnum and uevent_helper are only defined if CONFIG_HOTPLUG=y,
CONFIG_NET=n.

(I stole this back from Greg's tree - it makes allnoconfig work).

Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ksysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f2690ed74530..f119e098e67b 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -22,7 +22,7 @@ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
 static struct subsys_attribute _name##_attr = \
 	__ATTR(_name, 0644, _name##_show, _name##_store)
 
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
 {
@@ -52,7 +52,7 @@ decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
 static struct attribute * kernel_attrs[] = {
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
 #endif
-- 
cgit v1.2.3


From 6978c7052f2e22c6c40781cdd4eba5c4bce9a789 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Fri, 24 Mar 2006 18:45:21 +0100
Subject: BUG_ON() Conversion in kernel/cpu.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/cpu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index e882c6babf41..8be22bd80933 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -223,8 +223,7 @@ int __devinit cpu_up(unsigned int cpu)
 	ret = __cpu_up(cpu);
 	if (ret != 0)
 		goto out_notify;
-	if (!cpu_online(cpu))
-		BUG();
+	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
 	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
-- 
cgit v1.2.3


From 185ae6d7a32721e9062030a9f2d24ed714fa45df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:06:32 -0800
Subject: [PATCH] timer irq driven soft watchdog fix

I seem to have lost this hunk in yesterday's patch.  It brings the
coming-online CPU's softlockup timer up to date so we don't get false-positive
tripups during CPU hot-add.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softlockup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dd9524fa649a..d9b3d5847ed8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -118,6 +118,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			printk("watchdog for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
 		}
+  		per_cpu(touch_timestamp, hotcpu) = jiffies;
   		per_cpu(watchdog_task, hotcpu) = p;
 		kthread_bind(p, hotcpu);
  		break;
-- 
cgit v1.2.3


From c08b8a49100715b20e6f7c997e992428b5e06078 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Mar 2006 03:06:33 -0800
Subject: [PATCH] sys_alarm() unsigned signed conversion fixup

alarm() calls the kernel with an unsigend int timeout in seconds.  The
value is stored in the tv_sec field of a struct timeval to setup the
itimer.  The tv_sec field of struct timeval is of type long, which causes
the tv_sec value to be negative on 32 bit machines if seconds > INT_MAX.

Before the hrtimer merge (pre 2.6.16) such a negative value was converted
to the maximum jiffies timeout by the timeval_to_jiffies conversion.  It's
not clear whether this was intended or just happened to be done by the
timeval_to_jiffies code.

hrtimers expect a timeval in canonical form and treat a negative timeout as
already expired.  This breaks the legitimate usage of alarm() with a
timeout value > INT_MAX seconds.

For 32 bit machines it is therefor necessary to limit the internal seconds
value to avoid API breakage.  Instead of doing this in all implementations
of sys_alarm the duplicated sys_alarm code is moved into a common function
in itimer.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/sys_ia32.c   | 14 +-------------
 arch/mips/kernel/sysirix.c  | 22 +---------------------
 arch/x86_64/ia32/sys_ia32.c | 16 ++--------------
 include/linux/time.h        |  1 +
 kernel/itimer.c             | 37 +++++++++++++++++++++++++++++++++++++
 kernel/timer.c              | 14 +-------------
 6 files changed, 43 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 70dba1f0e2ee..13e739e4c84d 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1166,19 +1166,7 @@ put_tv32 (struct compat_timeval __user *o, struct timeval *i)
 asmlinkage unsigned long
 sys32_alarm (unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 0fc3730a294f..5407b784cd01 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -645,27 +645,7 @@ static inline void getitimer_real(struct itimerval *value)
 
 asmlinkage unsigned int irix_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	if (!seconds) {
-		getitimer_real(&it_old);
-		del_timer(&current->real_timer);
-	} else {
-		it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-		it_new.it_value.tv_sec = seconds;
-		it_new.it_value.tv_usec = 0;
-		do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	}
-	oldalarm = it_old.it_value.tv_sec;
-	/*
-	 * ehhh.. We can't return 0 if we have an alarm pending ...
-	 * And we'd better return too much than too little anyway
-	 */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 asmlinkage int irix_pause(void)
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 2bc55af95419..2b2d029f477c 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -430,24 +430,12 @@ put_tv32(struct compat_timeval __user *o, struct timeval *i)
 	return err; 
 }
 
-extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 
 asmlinkage long
 sys32_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if (it_old.it_value.tv_usec)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 /* Translations due to time_t size differences.  Which affects all
diff --git a/include/linux/time.h b/include/linux/time.h
index d9cdba54b789..bf0e785e2e03 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -101,6 +101,7 @@ extern long do_utimes(int dfd, char __user *filename, struct timeval *times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
+extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 379be2f8c84c..a2dc375927d8 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -226,6 +226,43 @@ again:
 	return 0;
 }
 
+/**
+ * alarm_setitimer - set alarm in seconds
+ *
+ * @seconds:	number of seconds until alarm
+ *		0 disables the alarm
+ *
+ * Returns the remaining time in seconds of a pending timer or 0 when
+ * the timer is not active.
+ *
+ * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
+ * negative timeval settings which would cause immediate expiry.
+ */
+unsigned int alarm_setitimer(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+
+#if BITS_PER_LONG < 64
+	if (seconds > INT_MAX)
+		seconds = INT_MAX;
+#endif
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+
+	/*
+	 * We can't return 0 if we have an alarm pending ...  And we'd
+	 * better return too much than too little anyway
+	 */
+	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
+	      it_old.it_value.tv_usec >= 500000)
+		it_old.it_value.tv_sec++;
+
+	return it_old.it_value.tv_sec;
+}
+
 asmlinkage long sys_setitimer(int which,
 			      struct itimerval __user *value,
 			      struct itimerval __user *ovalue)
diff --git a/kernel/timer.c b/kernel/timer.c
index 17d956cebcb9..13fa72cac7d8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -956,19 +956,7 @@ void do_timer(struct pt_regs *regs)
  */
 asmlinkage unsigned long sys_alarm(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
-	unsigned int oldalarm;
-
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
-	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	do_setitimer(ITIMER_REAL, &it_new, &it_old);
-	oldalarm = it_old.it_value.tv_sec;
-	/* ehhh.. We can't return 0 if we have an alarm pending.. */
-	/* And we'd better return too much than too little anyway */
-	if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
-		oldalarm++;
-	return oldalarm;
+	return alarm_setitimer(seconds);
 }
 
 #endif
-- 
cgit v1.2.3


From 7d99b7d634d81bb372e03e4561c80430aa4cfac2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Mar 2006 03:06:35 -0800
Subject: [PATCH] Validate and sanitze itimer timeval from userspace

According to the specification the timevals must be validated and an
errorcode -EINVAL returned in case the timevals are not in canonical form.
This check was never done in Linux.

The pre 2.6.16 code converted invalid timevals silently.  Negative timeouts
were converted by the timeval_to_jiffies conversion to the maximum timeout.

hrtimers and the ktime_t operations expect timevals in canonical form.
Otherwise random results might happen on 32 bits machines due to the
optimized ktime_add/sub operations.  Negative timeouts are treated as
already expired.  This might break applications which work on pre 2.6.16.

To prevent random behaviour and API breakage the timevals are checked and
invalid timevals sanitized in a simliar way as the pre 2.6.16 code did.

Invalid timevals are reported with a per boot limited number of kernel
messages so applications which use this misfeature can be corrected.

After a grace period of one year the sanitizing should be replaced by a
correct validation check.  This is also documented in
Documentation/feature-removal-schedule.txt

The validation and sanitizing is done inside do_setitimer so all callers
(sys_setitimer, compat_sys_setitimer, osf_setitimer) are catched.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt | 12 ++++++
 kernel/itimer.c                            | 66 ++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 21272e4b4a5c..495858b236b6 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -176,6 +176,18 @@ Who:	Richard Knutsson <ricknu-0@student.ltu.se> and Greg Kroah-Hartman <gregkh@s
 
 ---------------------------
 
+What:	Usage of invalid timevals in setitimer
+When:	March 2007
+Why:	POSIX requires to validate timevals in the setitimer call. This
+	was never done by Linux. The invalid (e.g. negative timevals) were
+	silently converted to more or less random timeouts and intervals.
+	Until the removal a per boot limited number of warnings is printed
+	and the timevals are sanitized.
+
+Who:	Thomas Gleixner <tglx@linutronix.de>
+
+---------------------------
+
 What:	I2C interface of the it87 driver
 When:	January 2007
 Why:	The ISA interface is faster and should be always available. The I2C
diff --git a/kernel/itimer.c b/kernel/itimer.c
index a2dc375927d8..680e6b70c872 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -143,6 +143,60 @@ int it_real_fn(void *data)
 	return HRTIMER_NORESTART;
 }
 
+/*
+ * We do not care about correctness. We just sanitize the values so
+ * the ktime_t operations which expect normalized values do not
+ * break. This converts negative values to long timeouts similar to
+ * the code in kernel versions < 2.6.16
+ *
+ * Print a limited number of warning messages when an invalid timeval
+ * is detected.
+ */
+static void fixup_timeval(struct timeval *tv, int interval)
+{
+	static int warnlimit = 10;
+	unsigned long tmp;
+
+	if (warnlimit > 0) {
+		warnlimit--;
+		printk(KERN_WARNING
+		       "setitimer: %s (pid = %d) provided "
+		       "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
+		       current->comm, current->pid,
+		       interval ? "it_interval" : "it_value",
+		       tv->tv_sec, (long) tv->tv_usec);
+	}
+
+	tmp = tv->tv_usec;
+	if (tmp >= USEC_PER_SEC) {
+		tv->tv_usec = tmp % USEC_PER_SEC;
+		tv->tv_sec += tmp / USEC_PER_SEC;
+	}
+
+	tmp = tv->tv_sec;
+	if (tmp > LONG_MAX)
+		tv->tv_sec = LONG_MAX;
+}
+
+/*
+ * Returns true if the timeval is in canonical form
+ */
+#define timeval_valid(t) \
+	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
+
+/*
+ * Check for invalid timevals, sanitize them and print a limited
+ * number of warnings.
+ */
+static void check_itimerval(struct itimerval *value) {
+
+	if (unlikely(!timeval_valid(&value->it_value)))
+		fixup_timeval(&value->it_value, 0);
+
+	if (unlikely(!timeval_valid(&value->it_interval)))
+		fixup_timeval(&value->it_interval, 1);
+}
+
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
@@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
+	/*
+	 * Validate the timevals in value.
+	 *
+	 * Note: Although the spec requires that invalid values shall
+	 * return -EINVAL, we just fixup the value and print a limited
+	 * number of warnings in order not to break users of this
+	 * historical misfeature.
+	 *
+	 * Scheduled for replacement in March 2007
+	 */
+	check_itimerval(value);
+
 	switch (which) {
 	case ITIMER_REAL:
 again:
-- 
cgit v1.2.3


From 8d3b33f67fdc0fb364a1ef6d8fbbea7c2e4e6c98 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 25 Mar 2006 03:07:05 -0800
Subject: [PATCH] Remove MODULE_PARM

MODULE_PARM was actually breaking: recent gcc version optimize them out as
unused.  It's time to replace the last users, which are generally in the
most unloved drivers anyway.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/networking/ray_cs.txt  |   2 +-
 Documentation/sound/oss/Introduction |   2 +-
 Documentation/sound/oss/cs46xx       |  16 +--
 arch/ppc/8xx_io/cs4218_tdm.c         |  10 +-
 drivers/block/ataflop.c              |   4 +-
 drivers/cdrom/cm206.c                |   8 +-
 drivers/cdrom/sbpcd.c                |  10 +-
 drivers/char/istallion.c             |   8 +-
 drivers/char/mxser.c                 |   8 +-
 drivers/char/riscom8.c               |   8 +-
 drivers/isdn/hardware/avm/b1dma.c    |   2 +-
 drivers/isdn/hardware/avm/b1isa.c    |   4 +-
 drivers/isdn/hardware/avm/c4.c       |   2 +-
 drivers/isdn/hardware/avm/t1isa.c    |   6 +-
 drivers/isdn/hysdn/hycapi.c          |   2 +-
 drivers/isdn/hysdn/hysdn_net.c       |   2 +-
 drivers/isdn/isdnloop/isdnloop.c     |   2 +-
 drivers/media/video/zr36120.c        |   8 +-
 drivers/mmc/au1xmmc.c                |   2 +-
 drivers/mtd/maps/pcmciamtd.c         |  14 +--
 drivers/net/atari_bionet.c           |   2 +-
 drivers/net/atari_pamsnet.c          |   2 +-
 drivers/net/atarilance.c             |   2 +-
 drivers/net/cassini.c                |  11 ++-
 drivers/net/chelsio/cxgb2.c          |   2 +-
 drivers/net/fec_8xx/fec_main.c       |   4 +-
 drivers/net/fs_enet/fs_enet-main.c   |   4 +-
 drivers/net/gt96100eth.c             |   4 +-
 drivers/net/hamradio/dmascc.c        |   2 +-
 drivers/net/hamradio/mkiss.c         |   2 +-
 drivers/net/irda/irport.c            |   4 +-
 drivers/net/lasi_82596.c             |   6 +-
 drivers/net/mac89x0.c                |   2 +-
 drivers/net/mace.c                   |   2 +-
 drivers/net/meth.c                   |   2 +-
 drivers/net/ne-h8300.c               |   6 +-
 drivers/net/ni5010.c                 |   4 +-
 drivers/net/sun3lance.c              |   2 +-
 drivers/s390/block/dasd.c            |   1 -
 drivers/s390/block/dasd_devmap.c     |   3 +
 drivers/scsi/arm/cumana_2.c          |   2 +-
 drivers/scsi/arm/eesox.c             |   2 +-
 drivers/scsi/arm/powertec.c          |   2 +-
 drivers/scsi/atari_scsi.c            |  10 +-
 drivers/video/pm3fb.c                |  18 ++--
 include/linux/module.h               |  19 ----
 include/video/pm3fb.h                |   3 -
 init/Kconfig                         |   9 --
 kernel/module.c                      | 183 +++--------------------------------
 kernel/rcutorture.c                  |  10 +-
 sound/oss/au1000.c                   |   2 +-
 sound/oss/au1550_ac97.c              |   2 +-
 sound/oss/dmasound/dmasound_core.c   |  10 +-
 sound/oss/ite8172.c                  |   4 +-
 sound/oss/swarm_cs4297a.c            |   4 +-
 sound/oss/waveartist.c               |   8 +-
 56 files changed, 146 insertions(+), 329 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/networking/ray_cs.txt b/Documentation/networking/ray_cs.txt
index 5427f8c7df95..145d27a52395 100644
--- a/Documentation/networking/ray_cs.txt
+++ b/Documentation/networking/ray_cs.txt
@@ -25,7 +25,7 @@ the essid= string parameter is available via the kernel command line.
 This will change after the method of sorting out parameters for all
 the PCMCIA drivers is agreed upon.  If you must have a built in driver
 with nondefault parameters, they can be edited in
-/usr/src/linux/drivers/net/pcmcia/ray_cs.c.  Searching for MODULE_PARM
+/usr/src/linux/drivers/net/pcmcia/ray_cs.c.  Searching for module_param
 will find them all.
 
 Information on card services is available at:
diff --git a/Documentation/sound/oss/Introduction b/Documentation/sound/oss/Introduction
index 15d4fb975ac0..f04ba6bb7395 100644
--- a/Documentation/sound/oss/Introduction
+++ b/Documentation/sound/oss/Introduction
@@ -69,7 +69,7 @@ are available, for example IRQ, address, DMA.
 
 Warning, the options for different cards sometime use different names 
 for the same or a similar feature (dma1= versus dma16=).  As a last 
-resort, inspect the code (search for MODULE_PARM).
+resort, inspect the code (search for module_param).
 
 Notes:
 
diff --git a/Documentation/sound/oss/cs46xx b/Documentation/sound/oss/cs46xx
index 88d6cf8b39f3..b54432709863 100644
--- a/Documentation/sound/oss/cs46xx
+++ b/Documentation/sound/oss/cs46xx
@@ -88,7 +88,7 @@ parameters.  for a copy email: twoller@crystal.cirrus.com
 
 MODULE_PARMS definitions
 ------------------------
-MODULE_PARM(defaultorder, "i");
+module_param(defaultorder, ulong, 0);
 defaultorder=N
 where N is a value from 1 to 12
 The buffer order determines the size of the dma buffer for the driver.
@@ -98,18 +98,18 @@ to not underrun the dma buffer as easily.  As default, use 32k (order=3)
 rather than 64k as some of the games work more responsively.
 (2^N) * PAGE_SIZE = allocated buffer size
 
-MODULE_PARM(cs_debuglevel, "i");
-MODULE_PARM(cs_debugmask, "i");
+module_param(cs_debuglevel, ulong, 0644);
+module_param(cs_debugmask, ulong, 0644);
 cs_debuglevel=N
 cs_debugmask=0xMMMMMMMM
 where N is a value from 0 (no debug printfs), to 9 (maximum)
 0xMMMMMMMM is a debug mask corresponding to the CS_xxx bits (see driver source).
 
-MODULE_PARM(hercules_egpio_disable, "i");
+module_param(hercules_egpio_disable, ulong, 0);
 hercules_egpio_disable=N
 where N is a 0 (enable egpio), or a 1 (disable egpio support)
 
-MODULE_PARM(initdelay, "i");
+module_param(initdelay, ulong, 0);
 initdelay=N
 This value is used to determine the millescond delay during the initialization
 code prior to powering up the PLL.  On laptops this value can be used to
@@ -118,19 +118,19 @@ system is booted under battery power then the mdelay()/udelay() functions fail t
 properly delay the required time.  Also, if the system is booted under AC power
 and then the power removed, the mdelay()/udelay() functions will not delay properly.
  
-MODULE_PARM(powerdown, "i");
+module_param(powerdown, ulong, 0);
 powerdown=N
 where N is 0 (disable any powerdown of the internal blocks) or 1 (enable powerdown)
 
 
-MODULE_PARM(external_amp, "i");
+module_param(external_amp, bool, 0);
 external_amp=1
 if N is set to 1, then force enabling the EAPD support in the primary AC97 codec.
 override the detection logic and force the external amp bit in the AC97 0x26 register
 to be reset (0).  EAPD should be 0 for powerup, and 1 for powerdown.  The VTB Santa Cruz
 card has inverted logic, so there is a special function for these cards.
 
-MODULE_PARM(thinkpad, "i");
+module_param(thinkpad, bool, 0);
 thinkpad=1
 if N is set to 1, then force enabling the clkrun functionality.
 Currently, when the part is being used, then clkrun is disabled for the entire system,
diff --git a/arch/ppc/8xx_io/cs4218_tdm.c b/arch/ppc/8xx_io/cs4218_tdm.c
index 49eb2a7e65c0..a892356d5c3b 100644
--- a/arch/ppc/8xx_io/cs4218_tdm.c
+++ b/arch/ppc/8xx_io/cs4218_tdm.c
@@ -126,11 +126,11 @@ static int numReadBufs = 4, readbufSize = 32;
 */
 static volatile cbd_t	*rx_base, *rx_cur, *tx_base, *tx_cur;
 
-MODULE_PARM(catchRadius, "i");
-MODULE_PARM(numBufs, "i");
-MODULE_PARM(bufSize, "i");
-MODULE_PARM(numreadBufs, "i");
-MODULE_PARM(readbufSize, "i");
+module_param(catchRadius, int, 0);
+module_param(numBufs, int, 0);
+module_param(bufSize, int, 0);
+module_param(numreadBufs, int, 0);
+module_param(readbufSize, int, 0);
 
 #define arraysize(x)	(sizeof(x)/sizeof(*(x)))
 #define le2be16(x)	(((x)<<8 & 0xff00) | ((x)>>8 & 0x00ff))
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f8ce235ccfc3..c39650920bdf 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -271,7 +271,7 @@ unsigned char *DMABuffer;			  /* buffer for writes */
 static unsigned long PhysDMABuffer;   /* physical address */
 
 static int UseTrackbuffer = -1;		  /* Do track buffering? */
-MODULE_PARM(UseTrackbuffer, "i");
+module_param(UseTrackbuffer, int, 0);
 
 unsigned char *TrackBuffer;			  /* buffer for reads */
 static unsigned long PhysTrackBuffer; /* physical address */
@@ -296,7 +296,7 @@ static int MotorOn = 0, MotorOffTrys;
 static int IsFormatting = 0, FormatError;
 
 static int UserSteprate[FD_MAX_UNITS] = { -1, -1 };
-MODULE_PARM(UserSteprate, "1-" __MODULE_STRING(FD_MAX_UNITS) "i");
+module_param_array(UserSteprate, int, NULL, 0);
 
 /* Synchronization of FDC access. */
 static volatile int fdc_busy = 0;
diff --git a/drivers/cdrom/cm206.c b/drivers/cdrom/cm206.c
index fad27a87ce35..f43a988dd413 100644
--- a/drivers/cdrom/cm206.c
+++ b/drivers/cdrom/cm206.c
@@ -218,12 +218,12 @@ static int cm206_base = CM206_BASE;
 static int cm206_irq = CM206_IRQ;
 #ifdef MODULE
 static int cm206[2] = { 0, 0 };	/* for compatible `insmod' parameter passing */
+module_param_array(cm206, int, NULL, 0);	/* base,irq or irq,base */
 #endif
 
-MODULE_PARM(cm206_base, "i");	/* base */
-MODULE_PARM(cm206_irq, "i");	/* irq */
-MODULE_PARM(cm206, "1-2i");	/* base,irq or irq,base */
-MODULE_PARM(auto_probe, "i");	/* auto probe base and irq */
+module_param(cm206_base, int, 0);	/* base */
+module_param(cm206_irq, int, 0);	/* irq */
+module_param(auto_probe, bool, 0);	/* auto probe base and irq */
 MODULE_LICENSE("GPL");
 
 #define POLLOOP 100		/* milliseconds */
diff --git a/drivers/cdrom/sbpcd.c b/drivers/cdrom/sbpcd.c
index 4760f515f591..05c9e865ecaf 100644
--- a/drivers/cdrom/sbpcd.c
+++ b/drivers/cdrom/sbpcd.c
@@ -464,8 +464,13 @@ static int sbpcd[] =
 static  __cacheline_aligned DEFINE_SPINLOCK(sbpcd_lock);
 static struct request_queue *sbpcd_queue;
 
-MODULE_PARM(sbpcd, "2i");
-MODULE_PARM(max_drives, "i");
+/* You can only set the first pair, from old MODULE_PARM code.  */
+static int sbpcd_set(const char *val, struct kernel_param *kp)
+{
+	get_options((char *)val, 2, (int *)sbpcd);
+	return 0;
+}
+module_param_call(sbpcd, sbpcd_set, NULL, NULL, 0);
 
 #define NUM_PROBE  (sizeof(sbpcd) / sizeof(int))
 
@@ -553,6 +558,7 @@ static unsigned char msgnum;
 static char msgbuf[80];
 
 static int max_drives = MAX_DRIVES;
+module_param(max_drives, int, 0);
 #ifndef MODULE
 static unsigned char setup_done;
 static const char *str_sb_l = "soundblaster";
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index ede128356af2..e5247f85a446 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -378,13 +378,13 @@ MODULE_DESCRIPTION("Stallion Intelligent Multiport Serial Driver");
 MODULE_LICENSE("GPL");
 
 
-MODULE_PARM(board0, "1-3s");
+module_param_array(board0, charp, NULL, 0);
 MODULE_PARM_DESC(board0, "Board 0 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board1, "1-3s");
+module_param_array(board1, charp, NULL, 0);
 MODULE_PARM_DESC(board1, "Board 1 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board2, "1-3s");
+module_param_array(board2, charp, NULL, 0);
 MODULE_PARM_DESC(board2, "Board 2 config -> name[,ioaddr[,memaddr]");
-MODULE_PARM(board3, "1-3s");
+module_param_array(board3, charp, NULL, 0);
 MODULE_PARM_DESC(board3, "Board 3 config -> name[,ioaddr[,memaddr]");
 
 #endif
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index ea725a9964e2..0fb2fb9fb024 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -243,10 +243,10 @@ static int verbose = 0;
 
 MODULE_AUTHOR("Casper Yang");
 MODULE_DESCRIPTION("MOXA Smartio/Industio Family Multiport Board Device Driver");
-MODULE_PARM(ioaddr, "1-4i");
-MODULE_PARM(ttymajor, "i");
-MODULE_PARM(calloutmajor, "i");
-MODULE_PARM(verbose, "i");
+module_param_array(ioaddr, int, NULL, 0);
+module_param(ttymajor, int, 0);
+module_param(calloutmajor, int, 0);
+module_param(verbose, bool, 0);
 MODULE_LICENSE("GPL");
 
 struct mxser_log {
diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c
index 119e629656b7..657c0d88f48c 100644
--- a/drivers/char/riscom8.c
+++ b/drivers/char/riscom8.c
@@ -1743,10 +1743,10 @@ static int iobase;
 static int iobase1;
 static int iobase2;
 static int iobase3;
-MODULE_PARM(iobase, "i");
-MODULE_PARM(iobase1, "i");
-MODULE_PARM(iobase2, "i");
-MODULE_PARM(iobase3, "i");
+module_param(iobase, int, 0);
+module_param(iobase1, int, 0);
+module_param(iobase2, int, 0);
+module_param(iobase3, int, 0);
 
 MODULE_LICENSE("GPL");
 #endif /* MODULE */
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 91dd0551fc7c..4d64e5cbcdbf 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -39,7 +39,7 @@ MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
 
 static int suppress_pollack = 0;
-MODULE_PARM(suppress_pollack, "0-1i");
+module_param(suppress_pollack, bool, 0);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/b1isa.c b/drivers/isdn/hardware/avm/b1isa.c
index 38bd4dfecbd1..80fb488848b8 100644
--- a/drivers/isdn/hardware/avm/b1isa.c
+++ b/drivers/isdn/hardware/avm/b1isa.c
@@ -169,8 +169,8 @@ static struct pci_dev isa_dev[MAX_CARDS];
 static int io[MAX_CARDS];
 static int irq[MAX_CARDS];
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s) (assigned)");
 
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 724aac2c1cca..f7253b2136ea 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -50,7 +50,7 @@ MODULE_DEVICE_TABLE(pci, c4_pci_tbl);
 MODULE_DESCRIPTION("CAPI4Linux: Driver for AVM C2/C4 cards");
 MODULE_AUTHOR("Carsten Paeth");
 MODULE_LICENSE("GPL");
-MODULE_PARM(suppress_pollack, "0-1i");
+module_param(suppress_pollack, bool, 0);
 
 /* ------------------------------------------------------------- */
 
diff --git a/drivers/isdn/hardware/avm/t1isa.c b/drivers/isdn/hardware/avm/t1isa.c
index 3b701d97bdf1..5a2f854d55b5 100644
--- a/drivers/isdn/hardware/avm/t1isa.c
+++ b/drivers/isdn/hardware/avm/t1isa.c
@@ -519,9 +519,9 @@ static int io[MAX_CARDS];
 static int irq[MAX_CARDS];
 static int cardnr[MAX_CARDS];
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_CARDS) "i");
-MODULE_PARM(cardnr, "1-" __MODULE_STRING(MAX_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
+module_param_array(cardnr, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s) (assigned)");
 MODULE_PARM_DESC(cardnr, "Card number(s) (as jumpered)");
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index 55fbea0b6ac8..6bac43cc91bd 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -31,7 +31,7 @@
 static char hycapi_revision[]="$Revision: 1.8.6.4 $";
 
 unsigned int hycapi_enable = 0xffffffff; 
-MODULE_PARM(hycapi_enable, "i");
+module_param(hycapi_enable, uint, 0);
 
 typedef struct _hycapi_appl {
 	unsigned int ctrl_mask;
diff --git a/drivers/isdn/hysdn/hysdn_net.c b/drivers/isdn/hysdn/hysdn_net.c
index 683647b1f2bc..d205249a1242 100644
--- a/drivers/isdn/hysdn/hysdn_net.c
+++ b/drivers/isdn/hysdn/hysdn_net.c
@@ -24,7 +24,7 @@
 #include "hysdn_defs.h"
 
 unsigned int hynet_enable = 0xffffffff; 
-MODULE_PARM(hynet_enable, "i");
+module_param(hynet_enable, uint, 0);
 
 /* store the actual version for log reporting */
 char *hysdn_net_revision = "$Revision: 1.8.6.4 $";
diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c
index 33d339700411..a67d31af797a 100644
--- a/drivers/isdn/isdnloop/isdnloop.c
+++ b/drivers/isdn/isdnloop/isdnloop.c
@@ -22,7 +22,7 @@ static char *isdnloop_id = "loop0";
 MODULE_DESCRIPTION("ISDN4Linux: Pseudo Driver that simulates an ISDN card");
 MODULE_AUTHOR("Fritz Elfert");
 MODULE_LICENSE("GPL");
-MODULE_PARM(isdnloop_id, "s");
+module_param(isdnloop_id, charp, 0);
 MODULE_PARM_DESC(isdnloop_id, "ID-String of first card");
 
 static int isdnloop_addcard(char *);
diff --git a/drivers/media/video/zr36120.c b/drivers/media/video/zr36120.c
index d4c633b8a7f5..417a999afee3 100644
--- a/drivers/media/video/zr36120.c
+++ b/drivers/media/video/zr36120.c
@@ -70,10 +70,10 @@ MODULE_AUTHOR("Pauline Middelink <middelin@polyware.nl>");
 MODULE_DESCRIPTION("Zoran ZR36120 based framegrabber");
 MODULE_LICENSE("GPL");
 
-MODULE_PARM(triton1,"i");
-MODULE_PARM(cardtype,"1-" __MODULE_STRING(ZORAN_MAX) "i");
-MODULE_PARM(video_nr,"i");
-MODULE_PARM(vbi_nr,"i");
+module_param(triton1, uint, 0);
+module_param_array(cardtype, uint, NULL, 0);
+module_param(video_nr, int, 0);
+module_param(vbi_nr, int, 0);
 
 static int zoran_cards;
 static struct zoran zorans[ZORAN_MAX];
diff --git a/drivers/mmc/au1xmmc.c b/drivers/mmc/au1xmmc.c
index 8d84b045bc83..85e89c77bdea 100644
--- a/drivers/mmc/au1xmmc.c
+++ b/drivers/mmc/au1xmmc.c
@@ -87,7 +87,7 @@ struct au1xmmc_host *au1xmmc_hosts[AU1XMMC_CONTROLLER_COUNT];
 static int dma = 1;
 
 #ifdef MODULE
-MODULE_PARM(dma, "i");
+module_param(dma, bool, 0);
 MODULE_PARM_DESC(dma, "Use DMA engine for data transfers (0 = disabled)");
 #endif
 
diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c
index f0f8916da7ad..f988c817e196 100644
--- a/drivers/mtd/maps/pcmciamtd.c
+++ b/drivers/mtd/maps/pcmciamtd.c
@@ -28,7 +28,7 @@
 
 #ifdef CONFIG_MTD_DEBUG
 static int debug = CONFIG_MTD_DEBUG_VERBOSE;
-MODULE_PARM(debug, "i");
+module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "Set Debug Level 0=quiet, 5=noisy");
 #undef DEBUG
 #define DEBUG(n, format, arg...) \
@@ -89,17 +89,17 @@ static int mem_type;
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Simon Evans <spse@secret.org.uk>");
 MODULE_DESCRIPTION(DRIVER_DESC);
-MODULE_PARM(bankwidth, "i");
+module_param(bankwidth, int, 0);
 MODULE_PARM_DESC(bankwidth, "Set bankwidth (1=8 bit, 2=16 bit, default=2)");
-MODULE_PARM(mem_speed, "i");
+module_param(mem_speed, int, 0);
 MODULE_PARM_DESC(mem_speed, "Set memory access speed in ns");
-MODULE_PARM(force_size, "i");
+module_param(force_size, int, 0);
 MODULE_PARM_DESC(force_size, "Force size of card in MiB (1-64)");
-MODULE_PARM(setvpp, "i");
+module_param(setvpp, int, 0);
 MODULE_PARM_DESC(setvpp, "Set Vpp (0=Never, 1=On writes, 2=Always on, default=0)");
-MODULE_PARM(vpp, "i");
+module_param(vpp, int, 0);
 MODULE_PARM_DESC(vpp, "Vpp value in 1/10ths eg 33=3.3V 120=12V (Dangerous)");
-MODULE_PARM(mem_type, "i");
+module_param(mem_type, int, 0);
 MODULE_PARM_DESC(mem_type, "Set Memory type (0=Flash, 1=RAM, 2=ROM, default=0)");
 
 
diff --git a/drivers/net/atari_bionet.c b/drivers/net/atari_bionet.c
index 0095384ff454..5e5f80b99b9e 100644
--- a/drivers/net/atari_bionet.c
+++ b/drivers/net/atari_bionet.c
@@ -123,7 +123,7 @@ static char version[] =
  * Global variable 'bionet_debug'. Can be set at load time by 'insmod'
  */
 unsigned int bionet_debug = NET_DEBUG;
-MODULE_PARM(bionet_debug, "i");
+module_param(bionet_debug, int, 0);
 MODULE_PARM_DESC(bionet_debug, "bionet debug level (0-2)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/atari_pamsnet.c b/drivers/net/atari_pamsnet.c
index 8b997809f9de..d6039e62d832 100644
--- a/drivers/net/atari_pamsnet.c
+++ b/drivers/net/atari_pamsnet.c
@@ -119,7 +119,7 @@ static char *version =
  * Global variable 'pamsnet_debug'. Can be set at load time by 'insmod'
  */
 unsigned int pamsnet_debug = NET_DEBUG;
-MODULE_PARM(pamsnet_debug, "i");
+module_param(pamsnet_debug, int, 0);
 MODULE_PARM_DESC(pamsnet_debug, "pamsnet debug enable (0-1)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/atarilance.c b/drivers/net/atarilance.c
index e01b6a78ec63..442b2cbeb58a 100644
--- a/drivers/net/atarilance.c
+++ b/drivers/net/atarilance.c
@@ -78,7 +78,7 @@ static int lance_debug = LANCE_DEBUG;
 #else
 static int lance_debug = 1;
 #endif
-MODULE_PARM(lance_debug, "i");
+module_param(lance_debug, int, 0);
 MODULE_PARM_DESC(lance_debug, "atarilance debug level (0-3)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/cassini.c b/drivers/net/cassini.c
index 8f1573e658a5..ac48f7543500 100644
--- a/drivers/net/cassini.c
+++ b/drivers/net/cassini.c
@@ -192,12 +192,15 @@
 static char version[] __devinitdata =
 	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
 
+static int cassini_debug = -1;	/* -1 == use CAS_DEF_MSG_ENABLE as value */
+static int link_mode;
+
 MODULE_AUTHOR("Adrian Sun (asun@darksunrising.com)");
 MODULE_DESCRIPTION("Sun Cassini(+) ethernet driver");
 MODULE_LICENSE("GPL");
-MODULE_PARM(cassini_debug, "i");
+module_param(cassini_debug, int, 0);
 MODULE_PARM_DESC(cassini_debug, "Cassini bitmapped debugging message enable value");
-MODULE_PARM(link_mode, "i");
+module_param(link_mode, int, 0);
 MODULE_PARM_DESC(link_mode, "default link mode");
 
 /*
@@ -209,7 +212,7 @@ MODULE_PARM_DESC(link_mode, "default link mode");
  * Value in seconds, for user input.
  */
 static int linkdown_timeout = DEFAULT_LINKDOWN_TIMEOUT;
-MODULE_PARM(linkdown_timeout, "i");
+module_param(linkdown_timeout, int, 0);
 MODULE_PARM_DESC(linkdown_timeout,
 "min reset interval in sec. for PCS linkdown issue; disabled if not positive");
 
@@ -221,8 +224,6 @@ MODULE_PARM_DESC(linkdown_timeout,
 static int link_transition_timeout;
 
 
-static int cassini_debug = -1;	/* -1 == use CAS_DEF_MSG_ENABLE as value */
-static int link_mode;
 
 static u16 link_modes[] __devinitdata = {
 	BMCR_ANENABLE,			 /* 0 : autoneg */
diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c
index 349ebe783ed6..7fe2638ae06d 100644
--- a/drivers/net/chelsio/cxgb2.c
+++ b/drivers/net/chelsio/cxgb2.c
@@ -124,7 +124,7 @@ MODULE_LICENSE("GPL");
 
 static int dflt_msg_enable = DFLT_MSG_ENABLE;
 
-MODULE_PARM(dflt_msg_enable, "i");
+module_param(dflt_msg_enable, int, 0);
 MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T1 message enable bitmap");
 
 
diff --git a/drivers/net/fec_8xx/fec_main.c b/drivers/net/fec_8xx/fec_main.c
index b4f3a9f8a535..7e4338097139 100644
--- a/drivers/net/fec_8xx/fec_main.c
+++ b/drivers/net/fec_8xx/fec_main.c
@@ -55,11 +55,11 @@ MODULE_AUTHOR("Pantelis Antoniou <panto@intracom.gr>");
 MODULE_DESCRIPTION("Motorola 8xx FEC ethernet driver");
 MODULE_LICENSE("GPL");
 
-MODULE_PARM(fec_8xx_debug, "i");
+int fec_8xx_debug = -1;		/* -1 == use FEC_8XX_DEF_MSG_ENABLE as value */
+module_param(fec_8xx_debug, int, 0);
 MODULE_PARM_DESC(fec_8xx_debug,
 		 "FEC 8xx bitmapped debugging message enable value");
 
-int fec_8xx_debug = -1;		/* -1 == use FEC_8XX_DEF_MSG_ENABLE as value */
 
 /*************************************************/
 
diff --git a/drivers/net/fs_enet/fs_enet-main.c b/drivers/net/fs_enet/fs_enet-main.c
index f5d49a110654..196298f33db8 100644
--- a/drivers/net/fs_enet/fs_enet-main.c
+++ b/drivers/net/fs_enet/fs_enet-main.c
@@ -58,11 +58,11 @@ MODULE_DESCRIPTION("Freescale Ethernet Driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_MODULE_VERSION);
 
-MODULE_PARM(fs_enet_debug, "i");
+int fs_enet_debug = -1;		/* -1 == use FS_ENET_DEF_MSG_ENABLE as value */
+module_param(fs_enet_debug, int, 0);
 MODULE_PARM_DESC(fs_enet_debug,
 		 "Freescale bitmapped debugging message enable value");
 
-int fs_enet_debug = -1;		/* -1 == use FS_ENET_DEF_MSG_ENABLE as value */
 
 static void fs_set_multicast_list(struct net_device *dev)
 {
diff --git a/drivers/net/gt96100eth.c b/drivers/net/gt96100eth.c
index 5958a6314723..2d2435404614 100644
--- a/drivers/net/gt96100eth.c
+++ b/drivers/net/gt96100eth.c
@@ -114,8 +114,8 @@ static int max_interrupt_work = 32;
 
 static char mac0[18] = "00.02.03.04.05.06";
 static char mac1[18] = "00.01.02.03.04.05";
-MODULE_PARM(mac0, "c18");
-MODULE_PARM(mac1, "c18");
+module_param_string(mac0, mac0, 18, 0);
+module_param_string(mac1, mac0, 18, 0);
 MODULE_PARM_DESC(mac0, "MAC address for GT96100 ethernet port 0");
 MODULE_PARM_DESC(mac1, "MAC address for GT96100 ethernet port 1");
 
diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index c8dc40214a08..79a8fbcf5f93 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -280,7 +280,7 @@ static unsigned long rand;
 
 MODULE_AUTHOR("Klaus Kudielka");
 MODULE_DESCRIPTION("Driver for high-speed SCC boards");
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_NUM_DEVS) "i");
+module_param_array(io, int, NULL, 0);
 MODULE_LICENSE("GPL");
 
 static void __exit dmascc_exit(void)
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index dc5e9d59deed..d81a8e1eeb8d 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -1012,7 +1012,7 @@ static void __exit mkiss_exit_driver(void)
 
 MODULE_AUTHOR("Ralf Baechle DL5RB <ralf@linux-mips.org>");
 MODULE_DESCRIPTION("KISS driver for AX.25 over TTYs");
-MODULE_PARM(crc_force, "i");
+module_param(crc_force, int, 0);
 MODULE_PARM_DESC(crc_force, "crc [0 = auto | 1 = none | 2 = flexnet | 3 = smack]");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_LDISC(N_AX25);
diff --git a/drivers/net/irda/irport.c b/drivers/net/irda/irport.c
index 6070195b87bd..98fa5319e5cc 100644
--- a/drivers/net/irda/irport.c
+++ b/drivers/net/irda/irport.c
@@ -1118,9 +1118,9 @@ static void __exit irport_cleanup(void)
  	}
 }
 
-MODULE_PARM(io, "1-4i");
+module_param_array(io, int, NULL, 0);
 MODULE_PARM_DESC(io, "Base I/O addresses");
-MODULE_PARM(irq, "1-4i");
+module_param_array(irq, int, NULL, 0);
 MODULE_PARM_DESC(irq, "IRQ lines");
 
 MODULE_AUTHOR("Dag Brattli <dagb@cs.uit.no>");
diff --git a/drivers/net/lasi_82596.c b/drivers/net/lasi_82596.c
index f7b7238d8352..957888de3d7e 100644
--- a/drivers/net/lasi_82596.c
+++ b/drivers/net/lasi_82596.c
@@ -177,7 +177,7 @@ static int i596_debug = (DEB_SERIOUS|DEB_PROBE);
 MODULE_AUTHOR("Richard Hirst");
 MODULE_DESCRIPTION("i82596 driver");
 MODULE_LICENSE("GPL");
-MODULE_PARM(i596_debug, "i");
+module_param(i596_debug, int, 0);
 MODULE_PARM_DESC(i596_debug, "lasi_82596 debug mask");
 
 /* Copy frames shorter than rx_copybreak, otherwise pass on up in
@@ -1520,9 +1520,9 @@ static void set_multicast_list(struct net_device *dev)
 	}
 }
 
-MODULE_PARM(debug, "i");
-MODULE_PARM_DESC(debug, "lasi_82596 debug mask");
 static int debug = -1;
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "lasi_82596 debug mask");
 
 static int num_drivers;
 static struct net_device *netdevs[MAX_DRIVERS];
diff --git a/drivers/net/mac89x0.c b/drivers/net/mac89x0.c
index f65b0db111b8..cd3c9a5a98b2 100644
--- a/drivers/net/mac89x0.c
+++ b/drivers/net/mac89x0.c
@@ -629,7 +629,7 @@ static int set_mac_address(struct net_device *dev, void *addr)
 static struct net_device *dev_cs89x0;
 static int debug;
 
-MODULE_PARM(debug, "i");
+module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "CS89[02]0 debug level (0-5)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/mace.c b/drivers/net/mace.c
index 2a5add257b8f..77792b286027 100644
--- a/drivers/net/mace.c
+++ b/drivers/net/mace.c
@@ -1042,7 +1042,7 @@ static void __exit mace_cleanup(void)
 
 MODULE_AUTHOR("Paul Mackerras");
 MODULE_DESCRIPTION("PowerMac MACE driver.");
-MODULE_PARM(port_aaui, "i");
+module_param(port_aaui, int, 0);
 MODULE_PARM_DESC(port_aaui, "MACE uses AAUI port (0-1)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/net/meth.c b/drivers/net/meth.c
index e23655f5049f..d644bf3a933c 100644
--- a/drivers/net/meth.c
+++ b/drivers/net/meth.c
@@ -62,7 +62,7 @@ MODULE_DESCRIPTION("SGI O2 Builtin Fast Ethernet driver");
 
 #ifdef HAVE_TX_TIMEOUT
 static int timeout = TX_TIMEOUT;
-MODULE_PARM(timeout, "i");
+module_param(timeout, int, 0);
 #endif
 
 /*
diff --git a/drivers/net/ne-h8300.c b/drivers/net/ne-h8300.c
index aaebd28a1920..7ea3d596ac3b 100644
--- a/drivers/net/ne-h8300.c
+++ b/drivers/net/ne-h8300.c
@@ -601,9 +601,9 @@ static int io[MAX_NE_CARDS];
 static int irq[MAX_NE_CARDS];
 static int bad[MAX_NE_CARDS];	/* 0xbad = bad sig or no reset ack */
 
-MODULE_PARM(io, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
-MODULE_PARM(irq, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
-MODULE_PARM(bad, "1-" __MODULE_STRING(MAX_NE_CARDS) "i");
+module_param_array(io, int, NULL, 0);
+module_param_array(irq, int, NULL, 0);
+module_param_array(bad, int, NULL, 0);
 MODULE_PARM_DESC(io, "I/O base address(es)");
 MODULE_PARM_DESC(irq, "IRQ number(s)");
 MODULE_DESCRIPTION("H8/300 NE2000 Ethernet driver");
diff --git a/drivers/net/ni5010.c b/drivers/net/ni5010.c
index 2ab01a5d1d22..a68bf474f6ed 100644
--- a/drivers/net/ni5010.c
+++ b/drivers/net/ni5010.c
@@ -766,8 +766,8 @@ static void ni5010_show_registers(struct net_device *dev)
 #ifdef MODULE
 static struct net_device *dev_ni5010;
 
-MODULE_PARM(io, "i");
-MODULE_PARM(irq, "i");
+module_param(io, int, 0);
+module_param(irq, int, 0);
 MODULE_PARM_DESC(io, "ni5010 I/O base address");
 MODULE_PARM_DESC(irq, "ni5010 IRQ number");
 
diff --git a/drivers/net/sun3lance.c b/drivers/net/sun3lance.c
index 01bdb2334058..d4c0002b43db 100644
--- a/drivers/net/sun3lance.c
+++ b/drivers/net/sun3lance.c
@@ -71,7 +71,7 @@ static int lance_debug = LANCE_DEBUG;
 #else
 static int lance_debug = 1;
 #endif
-MODULE_PARM(lance_debug, "i");
+module_param(lance_debug, int, 0);
 MODULE_PARM_DESC(lance_debug, "SUN3 Lance debug level (0-3)");
 MODULE_LICENSE("GPL");
 
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index dfe542b206cc..7967916bda18 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -43,7 +43,6 @@ MODULE_AUTHOR("Holger Smolinski <Holger.Smolinski@de.ibm.com>");
 MODULE_DESCRIPTION("Linux on S/390 DASD device driver,"
 		   " Copyright 2000 IBM Corporation");
 MODULE_SUPPORTED_DEVICE("dasd");
-MODULE_PARM(dasd, "1-" __MODULE_STRING(256) "s");
 MODULE_LICENSE("GPL");
 
 /*
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index f576f243cd93..2f720108a7e0 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -16,6 +16,7 @@
 #include <linux/config.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include <asm/debug.h>
 #include <asm/uaccess.h>
@@ -69,6 +70,8 @@ int dasd_autodetect = 0;	/* is true, when autodetection is active */
  * strings when running as a module.
  */
 static char *dasd[256];
+module_param_array(dasd, charp, NULL, 0);
+
 /*
  * Single spinlock to protect devmap structures and lists.
  */
diff --git a/drivers/scsi/arm/cumana_2.c b/drivers/scsi/arm/cumana_2.c
index 583d2d8c8335..fad2109268bb 100644
--- a/drivers/scsi/arm/cumana_2.c
+++ b/drivers/scsi/arm/cumana_2.c
@@ -550,6 +550,6 @@ module_exit(cumanascsi2_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("Cumana SCSI-2 driver for Acorn machines");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/arm/eesox.c b/drivers/scsi/arm/eesox.c
index 3ffec7efc9d5..dcbb4b2b3fe0 100644
--- a/drivers/scsi/arm/eesox.c
+++ b/drivers/scsi/arm/eesox.c
@@ -674,6 +674,6 @@ module_exit(eesox_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("EESOX 'Fast' SCSI driver for Acorn machines");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/arm/powertec.c b/drivers/scsi/arm/powertec.c
index 3113bdcedb13..3d69f6c45a6b 100644
--- a/drivers/scsi/arm/powertec.c
+++ b/drivers/scsi/arm/powertec.c
@@ -466,6 +466,6 @@ module_exit(powertecscsi_exit);
 
 MODULE_AUTHOR("Russell King");
 MODULE_DESCRIPTION("Powertec SCSI driver");
-MODULE_PARM(term, "1-8i");
+module_param_array(term, int, NULL, 0);
 MODULE_PARM_DESC(term, "SCSI bus termination");
 MODULE_LICENSE("GPL");
diff --git a/drivers/scsi/atari_scsi.c b/drivers/scsi/atari_scsi.c
index f4c1ca7c1572..f677c5a32a68 100644
--- a/drivers/scsi/atari_scsi.c
+++ b/drivers/scsi/atari_scsi.c
@@ -239,17 +239,17 @@ static int atari_read_overruns = 0;
 #endif
 
 static int setup_can_queue = -1;
-MODULE_PARM(setup_can_queue, "i");
+module_param(setup_can_queue, int, 0);
 static int setup_cmd_per_lun = -1;
-MODULE_PARM(setup_cmd_per_lun, "i");
+module_param(setup_cmd_per_lun, int, 0);
 static int setup_sg_tablesize = -1;
-MODULE_PARM(setup_sg_tablesize, "i");
+module_param(setup_sg_tablesize, int, 0);
 #ifdef SUPPORT_TAGS
 static int setup_use_tagged_queuing = -1;
-MODULE_PARM(setup_use_tagged_queuing, "i");
+module_param(setup_use_tagged_queuing, int, 0);
 #endif
 static int setup_hostid = -1;
-MODULE_PARM(setup_hostid, "i");
+module_param(setup_hostid, int, 0);
 
 
 #if defined(CONFIG_TT_DMA_EMUL)
diff --git a/drivers/video/pm3fb.c b/drivers/video/pm3fb.c
index 0e78ddc81583..52c18a35fb41 100644
--- a/drivers/video/pm3fb.c
+++ b/drivers/video/pm3fb.c
@@ -3532,26 +3532,26 @@ int __init pm3fb_init(void)
 MODULE_AUTHOR("Romain Dolbeau");
 MODULE_DESCRIPTION("Permedia3 framebuffer device driver");
 static char *mode[PM3_MAX_BOARD];
-MODULE_PARM(mode,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(mode, charp, NULL, 0);
 MODULE_PARM_DESC(mode,"video mode");
-MODULE_PARM(disable,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(disable, short, NULL, 0);
 MODULE_PARM_DESC(disable,"disable board");
 static short off[PM3_MAX_BOARD];
-MODULE_PARM(off,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(off, short, NULL, 0);
 MODULE_PARM_DESC(off,"disable board");
 static char *pciid[PM3_MAX_BOARD];
-MODULE_PARM(pciid,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(pciid, charp, NULL, 0);
 MODULE_PARM_DESC(pciid,"board PCI Id");
-MODULE_PARM(noaccel,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param_array(noaccel, short, NULL, 0);
 MODULE_PARM_DESC(noaccel,"disable accel");
 static char *font[PM3_MAX_BOARD];
-MODULE_PARM(font,PM3_MAX_BOARD_MODULE_ARRAY_STRING);
+module_param_array(font, charp, NULL, 0);
 MODULE_PARM_DESC(font,"choose font");
-MODULE_PARM(depth,PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param(depth, short, NULL, 0);
 MODULE_PARM_DESC(depth,"boot-time depth");
-MODULE_PARM(printtimings, "h");
+module_param(printtimings, short, NULL, 0);
 MODULE_PARM_DESC(printtimings, "print the memory timings of the card(s)");
-MODULE_PARM(forcesize, PM3_MAX_BOARD_MODULE_ARRAY_SHORT);
+module_param(forcesize, short, NULL, 0);
 MODULE_PARM_DESC(forcesize, "force specified memory size");
 /*
 MODULE_SUPPORTED_DEVICE("Permedia3 PCI boards")
diff --git a/include/linux/module.h b/include/linux/module.h
index 70bd843c71cb..e144309836af 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -554,25 +554,6 @@ static inline void module_remove_driver(struct device_driver *driver)
 
 /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
 
-struct obsolete_modparm {
-	char name[64];
-	char type[64-sizeof(void *)];
-	void *addr;
-};
-
-static inline void MODULE_PARM_(void) { }
-#ifdef MODULE
-/* DEPRECATED: Do not use. */
-#define MODULE_PARM(var,type)						    \
-extern struct obsolete_modparm __parm_##var \
-__attribute__((section("__obsparm"))); \
-struct obsolete_modparm __parm_##var = \
-{ __stringify(var), type, &MODULE_PARM_ }; \
-__MODULE_PARM_TYPE(var, type);
-#else
-#define MODULE_PARM(var,type) static void __attribute__((__unused__)) *__parm_##var = &MODULE_PARM_;
-#endif
-
 #define __MODULE_STRING(x) __stringify(x)
 
 /* Use symbol_get and symbol_put instead.  You'll thank me. */
diff --git a/include/video/pm3fb.h b/include/video/pm3fb.h
index 6f4ea808cf74..ac021379ac40 100644
--- a/include/video/pm3fb.h
+++ b/include/video/pm3fb.h
@@ -1128,10 +1128,7 @@
 #endif
 
 /* max number of simultaneous board */
-/* warning : make sure module array def's are coherent with PM3_MAX_BOARD */
 #define PM3_MAX_BOARD 4
-#define PM3_MAX_BOARD_MODULE_ARRAY_SHORT "1-4h"
-#define PM3_MAX_BOARD_MODULE_ARRAY_STRING "1-4s"
 
 /* max size of options */
 #define PM3_OPTIONS_SIZE 256
diff --git a/init/Kconfig b/init/Kconfig
index 1d19fd25204b..05951c1d654e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -470,15 +470,6 @@ config MODULE_FORCE_UNLOAD
 	  rmmod).  This is mainly for kernel developers and desperate users.
 	  If unsure, say N.
 
-config OBSOLETE_MODPARM
-	bool
-	default y
-	depends on MODULES
-	help
-	  You need this option to use module parameters on modules which
-	  have not been converted to the new module parameter system yet.
-	  If unsure, say Y.
-
 config MODVERSIONS
 	bool "Module versioning support"
 	depends on MODULES
diff --git a/kernel/module.c b/kernel/module.c
index 54623c714bba..ddfe45ac2fd1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -233,24 +233,6 @@ static unsigned long __find_symbol(const char *name,
  	return 0;
 }
 
-/* Find a symbol in this elf symbol table */
-static unsigned long find_local_symbol(Elf_Shdr *sechdrs,
-				       unsigned int symindex,
-				       const char *strtab,
-				       const char *name)
-{
-	unsigned int i;
-	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
-
-	/* Search (defined) internal symbols first. */
-	for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) {
-		if (sym[i].st_shndx != SHN_UNDEF
-		    && strcmp(name, strtab + sym[i].st_name) == 0)
-			return sym[i].st_value;
-	}
-	return 0;
-}
-
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -785,139 +767,6 @@ static struct module_attribute *modinfo_attrs[] = {
 	NULL,
 };
 
-#ifdef CONFIG_OBSOLETE_MODPARM
-/* Bounds checking done below */
-static int obsparm_copy_string(const char *val, struct kernel_param *kp)
-{
-	strcpy(kp->arg, val);
-	return 0;
-}
-
-static int set_obsolete(const char *val, struct kernel_param *kp)
-{
-	unsigned int min, max;
-	unsigned int size, maxsize;
-	int dummy;
-	char *endp;
-	const char *p;
-	struct obsolete_modparm *obsparm = kp->arg;
-
-	if (!val) {
-		printk(KERN_ERR "Parameter %s needs an argument\n", kp->name);
-		return -EINVAL;
-	}
-
-	/* type is: [min[-max]]{b,h,i,l,s} */
-	p = obsparm->type;
-	min = simple_strtol(p, &endp, 10);
-	if (endp == obsparm->type)
-		min = max = 1;
-	else if (*endp == '-') {
-		p = endp+1;
-		max = simple_strtol(p, &endp, 10);
-	} else
-		max = min;
-	switch (*endp) {
-	case 'b':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   1, param_set_byte, &dummy);
-	case 'h':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(short), param_set_short, &dummy);
-	case 'i':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(int), param_set_int, &dummy);
-	case 'l':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(long), param_set_long, &dummy);
-	case 's':
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   sizeof(char *), param_set_charp, &dummy);
-
-	case 'c':
-		/* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars,
-		   and the decl is "char xxx[5][50];" */
-		p = endp+1;
-		maxsize = simple_strtol(p, &endp, 10);
-		/* We check lengths here (yes, this is a hack). */
-		p = val;
-		while (p[size = strcspn(p, ",")]) {
-			if (size >= maxsize) 
-				goto oversize;
-			p += size+1;
-		}
-		if (size >= maxsize) 
-			goto oversize;
-		return param_array(kp->name, val, min, max, obsparm->addr,
-				   maxsize, obsparm_copy_string, &dummy);
-	}
-	printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type);
-	return -EINVAL;
- oversize:
-	printk(KERN_ERR
-	       "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize);
-	return -EINVAL;
-}
-
-static int obsolete_params(const char *name,
-			   char *args,
-			   struct obsolete_modparm obsparm[],
-			   unsigned int num,
-			   Elf_Shdr *sechdrs,
-			   unsigned int symindex,
-			   const char *strtab)
-{
-	struct kernel_param *kp;
-	unsigned int i;
-	int ret;
-
-	kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL);
-	if (!kp)
-		return -ENOMEM;
-
-	for (i = 0; i < num; i++) {
-		char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)];
-
-		snprintf(sym_name, sizeof(sym_name), "%s%s",
-			 MODULE_SYMBOL_PREFIX, obsparm[i].name);
-
-		kp[i].name = obsparm[i].name;
-		kp[i].perm = 000;
-		kp[i].set = set_obsolete;
-		kp[i].get = NULL;
-		obsparm[i].addr
-			= (void *)find_local_symbol(sechdrs, symindex, strtab,
-						    sym_name);
-		if (!obsparm[i].addr) {
-			printk("%s: falsely claims to have parameter %s\n",
-			       name, obsparm[i].name);
-			ret = -EINVAL;
-			goto out;
-		}
-		kp[i].arg = &obsparm[i];
-	}
-
-	ret = parse_args(name, args, kp, num, NULL);
- out:
-	kfree(kp);
-	return ret;
-}
-#else
-static int obsolete_params(const char *name,
-			   char *args,
-			   struct obsolete_modparm obsparm[],
-			   unsigned int num,
-			   Elf_Shdr *sechdrs,
-			   unsigned int symindex,
-			   const char *strtab)
-{
-	if (num != 0)
-		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
-		       name);
-	return 0;
-}
-#endif /* CONFIG_OBSOLETE_MODPARM */
-
 static const char vermagic[] = VERMAGIC_STRING;
 
 #ifdef CONFIG_MODVERSIONS
@@ -1874,27 +1723,17 @@ static struct module *load_module(void __user *umod,
 	set_fs(old_fs);
 
 	mod->args = args;
-	if (obsparmindex) {
-		err = obsolete_params(mod->name, mod->args,
-				      (struct obsolete_modparm *)
-				      sechdrs[obsparmindex].sh_addr,
-				      sechdrs[obsparmindex].sh_size
-				      / sizeof(struct obsolete_modparm),
-				      sechdrs, symindex,
-				      (char *)sechdrs[strindex].sh_addr);
-		if (setupindex)
-			printk(KERN_WARNING "%s: Ignoring new-style "
-			       "parameters in presence of obsolete ones\n",
-			       mod->name);
-	} else {
-		/* Size of section 0 is 0, so this works well if no params */
-		err = parse_args(mod->name, mod->args,
-				 (struct kernel_param *)
-				 sechdrs[setupindex].sh_addr,
-				 sechdrs[setupindex].sh_size
-				 / sizeof(struct kernel_param),
-				 NULL);
-	}
+	if (obsparmindex)
+		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+		       mod->name);
+
+	/* Size of section 0 is 0, so this works well if no params */
+	err = parse_args(mod->name, mod->args,
+			 (struct kernel_param *)
+			 sechdrs[setupindex].sh_addr,
+			 sechdrs[setupindex].sh_size
+			 / sizeof(struct kernel_param),
+			 NULL);
 	if (err < 0)
 		goto arch_cleanup;
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9a1fa8894b95..b4b362b5baf5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -54,15 +54,15 @@ static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
 
-MODULE_PARM(nreaders, "i");
+module_param(nreaders, int, 0);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-MODULE_PARM(stat_interval, "i");
+module_param(stat_interval, int, 0);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-MODULE_PARM(verbose, "i");
+module_param(verbose, bool, 0);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-MODULE_PARM(test_no_idle_hz, "i");
+module_param(test_no_idle_hz, bool, 0);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-MODULE_PARM(shuffle_interval, "i");
+module_param(shuffle_interval, int, 0);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 #define TORTURE_FLAG "rcutorture: "
 #define PRINTK_STRING(s) \
diff --git a/sound/oss/au1000.c b/sound/oss/au1000.c
index fe54de25aafc..eacb0aef21e1 100644
--- a/sound/oss/au1000.c
+++ b/sound/oss/au1000.c
@@ -100,7 +100,7 @@
 
 /* Boot options */
 static int      vra = 0;	// 0 = no VRA, 1 = use VRA if codec supports it
-MODULE_PARM(vra, "i");
+module_param(vra, bool, 0);
 MODULE_PARM_DESC(vra, "if 1 use VRA if codec supports it");
 
 
diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c
index 6a4956b8025d..c1168fae6be6 100644
--- a/sound/oss/au1550_ac97.c
+++ b/sound/oss/au1550_ac97.c
@@ -79,7 +79,7 @@
  * 0 = no VRA, 1 = use VRA if codec supports it
  */
 static int      vra = 1;
-MODULE_PARM(vra, "i");
+module_param(vra, bool, 0);
 MODULE_PARM_DESC(vra, "if 1 use VRA if codec supports it");
 
 static struct au1550_state {
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index c9302a1e515b..87bd3100aef3 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -195,18 +195,18 @@
      */
 
 int dmasound_catchRadius = 0;
-MODULE_PARM(dmasound_catchRadius, "i");
+module_param(dmasound_catchRadius, int, 0);
 
 static unsigned int numWriteBufs = DEFAULT_N_BUFFERS;
-MODULE_PARM(numWriteBufs, "i");
+module_param(numWriteBufs, int, 0);
 static unsigned int writeBufSize = DEFAULT_BUFF_SIZE ;	/* in bytes */
-MODULE_PARM(writeBufSize, "i");
+module_param(writeBufSize, int, 0);
 
 #ifdef HAS_RECORD
 static unsigned int numReadBufs = DEFAULT_N_BUFFERS;
-MODULE_PARM(numReadBufs, "i");
+module_param(numReadBufs, int, 0);
 static unsigned int readBufSize = DEFAULT_BUFF_SIZE;	/* in bytes */
-MODULE_PARM(readBufSize, "i");
+module_param(readBufSize, int, 0);
 #endif
 
 MODULE_LICENSE("GPL");
diff --git a/sound/oss/ite8172.c b/sound/oss/ite8172.c
index ffcb910f5c3e..00ac1c95a429 100644
--- a/sound/oss/ite8172.c
+++ b/sound/oss/ite8172.c
@@ -1968,9 +1968,9 @@ static int i2s_fmt[NR_DEVICE];
 
 static unsigned int devindex;
 
-MODULE_PARM(spdif, "1-" __MODULE_STRING(NR_DEVICE) "i");
+module_param_array(spdif, int, NULL, 0);
 MODULE_PARM_DESC(spdif, "if 1 the S/PDIF digital output is enabled");
-MODULE_PARM(i2s_fmt, "1-" __MODULE_STRING(NR_DEVICE) "i");
+module_param_array(i2s_fmt, int, NULL, 0);
 MODULE_PARM_DESC(i2s_fmt, "the format of I2S");
 
 MODULE_AUTHOR("Monta Vista Software, stevel@mvista.com");
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index dce9016cbcfd..eb5ea32fd1b0 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -154,8 +154,8 @@ static void start_adc(struct cs4297a_state *s);
 #if CSDEBUG
 static unsigned long cs_debuglevel = 4;	// levels range from 1-9
 static unsigned long cs_debugmask = CS_INIT /*| CS_IOCTL*/;
-MODULE_PARM(cs_debuglevel, "i");
-MODULE_PARM(cs_debugmask, "i");
+module_param(cs_debuglevel, int, 0);
+module_param(cs_debugmask, int, 0);
 #endif
 #define CS_TRUE 	1
 #define CS_FALSE 	0
diff --git a/sound/oss/waveartist.c b/sound/oss/waveartist.c
index 99d04ad3ca13..afcb524a40eb 100644
--- a/sound/oss/waveartist.c
+++ b/sound/oss/waveartist.c
@@ -2028,8 +2028,8 @@ __setup("waveartist=", setup_waveartist);
 #endif
 
 MODULE_DESCRIPTION("Rockwell WaveArtist RWA-010 sound driver");
-MODULE_PARM(io, "i");		/* IO base */
-MODULE_PARM(irq, "i");		/* IRQ */
-MODULE_PARM(dma, "i");		/* DMA */
-MODULE_PARM(dma2, "i");		/* DMA2 */
+module_param(io, int, 0);		/* IO base */
+module_param(irq, int, 0);		/* IRQ */
+module_param(dma, int, 0);		/* DMA */
+module_param(dma2, int, 0);		/* DMA2 */
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9871728b756646e0d758a966ba00f2c0ff812817 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Mar 2006 03:07:06 -0800
Subject: [PATCH] kernel/params.c: make param_array() static

param_array() in kernel/params.c can now become static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/moduleparam.h |  7 -------
 kernel/params.c             | 12 ++++++------
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index b5c98c43779e..7c0c2c198f1f 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -162,13 +162,6 @@ extern int param_array_get(char *buffer, struct kernel_param *kp);
 extern int param_set_copystring(const char *val, struct kernel_param *kp);
 extern int param_get_string(char *buffer, struct kernel_param *kp);
 
-int param_array(const char *name,
-		const char *val,
-		unsigned int min, unsigned int max,
-		void *elem, int elemsize,
-		int (*set)(const char *, struct kernel_param *kp),
-		int *num);
-
 /* for exporting parameters in /sys/parameters */
 
 struct module;
diff --git a/kernel/params.c b/kernel/params.c
index a29150582310..9de637a5c8bc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
 }
 
 /* We cheat here and temporarily mangle the string. */
-int param_array(const char *name,
-		const char *val,
-		unsigned int min, unsigned int max,
-		void *elem, int elemsize,
-		int (*set)(const char *, struct kernel_param *kp),
-		int *num)
+static int param_array(const char *name,
+		       const char *val,
+		       unsigned int min, unsigned int max,
+		       void *elem, int elemsize,
+		       int (*set)(const char *, struct kernel_param *kp),
+		       int *num)
 {
 	int ret;
 	struct kernel_param kp;
-- 
cgit v1.2.3


From c777ac5594f772ac760e02c3ac71d067616b579d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:07:36 -0800
Subject: [PATCH] irq: uninline migration functions

Uninline some massive IRQ migration functions.  Put them in the new
kernel/irq/migration.c.

Cc: Andi Kleen <ak@muc.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/irq.h    | 49 ++---------------------------------------------
 kernel/irq/Makefile    |  3 +--
 kernel/irq/migration.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 49 deletions(-)
 create mode 100644 kernel/irq/migration.c

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 6c5d4c898ccb..ee2a82a572f7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -114,53 +114,8 @@ static inline void set_native_irq_info(int irq, cpumask_t mask)
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
 extern cpumask_t pending_irq_cpumask[NR_IRQS];
 
-static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
-	irq_desc_t *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&desc->lock, flags);
-	desc->move_irq = 1;
-	pending_irq_cpumask[irq] = mask;
-	spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-static inline void
-move_native_irq(int irq)
-{
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-
-	if (likely (!desc->move_irq))
-		return;
-
-	desc->move_irq = 0;
-
-	if (likely(cpus_empty(pending_irq_cpumask[irq])))
-		return;
-
-	if (!desc->handler->set_affinity)
-		return;
-
-	/* note - we hold the desc->lock */
-	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-
-	/*
-	 * If there was a valid mask to work with, please
-	 * do the disable, re-program, enable sequence.
-	 * This is *not* particularly important for level triggered
-	 * but in a edge trigger case, we might be setting rte
-	 * when an active trigger is comming in. This could
-	 * cause some ioapics to mal-function.
-	 * Being paranoid i guess!
-	 */
-	if (unlikely(!cpus_empty(tmp))) {
-		desc->handler->disable(irq);
-		desc->handler->set_affinity(irq,tmp);
-		desc->handler->enable(irq);
-	}
-	cpus_clear(pending_irq_cpumask[irq]);
-}
+void set_pending_irq(unsigned int irq, cpumask_t mask);
+void move_native_irq(int irq);
 
 #ifdef CONFIG_PCI_MSI
 /*
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 49378738ff5e..2b33f852be3e 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,4 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o migration.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
-
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
new file mode 100644
index 000000000000..6bdd03c524c7
--- /dev/null
+++ b/kernel/irq/migration.c
@@ -0,0 +1,52 @@
+#include <linux/irq.h>
+
+#if defined(CONFIG_GENERIC_PENDING_IRQ)
+
+void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#endif
-- 
cgit v1.2.3


From 501f2499b897ca4be68b1acc7a4bc8cf66f5fd24 Mon Sep 17 00:00:00 2001
From: Bryan Holty <lgeek@frontiernet.net>
Date: Sat, 25 Mar 2006 03:07:37 -0800
Subject: [PATCH] IRQ: prevent enabling of previously disabled interrupt

This fix prevents re-disabling and enabling of a previously disabled
interrupt.  On an SMP system with irq balancing enabled; If an interrupt is
disabled from within its own interrupt context with disable_irq_nosync and is
also earmarked for processor migration, the interrupt is blindly moved to the
other processor and enabled without regard for its current "enabled" state.
If there is an interrupt pending, it will unexpectedly invoke the irq handler
on the new irq owning processor (even though the irq was previously disabled)

The more intuitive fix would be to invoke disable_irq_nosync and
enable_irq, but since we already have the desc->lock from __do_IRQ, we
cannot call them directly.  Instead we can use the same logic to disable
and enable found in disable_irq_nosync and enable_irq, with regards to the
desc->depth.

This now prevents a disabled interrupt from being re-disabled, and more
importantly prevents a disabled interrupt from being incorrectly enabled on
a different processor.

Signed-off-by: Bryan Holty <lgeek@frontiernet.net>
Cc: Andi Kleen <ak@muc.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/migration.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 6bdd03c524c7..52a8655fa080 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,9 +18,17 @@ void move_native_irq(int irq)
 	cpumask_t tmp;
 	irq_desc_t *desc = irq_descp(irq);
 
-	if (likely (!desc->move_irq))
+	if (likely(!desc->move_irq))
 		return;
 
+	/*
+	 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
+	 */
+	if (CHECK_IRQ_PER_CPU(desc->status)) {
+		WARN_ON(1);
+		return;
+	}
+
 	desc->move_irq = 0;
 
 	if (likely(cpus_empty(pending_irq_cpumask[irq])))
@@ -29,7 +37,8 @@ void move_native_irq(int irq)
 	if (!desc->handler->set_affinity)
 		return;
 
-	/* note - we hold the desc->lock */
+	assert_spin_locked(&desc->lock);
+
 	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
 
 	/*
@@ -42,9 +51,13 @@ void move_native_irq(int irq)
 	 * Being paranoid i guess!
 	 */
 	if (unlikely(!cpus_empty(tmp))) {
-		desc->handler->disable(irq);
+		if (likely(!(desc->status & IRQ_DISABLED)))
+			desc->handler->disable(irq);
+
 		desc->handler->set_affinity(irq,tmp);
-		desc->handler->enable(irq);
+
+		if (likely(!(desc->status & IRQ_DISABLED)))
+			desc->handler->enable(irq);
 	}
 	cpus_clear(pending_irq_cpumask[irq]);
 }
-- 
cgit v1.2.3


From 12b5989be10011387a9da5dee82e5c0d6f9d02e7 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Sat, 25 Mar 2006 03:07:41 -0800
Subject: [PATCH] refactor capable() to one implementation, add __capable()
 helper

Move capable() to kernel/capability.c and eliminate duplicate
implementations.  Add __capable() function which can be used to check for
capabiilty of any process.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/capability.h |  3 ++-
 include/linux/security.h   | 22 ++++++++++++++++------
 kernel/capability.c        | 16 ++++++++++++++++
 kernel/sys.c               | 12 ------------
 security/security.c        | 23 -----------------------
 5 files changed, 34 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 5a23ce752629..6548b35ab9f6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -357,7 +357,8 @@ static inline kernel_cap_t cap_invert(kernel_cap_t c)
 
 #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
 
-extern int capable(int cap);
+int capable(int cap);
+int __capable(struct task_struct *t, int cap);
 
 #endif /* __KERNEL__ */
 
diff --git a/include/linux/security.h b/include/linux/security.h
index b18eb8cfa639..3c19be35124b 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1040,6 +1040,11 @@ struct swap_info_struct;
  *	@effective contains the effective capability set.
  *	@inheritable contains the inheritable capability set.
  *	@permitted contains the permitted capability set.
+ * @capable:
+ *	Check whether the @tsk process has the @cap capability.
+ *	@tsk contains the task_struct for the process.
+ *	@cap contains the capability <include/linux/capability.h>.
+ *	Return 0 if the capability is granted for @tsk.
  * @acct:
  *	Check permission before enabling or disabling process accounting.  If
  *	accounting is being enabled, then @file refers to the open file used to
@@ -1053,11 +1058,6 @@ struct swap_info_struct;
  *	@table contains the ctl_table structure for the sysctl variable.
  *	@op contains the operation (001 = search, 002 = write, 004 = read).
  *	Return 0 if permission is granted.
- * @capable:
- *	Check whether the @tsk process has the @cap capability.
- *	@tsk contains the task_struct for the process.
- *	@cap contains the capability <include/linux/capability.h>.
- *	Return 0 if the capability is granted for @tsk.
  * @syslog:
  *	Check permission before accessing the kernel message ring or changing
  *	logging to the console.
@@ -1099,9 +1099,9 @@ struct security_operations {
 			    kernel_cap_t * effective,
 			    kernel_cap_t * inheritable,
 			    kernel_cap_t * permitted);
+	int (*capable) (struct task_struct * tsk, int cap);
 	int (*acct) (struct file * file);
 	int (*sysctl) (struct ctl_table * table, int op);
-	int (*capable) (struct task_struct * tsk, int cap);
 	int (*quotactl) (int cmds, int type, int id, struct super_block * sb);
 	int (*quota_on) (struct dentry * dentry);
 	int (*syslog) (int type);
@@ -1347,6 +1347,11 @@ static inline void security_capset_set (struct task_struct *target,
 	security_ops->capset_set (target, effective, inheritable, permitted);
 }
 
+static inline int security_capable(struct task_struct *tsk, int cap)
+{
+	return security_ops->capable(tsk, cap);
+}
+
 static inline int security_acct (struct file *file)
 {
 	return security_ops->acct (file);
@@ -2050,6 +2055,11 @@ static inline void security_capset_set (struct task_struct *target,
 	cap_capset_set (target, effective, inheritable, permitted);
 }
 
+static inline int security_capable(struct task_struct *tsk, int cap)
+{
+	return cap_capable(tsk, cap);
+}
+
 static inline int security_acct (struct file *file)
 {
 	return 0;
diff --git a/kernel/capability.c b/kernel/capability.c
index bfa3c92e16f2..1a4d8a40d3f9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -233,3 +233,19 @@ out:
 
      return ret;
 }
+
+int __capable(struct task_struct *t, int cap)
+{
+	if (security_capable(t, cap) == 0) {
+		t->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__capable);
+
+int capable(int cap)
+{
+	return __capable(current, cap);
+}
+EXPORT_SYMBOL(capable);
diff --git a/kernel/sys.c b/kernel/sys.c
index 19d058be49d4..421009cedb51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -224,18 +224,6 @@ int unregister_reboot_notifier(struct notifier_block * nb)
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
-#ifndef CONFIG_SECURITY
-int capable(int cap)
-{
-        if (cap_raised(current->cap_effective, cap)) {
-	       current->flags |= PF_SUPERPRIV;
-	       return 1;
-        }
-        return 0;
-}
-EXPORT_SYMBOL(capable);
-#endif
-
 static int set_one_prio(struct task_struct *p, int niceval, int error)
 {
 	int no_nice;
diff --git a/security/security.c b/security/security.c
index f693e1f66b98..51ef509710b9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -174,31 +174,8 @@ int mod_unreg_security(const char *name, struct security_operations *ops)
 	return security_ops->unregister_security(name, ops);
 }
 
-/**
- * capable - calls the currently loaded security module's capable() function with the specified capability
- * @cap: the requested capability level.
- *
- * This function calls the currently loaded security module's capable()
- * function with a pointer to the current task and the specified @cap value.
- *
- * This allows the security module to implement the capable function call
- * however it chooses to.
- */
-int capable(int cap)
-{
-	if (security_ops->capable(current, cap)) {
-		/* capability denied */
-		return 0;
-	}
-
-	/* capability granted */
-	current->flags |= PF_SUPERPRIV;
-	return 1;
-}
-
 EXPORT_SYMBOL_GPL(register_security);
 EXPORT_SYMBOL_GPL(unregister_security);
 EXPORT_SYMBOL_GPL(mod_reg_security);
 EXPORT_SYMBOL_GPL(mod_unreg_security);
-EXPORT_SYMBOL(capable);
 EXPORT_SYMBOL(security_ops);
-- 
cgit v1.2.3


From 05eeae208d08a05a6980cf2ff61f02843c0955fd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:07:48 -0800
Subject: [PATCH] find_task_by_pid() needs tasklist_lock

A couple of places are forgetting to take it.

The kswapd case is probably unimportant.  keventd_create_kthread() was racy.

The whole thing is a bit flakey: you start a kernel thread, get its pid from
kernel_thread() then look up its task_struct.

a) It assumes that pid recycling takes a "long" time.

b) We get a task_struct but no reference was taken on it.  The owner of the
   kswapd and kthread task_struct*'s must assume that the new thread won't
   exit unexpectedly.  Because if it does, they're left holding dead memory
   and any attempt to control or stop that task will crash.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c | 2 ++
 mm/vmscan.c      | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6a5373868a98..c5f3c6613b6d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -115,7 +115,9 @@ static void keventd_create_kthread(void *_create)
 		create->result = ERR_PTR(pid);
 	} else {
 		wait_for_completion(&create->started);
+		read_lock(&tasklist_lock);
 		create->result = find_task_by_pid(pid);
+		read_unlock(&tasklist_lock);
 	}
 	complete(&create->done);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd572bbdc9f5..78865c849f8f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1356,7 +1356,9 @@ static int __init kswapd_init(void)
 
 		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
 		BUG_ON(pid < 0);
+		read_lock(&tasklist_lock);
 		pgdat->kswapd = find_task_by_pid(pid);
+		read_unlock(&tasklist_lock);
 	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
-- 
cgit v1.2.3


From 231bed205879236357171e50bd8965e70797ecdc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@resalehost.networksolutions.com>
Date: Sat, 25 Mar 2006 03:08:00 -0800
Subject: [PATCH] No need to protect current->group_info in sys_getgroups(),
 in_group_p() and in_egroup_p()

While doing some benchmarks of an Apache/PHP SMP server, I noticed high
oprofile numbers in in_group_p() and _atomic_dec_and_lock().

rank  percent
  1     4.8911 % __link_path_walk
  2     4.8503 % __d_lookup
*3     4.2911 % _atomic_dec_and_lock
  4     3.9307 % __copy_to_user_ll
  5     4.9004 % sysenter_past_esp
*6     3.3248 % in_group_p

It appears that in_group_p() does an uncessary

get_group_info(current->group_info); /* atomic_inc() */
  ... /* access current->group_info */
put_group_info(current->group_info); /* _atomic_dec_and_lock */

It is not necessary to do this, because the current task holds a reference
on its own group_info, and this reference cannot change during the lookup.

This patch deletes the get_group_info()/put_group_info() pair from
sys_getgroups(), in_group_p() and in_egroup_p() functions.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Tim Hockin <thockin@hockin.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 421009cedb51..119fb0d9e24e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1421,7 +1421,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 		return -EINVAL;
 
 	/* no need to grab task_lock here; it cannot change */
-	get_group_info(current->group_info);
 	i = current->group_info->ngroups;
 	if (gidsetsize) {
 		if (i > gidsetsize) {
@@ -1434,7 +1433,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
 		}
 	}
 out:
-	put_group_info(current->group_info);
 	return i;
 }
 
@@ -1475,9 +1473,7 @@ int in_group_p(gid_t grp)
 {
 	int retval = 1;
 	if (grp != current->fsgid) {
-		get_group_info(current->group_info);
 		retval = groups_search(current->group_info, grp);
-		put_group_info(current->group_info);
 	}
 	return retval;
 }
@@ -1488,9 +1484,7 @@ int in_egroup_p(gid_t grp)
 {
 	int retval = 1;
 	if (grp != current->egid) {
-		get_group_info(current->group_info);
 		retval = groups_search(current->group_info, grp);
-		put_group_info(current->group_info);
 	}
 	return retval;
 }
-- 
cgit v1.2.3


From 34f361ade2fb4a869f6a7714d01c04ce4cfa75d9 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Mar 2006 03:08:18 -0800
Subject: [PATCH] Check if cpu can be onlined before calling smp_prepare_cpu()

- Moved check for online cpu out of smp_prepare_cpu()

- Moved default declaration of smp_prepare_cpu() to kernel/cpu.c

- Removed lock_cpu_hotplug() from smp_prepare_cpu() to around it, since
  its called from cpu_up() as well now.

- Removed clearing from cpu_present_map during cpu_offline as it breaks
  using cpu_up() directly during a subsequent online operation.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: "Li, Shaohua" <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c | 33 ++++++++++++++++++---------------
 drivers/base/cpu.c         |  9 +--------
 include/linux/cpu.h        |  1 -
 kernel/power/smp.c         |  4 +---
 4 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4c470e99a742..82371d83bfa9 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1003,7 +1003,6 @@ void cpu_exit_clear(void)
 
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
-	cpu_clear(cpu, cpu_present_map);
 
 	cpu_clear(cpu, smp_commenced_mask);
 	unmap_cpu_to_logical_apicid(cpu);
@@ -1015,31 +1014,20 @@ struct warm_boot_cpu_info {
 	int cpu;
 };
 
-static void __devinit do_warm_boot_cpu(void *p)
+static void __cpuinit do_warm_boot_cpu(void *p)
 {
 	struct warm_boot_cpu_info *info = p;
 	do_boot_cpu(info->apicid, info->cpu);
 	complete(info->complete);
 }
 
-int __devinit smp_prepare_cpu(int cpu)
+static int __cpuinit __smp_prepare_cpu(int cpu)
 {
 	DECLARE_COMPLETION(done);
 	struct warm_boot_cpu_info info;
 	struct work_struct task;
 	int	apicid, ret;
 
-	lock_cpu_hotplug();
-
-	/*
-	 * On x86, CPU0 is never offlined.  Trying to bring up an
-	 * already-booted CPU will hang.  So check for that case.
-	 */
-	if (cpu_online(cpu)) {
-		ret = -EINVAL;
-		goto exit;
-	}
-
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
 		ret = -ENODEV;
@@ -1064,7 +1052,6 @@ int __devinit smp_prepare_cpu(int cpu)
 	zap_low_mappings();
 	ret = 0;
 exit:
-	unlock_cpu_hotplug();
 	return ret;
 }
 #endif
@@ -1392,6 +1379,22 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+#ifdef CONFIG_HOTPLUG_CPU
+	int ret=0;
+
+	/*
+	 * We do warm boot only on cpus that had booted earlier
+	 * Otherwise cold boot is all handled from smp_boot_cpus().
+	 * cpu_callin_map is set during AP kickstart process. Its reset
+	 * when a cpu is taken offline from cpu_exit_clear().
+	 */
+	if (!cpu_isset(cpu, cpu_callin_map))
+		ret = __smp_prepare_cpu(cpu);
+
+	if (ret)
+		return -EIO;
+#endif
+
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 29f3d7504da1..dd712b24ec91 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -19,11 +19,6 @@ EXPORT_SYMBOL(cpu_sysdev_class);
 static struct sys_device *cpu_sys_devices[NR_CPUS];
 
 #ifdef CONFIG_HOTPLUG_CPU
-int __attribute__((weak)) smp_prepare_cpu (int cpu)
-{
-	return 0;
-}
-
 static ssize_t show_online(struct sys_device *dev, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -44,9 +39,7 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
 		break;
 	case '1':
-		ret = smp_prepare_cpu(cpu->sysdev.id);
-		if (!ret)
-			ret = cpu_up(cpu->sysdev.id);
+		ret = cpu_up(cpu->sysdev.id);
 		if (!ret)
 			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 		break;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d612b89dce33..08d50c53aab4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -74,7 +74,6 @@ extern int lock_cpu_hotplug_interruptible(void);
 	register_cpu_notifier(&fn##_nb);			\
 }
 int cpu_down(unsigned int cpu);
-extern int __attribute__((weak)) smp_prepare_cpu(int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
 #define lock_cpu_hotplug()	do { } while (0)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 911fc62b8225..5957312b2d68 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -49,9 +49,7 @@ void enable_nonboot_cpus(void)
 
 	printk("Thawing cpus ...\n");
 	for_each_cpu_mask(cpu, frozen_cpus) {
-		error = smp_prepare_cpu(cpu);
-		if (!error)
-			error = cpu_up(cpu);
+		error = cpu_up(cpu);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
 			continue;
-- 
cgit v1.2.3


From d74beb9f33a5f16d2965f11b275e401f225c949d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 25 Mar 2006 03:08:19 -0800
Subject: [PATCH] Use unsigned int types for a faster bsearch

This patch avoids arithmetic on 'signed' types that are slower than
'unsigned'.  This saves space and cpu cycles.

size of kernel/sys.o before the patch (gcc-3.4.5)

    text    data     bss     dec     hex filename
   10924     252       4   11180    2bac kernel/sys.o

size of kernel/sys.o after the patch
    text    data     bss     dec     hex filename
   10903     252       4   11159    2b97 kernel/sys.o

I noticed that gcc-4.1.0 (from Fedora Core 5) even uses idiv instruction for
(a+b)/2 if a and b are signed.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 119fb0d9e24e..38bc73ede2ba 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1363,7 +1363,7 @@ static void groups_sort(struct group_info *group_info)
 /* a simple bsearch */
 int groups_search(struct group_info *group_info, gid_t grp)
 {
-	int left, right;
+	unsigned int left, right;
 
 	if (!group_info)
 		return 0;
@@ -1371,7 +1371,7 @@ int groups_search(struct group_info *group_info, gid_t grp)
 	left = 0;
 	right = group_info->ngroups;
 	while (left < right) {
-		int mid = (left+right)/2;
+		unsigned int mid = (left+right)/2;
 		int cmp = grp - GROUP_AT(group_info, mid);
 		if (cmp > 0)
 			left = mid + 1;
-- 
cgit v1.2.3


From f5163427453bc6ca2dd497eeb3e7a30d1c74b487 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Sat, 25 Mar 2006 03:08:23 -0800
Subject: [PATCH] Add SA_PERCPU_IRQ flag support

Add support for SA_PERCPU_IRQ (only mmtimer.c uses this at this stage).

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 97d5559997d2..6edfcef291e8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	p = &desc->action;
 	if ((old = *p) != NULL) {
 		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&desc->lock,flags);
-			return -EBUSY;
-		}
+		if (!(old->flags & new->flags & SA_SHIRQ))
+			goto mismatch;
+
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+		/* All handlers must agree on per-cpuness */
+		if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
+			goto mismatch;
+#endif
 
 		/* add new interrupt at end of irq queue */
 		do {
@@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	}
 
 	*p = new;
-
+#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
+	if (new->flags & SA_PERCPU_IRQ)
+		desc->status |= IRQ_PER_CPU;
+#endif
 	if (!shared) {
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -236,6 +243,12 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	register_handler_proc(irq, new);
 
 	return 0;
+
+mismatch:
+	spin_unlock_irqrestore(&desc->lock, flags);
+	printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+	dump_stack();
+	return -EBUSY;
 }
 
 /**
-- 
cgit v1.2.3


From 5ddcfa878d5b10b0ab94251a4229a8a9daaf93ed Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sat, 25 Mar 2006 03:08:28 -0800
Subject: [PATCH] remove pps support

This removes the support for pps.  It's completely unused within the kernel
and is basically in the way for further cleanups.  It should be easier to
readd proper support for it after the rest has been converted to NTP4
(where the pps mechanisms are quite different from NTP3 anyway).

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Cc: Adrian Bunk <bunk@stusta.de>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h | 41 -----------------------------------
 kernel/time.c         | 59 ++++++++++++++-------------------------------------
 kernel/timer.c        | 13 ++----------
 3 files changed, 18 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index b7ca1204e42a..82dc9ae79d37 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -97,37 +97,10 @@
 
 #define MAXPHASE 512000L        /* max phase error (us) */
 #define MAXFREQ (512L << SHIFT_USEC)  /* max frequency error (ppm) */
-#define MAXTIME (200L << PPS_AVG) /* max PPS error (jitter) (200 us) */
 #define MINSEC 16L              /* min interval between updates (s) */
 #define MAXSEC 1200L            /* max interval between updates (s) */
 #define	NTP_PHASE_LIMIT	(MAXPHASE << 5)	/* beyond max. dispersion */
 
-/*
- * The following defines are used only if a pulse-per-second (PPS)
- * signal is available and connected via a modem control lead, such as
- * produced by the optional ppsclock feature incorporated in the Sun
- * asynch driver. They establish the design parameters of the frequency-
- * lock loop used to discipline the CPU clock oscillator to the PPS
- * signal.
- *
- * PPS_AVG is the averaging factor for the frequency loop, as well as
- * the time and frequency dispersion.
- *
- * PPS_SHIFT and PPS_SHIFTMAX specify the minimum and maximum
- * calibration intervals, respectively, in seconds as a power of two.
- *
- * PPS_VALID is the maximum interval before the PPS signal is considered
- * invalid and protocol updates used directly instead.
- *
- * MAXGLITCH is the maximum interval before a time offset of more than
- * MAXTIME is believed.
- */
-#define PPS_AVG 2		/* pps averaging constant (shift) */
-#define PPS_SHIFT 2		/* min interval duration (s) (shift) */
-#define PPS_SHIFTMAX 8		/* max interval duration (s) (shift) */
-#define PPS_VALID 120		/* pps signal watchdog max (s) */
-#define MAXGLITCH 30		/* pps signal glitch max (s) */
-
 /*
  * syscall interface - used (mainly by NTP daemon)
  * to discipline kernel clock oscillator
@@ -246,20 +219,6 @@ extern long time_reftime;	/* time at last adjustment (s) */
 extern long time_adjust;	/* The amount of adjtime left */
 extern long time_next_adjust;	/* Value for time_adjust at next tick */
 
-/* interface variables pps->timer interrupt */
-extern long pps_offset;		/* pps time offset (us) */
-extern long pps_jitter;		/* time dispersion (jitter) (us) */
-extern long pps_freq;		/* frequency offset (scaled ppm) */
-extern long pps_stabil;		/* frequency dispersion (scaled ppm) */
-extern long pps_valid;		/* pps signal watchdog counter */
-
-/* interface variables pps->adjtimex */
-extern int pps_shift;		/* interval duration (s) (shift) */
-extern long pps_jitcnt;		/* jitter limit exceeded */
-extern long pps_calcnt;		/* calibration intervals */
-extern long pps_errcnt;		/* calibration errors */
-extern long pps_stbcnt;		/* stability limit exceeded */
-
 /**
  * ntp_clear - Clears the NTP state variables
  *
diff --git a/kernel/time.c b/kernel/time.c
index 804539165d8b..e00a97b77241 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
 	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-long pps_offset;		/* pps time offset (us) */
-long pps_jitter = MAXTIME;	/* time dispersion (jitter) (us) */
-
-long pps_freq;			/* frequency offset (scaled ppm) */
-long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
-
-long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
-
-int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
-
-long pps_jitcnt;		/* jitter limit exceeded */
-long pps_calcnt;		/* calibration intervals */
-long pps_errcnt;		/* calibration errors */
-long pps_stbcnt;		/* stability limit exceeded */
-
-/* hook for a loadable hardpps kernel module */
-void (*hardpps_ptr)(struct timeval *);
-
 /* we call this to notify the arch when the clock is being
  * controlled.  If no such arch routine, do nothing.
  */
@@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc)
 		    result = -EINVAL;
 		    goto leave;
 		}
-		time_freq = txc->freq - pps_freq;
+		time_freq = txc->freq;
 	    }
 
 	    if (txc->modes & ADJ_MAXERROR) {
@@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc)
 		    if ((time_next_adjust = txc->offset) == 0)
 			 time_adjust = 0;
 		}
-		else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
-		    ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
-		            (STA_PPSTIME | STA_PPSSIGNAL) ?
-		            pps_offset : txc->offset;
+		else if (time_status & STA_PLL) {
+		    ltemp = txc->offset;
 
 		    /*
 		     * Scale the phase adjustment and
@@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc)
 		    }
 		    time_freq = min(time_freq, time_tolerance);
 		    time_freq = max(time_freq, -time_tolerance);
-		} /* STA_PLL || STA_PPSTIME */
+		} /* STA_PLL */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
 		tick_usec = txc->tick;
 		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
 	    }
 	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
-	    || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
-		&& (time_status & STA_PPSSIGNAL) == 0)
-	    /* p. 24, (b) */
-	    || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
-		== (STA_PPSTIME|STA_PPSJITTER))
-	    /* p. 24, (c) */
-	    || ((time_status & STA_PPSFREQ) != 0
-		&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
-	    /* p. 24, (d) */
+leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
 		result = TIME_ERROR;
 	
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
@@ -380,7 +351,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	else {
 	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
-	txc->freq	   = time_freq + pps_freq;
+	txc->freq	   = time_freq;
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
@@ -388,14 +359,16 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	txc->precision	   = time_precision;
 	txc->tolerance	   = time_tolerance;
 	txc->tick	   = tick_usec;
-	txc->ppsfreq	   = pps_freq;
-	txc->jitter	   = pps_jitter >> PPS_AVG;
-	txc->shift	   = pps_shift;
-	txc->stabil	   = pps_stabil;
-	txc->jitcnt	   = pps_jitcnt;
-	txc->calcnt	   = pps_calcnt;
-	txc->errcnt	   = pps_errcnt;
-	txc->stbcnt	   = pps_stbcnt;
+
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
 	notify_arch_cmos_timer();
diff --git a/kernel/timer.c b/kernel/timer.c
index 13fa72cac7d8..ab189dd187cb 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -697,18 +697,9 @@ static void second_overflow(void)
 
 	/*
 	 * Compute the frequency estimate and additional phase adjustment due
-	 * to frequency error for the next second. When the PPS signal is
-	 * engaged, gnaw on the watchdog counter and update the frequency
-	 * computed by the pll and the PPS signal.
+	 * to frequency error for the next second.
 	 */
-	pps_valid++;
-	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-		pps_jitter = MAXTIME;
-		pps_stabil = MAXFREQ;
-		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-				STA_PPSWANDER | STA_PPSERROR);
-	}
-	ltemp = time_freq + pps_freq;
+	ltemp = time_freq;
 	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-- 
cgit v1.2.3


From 910dea7fdda22f0ee83d26d459e460c79ed94557 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:29:26 +0200
Subject: BUG_ON() Conversion in kernel/fork.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aaa..d93ab2ba729c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -769,8 +769,7 @@ int unshare_files(void)
 	struct files_struct *files  = current->files;
 	int rc;
 
-	if(!files)
-		BUG();
+	BUG_ON(!files);
 
 	/* This can race but the race causes us to copy when we don't
 	   need to and drop the copy */
-- 
cgit v1.2.3


From cd7b24bb1891a10ee25168a912ff2304a9571d23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 26 Mar 2006 01:36:54 -0800
Subject: [PATCH] warn if free_irq() is called from IRQ context

Warn if free_irq() is called in IRQ context - free_irq() can execute /proc
VFS work, which must not be done in IRQ context.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6edfcef291e8..ac766ad573e8 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -271,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	struct irqaction **p;
 	unsigned long flags;
 
+	WARN_ON(in_interrupt());
 	if (irq >= NR_IRQS)
 		return;
 
-- 
cgit v1.2.3


From e655a250d5fc12b6dfe0d436180ba4a3bfffdc9f Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Sun, 26 Mar 2006 01:37:11 -0800
Subject: [PATCH] swswsup: return correct load_image error

If there's an error in load_image() we should return that without checking
snapshot_image_loaded.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 9177f3f73a6c..044b8e0c1025 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -454,10 +454,11 @@ static int load_image(struct swap_map_handle *handle,
 			nr_pages++;
 		}
 	} while (ret > 0);
-	if (!error)
+	if (!error) {
 		printk("\b\b\b\bdone\n");
-	if (!snapshot_image_loaded(snapshot))
-		error = -ENODATA;
+		if (!snapshot_image_loaded(snapshot))
+			error = -ENODATA;
+	}
 	return error;
 }
 
-- 
cgit v1.2.3


From 3158e9411a66fb98d495ac441c242264f31aaf3e Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Sun, 26 Mar 2006 01:37:29 -0800
Subject: [PATCH] consolidate sys32/compat_adjtimex

Create compat_sys_adjtimex and use it an all appropriate places.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/osf_sys.c        |  1 -
 arch/ia64/ia32/sys_ia32.c          | 62 ------------------------------------
 arch/mips/kernel/linux32.c         | 61 ------------------------------------
 arch/mips/kernel/scall64-n32.S     |  2 +-
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/sys_parisc32.c  | 29 -----------------
 arch/parisc/kernel/syscall_table.S |  2 +-
 arch/powerpc/kernel/sys_ppc32.c    | 60 -----------------------------------
 arch/s390/kernel/compat_linux.c    | 61 ------------------------------------
 arch/s390/kernel/compat_wrapper.S  |  8 ++---
 arch/s390/kernel/syscalls.S        |  2 +-
 arch/sparc64/kernel/sys_sparc32.c  | 61 ------------------------------------
 arch/sparc64/kernel/systbls.S      |  2 +-
 arch/x86_64/ia32/ia32entry.S       |  2 +-
 arch/x86_64/ia32/sys_ia32.c        | 64 --------------------------------------
 include/linux/compat.h             |  2 ++
 include/linux/timex.h              |  2 ++
 kernel/compat.c                    | 59 +++++++++++++++++++++++++++++++++++
 18 files changed, 73 insertions(+), 409 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 7fb14f42a125..31afe3d91ac6 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -821,7 +821,6 @@ osf_setsysinfo(unsigned long op, void __user *buffer, unsigned long nbytes,
    affects all sorts of things, like timeval and itimerval.  */
 
 extern struct timezone sys_tz;
-extern int do_adjtimex(struct timex *);
 
 struct timeval32
 {
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 1e1a2353aefd..5366b3b23d09 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -25,7 +25,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -2591,65 +2590,4 @@ sys32_setresgid(compat_gid_t rgid, compat_gid_t egid,
 	ssgid = (sgid == (compat_gid_t)-1) ? ((gid_t)-1) : ((gid_t)sgid);
 	return sys_setresgid(srgid, segid, ssgid);
 }
-
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long
-sys32_adjtimex(struct compat_timex *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
 #endif /* NOTYET */
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index f33e779796f0..3f40c37a9ee6 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -30,7 +30,6 @@
 #include <linux/utime.h>
 #include <linux/utsname.h>
 #include <linux/personality.h>
-#include <linux/timex.h>
 #include <linux/dnotify.h>
 #include <linux/module.h>
 #include <linux/binfmts.h>
@@ -1157,66 +1156,6 @@ out:
 	return err;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage int sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage int sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset,
 	s32 count)
 {
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 02c8267e45e7..05a2c0567dae 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -273,7 +273,7 @@ EXPORT(sysn32_call_table)
 	PTR	sys_pivot_root
 	PTR	sys32_sysctl
 	PTR	sys_prctl
-	PTR	sys32_adjtimex
+	PTR	compat_sys_adjtimex
 	PTR	compat_sys_setrlimit		/* 6155 */
 	PTR	sys_chroot
 	PTR	sys_sync
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 797e0d874889..19c4ca481b02 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -328,7 +328,7 @@ sys_call_table:
 	PTR	sys_setdomainname
 	PTR	sys32_newuname
 	PTR	sys_ni_syscall			/* sys_modify_ldt */
-	PTR	sys32_adjtimex
+	PTR	compat_sys_adjtimex
 	PTR	sys_mprotect			/* 4125 */
 	PTR	compat_sys_sigprocmask
 	PTR	sys_ni_syscall			/* was creat_module */
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index ca1d8dbbadf3..d286f68a3d3a 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -21,7 +21,6 @@
 #include <linux/times.h>
 #include <linux/utsname.h>
 #include <linux/time.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -567,34 +566,6 @@ asmlinkage int sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *off
 }
 
 
-asmlinkage long sys32_adjtimex(struct compat_timex __user *txc_p32)
-{
-	struct timex txc;
-	struct compat_timex t32;
-	int ret;
-	extern int do_adjtimex(struct timex *txc);
-
-	if(copy_from_user(&t32, txc_p32, sizeof(struct compat_timex)))
-		return -EFAULT;
-#undef CP
-#define CP(x) txc.x = t32.x
-	CP(modes); CP(offset); CP(freq); CP(maxerror); CP(esterror);
-	CP(status); CP(constant); CP(precision); CP(tolerance);
-	CP(time.tv_sec); CP(time.tv_usec); CP(tick); CP(ppsfreq); CP(jitter);
-	CP(shift); CP(stabil); CP(jitcnt); CP(calcnt); CP(errcnt);
-	CP(stbcnt);
-	ret = do_adjtimex(&txc);
-#undef CP
-#define CP(x) t32.x = txc.x
-	CP(modes); CP(offset); CP(freq); CP(maxerror); CP(esterror);
-	CP(status); CP(constant); CP(precision); CP(tolerance);
-	CP(time.tv_sec); CP(time.tv_usec); CP(tick); CP(ppsfreq); CP(jitter);
-	CP(shift); CP(stabil); CP(jitcnt); CP(calcnt); CP(errcnt);
-	CP(stbcnt);
-	return copy_to_user(txc_p32, &t32, sizeof(struct compat_timex)) ? -EFAULT : ret;
-}
-
-
 struct sysinfo32 {
 	s32 uptime;
 	u32 loads[3];
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index 71011eadb872..89b6c56ea0a8 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -207,7 +207,7 @@
 	/* struct sockaddr... */
 	ENTRY_SAME(recvfrom)
 	/* struct timex contains longs */
-	ENTRY_DIFF(adjtimex)
+	ENTRY_COMP(adjtimex)
 	ENTRY_SAME(mprotect)		/* 125 */
 	/* old_sigset_t forced to 32 bits.  Beware glibc sigset_t */
 	ENTRY_COMP(sigprocmask)
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index d9867f583c68..ec274e688816 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -24,7 +24,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -161,65 +160,6 @@ asmlinkage long compat_sys_sysfs(u32 option, u32 arg1, u32 arg2)
 	return sys_sysfs((int)option, arg1, arg2);
 }
 
-/* Handle adjtimex compatibility. */
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-	
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage long compat_sys_pause(void)
 {
 	current->state = TASK_INTERRUPTIBLE;
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 9809264f2f4d..5e14de37c17b 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -26,7 +26,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -705,66 +704,6 @@ asmlinkage long sys32_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if(get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if(put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 #ifdef CONFIG_SYSCTL
 struct __sysctl_args32 {
 	u32 name;
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 50e80138e7ad..199da68bd7be 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -551,10 +551,10 @@ sys32_newuname_wrapper:
 	llgtr	%r2,%r2			# struct new_utsname *
 	jg	s390x_newuname		# branch to system call
 
-	.globl  sys32_adjtimex_wrapper 
-sys32_adjtimex_wrapper:
-	llgtr	%r2,%r2			# struct timex_emu31 *
-	jg	sys32_adjtimex		# branch to system call
+	.globl  compat_sys_adjtimex_wrapper
+compat_sys_adjtimex_wrapper:
+	llgtr	%r2,%r2			# struct compat_timex *
+	jg	compat_sys_adjtimex	# branch to system call
 
 	.globl  sys32_mprotect_wrapper 
 sys32_mprotect_wrapper:
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 7c88d85c3597..2f56654da821 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -132,7 +132,7 @@ SYSCALL(sys_clone_glue,sys_clone_glue,sys32_clone_glue)		/* 120 */
 SYSCALL(sys_setdomainname,sys_setdomainname,sys32_setdomainname_wrapper)
 SYSCALL(sys_newuname,s390x_newuname,sys32_newuname_wrapper)
 NI_SYSCALL							/* modify_ldt for i386 */
-SYSCALL(sys_adjtimex,sys_adjtimex,sys32_adjtimex_wrapper)
+SYSCALL(sys_adjtimex,sys_adjtimex,compat_sys_adjtimex_wrapper)
 SYSCALL(sys_mprotect,sys_mprotect,sys32_mprotect_wrapper)	/* 125 */
 SYSCALL(sys_sigprocmask,sys_sigprocmask,compat_sys_sigprocmask_wrapper)
 NI_SYSCALL							/* old "create module" */
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 48f02f201918..2e906bad56fa 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -19,7 +19,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -945,66 +944,6 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (get_user(txc.modes, &utp->modes) ||
-	    __get_user(txc.offset, &utp->offset) ||
-	    __get_user(txc.freq, &utp->freq) ||
-	    __get_user(txc.maxerror, &utp->maxerror) ||
-	    __get_user(txc.esterror, &utp->esterror) ||
-	    __get_user(txc.status, &utp->status) ||
-	    __get_user(txc.constant, &utp->constant) ||
-	    __get_user(txc.precision, &utp->precision) ||
-	    __get_user(txc.tolerance, &utp->tolerance) ||
-	    __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __get_user(txc.tick, &utp->tick) ||
-	    __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __get_user(txc.jitter, &utp->jitter) ||
-	    __get_user(txc.shift, &utp->shift) ||
-	    __get_user(txc.stabil, &utp->stabil) ||
-	    __get_user(txc.jitcnt, &utp->jitcnt) ||
-	    __get_user(txc.calcnt, &utp->calcnt) ||
-	    __get_user(txc.errcnt, &utp->errcnt) ||
-	    __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (put_user(txc.modes, &utp->modes) ||
-	    __put_user(txc.offset, &utp->offset) ||
-	    __put_user(txc.freq, &utp->freq) ||
-	    __put_user(txc.maxerror, &utp->maxerror) ||
-	    __put_user(txc.esterror, &utp->esterror) ||
-	    __put_user(txc.status, &utp->status) ||
-	    __put_user(txc.constant, &utp->constant) ||
-	    __put_user(txc.precision, &utp->precision) ||
-	    __put_user(txc.tolerance, &utp->tolerance) ||
-	    __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __put_user(txc.tick, &utp->tick) ||
-	    __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __put_user(txc.jitter, &utp->jitter) ||
-	    __put_user(txc.shift, &utp->shift) ||
-	    __put_user(txc.stabil, &utp->stabil) ||
-	    __put_user(txc.jitcnt, &utp->jitcnt) ||
-	    __put_user(txc.calcnt, &utp->calcnt) ||
-	    __put_user(txc.errcnt, &utp->errcnt) ||
-	    __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 /* This is just a version for 32-bit applications which does
  * not force O_LARGEFILE on.
  */
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index c3adb7ac167d..3b250f2318fd 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -63,7 +63,7 @@ sys_call_table32:
 /*200*/	.word sys32_ssetmask, sys_sigsuspend, compat_sys_newlstat, sys_uselib, compat_sys_old_readdir
 	.word sys32_readahead, sys32_socketcall, sys32_syslog, sys32_lookup_dcookie, sys32_fadvise64
 /*210*/	.word sys32_fadvise64_64, sys32_tgkill, sys32_waitpid, sys_swapoff, sys32_sysinfo
-	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, sys32_adjtimex
+	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, compat_sys_adjtimex
 /*220*/	.word sys32_sigprocmask, sys_ni_syscall, sys32_delete_module, sys_ni_syscall, sys32_getpgid
 	.word sys32_bdflush, sys32_sysfs, sys_nis_syscall, sys32_setfsuid16, sys32_setfsgid16
 /*230*/	.word sys32_select, compat_sys_time, sys_nis_syscall, compat_sys_stime, compat_sys_statfs64
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 00dee176c08e..7549a4389fbf 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -501,7 +501,7 @@ ia32_sys_call_table:
 	.quad sys_setdomainname
 	.quad sys_uname
 	.quad sys_modify_ldt
-	.quad sys32_adjtimex
+	.quad compat_sys_adjtimex
 	.quad sys32_mprotect		/* 125 */
 	.quad compat_sys_sigprocmask
 	.quad quiet_ni_syscall		/* create_module */
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index b13121e451a8..f182b20858e2 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -30,7 +30,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -767,69 +766,6 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long
-sys32_adjtimex(struct compat_timex __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-	   __get_user(txc.modes, &utp->modes) ||
-	   __get_user(txc.offset, &utp->offset) ||
-	   __get_user(txc.freq, &utp->freq) ||
-	   __get_user(txc.maxerror, &utp->maxerror) ||
-	   __get_user(txc.esterror, &utp->esterror) ||
-	   __get_user(txc.status, &utp->status) ||
-	   __get_user(txc.constant, &utp->constant) ||
-	   __get_user(txc.precision, &utp->precision) ||
-	   __get_user(txc.tolerance, &utp->tolerance) ||
-	   __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __get_user(txc.tick, &utp->tick) ||
-	   __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __get_user(txc.jitter, &utp->jitter) ||
-	   __get_user(txc.shift, &utp->shift) ||
-	   __get_user(txc.stabil, &utp->stabil) ||
-	   __get_user(txc.jitcnt, &utp->jitcnt) ||
-	   __get_user(txc.calcnt, &utp->calcnt) ||
-	   __get_user(txc.errcnt, &utp->errcnt) ||
-	   __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-	   __put_user(txc.modes, &utp->modes) ||
-	   __put_user(txc.offset, &utp->offset) ||
-	   __put_user(txc.freq, &utp->freq) ||
-	   __put_user(txc.maxerror, &utp->maxerror) ||
-	   __put_user(txc.esterror, &utp->esterror) ||
-	   __put_user(txc.status, &utp->status) ||
-	   __put_user(txc.constant, &utp->constant) ||
-	   __put_user(txc.precision, &utp->precision) ||
-	   __put_user(txc.tolerance, &utp->tolerance) ||
-	   __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	   __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	   __put_user(txc.tick, &utp->tick) ||
-	   __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	   __put_user(txc.jitter, &utp->jitter) ||
-	   __put_user(txc.shift, &utp->shift) ||
-	   __put_user(txc.stabil, &utp->stabil) ||
-	   __put_user(txc.jitcnt, &utp->jitcnt) ||
-	   __put_user(txc.calcnt, &utp->calcnt) ||
-	   __put_user(txc.errcnt, &utp->errcnt) ||
-	   __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
 	unsigned long prot, unsigned long flags,
 	unsigned long fd, unsigned long pgoff)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 859f95700d36..24d659cdbafe 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -207,5 +207,7 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
+
 #endif /* CONFIG_COMPAT */
 #endif /* _LINUX_COMPAT_H */
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 82dc9ae79d37..03914b7e41b1 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -307,6 +307,8 @@ time_interpolator_reset(void)
 /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
 extern u64 current_tick_length(void);
 
+extern int do_adjtimex(struct timex *);
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/kernel/compat.c b/kernel/compat.c
index 8c9cd88b6785..b9bdd1271f44 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
+#include <linux/timex.h>
 
 #include <asm/uaccess.h>
 
@@ -898,3 +899,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
 	return -ERESTARTNOHAND;
 }
 #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
+
+asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+{
+	struct timex txc;
+	int ret;
+
+	memset(&txc, 0, sizeof(struct timex));
+
+	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+			__get_user(txc.modes, &utp->modes) ||
+			__get_user(txc.offset, &utp->offset) ||
+			__get_user(txc.freq, &utp->freq) ||
+			__get_user(txc.maxerror, &utp->maxerror) ||
+			__get_user(txc.esterror, &utp->esterror) ||
+			__get_user(txc.status, &utp->status) ||
+			__get_user(txc.constant, &utp->constant) ||
+			__get_user(txc.precision, &utp->precision) ||
+			__get_user(txc.tolerance, &utp->tolerance) ||
+			__get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+			__get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+			__get_user(txc.tick, &utp->tick) ||
+			__get_user(txc.ppsfreq, &utp->ppsfreq) ||
+			__get_user(txc.jitter, &utp->jitter) ||
+			__get_user(txc.shift, &utp->shift) ||
+			__get_user(txc.stabil, &utp->stabil) ||
+			__get_user(txc.jitcnt, &utp->jitcnt) ||
+			__get_user(txc.calcnt, &utp->calcnt) ||
+			__get_user(txc.errcnt, &utp->errcnt) ||
+			__get_user(txc.stbcnt, &utp->stbcnt))
+		return -EFAULT;
+
+	ret = do_adjtimex(&txc);
+
+	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+			__put_user(txc.modes, &utp->modes) ||
+			__put_user(txc.offset, &utp->offset) ||
+			__put_user(txc.freq, &utp->freq) ||
+			__put_user(txc.maxerror, &utp->maxerror) ||
+			__put_user(txc.esterror, &utp->esterror) ||
+			__put_user(txc.status, &utp->status) ||
+			__put_user(txc.constant, &utp->constant) ||
+			__put_user(txc.precision, &utp->precision) ||
+			__put_user(txc.tolerance, &utp->tolerance) ||
+			__put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+			__put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+			__put_user(txc.tick, &utp->tick) ||
+			__put_user(txc.ppsfreq, &utp->ppsfreq) ||
+			__put_user(txc.jitter, &utp->jitter) ||
+			__put_user(txc.shift, &utp->shift) ||
+			__put_user(txc.stabil, &utp->stabil) ||
+			__put_user(txc.jitcnt, &utp->jitcnt) ||
+			__put_user(txc.calcnt, &utp->calcnt) ||
+			__put_user(txc.errcnt, &utp->errcnt) ||
+			__put_user(txc.stbcnt, &utp->stbcnt))
+		ret = -EFAULT;
+
+	return ret;
+}
-- 
cgit v1.2.3


From 92127c7a45d4d167d9b015a5f9de6b41ed66f1d0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Mar 2006 01:38:05 -0800
Subject: [PATCH] hrtimers: optimize softirq runqueues

The hrtimer softirq is called from the timer softirq every tick.  Retrieve the
current time from xtime and wall_to_monotonic instead of calling
base->get_time() for each timer base.  Store the time in the base structure
and provide a hook once clock source abstractions are in place and to keep the
code open for new base clocks.

Based on a patch from: Roman Zippel <zippel@linux-m68k.org>

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 20 ++++++++++++--------
 kernel/hrtimer.c        | 28 ++++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6401c31d6add..64e2754ca734 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -72,14 +72,16 @@ struct hrtimer {
 /**
  * struct hrtimer_base - the timer base for a specific clock
  *
- * @index:	clock type index for per_cpu support when moving a timer
- *		to a base on another cpu.
- * @lock:	lock protecting the base and associated timers
- * @active:	red black tree root node for the active timers
- * @first:	pointer to the timer node which expires first
- * @resolution:	the resolution of the clock, in nanoseconds
- * @get_time:	function to retrieve the current time of the clock
- * @curr_timer:	the timer which is executing a callback right now
+ * @index:		clock type index for per_cpu support when moving a timer
+ *			to a base on another cpu.
+ * @lock:		lock protecting the base and associated timers
+ * @active:		red black tree root node for the active timers
+ * @first:		pointer to the timer node which expires first
+ * @resolution:		the resolution of the clock, in nanoseconds
+ * @get_time:		function to retrieve the current time of the clock
+ * @get_sofirq_time:	function to retrieve the current time from the softirq
+ * @curr_timer:		the timer which is executing a callback right now
+ * @softirq_time:	the time when running the hrtimer queue in the softirq
  */
 struct hrtimer_base {
 	clockid_t		index;
@@ -88,7 +90,9 @@ struct hrtimer_base {
 	struct rb_node		*first;
 	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
+	ktime_t			(*get_softirq_time)(void);
 	struct hrtimer		*curr_timer;
+	ktime_t			softirq_time;
 };
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14bc9cfa6399..b728cc53452b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -122,6 +122,26 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+/*
+ * Get the coarse grained time at the softirq based on xtime and
+ * wall_to_monotonic.
+ */
+static void hrtimer_get_softirq_time(struct hrtimer_base *base)
+{
+	ktime_t xtim, tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		xtim = timespec_to_ktime(xtime);
+		tomono = timespec_to_ktime(wall_to_monotonic);
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	base[CLOCK_REALTIME].softirq_time = xtim;
+	base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono);
+}
+
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -586,9 +606,11 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
  */
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
-	ktime_t now = base->get_time();
 	struct rb_node *node;
 
+	if (base->get_softirq_time)
+		base->softirq_time = base->get_softirq_time();
+
 	spin_lock_irq(&base->lock);
 
 	while ((node = base->first)) {
@@ -598,7 +620,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
-		if (now.tv64 <= timer->expires.tv64)
+		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
@@ -641,6 +663,8 @@ void hrtimer_run_queues(void)
 	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
 	int i;
 
+	hrtimer_get_softirq_time(base);
+
 	for (i = 0; i < MAX_HRTIMER_BASES; i++)
 		run_hrtimer_queue(&base[i]);
 }
-- 
cgit v1.2.3


From 44f21475511bbc0135b52c66ad74dcc6a9026da3 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:06 -0800
Subject: [PATCH] hrtimers: pass current time to hrtimer_forward()

Pass current time to hrtimer_forward().  This allows to use the softirq time
in the timer base when the forward function is called from the timer callback.
 Other places pass current time with a call to timer->base->get_time().

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |  3 ++-
 kernel/hrtimer.c        |  7 +++----
 kernel/itimer.c         |  3 ++-
 kernel/posix-timers.c   | 14 ++++++++++----
 4 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 64e2754ca734..84fc186324e6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -130,7 +130,8 @@ static inline int hrtimer_active(const struct hrtimer *timer)
 }
 
 /* Forward a hrtimer so it expires after now: */
-extern unsigned long hrtimer_forward(struct hrtimer *timer, ktime_t interval);
+extern unsigned long
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec *rqtp,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b728cc53452b..e989c9981a96 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -301,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  * hrtimer_forward - forward the timer expiry
  *
  * @timer:	hrtimer to forward
+ * @now:	forward past this time
  * @interval:	the interval to forward
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
  */
 unsigned long
-hrtimer_forward(struct hrtimer *timer, ktime_t interval)
+hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
 	unsigned long orun = 1;
-	ktime_t delta, now;
-
-	now = timer->base->get_time();
+	ktime_t delta;
 
 	delta = ktime_sub(now, timer->expires);
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 680e6b70c872..af2ec6b4392c 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -136,7 +136,8 @@ int it_real_fn(void *data)
 
 	if (tsk->signal->it_real_incr.tv64 != 0) {
 		hrtimer_forward(&tsk->signal->real_timer,
-			       tsk->signal->it_real_incr);
+				tsk->signal->real_timer.base->softirq_time,
+				tsk->signal->it_real_incr);
 
 		return HRTIMER_RESTART;
 	}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9944379360b5..255657accf02 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -251,15 +251,18 @@ __initcall(init_posix_timers);
 
 static void schedule_next_timer(struct k_itimer *timr)
 {
+	struct hrtimer *timer = &timr->it.real.timer;
+
 	if (timr->it.real.interval.tv64 == 0)
 		return;
 
-	timr->it_overrun += hrtimer_forward(&timr->it.real.timer,
+	timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
 					    timr->it.real.interval);
+
 	timr->it_overrun_last = timr->it_overrun;
 	timr->it_overrun = -1;
 	++timr->it_requeue_pending;
-	hrtimer_restart(&timr->it.real.timer);
+	hrtimer_restart(timer);
 }
 
 /*
@@ -334,6 +337,7 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 static int posix_timer_fn(void *data)
 {
 	struct k_itimer *timr = data;
+	struct hrtimer *timer = &timr->it.real.timer;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
@@ -351,7 +355,8 @@ static int posix_timer_fn(void *data)
 		 */
 		if (timr->it.real.interval.tv64 != 0) {
 			timr->it_overrun +=
-				hrtimer_forward(&timr->it.real.timer,
+				hrtimer_forward(timer,
+						timer->base->softirq_time,
 						timr->it.real.interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
@@ -623,7 +628,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	if (timr->it_requeue_pending & REQUEUE_PENDING ||
 	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
 		timr->it_overrun +=
-			hrtimer_forward(timer, timr->it.real.interval);
+			hrtimer_forward(timer, timer->base->get_time(),
+					timr->it.real.interval);
 		remaining = hrtimer_get_remaining(timer);
 	}
  calci:
-- 
cgit v1.2.3


From 3b98a5328171cebc867f70484b20bd34948cd7f6 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:07 -0800
Subject: [PATCH] hrtimers: posix-timer: cleanup common_timer_get()

Cleanup common_timer_get() a little.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 255657accf02..7c5f44787c8c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -608,39 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	ktime_t remaining;
+	ktime_t now, remaining, iv;
 	struct hrtimer *timer = &timr->it.real.timer;
 
 	memset(cur_setting, 0, sizeof(struct itimerspec));
-	remaining = hrtimer_get_remaining(timer);
 
-	/* Time left ? or timer pending */
-	if (remaining.tv64 > 0 || hrtimer_active(timer))
-		goto calci;
+	iv = timr->it.real.interval;
+
 	/* interval timer ? */
-	if (timr->it.real.interval.tv64 == 0)
+	if (iv.tv64)
+		cur_setting->it_interval = ktime_to_timespec(iv);
+	else if (!hrtimer_active(timer) &&
+		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 		return;
+
+	now = timer->base->get_time();
+
 	/*
-	 * When a requeue is pending or this is a SIGEV_NONE timer
-	 * move the expiry time forward by intervals, so expiry is >
-	 * now.
+	 * When a requeue is pending or this is a SIGEV_NONE
+	 * timer move the expiry time forward by intervals, so
+	 * expiry is > now.
 	 */
-	if (timr->it_requeue_pending & REQUEUE_PENDING ||
-	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-		timr->it_overrun +=
-			hrtimer_forward(timer, timer->base->get_time(),
-					timr->it.real.interval);
-		remaining = hrtimer_get_remaining(timer);
-	}
- calci:
-	/* interval timer ? */
-	if (timr->it.real.interval.tv64 != 0)
-		cur_setting->it_interval =
-			ktime_to_timespec(timr->it.real.interval);
+	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+		timr->it_overrun += hrtimer_forward(timer, now, iv);
+
+	remaining = ktime_sub(timer->expires, now);
 	/* Return 0 only, when the timer is expired and not pending */
-	if (remaining.tv64 <= 0)
-		cur_setting->it_value.tv_nsec = 1;
-	else
+	if (remaining.tv64 <= 0) {
+		/*
+		 * A single shot SIGEV_NONE timer must return 0, when
+		 * it is expired !
+		 */
+		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+			cur_setting->it_value.tv_nsec = 1;
+	} else
 		cur_setting->it_value = ktime_to_timespec(remaining);
 }
 
-- 
cgit v1.2.3


From 432569bb9d9d424d7ffe5b21f8205c55bdd1aaa8 Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:08 -0800
Subject: [PATCH] hrtimers: simplify nanosleep

nanosleep is the only user of the expired state, so let it manage this itself,
which makes the hrtimer code a bit simpler.  The remaining time is also only
calculated if requested.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |   4 +-
 kernel/hrtimer.c        | 142 +++++++++++++++++++++---------------------------
 2 files changed, 63 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 84fc186324e6..0e8f4762f6f8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -38,9 +38,7 @@ enum hrtimer_restart {
  * Timer states:
  */
 enum hrtimer_state {
-	HRTIMER_INACTIVE,	/* Timer is inactive */
-	HRTIMER_EXPIRED,		/* Timer is expired */
-	HRTIMER_RUNNING,		/* Timer is running the callback function */
+	HRTIMER_INACTIVE,		/* Timer is inactive */
 	HRTIMER_PENDING,		/* Timer is pending */
 };
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e989c9981a96..59ec50c1e905 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -625,30 +625,20 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
-		timer->state = HRTIMER_RUNNING;
+		timer->state = HRTIMER_INACTIVE;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		/*
-		 * fn == NULL is special case for the simplest timer
-		 * variant - wake up process and do not restart:
-		 */
-		if (!fn) {
-			wake_up_process(data);
-			restart = HRTIMER_NORESTART;
-		} else
-			restart = fn(data);
+		restart = fn(data);
 
 		spin_lock_irq(&base->lock);
 
 		/* Another CPU has added back the timer */
-		if (timer->state != HRTIMER_RUNNING)
+		if (timer->state != HRTIMER_INACTIVE)
 			continue;
 
-		if (restart == HRTIMER_RESTART)
+		if (restart != HRTIMER_NORESTART)
 			enqueue_hrtimer(timer, base);
-		else
-			timer->state = HRTIMER_EXPIRED;
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
@@ -672,79 +662,70 @@ void hrtimer_run_queues(void)
  * Sleep related functions:
  */
 
-/**
- * schedule_hrtimer - sleep until timeout
- *
- * @timer:	hrtimer variable initialized with the correct clock base
- * @mode:	timeout value is abs/rel
- *
- * Make the current task sleep until @timeout is
- * elapsed.
- *
- * You can set the task state as follows -
- *
- * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
- * pass before the routine returns. The routine will return 0
- *
- * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * will be returned
- *
- * The current task state is guaranteed to be TASK_RUNNING when this
- * routine returns.
- */
-static ktime_t __sched
-schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
-{
-	/* fn stays NULL, meaning single-shot wakeup: */
-	timer->data = current;
+struct sleep_hrtimer {
+	struct hrtimer timer;
+	struct task_struct *task;
+	int expired;
+};
 
-	hrtimer_start(timer, timer->expires, mode);
+static int nanosleep_wakeup(void *data)
+{
+	struct sleep_hrtimer *t = data;
 
-	schedule();
-	hrtimer_cancel(timer);
+	t->expired = 1;
+	wake_up_process(t->task);
 
-	/* Return the remaining time: */
-	if (timer->state != HRTIMER_EXPIRED)
-		return ktime_sub(timer->expires, timer->base->get_time());
-	else
-		return (ktime_t) {.tv64 = 0 };
+	return HRTIMER_NORESTART;
 }
 
-static inline ktime_t __sched
-schedule_hrtimer_interruptible(struct hrtimer *timer,
-			       const enum hrtimer_mode mode)
+static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
-	set_current_state(TASK_INTERRUPTIBLE);
+	t->timer.function = nanosleep_wakeup;
+	t->timer.data = t;
+	t->task = current;
+	t->expired = 0;
+
+	do {
+		set_current_state(TASK_INTERRUPTIBLE);
+		hrtimer_start(&t->timer, t->timer.expires, mode);
+
+		schedule();
 
-	return schedule_hrtimer(timer, mode);
+		if (unlikely(!t->expired)) {
+			hrtimer_cancel(&t->timer);
+			mode = HRTIMER_ABS;
+		}
+	} while (!t->expired && !signal_pending(current));
+
+	return t->expired;
 }
 
 static long __sched nanosleep_restart(struct restart_block *restart)
 {
+	struct sleep_hrtimer t;
 	struct timespec __user *rmtp;
 	struct timespec tu;
-	void *rfn_save = restart->fn;
-	struct hrtimer timer;
-	ktime_t rem;
+	ktime_t time;
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
-
-	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+	hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+	t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
 
-	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);
-
-	if (rem.tv64 <= 0)
+	if (do_nanosleep(&t, HRTIMER_ABS))
 		return 0;
 
 	rmtp = (struct timespec __user *) restart->arg2;
-	tu = ktime_to_timespec(rem);
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (time.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(time);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
-	restart->fn = rfn_save;
+	restart->fn = nanosleep_restart;
 
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
@@ -754,33 +735,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
-	struct hrtimer timer;
+	struct sleep_hrtimer t;
 	struct timespec tu;
 	ktime_t rem;
 
-	hrtimer_init(&timer, clockid, mode);
-
-	timer.expires = timespec_to_ktime(*rqtp);
-
-	rem = schedule_hrtimer_interruptible(&timer, mode);
-	if (rem.tv64 <= 0)
+	hrtimer_init(&t.timer, clockid, mode);
+	t.timer.expires = timespec_to_ktime(*rqtp);
+	if (do_nanosleep(&t, mode))
 		return 0;
 
 	/* Absolute timers do not update the rmtp value and restart: */
 	if (mode == HRTIMER_ABS)
 		return -ERESTARTNOHAND;
 
-	tu = ktime_to_timespec(rem);
-
-	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
-		return -EFAULT;
+	if (rmtp) {
+		rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
+		if (rem.tv64 <= 0)
+			return 0;
+		tu = ktime_to_timespec(rem);
+		if (copy_to_user(rmtp, &tu, sizeof(tu)))
+			return -EFAULT;
+	}
 
 	restart = &current_thread_info()->restart_block;
 	restart->fn = nanosleep_restart;
-	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
-	restart->arg1 = timer.expires.tv64 >> 32;
+	restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
+	restart->arg1 = t.timer.expires.tv64 >> 32;
 	restart->arg2 = (unsigned long) rmtp;
-	restart->arg3 = (unsigned long) timer.base->index;
+	restart->arg3 = (unsigned long) t.timer.base->index;
 
 	return -ERESTART_RESTARTBLOCK;
 }
-- 
cgit v1.2.3


From b75f7a51ca75c977d7d77f735d7a7859194eb39e Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:09 -0800
Subject: [PATCH] hrtimers: remove state field

Remove the state field and encode this information in the rb_node similiar to
normal timer.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 11 ++---------
 kernel/hrtimer.c        | 14 +++++---------
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0e8f4762f6f8..f57cc7bd7008 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -34,13 +34,7 @@ enum hrtimer_restart {
 	HRTIMER_RESTART,
 };
 
-/*
- * Timer states:
- */
-enum hrtimer_state {
-	HRTIMER_INACTIVE,		/* Timer is inactive */
-	HRTIMER_PENDING,		/* Timer is pending */
-};
+#define HRTIMER_INACTIVE	((void *)1UL)
 
 struct hrtimer_base;
 
@@ -61,7 +55,6 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	enum hrtimer_state	state;
 	int			(*function)(void *);
 	void			*data;
 	struct hrtimer_base	*base;
@@ -124,7 +117,7 @@ extern ktime_t hrtimer_get_next_event(void);
 
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
-	return timer->state == HRTIMER_PENDING;
+	return timer->node.rb_parent != HRTIMER_INACTIVE;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 59ec50c1e905..658d49feedb9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -374,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);
 
-	timer->state = HRTIMER_PENDING;
-
 	if (!base->first || timer->expires.tv64 <
 	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
 		base->first = &timer->node;
@@ -395,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 	if (base->first == &timer->node)
 		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
+	timer->node.rb_parent = HRTIMER_INACTIVE;
 }
 
 /*
@@ -405,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	if (hrtimer_active(timer)) {
 		__remove_hrtimer(timer, base);
-		timer->state = HRTIMER_INACTIVE;
 		return 1;
 	}
 	return 0;
@@ -579,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 
 	timer->base = &bases[clock_id];
+	timer->node.rb_parent = HRTIMER_INACTIVE;
 }
 
 /**
@@ -625,7 +624,6 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
-		timer->state = HRTIMER_INACTIVE;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
@@ -633,12 +631,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 		spin_lock_irq(&base->lock);
 
-		/* Another CPU has added back the timer */
-		if (timer->state != HRTIMER_INACTIVE)
-			continue;
-
-		if (restart != HRTIMER_NORESTART)
+		if (restart != HRTIMER_NORESTART) {
+			BUG_ON(hrtimer_active(timer));
 			enqueue_hrtimer(timer, base);
+		}
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
-- 
cgit v1.2.3


From df869b630d9d9131c10cf073fb61646048874b2f Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:11 -0800
Subject: [PATCH] hrtimers: remove nsec_t typedef

nsec_t predates ktime_t and has mostly been superseded by it.  In the few
places that are left it's better to make it explicit that we're dealing with
64 bit values here.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h | 18 ++++++------------
 kernel/hrtimer.c     |  4 ++--
 kernel/time.c        |  4 ++--
 3 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index bf0e785e2e03..0cd696cee998 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -73,12 +73,6 @@ extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
 #define timespec_valid(ts) \
 	(((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC))
 
-/*
- * 64-bit nanosec type. Large enough to span 292+ years in nanosecond
- * resolution. Ought to be enough for a while.
- */
-typedef s64 nsec_t;
-
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
 extern seqlock_t xtime_lock;
@@ -114,9 +108,9 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
  * Returns the scalar nanosecond representation of the timespec
  * parameter.
  */
-static inline nsec_t timespec_to_ns(const struct timespec *ts)
+static inline s64 timespec_to_ns(const struct timespec *ts)
 {
-	return ((nsec_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+	return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
 }
 
 /**
@@ -126,9 +120,9 @@ static inline nsec_t timespec_to_ns(const struct timespec *ts)
  * Returns the scalar nanosecond representation of the timeval
  * parameter.
  */
-static inline nsec_t timeval_to_ns(const struct timeval *tv)
+static inline s64 timeval_to_ns(const struct timeval *tv)
 {
-	return ((nsec_t) tv->tv_sec * NSEC_PER_SEC) +
+	return ((s64) tv->tv_sec * NSEC_PER_SEC) +
 		tv->tv_usec * NSEC_PER_USEC;
 }
 
@@ -138,7 +132,7 @@ static inline nsec_t timeval_to_ns(const struct timeval *tv)
  *
  * Returns the timespec representation of the nsec parameter.
  */
-extern struct timespec ns_to_timespec(const nsec_t nsec);
+extern struct timespec ns_to_timespec(const s64 nsec);
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -146,7 +140,7 @@ extern struct timespec ns_to_timespec(const nsec_t nsec);
  *
  * Returns the timeval representation of the nsec parameter.
  */
-extern struct timeval ns_to_timeval(const nsec_t nsec);
+extern struct timeval ns_to_timeval(const s64 nsec);
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 658d49feedb9..44108de4f028 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -266,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 /*
  * Divide a ktime value by a nanosecond value
  */
-static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
+static unsigned long ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc, inc, dns;
 	int sft = 0;
@@ -322,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 		interval.tv64 = timer->base->resolution.tv64;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
-		nsec_t incr = ktime_to_ns(interval);
+		s64 incr = ktime_to_ns(interval);
 
 		orun = ktime_divns(delta, incr);
 		timer->expires = ktime_add_ns(timer->expires, incr * orun);
diff --git a/kernel/time.c b/kernel/time.c
index e00a97b77241..ff8e7019c4c4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -610,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
  *
  * Returns the timespec representation of the nsec parameter.
  */
-struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const s64 nsec)
 {
 	struct timespec ts;
 
@@ -630,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec)
  *
  * Returns the timeval representation of the nsec parameter.
  */
-struct timeval ns_to_timeval(const nsec_t nsec)
+struct timeval ns_to_timeval(const s64 nsec)
 {
 	struct timespec ts = ns_to_timespec(nsec);
 	struct timeval tv;
-- 
cgit v1.2.3


From 05cfb614ddbf3181540ce09d44d96486f8ba8d6a Mon Sep 17 00:00:00 2001
From: Roman Zippel <zippel@linux-m68k.org>
Date: Sun, 26 Mar 2006 01:38:12 -0800
Subject: [PATCH] hrtimers: remove data field

The nanosleep cleanup allows to remove the data field of hrtimer.  The
callback function can use container_of() to get it's own data.  Since the
hrtimer structure is anyway embedded in other structures, this adds no
overhead.

Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c               |  2 +-
 include/linux/hrtimer.h |  5 +----
 include/linux/sched.h   |  1 +
 include/linux/timer.h   |  3 ++-
 kernel/fork.c           |  2 +-
 kernel/hrtimer.c        | 12 +++++-------
 kernel/itimer.c         | 15 +++++++--------
 kernel/posix-timers.c   |  9 ++++-----
 8 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 995cba3c62b8..c7397c46ad6d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -632,7 +632,7 @@ static int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = current;
+		sig->tsk = current;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f57cc7bd7008..93830158348e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -45,9 +45,7 @@ struct hrtimer_base;
  * @expires:	the absolute expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
  *		which the timer is based.
- * @state:	state of the timer
  * @function:	timer expiry callback function
- * @data:	argument for the callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  *
  * The hrtimer structure must be initialized by init_hrtimer_#CLOCKTYPE()
@@ -55,8 +53,7 @@ struct hrtimer_base;
 struct hrtimer {
 	struct rb_node		node;
 	ktime_t			expires;
-	int			(*function)(void *);
-	void			*data;
+	int			(*function)(struct hrtimer *);
 	struct hrtimer_base	*base;
 };
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e0054c1b9a09..036d14d2bf90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,6 +402,7 @@ struct signal_struct {
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
+	struct task_struct *tsk;
 	ktime_t it_real_incr;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ee5a09e806e8..b5caabca553c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -96,6 +96,7 @@ static inline void add_timer(struct timer_list *timer)
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern int it_real_fn(void *);
+struct hrtimer;
+extern int it_real_fn(struct hrtimer *);
 
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a02063903aaa..4bd6486aa67d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = tsk;
+	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 44108de4f028..0237a556eb1f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -613,21 +613,19 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 	while ((node = base->first)) {
 		struct hrtimer *timer;
-		int (*fn)(void *);
+		int (*fn)(struct hrtimer *);
 		int restart;
-		void *data;
 
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 
 		fn = timer->function;
-		data = timer->data;
 		set_curr_timer(base, timer);
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
-		restart = fn(data);
+		restart = fn(timer);
 
 		spin_lock_irq(&base->lock);
 
@@ -664,9 +662,10 @@ struct sleep_hrtimer {
 	int expired;
 };
 
-static int nanosleep_wakeup(void *data)
+static int nanosleep_wakeup(struct hrtimer *timer)
 {
-	struct sleep_hrtimer *t = data;
+	struct sleep_hrtimer *t =
+		container_of(timer, struct sleep_hrtimer, timer);
 
 	t->expired = 1;
 	wake_up_process(t->task);
@@ -677,7 +676,6 @@ static int nanosleep_wakeup(void *data)
 static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
 {
 	t->timer.function = nanosleep_wakeup;
-	t->timer.data = t;
 	t->task = current;
 	t->expired = 0;
 
diff --git a/kernel/itimer.c b/kernel/itimer.c
index af2ec6b4392c..204ed7939e75 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -128,17 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 /*
  * The timer is automagically restarted, when interval != 0
  */
-int it_real_fn(void *data)
+int it_real_fn(struct hrtimer *timer)
 {
-	struct task_struct *tsk = (struct task_struct *) data;
+	struct signal_struct *sig =
+	    container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk);
-
-	if (tsk->signal->it_real_incr.tv64 != 0) {
-		hrtimer_forward(&tsk->signal->real_timer,
-				tsk->signal->real_timer.base->softirq_time,
-				tsk->signal->it_real_incr);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
 
+	if (sig->it_real_incr.tv64 != 0) {
+		hrtimer_forward(timer, timer->base->softirq_time,
+				sig->it_real_incr);
 		return HRTIMER_RESTART;
 	}
 	return HRTIMER_NORESTART;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 7c5f44787c8c..ac6dc8744429 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int,
 			    struct itimerspec *, struct itimerspec *);
 static int common_timer_del(struct k_itimer *timer);
 
-static int posix_timer_fn(void *data);
+static int posix_timer_fn(struct hrtimer *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -334,14 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static int posix_timer_fn(void *data)
+static int posix_timer_fn(struct hrtimer *timer)
 {
-	struct k_itimer *timr = data;
-	struct hrtimer *timer = &timr->it.real.timer;
+	struct k_itimer *timr;
 	unsigned long flags;
 	int si_private = 0;
 	int ret = HRTIMER_NORESTART;
 
+	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
 	if (timr->it.real.interval.tv64 != 0)
@@ -725,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags,
 
 	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.data = timr;
 	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
-- 
cgit v1.2.3


From c6fd91f0bdcd294a0ae0ba2b2a7f7456ef4b7144 Mon Sep 17 00:00:00 2001
From: bibo mao <bibo_mao@linux.intel.com>
Date: Sun, 26 Mar 2006 01:38:20 -0800
Subject: [PATCH] kretprobe instance recycled by parent process

When kretprobe probes the schedule() function, if the probed process exits
then schedule() will never return, so some kretprobe instances will never
be recycled.

In this patch the parent process will recycle retprobe instances of the
probed function and there will be no memory leak of kretprobe instances.

Signed-off-by: bibo mao <bibo.mao@intel.com>
Cc: Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp>
Cc: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c    |  8 --------
 arch/ia64/kernel/process.c    |  8 --------
 arch/powerpc/kernel/process.c |  2 --
 arch/x86_64/kernel/process.c  |  9 +--------
 kernel/kprobes.c              | 10 +++++-----
 kernel/sched.c                |  9 ++++++++-
 6 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 299e61674084..24b3e745478b 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -38,7 +38,6 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
-#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -364,13 +363,6 @@ void exit_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_struct *t = &tsk->thread;
 
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(tsk);
-
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
 		int cpu = get_cpu();
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 309d59658e5f..355d57970ba3 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -30,7 +30,6 @@
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/kprobes.h>
 
 #include <asm/cpu.h>
 #include <asm/delay.h>
@@ -738,13 +737,6 @@ void
 exit_thread (void)
 {
 
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(current);
-
 	ia64_drop_fpu(current);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1770a066c217..f698aa77127e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -35,7 +35,6 @@
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
 #include <linux/utsname.h>
-#include <linux/kprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -460,7 +459,6 @@ void show_regs(struct pt_regs * regs)
 
 void exit_thread(void)
 {
-	kprobe_flush_task(current);
 	discard_lazy_cpu_state();
 }
 
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 81111835722d..0370720515f1 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -35,8 +35,8 @@
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/random.h>
-#include <linux/kprobes.h>
 #include <linux/notifier.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -353,13 +353,6 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 
-	/*
-	 * Remove function-return probe instances associated with this task
-	 * and put them back on the free list. Do not insert an exit probe for
-	 * this function, it will be disabled by kprobe_flush_task if you do.
-	 */
-	kprobe_flush_task(me);
-
 	if (me->thread.io_bitmap_ptr) { 
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fb9f753ef60..1156eb0977d0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 }
 
 /*
- * This function is called from exit_thread or flush_thread when task tk's
- * stack is being recycled so that we can recycle any function-return probe
- * instances associated with this task. These left over instances represent
- * probed functions that have been called but will never return.
+ * This function is called from finish_task_switch when task tk becomes dead,
+ * so that we can recycle any function-return probe instances associated
+ * with this task. These left over instances represent probed functions
+ * that have been called but will never return.
  */
 void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
@@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	unsigned long flags = 0;
 
 	spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(current);
+        head = kretprobe_inst_table_head(tk);
         hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                 if (ri->task == tk)
                         recycle_rp_inst(ri);
diff --git a/kernel/sched.c b/kernel/sched.c
index 7ffaabd64f89..78acdefeccca 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -49,6 +49,7 @@
 #include <linux/syscalls.h>
 #include <linux/times.h>
 #include <linux/acct.h>
+#include <linux/kprobes.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -1546,8 +1547,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD))
+	if (unlikely(prev_task_flags & PF_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+	 	 */
+		kprobe_flush_task(prev);
 		put_task_struct(prev);
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 013d3868143387be99bb0373d49452eaa3c55d41 Mon Sep 17 00:00:00 2001
From: Martin Andersson <martin.andersson@control.lth.se>
Date: Mon, 27 Mar 2006 01:15:18 -0800
Subject: [PATCH] sched: fix task interactivity calculation

Is a truncation error in kernel/sched.c triggered when the nice value is
negative.  The affected code is used in the TASK_INTERACTIVE macro.

The code is:
#define SCALE(v1,v1_max,v2_max) \
	(v1) * (v2_max) / (v1_max)

which is used in this way:
SCALE(TASK_NICE(p), 40, MAX_BONUS)

Comments in the code says:
  * This part scales the interactivity limit depending on niceness.
  *
  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
  * Here are a few examples of different nice levels:
  *
  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
  *
  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
  *  priority range a task can explore, a value of '1' means the
  *  task is rated interactive.)

However, the current code does not scale it linearly and the result differs
from the given examples.  If the mathematical function "floor" is used when
the nice value is negative instead of the truncation one gets when using
integer division, the result conforms to the documentation.

Output of TASK_INTERACTIVE when using the kernel code:
nice    dynamic priorities
-20     1     1     1     1     1     1     1     1     1     0     0
-19     1     1     1     1     1     1     1     1     0     0     0
-18     1     1     1     1     1     1     1     1     0     0     0
-17     1     1     1     1     1     1     1     1     0     0     0
-16     1     1     1     1     1     1     1     1     0     0     0
-15     1     1     1     1     1     1     1     0     0     0     0
-14     1     1     1     1     1     1     1     0     0     0     0
-13     1     1     1     1     1     1     1     0     0     0     0
-12     1     1     1     1     1     1     1     0     0     0     0
-11     1     1     1     1     1     1     0     0     0     0     0
-10     1     1     1     1     1     1     0     0     0     0     0
  -9     1     1     1     1     1     1     0     0     0     0     0
  -8     1     1     1     1     1     1     0     0     0     0     0
  -7     1     1     1     1     1     0     0     0     0     0     0
  -6     1     1     1     1     1     0     0     0     0     0     0
  -5     1     1     1     1     1     0     0     0     0     0     0
  -4     1     1     1     1     1     0     0     0     0     0     0
  -3     1     1     1     1     0     0     0     0     0     0     0
  -2     1     1     1     1     0     0     0     0     0     0     0
  -1     1     1     1     1     0     0     0     0     0     0     0
  0      1     1     1     1     0     0     0     0     0     0     0
  1      1     1     1     1     0     0     0     0     0     0     0
  2      1     1     1     1     0     0     0     0     0     0     0
  3      1     1     1     1     0     0     0     0     0     0     0
  4      1     1     1     0     0     0     0     0     0     0     0
  5      1     1     1     0     0     0     0     0     0     0     0
  6      1     1     1     0     0     0     0     0     0     0     0
  7      1     1     1     0     0     0     0     0     0     0     0
  8      1     1     0     0     0     0     0     0     0     0     0
  9      1     1     0     0     0     0     0     0     0     0     0
10      1     1     0     0     0     0     0     0     0     0     0
11      1     1     0     0     0     0     0     0     0     0     0
12      1     0     0     0     0     0     0     0     0     0     0
13      1     0     0     0     0     0     0     0     0     0     0
14      1     0     0     0     0     0     0     0     0     0     0
15      1     0     0     0     0     0     0     0     0     0     0
16      0     0     0     0     0     0     0     0     0     0     0
17      0     0     0     0     0     0     0     0     0     0     0
18      0     0     0     0     0     0     0     0     0     0     0
19      0     0     0     0     0     0     0     0     0     0     0

Output of TASK_INTERACTIVE when using "floor"
nice    dynamic priorities
-20     1     1     1     1     1     1     1     1     1     0     0
-19     1     1     1     1     1     1     1     1     1     0     0
-18     1     1     1     1     1     1     1     1     1     0     0
-17     1     1     1     1     1     1     1     1     1     0     0
-16     1     1     1     1     1     1     1     1     0     0     0
-15     1     1     1     1     1     1     1     1     0     0     0
-14     1     1     1     1     1     1     1     1     0     0     0
-13     1     1     1     1     1     1     1     1     0     0     0
-12     1     1     1     1     1     1     1     0     0     0     0
-11     1     1     1     1     1     1     1     0     0     0     0
-10     1     1     1     1     1     1     1     0     0     0     0
  -9     1     1     1     1     1     1     1     0     0     0     0
  -8     1     1     1     1     1     1     0     0     0     0     0
  -7     1     1     1     1     1     1     0     0     0     0     0
  -6     1     1     1     1     1     1     0     0     0     0     0
  -5     1     1     1     1     1     1     0     0     0     0     0
  -4     1     1     1     1     1     0     0     0     0     0     0
  -3     1     1     1     1     1     0     0     0     0     0     0
  -2     1     1     1     1     1     0     0     0     0     0     0
  -1     1     1     1     1     1     0     0     0     0     0     0
   0     1     1     1     1     0     0     0     0     0     0     0
   1     1     1     1     1     0     0     0     0     0     0     0
   2     1     1     1     1     0     0     0     0     0     0     0
   3     1     1     1     1     0     0     0     0     0     0     0
   4     1     1     1     0     0     0     0     0     0     0     0
   5     1     1     1     0     0     0     0     0     0     0     0
   6     1     1     1     0     0     0     0     0     0     0     0
   7     1     1     1     0     0     0     0     0     0     0     0
   8     1     1     0     0     0     0     0     0     0     0     0
   9     1     1     0     0     0     0     0     0     0     0     0
  10     1     1     0     0     0     0     0     0     0     0     0
  11     1     1     0     0     0     0     0     0     0     0     0
  12     1     0     0     0     0     0     0     0     0     0     0
  13     1     0     0     0     0     0     0     0     0     0     0
  14     1     0     0     0     0     0     0     0     0     0     0
  15     1     0     0     0     0     0     0     0     0     0     0
  16     0     0     0     0     0     0     0     0     0     0     0
  17     0     0     0     0     0     0     0     0     0     0     0
  18     0     0     0     0     0     0     0     0     0     0     0
  19     0     0     0     0     0     0     0     0     0     0     0

Signed-off-by: Martin Andersson <martin.andersson@control.lth.se>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 78acdefeccca..dc599c85a88d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -145,7 +145,8 @@
 	(v1) * (v2_max) / (v1_max)
 
 #define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+		INTERACTIVE_DELTA)
 
 #define TASK_INTERACTIVE(p) \
 	((p)->prio <= (p)->static_prio - DELTA(p))
-- 
cgit v1.2.3


From 77e4bfbcf071f795b54862455dce8902b3fc29c2 Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@lisas.de>
Date: Mon, 27 Mar 2006 01:15:20 -0800
Subject: [PATCH] Small schedule() optimization

small schedule() microoptimization.

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dc599c85a88d..a96a05d23262 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2879,13 +2879,11 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!current->exit_state)) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "BUG: scheduling while atomic: "
-				"%s/0x%08x/%d\n",
-				current->comm, preempt_count(), current->pid);
-			dump_stack();
-		}
+	if (unlikely(in_atomic() && !current->exit_state)) {
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-- 
cgit v1.2.3


From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 27 Mar 2006 01:15:22 -0800
Subject: [PATCH] sched: new sched domain for representing multi-core

Add a new sched domain for representing multi-core with shared caches
between cores.  Consider a dual package system, each package containing two
cores and with last level cache shared between cores with in a package.  If
there are two runnable processes, with this appended patch those two
processes will be scheduled on different packages.

On such systems, with this patch we have observed 8% perf improvement with
specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2
users).

This new domain will come into play only on multi-core systems with shared
caches.  On other systems, this sched domain will be removed by domain
degeneration code.  This new domain can be also used for implementing power
savings policy (see OLS 2005 CMP kernel scheduler paper for more details..
I will post another patch for power savings policy soon)

Most of the arch/* file changes are for cpu_coregroup_map() implementation.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig                      |  9 +++++
 arch/i386/kernel/cpu/common.c          | 10 +++--
 arch/i386/kernel/cpu/intel_cacheinfo.c | 22 +++++++++-
 arch/i386/kernel/smpboot.c             | 24 +++++++++++
 arch/x86_64/Kconfig                    |  9 +++++
 arch/x86_64/kernel/setup.c             |  3 +-
 arch/x86_64/kernel/smpboot.c           | 24 +++++++++++
 include/asm-i386/processor.h           |  5 +++
 include/asm-i386/topology.h            |  2 +
 include/asm-x86_64/processor.h         |  4 ++
 include/asm-x86_64/smp.h               |  1 +
 include/asm-x86_64/topology.h          |  2 +
 include/linux/topology.h               |  9 +++++
 kernel/sched.c                         | 73 +++++++++++++++++++++++++++++++---
 14 files changed, 186 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d0b913..f17bd1d2707e 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6a4e96..a06a49075f10 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
 void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 {
 	u32 tfms, xlvl;
-	int junk;
+	int ebx;
 
 	if (have_cpuid_p()) {
 		/* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 		/* Intel-defined flags: level 0x00000001 */
 		if ( c->cpuid_level >= 0x00000001 ) {
 			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
 			c->x86_capability[0] = capability;
 			c->x86_capability[4] = excap;
 			c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
 			if (c->x86 >= 0x6)
 				c->x86_model += ((tfms >> 16) & 0xF) << 4;
 			c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+			c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+			c->apicid = (ebx >> 24) & 0xFF;
+#endif
 		} else {
 			/* Have CPUID level 0 only - unheard of */
 			c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921369e5..7e7fd4e67dd0 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+	unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
 
 	if (c->cpuid_level > 3) {
 		static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					break;
 				    case 2:
 					new_l2 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l2_id = c->apicid >> index_msb;
 					break;
 				    case 3:
 					new_l3 = this_leaf.size/1024;
+					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+					index_msb = get_count_order(num_threads_sharing);
+					l3_id = c->apicid >> index_msb;
 					break;
 				    default:
 					break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 		if (new_l1i)
 			l1i = new_l1i;
 
-		if (new_l2)
+		if (new_l2) {
 			l2 = new_l2;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l2_id;
+#endif
+		}
 
-		if (new_l3)
+		if (new_l3) {
 			l3 = new_l3;
+#ifdef CONFIG_SMP
+			cpu_llc_id[cpu] = l3_id;
+#endif
+		}
 
 		if ( trace )
 			printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d83bfa9..a6969903f2d6 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 /* Core ID of each logical CPU */
 int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* representing HT siblings of each logical CPU */
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
 
 static int cpucount;
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..1cb4aa241c8c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..d1f3e9272c05 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id(0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	c->apicid = phys_pkg_id(0);
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..ea48fa638070 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
 
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d961e2b..af4bfd012475 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <asm/percpu.h>
+#include <linux/cpumask.h>
 
 /* flag for disabling the tsc */
 extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
 	char	pad0;
 	int	x86_power;
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	unsigned char x86_max_cores;	/* cpuid returned max cores value */
 	unsigned char booted_cores;	/* number of cores as seen by OS */
 	unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
 
 extern	int phys_proc_id[NR_CPUS];
 extern	int cpu_core_id[NR_CPUS];
+extern	int cpu_llc_id[NR_CPUS];
 extern char ignore_fpu_irq;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6ee83e..b94e5eeef917 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
 
 #endif /* CONFIG_NUMA */
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c036ed..1aa2cee43344 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
 #include <asm/mmsegment.h>
 #include <asm/percpu.h>
 #include <linux/personality.h>
+#include <linux/cpumask.h>
 
 #define TF_MASK		0x00000100
 #define IF_MASK		0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
         __u32   x86_power; 	
 	__u32   extended_cpuid_level;	/* Max extended CPUID function supported */
 	unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+	cpumask_t llc_shared_map;	/* cpus sharing the last level cache */
+#endif
 	__u8	apicid;
 	__u8	booted_cores;	/* number of cores as seen by OS */
 } ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2cfd5c0..a4fdaeb5c397 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern u8 phys_proc_id[NR_CPUS];
 extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d9882d..9db54e9d17bb 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
 
 #include <asm-generic/topology.h>
 
+extern cpumask_t cpu_coregroup_map(int cpu);
+
 #endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb0040ce3a..a305ae2e44b6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
 	.nr_balance_failed	= 0,			\
 }
 
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT   SD_CPU_INIT
+#endif
+#endif
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d23262..8a8b71b5751b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+	return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+	return cpu;
+}
+#endif
+
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
 static int cpu_to_phys_group(int cpu)
 {
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
 	return first_cpu(cpu_sibling_map[cpu]);
 #else
 	return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		sd->parent = p;
 		sd->groups = &sched_group_phys[group];
 
+#ifdef CONFIG_SCHED_MC
+		p = sd;
+		sd = &per_cpu(core_domains, i);
+		group = cpu_to_core_group(i);
+		*sd = SD_MC_INIT;
+		sd->span = cpu_coregroup_map(i);
+		cpus_and(sd->span, sd->span, *cpu_map);
+		sd->parent = p;
+		sd->groups = &sched_group_core[group];
+#endif
+
 #ifdef CONFIG_SCHED_SMT
 		p = sd;
 		sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	}
 #endif
 
+#ifdef CONFIG_SCHED_MC
+	/* Set up multi-core groups */
+	for_each_cpu_mask(i, *cpu_map) {
+		cpumask_t this_core_map = cpu_coregroup_map(i);
+		cpus_and(this_core_map, this_core_map, *cpu_map);
+		if (i != first_cpu(this_core_map))
+			continue;
+		init_sched_build_groups(sched_group_core, this_core_map,
+					&cpu_to_core_group);
+	}
+#endif
+
+
 	/* Set up physical groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		power = SCHED_LOAD_SCALE;
 		sd->groups->cpu_power = power;
 #endif
+#ifdef CONFIG_SCHED_MC
+		sd = &per_cpu(core_domains, i);
+		power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+					    * SCHED_LOAD_SCALE / 10;
+		sd->groups->cpu_power = power;
+
+		sd = &per_cpu(phys_domains, i);
 
+ 		/*
+ 		 * This has to be < 2 * SCHED_LOAD_SCALE
+ 		 * Lets keep it SCHED_LOAD_SCALE, so that
+ 		 * while calculating NUMA group's cpu_power
+ 		 * we can simply do
+ 		 *  numa_group->cpu_power += phys_group->cpu_power;
+ 		 *
+ 		 * See "only add power once for each physical pkg"
+ 		 * comment below
+ 		 */
+ 		sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
 		sd = &per_cpu(phys_domains, i);
 		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
+#endif
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
 next_sg:
 		for_each_cpu_mask(j, sg->cpumask) {
 			struct sched_domain *sd;
-			int power;
 
 			sd = &per_cpu(phys_domains, j);
 			if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
 				 */
 				continue;
 			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
 
-			sg->cpu_power += power;
+			sg->cpu_power += sd->groups->cpu_power;
 		}
 		sg = sg->next;
 		if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+		sd = &per_cpu(core_domains, i);
 #else
 		sd = &per_cpu(phys_domains, i);
 #endif
-- 
cgit v1.2.3


From 0806903316d516a3b3851c51cea5c71724d9051d Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Mon, 27 Mar 2006 01:15:23 -0800
Subject: [PATCH] sched: fix group power for allnodes_domains

Current sched groups power calculation for allnodes_domains is wrong.  We
should really be using cumulative power of the physical packages in that
group (similar to the calculation in node_domains)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 62 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8a8b71b5751b..7854ee516b92 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5621,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
+static void init_numa_sched_groups_power(struct sched_group *group_head)
+{
+	struct sched_group *sg = group_head;
+	int j;
+
+	if (!sg)
+		return;
+next_sg:
+	for_each_cpu_mask(j, sg->cpumask) {
+		struct sched_domain *sd;
+
+		sd = &per_cpu(phys_domains, j);
+		if (j != first_cpu(sd->groups->cpumask)) {
+			/*
+			 * Only add "power" once for each
+			 * physical package.
+			 */
+			continue;
+		}
+
+		sg->cpu_power += sd->groups->cpu_power;
+	}
+	sg = sg->next;
+	if (sg != group_head)
+		goto next_sg;
+}
 #endif
 
 /*
@@ -5866,43 +5892,13 @@ void build_sched_domains(const cpumask_t *cpu_map)
 				(cpus_weight(sd->groups->cpumask)-1) / 10;
 		sd->groups->cpu_power = power;
 #endif
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
+	for (i = 0; i < MAX_NUMNODES; i++)
+		init_numa_sched_groups_power(sched_group_nodes[i]);
 
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-
-			sg->cpu_power += sd->groups->cpu_power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
+	init_numa_sched_groups_power(sched_group_allnodes);
 #endif
 
 	/* Attach the domains */
-- 
cgit v1.2.3


From 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:22 -0800
Subject: [PATCH] lightweight robust futexes: core

Add the core infrastructure for robust futexes: structure definitions, the new
syscalls and the do_exit() based cleanup mechanism.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/futex.h   |  95 ++++++++++++++++++++++++++
 include/linux/sched.h   |   3 +
 include/linux/threads.h |   3 +-
 kernel/exit.c           |   3 +
 kernel/futex.c          | 172 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c         |   4 ++
 6 files changed, 279 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 10f96c31971e..20face6b798d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/sched.h>
+
 /* Second argument to futex syscall */
 
 
@@ -11,10 +13,103 @@
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ */
+
+/*
+ * Per-lock list entry - embedded in user-space locks, somewhere close
+ * to the futex field. (Note: user-space uses a double-linked list to
+ * achieve O(1) list add and remove, but the kernel only needs to know
+ * about the forward link)
+ *
+ * NOTE: this structure is part of the syscall ABI, and must not be
+ * changed.
+ */
+struct robust_list {
+	struct robust_list __user *next;
+};
+
+/*
+ * Per-thread list head:
+ *
+ * NOTE: this structure is part of the syscall ABI, and must only be
+ * changed if the change is first communicated with the glibc folks.
+ * (When an incompatible change is done, we'll increase the structure
+ *  size, which glibc will detect)
+ */
+struct robust_list_head {
+	/*
+	 * The head of the list. Points back to itself if empty:
+	 */
+	struct robust_list list;
+
+	/*
+	 * This relative offset is set by user-space, it gives the kernel
+	 * the relative position of the futex field to examine. This way
+	 * we keep userspace flexible, to freely shape its data-structure,
+	 * without hardcoding any particular offset into the kernel:
+	 */
+	long futex_offset;
+
+	/*
+	 * The death of the thread may race with userspace setting
+	 * up a lock's links. So to handle this race, userspace first
+	 * sets this field to the address of the to-be-taken lock,
+	 * then does the lock acquire, and then adds itself to the
+	 * list, and then clears this field. Hence the kernel will
+	 * always have full knowledge of all locks that the thread
+	 * _might_ have taken. We check the owner TID in any case,
+	 * so only truly owned locks will be handled.
+	 */
+	struct robust_list __user *list_op_pending;
+};
+
+/*
+ * Are there any waiters for this robust futex:
+ */
+#define FUTEX_WAITERS		0x80000000
+
+/*
+ * The kernel signals via this bit that a thread holding a futex
+ * has exited without unlocking the futex. The kernel also does
+ * a FUTEX_WAKE on such futexes, after setting the bit, to wake
+ * up any possible waiters:
+ */
+#define FUTEX_OWNER_DIED	0x40000000
+
+/*
+ * Reserved bit:
+ */
+#define FUTEX_OWNER_PENDING	0x20000000
+
+/*
+ * The rest of the robust-futex field is for the TID:
+ */
+#define FUTEX_TID_MASK		0x1fffffff
+
+/*
+ * A limit of one million locks held per thread (!) ought to be enough
+ * for some time. This also protects against a deliberately circular
+ * list. Not worth introducing an rlimit for this:
+ */
+#define ROBUST_LIST_LIMIT	1048576
+
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
+extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+
+#ifdef CONFIG_FUTEX
+extern void exit_robust_list(struct task_struct *curr);
+#else
+static inline void exit_robust_list(struct task_struct *curr)
+{
+}
+#endif
+
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
 #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
 #define FUTEX_OP_OR		2	/* *(int *)UADDR2 |= OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 036d14d2bf90..fd4848f2d750 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@
 #include <linux/topology.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/futex.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
@@ -872,6 +873,8 @@ struct task_struct {
 	int cpuset_mems_generation;
 	int cpuset_mem_spread_rotor;
 #endif
+	struct robust_list_head __user *robust_list;
+
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
 };
diff --git a/include/linux/threads.h b/include/linux/threads.h
index b59738ac6197..e646bcdf2614 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -28,7 +28,8 @@
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
 
 /*
- * A maximum of 4 million PIDs should be enough for a while:
+ * A maximum of 4 million PIDs should be enough for a while.
+ * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
  */
 #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e136e..aecb48ca7370 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
+#include <linux/futex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code)
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
+	if (unlikely(tsk->robust_list))
+		exit_robust_list(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..feb724b2554e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  *  (C) Copyright 2003, 2004 Jamie Lokier
  *
+ *  Robust futex support started by Ingo Molnar
+ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,174 @@ error:
 	goto out;
 }
 
+/*
+ * Support for robust futexes: the kernel cleans up held futexes at
+ * thread exit time.
+ *
+ * Implementation: user-space maintains a per-thread list of locks it
+ * is holding. Upon do_exit(), the kernel carefully walks this list,
+ * and marks all locks that are owned by this thread with the
+ * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
+ * always manipulated with the lock held, so the list is private and
+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+ * field, to allow the kernel to clean up if the thread dies after
+ * acquiring the lock, but just before it could have added itself to
+ * the list. There can only be one such pending lock.
+ */
+
+/**
+ * sys_set_robust_list - set the robust-futex list head of a task
+ * @head: pointer to the list-head
+ * @len: length of the list-head, as userspace expects
+ */
+asmlinkage long
+sys_set_robust_list(struct robust_list_head __user *head,
+		    size_t len)
+{
+	/*
+	 * The kernel knows only one size for now:
+	 */
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->robust_list = head;
+
+	return 0;
+}
+
+/**
+ * sys_get_robust_list - get the robust-futex list head of a task
+ * @pid: pid of the process [zero for current task]
+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
+ * @len_ptr: pointer to a length field, the kernel fills in the header size
+ */
+asmlinkage long
+sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+		    size_t __user *len_ptr)
+{
+	struct robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(head, head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+/*
+ * Process a futex-list entry, check whether it's owned by the
+ * dying task, and do notification if so:
+ */
+int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+{
+	unsigned int futex_val;
+
+repeat:
+	if (get_user(futex_val, uaddr))
+		return -1;
+
+	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+		/*
+		 * Ok, this dying thread is truly holding a futex
+		 * of interest. Set the OWNER_DIED bit atomically
+		 * via cmpxchg, and if the value had FUTEX_WAITERS
+		 * set, wake up a waiter (if any). (We have to do a
+		 * futex_wake() even if OWNER_DIED is already set -
+		 * to handle the rare but possible case of recursive
+		 * thread-death.) The rest of the cleanup is done in
+		 * userspace.
+		 */
+		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
+					 futex_val | FUTEX_OWNER_DIED) !=
+								   futex_val)
+			goto repeat;
+
+		if (futex_val & FUTEX_WAITERS)
+			futex_wake((unsigned long)uaddr, 1);
+	}
+	return 0;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void exit_robust_list(struct task_struct *curr)
+{
+	struct robust_list_head __user *head = curr->robust_list;
+	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned long futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(entry, &head->list.next))
+		return;
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(pending, &head->list_op_pending))
+		return;
+	if (pending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (entry != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(entry, &entry->next))
+			return;
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 		unsigned long uaddr2, int val2, int val3)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
 cond_syscall(sys_socketcall);
 cond_syscall(sys_futex);
 cond_syscall(compat_sys_futex);
+cond_syscall(sys_set_robust_list);
+cond_syscall(compat_sys_set_robust_list);
+cond_syscall(sys_get_robust_list);
+cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
-- 
cgit v1.2.3


From 34f192c6527f20c47ccec239e7d51a27691b93fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:24 -0800
Subject: [PATCH] lightweight robust futexes: compat

32-bit syscall compatibility support.  (This patch also moves all futex
related compat functionality into kernel/futex_compat.c.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/compat.h |  18 +++++++
 include/linux/sched.h  |   3 ++
 kernel/Makefile        |   3 ++
 kernel/compat.c        |  23 --------
 kernel/exit.c          |   5 ++
 kernel/futex_compat.c  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 23 deletions(-)
 create mode 100644 kernel/futex_compat.c

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 24d659cdbafe..6d3a654be1ae 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -147,6 +147,24 @@ typedef struct compat_sigevent {
 	} _sigev_un;
 } compat_sigevent_t;
 
+struct compat_robust_list {
+	compat_uptr_t			next;
+};
+
+struct compat_robust_list_head {
+	struct compat_robust_list	list;
+	compat_long_t			futex_offset;
+	compat_uptr_t			list_op_pending;
+};
+
+extern void compat_exit_robust_list(struct task_struct *curr);
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len);
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr);
 
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fd4848f2d750..20b4f0372e44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,6 +874,9 @@ struct task_struct {
 	int cpuset_mem_spread_rotor;
 #endif
 	struct robust_list_head __user *robust_list;
+#ifdef CONFIG_COMPAT
+	struct compat_robust_list_head __user *compat_robust_list;
+#endif
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
diff --git a/kernel/Makefile b/kernel/Makefile
index ff1c11dc12cf..58908f9d156a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,9 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_FUTEX) += futex.o
+ifeq ($(CONFIG_COMPAT),y)
+obj-$(CONFIG_FUTEX) += futex_compat.o
+endif
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/compat.c b/kernel/compat.c
index b9bdd1271f44..c1601a84f8d8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -17,7 +17,6 @@
 #include <linux/time.h>
 #include <linux/signal.h>
 #include <linux/sched.h>	/* for MAX_SCHEDULE_TIMEOUT */
-#include <linux/futex.h>	/* for FUTEX_WAIT */
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 #include <linux/security.h>
@@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
 	return ret;
 }
 
-#ifdef CONFIG_FUTEX
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
-		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
-{
-	struct timespec t;
-	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
-	int val2 = 0;
-
-	if ((op == FUTEX_WAIT) && utime) {
-		if (get_compat_timespec(&t, utime))
-			return -EFAULT;
-		timeout = timespec_to_jiffies(&t) + 1;
-	}
-	if (op >= FUTEX_REQUEUE)
-		val2 = (int) (unsigned long) utime;
-
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
-}
-#endif
-
 asmlinkage long compat_sys_setrlimit(unsigned int resource,
 		struct compat_rlimit __user *rlim)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index aecb48ca7370..a8c7efc7a681 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -855,6 +856,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	}
 	if (unlikely(tsk->robust_list))
 		exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+	if (unlikely(tsk->compat_robust_list))
+		compat_exit_robust_list(tsk);
+#endif
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
new file mode 100644
index 000000000000..c153559ef289
--- /dev/null
+++ b/kernel/futex_compat.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/futex_compat.c
+ *
+ * Futex compatibililty routines.
+ *
+ * Copyright 2006, Red Hat, Inc., Ingo Molnar
+ */
+
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <linux/futex.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+	struct robust_list __user *entry, *pending;
+	compat_uptr_t uentry, upending;
+	unsigned int limit = ROBUST_LIST_LIMIT;
+	compat_long_t futex_offset;
+
+	/*
+	 * Fetch the list head (which was registered earlier, via
+	 * sys_set_robust_list()):
+	 */
+	if (get_user(uentry, &head->list.next))
+		return;
+	entry = compat_ptr(uentry);
+	/*
+	 * Fetch the relative futex offset:
+	 */
+	if (get_user(futex_offset, &head->futex_offset))
+		return;
+	/*
+	 * Fetch any possibly pending lock-add first, and handle it
+	 * if it exists:
+	 */
+	if (get_user(upending, &head->list_op_pending))
+		return;
+	pending = compat_ptr(upending);
+	if (upending)
+		handle_futex_death((void *)pending + futex_offset, curr);
+
+	while (compat_ptr(uentry) != &head->list) {
+		/*
+		 * A pending lock might already be on the list, so
+		 * dont process it twice:
+		 */
+		if (entry != pending)
+			if (handle_futex_death((void *)entry + futex_offset,
+						curr))
+				return;
+
+		/*
+		 * Fetch the next entry in the list:
+		 */
+		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+			return;
+		entry = compat_ptr(uentry);
+		/*
+		 * Avoid excessively long or circular lists:
+		 */
+		if (!--limit)
+			break;
+
+		cond_resched();
+	}
+}
+
+asmlinkage long
+compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+			   compat_size_t len)
+{
+	if (unlikely(len != sizeof(*head)))
+		return -EINVAL;
+
+	current->compat_robust_list = head;
+
+	return 0;
+}
+
+asmlinkage long
+compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr,
+			   compat_size_t __user *len_ptr)
+{
+	struct compat_robust_list_head *head;
+	unsigned long ret;
+
+	if (!pid)
+		head = current->compat_robust_list;
+	else {
+		struct task_struct *p;
+
+		ret = -ESRCH;
+		read_lock(&tasklist_lock);
+		p = find_task_by_pid(pid);
+		if (!p)
+			goto err_unlock;
+		ret = -EPERM;
+		if ((current->euid != p->euid) && (current->euid != p->uid) &&
+				!capable(CAP_SYS_PTRACE))
+			goto err_unlock;
+		head = p->compat_robust_list;
+		read_unlock(&tasklist_lock);
+	}
+
+	if (put_user(sizeof(*head), len_ptr))
+		return -EFAULT;
+	return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+	read_unlock(&tasklist_lock);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+		struct compat_timespec __user *utime, u32 __user *uaddr2,
+		int val3)
+{
+	struct timespec t;
+	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+	int val2 = 0;
+
+	if ((op == FUTEX_WAIT) && utime) {
+		if (get_compat_timespec(&t, utime))
+			return -EFAULT;
+		timeout = timespec_to_jiffies(&t) + 1;
+	}
+	if (op >= FUTEX_REQUEUE)
+		val2 = (int) (unsigned long) utime;
+
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
+}
-- 
cgit v1.2.3


From 8f17d3a5049d32392b79925c73a0cf99ce6d5af0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Mar 2006 01:16:27 -0800
Subject: [PATCH] lightweight robust futexes updates

- fix: initialize the robust list(s) to NULL in copy_process.

- doc update

- cleanup: rename _inuser to _inatomic

- __user cleanups and other small cleanups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/robust-futex-ABI.txt |  2 --
 Documentation/robust-futexes.txt   |  2 +-
 include/asm-frv/futex.h            |  2 +-
 include/asm-generic/futex.h        |  2 +-
 include/asm-i386/futex.h           |  2 +-
 include/asm-mips/futex.h           |  2 +-
 include/asm-powerpc/futex.h        |  2 +-
 include/asm-x86_64/futex.h         |  2 +-
 include/linux/futex.h              |  2 +-
 kernel/fork.c                      |  5 ++++-
 kernel/futex.c                     | 20 +++++++++-----------
 kernel/futex_compat.c              |  7 +++----
 12 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index def5d8735286..8529a17ffaa1 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -142,8 +142,6 @@ On insertion:
     of the 'lock word', to the linked list starting at 'head', and
  4) clear the 'list_op_pending' word.
 
-	XXX I am particularly unsure of the following -pj XXX
-
 On removal:
  1) set the 'list_op_pending' word to the address of the 'lock word'
     to be removed,
diff --git a/Documentation/robust-futexes.txt b/Documentation/robust-futexes.txt
index 7aecc67b1361..df82d75245a0 100644
--- a/Documentation/robust-futexes.txt
+++ b/Documentation/robust-futexes.txt
@@ -213,6 +213,6 @@ robust-mutex testcases.
 All other architectures should build just fine too - but they wont have
 the new syscalls yet.
 
-Architectures need to implement the new futex_atomic_cmpxchg_inuser()
+Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
 inline function before writing up the syscalls (that function returns
 -ENOSYS right now).
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9a0e9026ba5e..08b3d1da3583 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -10,7 +10,7 @@
 extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index 514bd401cd7e..df893c160318 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -50,7 +50,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-i386/futex.h b/include/asm-i386/futex.h
index 41184a31885c..7b8ceefd010f 100644
--- a/include/asm-i386/futex.h
+++ b/include/asm-i386/futex.h
@@ -105,7 +105,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/asm-mips/futex.h b/include/asm-mips/futex.h
index c5fb2d6d918a..a554089991f2 100644
--- a/include/asm-mips/futex.h
+++ b/include/asm-mips/futex.h
@@ -100,7 +100,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-powerpc/futex.h b/include/asm-powerpc/futex.h
index 80ed9854e42b..f1b3c00bc1ce 100644
--- a/include/asm-powerpc/futex.h
+++ b/include/asm-powerpc/futex.h
@@ -82,7 +82,7 @@ static inline int futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	return -ENOSYS;
 }
diff --git a/include/asm-x86_64/futex.h b/include/asm-x86_64/futex.h
index 7d9eb1a84546..9804bf07b092 100644
--- a/include/asm-x86_64/futex.h
+++ b/include/asm-x86_64/futex.h
@@ -95,7 +95,7 @@ futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
 }
 
 static inline int
-futex_atomic_cmpxchg_inuser(int __user *uaddr, int oldval, int newval)
+futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
 {
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 20face6b798d..55fff96ae859 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -100,7 +100,7 @@ long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
 
-extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
+extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
diff --git a/kernel/fork.c b/kernel/fork.c
index e0a2b449dea6..c49bd193b058 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,7 +1061,10 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * Clear TID on mm_release()?
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
-
+	p->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+	p->compat_robust_list = NULL;
+#endif
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/futex.c b/kernel/futex.c
index feb724b2554e..9c9b2b6b22dd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -913,15 +913,15 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
 {
-	unsigned int futex_val;
+	u32 uval;
 
-repeat:
-	if (get_user(futex_val, uaddr))
+retry:
+	if (get_user(uval, uaddr))
 		return -1;
 
-	if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
+	if ((uval & FUTEX_TID_MASK) == curr->pid) {
 		/*
 		 * Ok, this dying thread is truly holding a futex
 		 * of interest. Set the OWNER_DIED bit atomically
@@ -932,12 +932,11 @@ repeat:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
-					 futex_val | FUTEX_OWNER_DIED) !=
-								   futex_val)
-			goto repeat;
+		if (futex_atomic_cmpxchg_inatomic(uaddr, uval,
+					 uval | FUTEX_OWNER_DIED) != uval)
+			goto retry;
 
-		if (futex_val & FUTEX_WAITERS)
+		if (uval & FUTEX_WAITERS)
 			futex_wake((unsigned long)uaddr, 1);
 	}
 	return 0;
@@ -985,7 +984,6 @@ void exit_robust_list(struct task_struct *curr)
 			if (handle_futex_death((void *)entry + futex_offset,
 						curr))
 				return;
-
 		/*
 		 * Fetch the next entry in the list:
 		 */
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index c153559ef289..9c077cf9aa84 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -121,9 +121,9 @@ err_unlock:
 	return ret;
 }
 
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
+asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		struct compat_timespec __user *utime, u32 __user *uaddr2,
-		int val3)
+		u32 val3)
 {
 	struct timespec t;
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -137,6 +137,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex((unsigned long)uaddr, op, val, timeout,
-			(unsigned long)uaddr2, val2, val3);
+	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From e041c683412d5bf44dc2b109053e3b837b71742d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 27 Mar 2006 01:16:30 -0800
Subject: [PATCH] Notifier chain update: API changes

The kernel's implementation of notifier chains is unsafe.  There is no
protection against entries being added to or removed from a chain while the
chain is in use.  The issues were discussed in this thread:

    http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

We noticed that notifier chains in the kernel fall into two basic usage
classes:

	"Blocking" chains are always called from a process context
	and the callout routines are allowed to sleep;

	"Atomic" chains can be called from an atomic context and
	the callout routines are not allowed to sleep.

We decided to codify this distinction and make it part of the API.  Therefore
this set of patches introduces three new, parallel APIs: one for blocking
notifiers, one for atomic notifiers, and one for "raw" notifiers (which is
really just the old API under a new name).  New kinds of data structures are
used for the heads of the chains, and new routines are defined for
registration, unregistration, and calling a chain.  The three APIs are
explained in include/linux/notifier.h and their implementation is in
kernel/sys.c.

With atomic and blocking chains, the implementation guarantees that the chain
links will not be corrupted and that chain callers will not get messed up by
entries being added or removed.  For raw chains the implementation provides no
guarantees at all; users of this API must provide their own protections.  (The
idea was that situations may come up where the assumptions of the atomic and
blocking APIs are not appropriate, so it should be possible for users to
handle these things in their own way.)

There are some limitations, which should not be too hard to live with.  For
atomic/blocking chains, registration and unregistration must always be done in
a process context since the chain is protected by a mutex/rwsem.  Also, a
callout routine for a non-raw chain must not try to register or unregister
entries on its own chain.  (This did happen in a couple of places and the code
had to be changed to avoid it.)

Since atomic chains may be called from within an NMI handler, they cannot use
spinlocks for synchronization.  Instead we use RCU.  The overhead falls almost
entirely in the unregister routine, which is okay since unregistration is much
less frequent that calling a chain.

Here is the list of chains that we adjusted and their classifications.  None
of them use the raw API, so for the moment it is only a placeholder.

  ATOMIC CHAINS
  -------------
arch/i386/kernel/traps.c:		i386die_chain
arch/ia64/kernel/traps.c:		ia64die_chain
arch/powerpc/kernel/traps.c:		powerpc_die_chain
arch/sparc64/kernel/traps.c:		sparc64die_chain
arch/x86_64/kernel/traps.c:		die_chain
drivers/char/ipmi/ipmi_si_intf.c:	xaction_notifier_list
kernel/panic.c:				panic_notifier_list
kernel/profile.c:			task_free_notifier
net/bluetooth/hci_core.c:		hci_notifier
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_chain
net/ipv4/netfilter/ip_conntrack_core.c:	ip_conntrack_expect_chain
net/ipv6/addrconf.c:			inet6addr_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_chain
net/netfilter/nf_conntrack_core.c:	nf_conntrack_expect_chain
net/netlink/af_netlink.c:		netlink_chain

  BLOCKING CHAINS
  ---------------
arch/powerpc/platforms/pseries/reconfig.c:	pSeries_reconfig_chain
arch/s390/kernel/process.c:		idle_chain
arch/x86_64/kernel/process.c		idle_notifier
drivers/base/memory.c:			memory_chain
drivers/cpufreq/cpufreq.c		cpufreq_policy_notifier_list
drivers/cpufreq/cpufreq.c		cpufreq_transition_notifier_list
drivers/macintosh/adb.c:		adb_client_list
drivers/macintosh/via-pmu.c		sleep_notifier_list
drivers/macintosh/via-pmu68k.c		sleep_notifier_list
drivers/macintosh/windfarm_core.c	wf_client_list
drivers/usb/core/notify.c		usb_notifier_list
drivers/video/fbmem.c			fb_notifier_list
kernel/cpu.c				cpu_chain
kernel/module.c				module_notify_list
kernel/profile.c			munmap_notifier
kernel/profile.c			task_exit_notifier
kernel/sys.c				reboot_notifier_list
net/core/dev.c				netdev_chain
net/decnet/dn_dev.c:			dnaddr_chain
net/ipv4/devinet.c:			inetaddr_chain

It's possible that some of these classifications are wrong.  If they are,
please let us know or submit a patch to fix them.  Note that any chain that
gets called very frequently should be atomic, because the rwsem read-locking
used for blocking chains is very likely to incur cache misses on SMP systems.
(However, if the chain's callout routines may sleep then the chain cannot be
atomic.)

The patch set was written by Alan Stern and Chandra Seetharaman, incorporating
material written by Keith Owens and suggestions from Paul McKenney and Andrew
Morton.

[jes@sgi.com: restructure the notifier chain initialization macros]
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/setup.c                   |   5 +-
 arch/arm/mach-omap1/board-netstar.c         |   2 +-
 arch/arm/mach-omap1/board-voiceblue.c       |   2 +-
 arch/i386/kernel/traps.c                    |  17 +-
 arch/ia64/kernel/traps.c                    |   6 +-
 arch/mips/lasat/setup.c                     |   3 +-
 arch/mips/sgi-ip22/ip22-reset.c             |   2 +-
 arch/mips/sgi-ip32/ip32-reset.c             |   2 +-
 arch/parisc/kernel/pdc_chassis.c            |   3 +-
 arch/powerpc/kernel/setup_64.c              |   3 +-
 arch/powerpc/kernel/traps.c                 |  16 +-
 arch/powerpc/platforms/pseries/reconfig.c   |  10 +-
 arch/ppc/platforms/prep_setup.c             |   2 +-
 arch/s390/kernel/process.c                  |  11 +-
 arch/sparc64/kernel/traps.c                 |  17 +-
 arch/um/drivers/mconsole_kern.c             |   3 +-
 arch/um/kernel/um_arch.c                    |   3 +-
 arch/x86_64/kernel/process.c                |  17 +-
 arch/x86_64/kernel/traps.c                  |  18 +-
 arch/xtensa/platform-iss/setup.c            |   2 +-
 drivers/base/memory.c                       |   8 +-
 drivers/char/ipmi/ipmi_msghandler.c         |   4 +-
 drivers/char/ipmi/ipmi_si_intf.c            |   7 +-
 drivers/char/ipmi/ipmi_watchdog.c           |   6 +-
 drivers/cpufreq/cpufreq.c                   |  61 +++---
 drivers/firmware/dcdbas.c                   |  19 +-
 drivers/macintosh/adb.c                     |  11 +-
 drivers/macintosh/adbhid.c                  |   3 +-
 drivers/macintosh/via-pmu.c                 |   2 +-
 drivers/macintosh/via-pmu68k.c              |   7 +-
 drivers/macintosh/windfarm_core.c           |   8 +-
 drivers/misc/ibmasm/heartbeat.c             |   5 +-
 drivers/net/bonding/bond_main.c             |   2 +-
 drivers/parisc/led.c                        |  14 +-
 drivers/parisc/power.c                      |   6 +-
 drivers/scsi/gdth.c                         |   9 +-
 drivers/usb/core/notify.c                   |  65 +-----
 drivers/video/fbmem.c                       |  31 +--
 include/asm-i386/kdebug.h                   |  10 +-
 include/asm-ia64/kdebug.h                   |   4 +-
 include/asm-powerpc/kdebug.h                |  12 +-
 include/asm-sparc64/kdebug.h                |  11 +-
 include/asm-x86_64/kdebug.h                 |  23 +-
 include/linux/adb.h                         |   2 +-
 include/linux/kernel.h                      |   2 +-
 include/linux/memory.h                      |   1 -
 include/linux/netfilter_ipv4/ip_conntrack.h |  17 +-
 include/linux/notifier.h                    |  96 +++++++-
 include/net/netfilter/nf_conntrack.h        |  17 +-
 kernel/cpu.c                                |  29 +--
 kernel/module.c                             |  20 +-
 kernel/panic.c                              |   4 +-
 kernel/profile.c                            |  53 ++---
 kernel/softlockup.c                         |   2 +-
 kernel/sys.c                                | 327 ++++++++++++++++++++++------
 net/bluetooth/hci_core.c                    |   8 +-
 net/core/dev.c                              |  42 ++--
 net/decnet/dn_dev.c                         |  10 +-
 net/ipv4/devinet.c                          |  16 +-
 net/ipv4/netfilter/ip_conntrack_core.c      |   6 +-
 net/ipv6/addrconf.c                         |  10 +-
 net/netfilter/nf_conntrack_core.c           |   6 +-
 net/netlink/af_netlink.c                    |   9 +-
 63 files changed, 677 insertions(+), 472 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 9402624453c2..dd8769670596 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -43,7 +43,7 @@
 #include <asm/setup.h>
 #include <asm/io.h>
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 static int alpha_panic_event(struct notifier_block *, unsigned long, void *);
 static struct notifier_block alpha_panic_block = {
 	alpha_panic_event,
@@ -500,7 +500,8 @@ setup_arch(char **cmdline_p)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &alpha_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&alpha_panic_block);
 
 #ifdef CONFIG_ALPHA_GENERIC
 	/* Assume that we've booted from SRM if we haven't booted from MILO.
diff --git a/arch/arm/mach-omap1/board-netstar.c b/arch/arm/mach-omap1/board-netstar.c
index 60d5f8a3339c..7520e602d7a2 100644
--- a/arch/arm/mach-omap1/board-netstar.c
+++ b/arch/arm/mach-omap1/board-netstar.c
@@ -141,7 +141,7 @@ static int __init netstar_late_init(void)
 	/* TODO: Setup front panel switch here */
 
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/arm/mach-omap1/board-voiceblue.c b/arch/arm/mach-omap1/board-voiceblue.c
index bfd5fdd1a875..52e4a9d69642 100644
--- a/arch/arm/mach-omap1/board-voiceblue.c
+++ b/arch/arm/mach-omap1/board-voiceblue.c
@@ -235,7 +235,7 @@ static struct notifier_block panic_block = {
 static int __init voiceblue_setup(void)
 {
 	/* Setup panic notifier */
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 4624f8ca2459..6b63a5aa1e46 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -92,22 +92,21 @@ asmlinkage void spurious_interrupt_bug(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
-struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(i386die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&i386die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&i386die_chain, nb);
 }
 EXPORT_SYMBOL(register_die_notifier);
 
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&i386die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
+
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
 	return	p > (void *)tinfo &&
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index dabd6c32641e..7c1ddc8ac443 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,19 +30,19 @@ extern spinlock_t timerlist_lock;
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
-struct notifier_block *ia64die_chain;
+ATOMIC_NOTIFIER_HEAD(ia64die_chain);
 
 int
 register_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ia64die_chain, nb);
+	return atomic_notifier_chain_register(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_die_notifier);
 
 int
 unregister_die_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ia64die_chain, nb);
+	return atomic_notifier_chain_unregister(&ia64die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_die_notifier);
 
diff --git a/arch/mips/lasat/setup.c b/arch/mips/lasat/setup.c
index 83eb08b7a072..e9e9a89c6741 100644
--- a/arch/mips/lasat/setup.c
+++ b/arch/mips/lasat/setup.c
@@ -165,7 +165,8 @@ void __init plat_setup(void)
 
 	/* Set up panic notifier */
 	for (i = 0; i < sizeof(lasat_panic_block) / sizeof(struct notifier_block); i++)
-		notifier_chain_register(&panic_notifier_list, &lasat_panic_block[i]);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&lasat_panic_block[i]);
 
 	lasat_reboot_setup();
 
diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c
index 92a3b3c15ed3..a9c58e067b53 100644
--- a/arch/mips/sgi-ip22/ip22-reset.c
+++ b/arch/mips/sgi-ip22/ip22-reset.c
@@ -238,7 +238,7 @@ static int __init reboot_setup(void)
 	request_irq(SGI_PANEL_IRQ, panel_int, 0, "Front Panel", NULL);
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	return 0;
 }
diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c
index 0c948008b023..ab9d9cef089e 100644
--- a/arch/mips/sgi-ip32/ip32-reset.c
+++ b/arch/mips/sgi-ip32/ip32-reset.c
@@ -193,7 +193,7 @@ static __init int ip32_reboot_setup(void)
 
 	init_timer(&blink_timer);
 	blink_timer.function = blink_timeout;
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	request_irq(MACEISA_RTC_IRQ, ip32_rtc_int, 0, "rtc", NULL);
 
diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c
index 2a01fe1bdc98..0cea6958f427 100644
--- a/arch/parisc/kernel/pdc_chassis.c
+++ b/arch/parisc/kernel/pdc_chassis.c
@@ -150,7 +150,8 @@ void __init parisc_pdc_chassis_init(void)
 
 		if (handle) {
 			/* initialize panic notifier chain */
-			notifier_chain_register(&panic_notifier_list, &pdc_chassis_panic_block);
+			atomic_notifier_chain_register(&panic_notifier_list,
+					&pdc_chassis_panic_block);
 
 			/* initialize reboot notifier chain */
 			register_reboot_notifier(&pdc_chassis_reboot_block);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2f3fdad35594..e20c1fae3423 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -579,7 +579,8 @@ void __init setup_arch(char **cmdline_p)
 	panic_timeout = 180;
 
 	if (ppc_md.panic)
-		notifier_chain_register(&panic_notifier_list, &ppc64_panic_block);
+		atomic_notifier_chain_register(&panic_notifier_list,
+				&ppc64_panic_block);
 
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) _etext;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 98660aedeeb7..9763faab6739 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -74,19 +74,19 @@ EXPORT_SYMBOL(__debugger_dabr_match);
 EXPORT_SYMBOL(__debugger_fault_handler);
 #endif
 
-struct notifier_block *powerpc_die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(powerpc_die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
+	return atomic_notifier_chain_register(&powerpc_die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
 
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&powerpc_die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&powerpc_die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /*
  * Trap & Exception support
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index 86cfa6ecdcf3..5ad90676567a 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -94,16 +94,16 @@ static struct device_node *derive_parent(const char *path)
 	return parent;
 }
 
-static struct notifier_block *pSeries_reconfig_chain;
+static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
 
 int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
-	return notifier_chain_register(&pSeries_reconfig_chain, nb);
+	return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
 }
 
 void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
 {
-	notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+	blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
 }
 
 static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
@@ -131,7 +131,7 @@ static int pSeries_reconfig_add_node(const char *path, struct property *proplist
 		goto out_err;
 	}
 
-	err = notifier_call_chain(&pSeries_reconfig_chain,
+	err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
 				  PSERIES_RECONFIG_ADD, np);
 	if (err == NOTIFY_BAD) {
 		printk(KERN_ERR "Failed to add device node %s\n", path);
@@ -171,7 +171,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 
 	remove_node_proc_entries(np);
 
-	notifier_call_chain(&pSeries_reconfig_chain,
+	blocking_notifier_call_chain(&pSeries_reconfig_chain,
 			    PSERIES_RECONFIG_REMOVE, np);
 	of_detach_node(np);
 
diff --git a/arch/ppc/platforms/prep_setup.c b/arch/ppc/platforms/prep_setup.c
index a0fc628ffb1e..d95c05d9824d 100644
--- a/arch/ppc/platforms/prep_setup.c
+++ b/arch/ppc/platforms/prep_setup.c
@@ -736,7 +736,7 @@ ibm_statusled_progress(char *s, unsigned short hex)
 		hex = 0xfff;
 		if (!notifier_installed) {
 			++notifier_installed;
-			notifier_chain_register(&panic_notifier_list,
+			atomic_notifier_chain_register(&panic_notifier_list,
 						&ibm_statusled_block);
 		}
 	}
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 99182a415fe7..4a0f5a1551ea 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -76,17 +76,17 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Need to know about CPUs going idle?
  */
-static struct notifier_block *idle_chain;
+static ATOMIC_NOTIFIER_HEAD(idle_chain);
 
 int register_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&idle_chain, nb);
+	return atomic_notifier_chain_register(&idle_chain, nb);
 }
 EXPORT_SYMBOL(register_idle_notifier);
 
 int unregister_idle_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&idle_chain, nb);
+	return atomic_notifier_chain_unregister(&idle_chain, nb);
 }
 EXPORT_SYMBOL(unregister_idle_notifier);
 
@@ -95,7 +95,7 @@ void do_monitor_call(struct pt_regs *regs, long interruption_code)
 	/* disable monitor call class 0 */
 	__ctl_clear_bit(8, 15);
 
-	notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
+	atomic_notifier_call_chain(&idle_chain, CPU_NOT_IDLE,
 			    (void *)(long) smp_processor_id());
 }
 
@@ -116,7 +116,8 @@ static void default_idle(void)
 		return;
 	}
 
-	rc = notifier_call_chain(&idle_chain, CPU_IDLE, (void *)(long) cpu);
+	rc = atomic_notifier_call_chain(&idle_chain,
+			CPU_IDLE, (void *)(long) cpu);
 	if (rc != NOTIFY_OK && rc != NOTIFY_DONE)
 		BUG();
 	if (rc != NOTIFY_OK) {
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index df612e4f75f9..ff090bb9734b 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -43,18 +43,19 @@
 #include <linux/kmod.h>
 #endif
 
-struct notifier_block *sparc64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(sparc64die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&sparc64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&sparc64die_chain, nb);
 }
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&sparc64die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /* When an irrecoverable trap occurs at tl > 0, the trap entry
  * code logs the trap state registers at every level in the trap
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 54388d10bcf9..1488816588ea 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -762,7 +762,8 @@ static struct notifier_block panic_exit_notifier = {
 
 static int add_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	return(0);
 }
 
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index bb1c87211ac1..7d51dd7201c3 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -477,7 +477,8 @@ static struct notifier_block panic_exit_notifier = {
 
 void __init setup_arch(char **cmdline_p)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&panic_exit_notifier);
 	paging_init();
         strlcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
  	*cmdline_p = command_line;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0370720515f1..70dd8e5c6889 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override);
 void (*pm_idle)(void);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-static struct notifier_block *idle_notifier;
-static DEFINE_SPINLOCK(idle_notifier_lock);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_register(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_register(&idle_notifier, n);
 }
 EXPORT_SYMBOL_GPL(idle_notifier_register);
 
 void idle_notifier_unregister(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_unregister(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_unregister(&idle_notifier, n);
 }
 EXPORT_SYMBOL(idle_notifier_unregister);
 
@@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
 void enter_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_NOT_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
 /* Called from interrupts to signify idle end */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7b148309c529..edaa9fe654dc 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -69,20 +69,20 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
-struct notifier_block *die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
diff --git a/arch/xtensa/platform-iss/setup.c b/arch/xtensa/platform-iss/setup.c
index 2e6dcbf0cc04..23790a5610e2 100644
--- a/arch/xtensa/platform-iss/setup.c
+++ b/arch/xtensa/platform-iss/setup.c
@@ -108,5 +108,5 @@ static struct notifier_block iss_panic_block = {
 
 void __init platform_setup(char **p_cmdline)
 {
-	notifier_chain_register(&panic_notifier_list, &iss_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &iss_panic_block);
 }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 105a0d61eb1f..dd547af4681a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -47,16 +47,16 @@ static struct kset_uevent_ops memory_uevent_ops = {
 	.uevent		= memory_uevent,
 };
 
-static struct notifier_block *memory_chain;
+static BLOCKING_NOTIFIER_HEAD(memory_chain);
 
 int register_memory_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&memory_chain, nb);
+        return blocking_notifier_chain_register(&memory_chain, nb);
 }
 
 void unregister_memory_notifier(struct notifier_block *nb)
 {
-        notifier_chain_unregister(&memory_chain, nb);
+        blocking_notifier_chain_unregister(&memory_chain, nb);
 }
 
 /*
@@ -140,7 +140,7 @@ static ssize_t show_mem_state(struct sys_device *dev, char *buf)
 
 static inline int memory_notify(unsigned long val, void *v)
 {
-	return notifier_call_chain(&memory_chain, val, v);
+	return blocking_notifier_call_chain(&memory_chain, val, v);
 }
 
 /*
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index b8fb87c6c29f..40eb005b9d77 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -3744,7 +3744,7 @@ static int ipmi_init_msghandler(void)
 	ipmi_timer.expires = jiffies + IPMI_TIMEOUT_JIFFIES;
 	add_timer(&ipmi_timer);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 
 	initialized = 1;
 
@@ -3764,7 +3764,7 @@ static __exit void cleanup_ipmi(void)
 	if (!initialized)
 		return;
 
-	notifier_chain_unregister(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list, &panic_block);
 
 	/* This can't be called if any interfaces exist, so no worry about
 	   shutting down the interfaces. */
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 12f858dc9994..35fbd4d8ed4b 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -237,10 +237,10 @@ struct smi_info
 
 static int try_smi_init(struct smi_info *smi);
 
-static struct notifier_block *xaction_notifier_list;
+static ATOMIC_NOTIFIER_HEAD(xaction_notifier_list);
 static int register_xaction_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&xaction_notifier_list, nb);
+	return atomic_notifier_chain_register(&xaction_notifier_list, nb);
 }
 
 static void si_restart_short_timer(struct smi_info *smi_info);
@@ -302,7 +302,8 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info)
 		do_gettimeofday(&t);
 		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
 #endif
-		err = notifier_call_chain(&xaction_notifier_list, 0, smi_info);
+		err = atomic_notifier_call_chain(&xaction_notifier_list,
+				0, smi_info);
 		if (err & NOTIFY_STOP_MASK) {
 			rv = SI_SM_CALL_WITHOUT_DELAY;
 			goto out;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 616539310d9a..7ece9f3c8f70 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1158,7 +1158,8 @@ static int __init ipmi_wdog_init(void)
 	}
 
 	register_reboot_notifier(&wdog_reboot_notifier);
-	notifier_chain_register(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&wdog_panic_notifier);
 
 	printk(KERN_INFO PFX "driver initialized\n");
 
@@ -1176,7 +1177,8 @@ static __exit void ipmi_unregister_watchdog(void)
 		release_nmi(&ipmi_nmi_handler);
 #endif
 
-	notifier_chain_unregister(&panic_notifier_list, &wdog_panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&wdog_panic_notifier);
 	unregister_reboot_notifier(&wdog_reboot_notifier);
 
 	if (! watchdog_user)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index aed80e6aec6d..9b6ae7dc8b8a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -52,9 +52,8 @@ static void handle_update(void *data);
  * changes to devices when the CPU clock speed changes.
  * The mutex locks both lists.
  */
-static struct notifier_block *cpufreq_policy_notifier_list;
-static struct notifier_block *cpufreq_transition_notifier_list;
-static DECLARE_RWSEM (cpufreq_notifier_rwsem);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_policy_notifier_list);
+static BLOCKING_NOTIFIER_HEAD(cpufreq_transition_notifier_list);
 
 
 static LIST_HEAD(cpufreq_governor_list);
@@ -247,8 +246,6 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 	dprintk("notification %u of frequency transition to %u kHz\n",
 		state, freqs->new);
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	policy = cpufreq_cpu_data[freqs->cpu];
 	switch (state) {
 
@@ -266,20 +263,19 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 				freqs->old = policy->cur;
 			}
 		}
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_PRECHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_PRECHANGE, freqs);
 		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
 		break;
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-		notifier_call_chain(&cpufreq_transition_notifier_list,
-					CPUFREQ_POSTCHANGE, freqs);
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
+				CPUFREQ_POSTCHANGE, freqs);
 		if (likely(policy) && likely(policy->cpu == freqs->cpu))
 			policy->cur = freqs->new;
 		break;
 	}
-	up_read(&cpufreq_notifier_rwsem);
 }
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
@@ -1007,7 +1003,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
 		freqs.old = cpu_policy->cur;
 		freqs.new = cur_freq;
 
-		notifier_call_chain(&cpufreq_transition_notifier_list,
+		blocking_notifier_call_chain(&cpufreq_transition_notifier_list,
 				    CPUFREQ_SUSPENDCHANGE, &freqs);
 		adjust_jiffies(CPUFREQ_SUSPENDCHANGE, &freqs);
 
@@ -1088,7 +1084,8 @@ static int cpufreq_resume(struct sys_device * sysdev)
 			freqs.old = cpu_policy->cur;
 			freqs.new = cur_freq;
 
-			notifier_call_chain(&cpufreq_transition_notifier_list,
+			blocking_notifier_call_chain(
+					&cpufreq_transition_notifier_list,
 					CPUFREQ_RESUMECHANGE, &freqs);
 			adjust_jiffies(CPUFREQ_RESUMECHANGE, &freqs);
 
@@ -1125,24 +1122,24 @@ static struct sysdev_driver cpufreq_sysdev_driver = {
  *      changes in cpufreq policy.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_register.
+ *	blocking_notifier_chain_register.
  */
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_register(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1157,24 +1154,24 @@ EXPORT_SYMBOL(cpufreq_register_notifier);
  *	Remove a driver from the CPU frequency notifier list.
  *
  *	This function may sleep, and has the same return conditions as
- *	notifier_chain_unregister.
+ *	blocking_notifier_chain_unregister.
  */
 int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
 {
 	int ret;
 
-	down_write(&cpufreq_notifier_rwsem);
 	switch (list) {
 	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_transition_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_transition_notifier_list, nb);
 		break;
 	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_policy_notifier_list, nb);
+		ret = blocking_notifier_chain_unregister(
+				&cpufreq_policy_notifier_list, nb);
 		break;
 	default:
 		ret = -EINVAL;
 	}
-	up_write(&cpufreq_notifier_rwsem);
 
 	return ret;
 }
@@ -1346,29 +1343,23 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_poli
 	if (ret)
 		goto error_out;
 
-	down_read(&cpufreq_notifier_rwsem);
-
 	/* adjust if necessary - all reasons */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_ADJUST, policy);
 
 	/* adjust if necessary - hardware incompatibility*/
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_INCOMPATIBLE,
-			    policy);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_INCOMPATIBLE, policy);
 
 	/* verify the cpu speed can be set within this limit,
 	   which might be different to the first one */
 	ret = cpufreq_driver->verify(policy);
-	if (ret) {
-		up_read(&cpufreq_notifier_rwsem);
+	if (ret)
 		goto error_out;
-	}
 
 	/* notification of the new policy */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY,
-			    policy);
-
-	up_read(&cpufreq_notifier_rwsem);
+	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
+			CPUFREQ_NOTIFY, policy);
 
 	data->min = policy->min;
 	data->max = policy->max;
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index d6543fc4a923..339f405ff708 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -484,26 +484,15 @@ static void dcdbas_host_control(void)
 static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 				void *unused)
 {
-	static unsigned int notify_cnt = 0;
-
 	switch (code) {
 	case SYS_DOWN:
 	case SYS_HALT:
 	case SYS_POWER_OFF:
 		if (host_control_on_shutdown) {
 			/* firmware is going to perform host control action */
-			if (++notify_cnt == 2) {
-				printk(KERN_WARNING
-				       "Please wait for shutdown "
-				       "action to complete...\n");
-				dcdbas_host_control();
-			}
-			/*
-			 * register again and initiate the host control
-			 * action on the second notification to allow
-			 * everyone that registered to be notified
-			 */
-			register_reboot_notifier(nb);
+			printk(KERN_WARNING "Please wait for shutdown "
+			       "action to complete...\n");
+			dcdbas_host_control();
 		}
 		break;
 	}
@@ -514,7 +503,7 @@ static int dcdbas_reboot_notify(struct notifier_block *nb, unsigned long code,
 static struct notifier_block dcdbas_reboot_nb = {
 	.notifier_call = dcdbas_reboot_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = INT_MIN
 };
 
 static DCDBAS_BIN_ATTR_RW(smi_data);
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index d2ead1776c16..34fcabac5fdb 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -80,7 +80,7 @@ static struct adb_driver *adb_driver_list[] = {
 static struct class *adb_dev_class;
 
 struct adb_driver *adb_controller;
-struct notifier_block *adb_client_list = NULL;
+BLOCKING_NOTIFIER_HEAD(adb_client_list);
 static int adb_got_sleep;
 static int adb_inited;
 static pid_t adb_probe_task_pid;
@@ -354,7 +354,8 @@ adb_notify_sleep(struct pmu_sleep_notifier *self, int when)
 		/* Stop autopoll */
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(0);
-		ret = notifier_call_chain(&adb_client_list, ADB_MSG_POWERDOWN, NULL);
+		ret = blocking_notifier_call_chain(&adb_client_list,
+				ADB_MSG_POWERDOWN, NULL);
 		if (ret & NOTIFY_STOP_MASK) {
 			up(&adb_probe_mutex);
 			return PBOOK_SLEEP_REFUSE;
@@ -391,7 +392,8 @@ do_adb_reset_bus(void)
 	if (adb_controller->autopoll)
 		adb_controller->autopoll(0);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_PRE_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_PRE_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK) {
 		if (adb_controller->autopoll)
 			adb_controller->autopoll(autopoll_devs);
@@ -426,7 +428,8 @@ do_adb_reset_bus(void)
 	}
 	up(&adb_handler_sem);
 
-	nret = notifier_call_chain(&adb_client_list, ADB_MSG_POST_RESET, NULL);
+	nret = blocking_notifier_call_chain(&adb_client_list,
+			ADB_MSG_POST_RESET, NULL);
 	if (nret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 	
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index c0b46bceb5df..f5779a73184d 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1214,7 +1214,8 @@ static int __init adbhid_init(void)
 
 	adbhid_probe();
 
-	notifier_chain_register(&adb_client_list, &adbhid_adb_notifier);
+	blocking_notifier_chain_register(&adb_client_list,
+			&adbhid_adb_notifier);
 
 	return 0;
 }
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 4f5f3abc9cb3..0b5ff553e39a 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -187,7 +187,7 @@ extern int disable_kernel_backlight;
 
 int __fake_sleep;
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 #ifdef CONFIG_ADB
 static int adb_dev_map = 0;
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index f08e52f2107b..35b70323e7e3 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -102,7 +102,7 @@ static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited = 0;
 
 int asleep;
-struct notifier_block *sleep_notifier_list;
+BLOCKING_NOTIFIER_HEAD(sleep_notifier_list);
 
 static int pmu_probe(void);
 static int pmu_init(void);
@@ -913,7 +913,8 @@ int powerbook_sleep(void)
 	struct adb_request sleep_req;
 
 	/* Notify device drivers */
-	ret = notifier_call_chain(&sleep_notifier_list, PBOOK_SLEEP, NULL);
+	ret = blocking_notifier_call_chain(&sleep_notifier_list,
+			PBOOK_SLEEP, NULL);
 	if (ret & NOTIFY_STOP_MASK)
 		return -EBUSY;
 
@@ -984,7 +985,7 @@ int powerbook_sleep(void)
 			enable_irq(i);
 
 	/* Notify drivers */
-	notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
+	blocking_notifier_call_chain(&sleep_notifier_list, PBOOK_WAKE, NULL);
 
 	/* reenable ADB autopoll */
 	pmu_adb_autopoll(adb_dev_map);
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index 6c0ba04bc57a..ab3faa702d58 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -52,7 +52,7 @@
 static LIST_HEAD(wf_controls);
 static LIST_HEAD(wf_sensors);
 static DEFINE_MUTEX(wf_lock);
-static struct notifier_block *wf_client_list;
+static BLOCKING_NOTIFIER_HEAD(wf_client_list);
 static int wf_client_count;
 static unsigned int wf_overtemp;
 static unsigned int wf_overtemp_counter;
@@ -68,7 +68,7 @@ static struct platform_device wf_platform_device = {
 
 static inline void wf_notify(int event, void *param)
 {
-	notifier_call_chain(&wf_client_list, event, param);
+	blocking_notifier_call_chain(&wf_client_list, event, param);
 }
 
 int wf_critical_overtemp(void)
@@ -398,7 +398,7 @@ int wf_register_client(struct notifier_block *nb)
 	struct wf_sensor *sr;
 
 	mutex_lock(&wf_lock);
-	rc = notifier_chain_register(&wf_client_list, nb);
+	rc = blocking_notifier_chain_register(&wf_client_list, nb);
 	if (rc != 0)
 		goto bail;
 	wf_client_count++;
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(wf_register_client);
 int wf_unregister_client(struct notifier_block *nb)
 {
 	mutex_lock(&wf_lock);
-	notifier_chain_unregister(&wf_client_list, nb);
+	blocking_notifier_chain_unregister(&wf_client_list, nb);
 	wf_client_count++;
 	if (wf_client_count == 0)
 		wf_stop_thread();
diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c
index f295401fac21..7fd7a43e38de 100644
--- a/drivers/misc/ibmasm/heartbeat.c
+++ b/drivers/misc/ibmasm/heartbeat.c
@@ -52,12 +52,13 @@ static struct notifier_block panic_notifier = { panic_happened, NULL, 1 };
 
 void ibmasm_register_panic_notifier(void)
 {
-	notifier_chain_register(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_notifier);
 }
 
 void ibmasm_unregister_panic_notifier(void)
 {
-	notifier_chain_unregister(&panic_notifier_list, &panic_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&panic_notifier);
 }
 
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2d0ac169a86c..f13a539dc169 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3159,7 +3159,7 @@ static int bond_slave_netdev_event(unsigned long event, struct net_device *slave
  * bond_netdev_event: handle netdev notifier chain events.
  *
  * This function receives events for the netdev chain.  The caller (an
- * ioctl handler calling notifier_call_chain) holds the necessary
+ * ioctl handler calling blocking_notifier_call_chain) holds the necessary
  * locks for us to safely manipulate the slave devices (RTNL lock,
  * dev_probe_lock).
  */
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index 3627a2d7f79f..298f2ddb2c17 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -499,11 +499,16 @@ static int led_halt(struct notifier_block *, unsigned long, void *);
 static struct notifier_block led_notifier = {
 	.notifier_call = led_halt,
 };
+static int notifier_disabled = 0;
 
 static int led_halt(struct notifier_block *nb, unsigned long event, void *buf) 
 {
 	char *txt;
-	
+
+	if (notifier_disabled)
+		return NOTIFY_OK;
+
+	notifier_disabled = 1;
 	switch (event) {
 	case SYS_RESTART:	txt = "SYSTEM RESTART";
 				break;
@@ -527,7 +532,6 @@ static int led_halt(struct notifier_block *nb, unsigned long event, void *buf)
 		if (led_func_ptr)
 			led_func_ptr(0xff); /* turn all LEDs ON */
 	
-	unregister_reboot_notifier(&led_notifier);
 	return NOTIFY_OK;
 }
 
@@ -758,6 +762,12 @@ not_found:
 	return 1;
 }
 
+static void __exit led_exit(void)
+{
+	unregister_reboot_notifier(&led_notifier);
+	return;
+}
+
 #ifdef CONFIG_PROC_FS
 module_init(led_create_procfs)
 #endif
diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 54b2b7f20b96..0bcab83b4080 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -251,7 +251,8 @@ static int __init power_init(void)
 	}
 
 	/* Register a call for panic conditions. */
-	notifier_chain_register(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list,
+			&parisc_panic_block);
 
 	tasklet_enable(&power_tasklet);
 
@@ -264,7 +265,8 @@ static void __exit power_exit(void)
 		return;
 
 	tasklet_disable(&power_tasklet);
-	notifier_chain_unregister(&panic_notifier_list, &parisc_panic_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+			&parisc_panic_block);
 	power_tasklet.func = NULL;
 	pdc_soft_power_button(0);
 }
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 62e3cda859af..7f7013e80a88 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -671,7 +671,7 @@ static struct file_operations gdth_fops = {
 static struct notifier_block gdth_notifier = {
     gdth_halt, NULL, 0
 };
-
+static int notifier_disabled = 0;
 
 static void gdth_delay(int milliseconds)
 {
@@ -4595,13 +4595,13 @@ static int __init gdth_detect(struct scsi_host_template *shtp)
         add_timer(&gdth_timer);
 #endif
         major = register_chrdev(0,"gdth",&gdth_fops);
+        notifier_disabled = 0;
         register_reboot_notifier(&gdth_notifier);
     }
     gdth_polling = FALSE;
     return gdth_ctr_vcount;
 }
 
-
 static int gdth_release(struct Scsi_Host *shp)
 {
     int hanum;
@@ -5632,10 +5632,14 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
     char            cmnd[MAX_COMMAND_SIZE];   
 #endif
 
+    if (notifier_disabled)
+    	return NOTIFY_OK;
+
     TRACE2(("gdth_halt() event %d\n",(int)event));
     if (event != SYS_RESTART && event != SYS_HALT && event != SYS_POWER_OFF)
         return NOTIFY_DONE;
 
+    notifier_disabled = 1;
     printk("GDT-HA: Flushing all host drives .. ");
     for (hanum = 0; hanum < gdth_ctr_count; ++hanum) {
         gdth_flush(hanum);
@@ -5679,7 +5683,6 @@ static int gdth_halt(struct notifier_block *nb, ulong event, void *buf)
 #ifdef GDTH_STATISTICS
     del_timer(&gdth_timer);
 #endif
-    unregister_reboot_notifier(&gdth_notifier);
     return NOTIFY_OK;
 }
 
diff --git a/drivers/usb/core/notify.c b/drivers/usb/core/notify.c
index 4b55285de9a0..fe0ed54fa0ae 100644
--- a/drivers/usb/core/notify.c
+++ b/drivers/usb/core/notify.c
@@ -16,57 +16,7 @@
 #include <linux/mutex.h>
 #include "usb.h"
 
-
-static struct notifier_block *usb_notifier_list;
-static DEFINE_MUTEX(usb_notifier_lock);
-
-static void usb_notifier_chain_register(struct notifier_block **list,
-					struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while (*list) {
-		if (n->priority > (*list)->priority)
-			break;
-		list = &((*list)->next);
-	}
-	n->next = *list;
-	*list = n;
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static void usb_notifier_chain_unregister(struct notifier_block **nl,
-				   struct notifier_block *n)
-{
-	mutex_lock(&usb_notifier_lock);
-	while ((*nl)!=NULL) {
-		if ((*nl)==n) {
-			*nl = n->next;
-			goto exit;
-		}
-		nl=&((*nl)->next);
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-}
-
-static int usb_notifier_call_chain(struct notifier_block **n,
-				   unsigned long val, void *v)
-{
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
-
-	mutex_lock(&usb_notifier_lock);
-	while (nb) {
-		ret = nb->notifier_call(nb,val,v);
-		if (ret&NOTIFY_STOP_MASK) {
-			goto exit;
-		}
-		nb = nb->next;
-	}
-exit:
-	mutex_unlock(&usb_notifier_lock);
-	return ret;
-}
+static BLOCKING_NOTIFIER_HEAD(usb_notifier_list);
 
 /**
  * usb_register_notify - register a notifier callback whenever a usb change happens
@@ -76,7 +26,7 @@ exit:
  */
 void usb_register_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_register(&usb_notifier_list, nb);
+	blocking_notifier_chain_register(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_register_notify);
 
@@ -89,27 +39,28 @@ EXPORT_SYMBOL_GPL(usb_register_notify);
  */
 void usb_unregister_notify(struct notifier_block *nb)
 {
-	usb_notifier_chain_unregister(&usb_notifier_list, nb);
+	blocking_notifier_chain_unregister(&usb_notifier_list, nb);
 }
 EXPORT_SYMBOL_GPL(usb_unregister_notify);
 
 
 void usb_notify_add_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev);
 }
 
 void usb_notify_remove_device(struct usb_device *udev)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev);
+	blocking_notifier_call_chain(&usb_notifier_list,
+			USB_DEVICE_REMOVE, udev);
 }
 
 void usb_notify_add_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus);
 }
 
 void usb_notify_remove_bus(struct usb_bus *ubus)
 {
-	usb_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
+	blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus);
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 07d882b14396..b1a8dca76430 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -55,7 +55,7 @@
 
 #define FBPIXMAPSIZE	(1024 * 8)
 
-static struct notifier_block *fb_notifier_list;
+static BLOCKING_NOTIFIER_HEAD(fb_notifier_list);
 struct fb_info *registered_fb[FB_MAX];
 int num_registered_fb;
 
@@ -784,7 +784,7 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 		    event.info = info;
 		    event.data = &mode1;
-		    ret = notifier_call_chain(&fb_notifier_list,
+		    ret = blocking_notifier_call_chain(&fb_notifier_list,
 					      FB_EVENT_MODE_DELETE, &event);
 		}
 
@@ -830,8 +830,8 @@ fb_set_var(struct fb_info *info, struct fb_var_screeninfo *var)
 
 				info->flags &= ~FBINFO_MISC_USEREVENT;
 				event.info = info;
-				notifier_call_chain(&fb_notifier_list, evnt,
-						    &event);
+				blocking_notifier_call_chain(&fb_notifier_list,
+						evnt, &event);
 			}
 		}
 	}
@@ -854,7 +854,8 @@ fb_blank(struct fb_info *info, int blank)
 
 		event.info = info;
 		event.data = &blank;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_BLANK, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_BLANK, &event);
 	}
 
  	return ret;
@@ -925,7 +926,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		con2fb.framebuffer = -1;
 		event.info = info;
 		event.data = &con2fb;
-		notifier_call_chain(&fb_notifier_list,
+		blocking_notifier_call_chain(&fb_notifier_list,
 				    FB_EVENT_GET_CONSOLE_MAP, &event);
 		return copy_to_user(argp, &con2fb,
 				    sizeof(con2fb)) ? -EFAULT : 0;
@@ -944,7 +945,7 @@ fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 		    return -EINVAL;
 		event.info = info;
 		event.data = &con2fb;
-		return notifier_call_chain(&fb_notifier_list,
+		return blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_SET_CONSOLE_MAP,
 					   &event);
 	case FBIOBLANK:
@@ -1324,7 +1325,7 @@ register_framebuffer(struct fb_info *fb_info)
 	devfs_mk_cdev(MKDEV(FB_MAJOR, i),
 			S_IFCHR | S_IRUGO | S_IWUGO, "fb/%d", i);
 	event.info = fb_info;
-	notifier_call_chain(&fb_notifier_list,
+	blocking_notifier_call_chain(&fb_notifier_list,
 			    FB_EVENT_FB_REGISTERED, &event);
 	return 0;
 }
@@ -1366,7 +1367,7 @@ unregister_framebuffer(struct fb_info *fb_info)
  */
 int fb_register_client(struct notifier_block *nb)
 {
-	return notifier_chain_register(&fb_notifier_list, nb);
+	return blocking_notifier_chain_register(&fb_notifier_list, nb);
 }
 
 /**
@@ -1375,7 +1376,7 @@ int fb_register_client(struct notifier_block *nb)
  */
 int fb_unregister_client(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&fb_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&fb_notifier_list, nb);
 }
 
 /**
@@ -1393,11 +1394,13 @@ void fb_set_suspend(struct fb_info *info, int state)
 
 	event.info = info;
 	if (state) {
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_SUSPEND, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_SUSPEND, &event);
 		info->state = FBINFO_STATE_SUSPENDED;
 	} else {
 		info->state = FBINFO_STATE_RUNNING;
-		notifier_call_chain(&fb_notifier_list, FB_EVENT_RESUME, &event);
+		blocking_notifier_call_chain(&fb_notifier_list,
+				FB_EVENT_RESUME, &event);
 	}
 }
 
@@ -1469,7 +1472,7 @@ int fb_new_modelist(struct fb_info *info)
 
 	if (!list_empty(&info->modelist)) {
 		event.info = info;
-		err = notifier_call_chain(&fb_notifier_list,
+		err = blocking_notifier_call_chain(&fb_notifier_list,
 					   FB_EVENT_NEW_MODELIST,
 					   &event);
 	}
@@ -1495,7 +1498,7 @@ int fb_con_duit(struct fb_info *info, int event, void *data)
 	evnt.info = info;
 	evnt.data = data;
 
-	return notifier_call_chain(&fb_notifier_list, event, &evnt);
+	return blocking_notifier_call_chain(&fb_notifier_list, event, &evnt);
 }
 EXPORT_SYMBOL(fb_con_duit);
 
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index 316138e89910..96d0828ce096 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -17,11 +17,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *i386die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head i386die_chain;
 
 
 /* Grossly misnamed. */
@@ -51,7 +49,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&i386die_chain, val, &args);
+	return atomic_notifier_call_chain(&i386die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
index 8b01a083dde6..218c458ab60c 100644
--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -40,7 +40,7 @@ struct die_args {
 
 extern int register_die_notifier(struct notifier_block *);
 extern int unregister_die_notifier(struct notifier_block *);
-extern struct notifier_block *ia64die_chain;
+extern struct atomic_notifier_head ia64die_chain;
 
 enum die_val {
 	DIE_BREAK = 1,
@@ -81,7 +81,7 @@ static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
 		.signr  = sig
 	};
 
-	return notifier_call_chain(&ia64die_chain, val, &args);
+	return atomic_notifier_call_chain(&ia64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-powerpc/kdebug.h b/include/asm-powerpc/kdebug.h
index 7c16265568e0..c01786ab5fa6 100644
--- a/include/asm-powerpc/kdebug.h
+++ b/include/asm-powerpc/kdebug.h
@@ -16,13 +16,9 @@ struct die_args {
 	int signr;
 };
 
-/*
-   Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched -
-   then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *powerpc_die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head powerpc_die_chain;
 
 /* Grossly misnamed. */
 enum die_val {
@@ -37,7 +33,7 @@ enum die_val {
 static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
 {
 	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
-	return notifier_call_chain(&powerpc_die_chain, val, &args);
+	return atomic_notifier_call_chain(&powerpc_die_chain, val, &args);
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index 6321f5a0198d..4040d127ac3e 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -15,12 +15,9 @@ struct die_args {
 	int signr;
 };
 
-/* Note - you should never unregister because that can race with NMIs.
- * If you really want to do it first unregister - then synchronize_sched
- * - then free.
- */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *sparc64die_chain;
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head sparc64die_chain;
 
 extern void bad_trap(struct pt_regs *, long);
 
@@ -46,7 +43,7 @@ static inline int notify_die(enum die_val val,char *str, struct pt_regs *regs,
 				 .trapnr	= trap,
 				 .signr		= sig };
 
-	return notifier_call_chain(&sparc64die_chain, val, &args);
+	return atomic_notifier_call_chain(&sparc64die_chain, val, &args);
 }
 
 #endif
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index b9ed4c0c8783..cf795631d9b4 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -5,21 +5,20 @@
 
 struct pt_regs;
 
-struct die_args { 
+struct die_args {
 	struct pt_regs *regs;
 	const char *str;
-	long err; 
+	long err;
 	int trapnr;
 	int signr;
-}; 
+};
+
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
+extern struct atomic_notifier_head die_chain;
 
-/* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_sched - then free.
-  */
-int register_die_notifier(struct notifier_block *nb);
-extern struct notifier_block *die_chain;
 /* Grossly misnamed. */
-enum die_val { 
+enum die_val {
 	DIE_OOPS = 1,
 	DIE_INT3,
 	DIE_DEBUG,
@@ -33,8 +32,8 @@ enum die_val {
 	DIE_CALL,
 	DIE_NMI_IPI,
 	DIE_PAGE_FAULT,
-}; 
-	
+};
+
 static inline int notify_die(enum die_val val, const char *str,
 			struct pt_regs *regs, long err, int trap, int sig)
 {
@@ -45,7 +44,7 @@ static inline int notify_die(enum die_val val, const char *str,
 		.trapnr = trap,
 		.signr = sig
 	};
-	return notifier_call_chain(&die_chain, val, &args); 
+	return atomic_notifier_call_chain(&die_chain, val, &args);
 } 
 
 extern int printk_address(unsigned long address);
diff --git a/include/linux/adb.h b/include/linux/adb.h
index e9fdc63483c7..b7305b178279 100644
--- a/include/linux/adb.h
+++ b/include/linux/adb.h
@@ -85,7 +85,7 @@ enum adb_message {
     ADB_MSG_POST_RESET	/* Called after resetting the bus (re-do init & register) */
 };
 extern struct adb_driver *adb_controller;
-extern struct notifier_block *adb_client_list;
+extern struct blocking_notifier_head adb_client_list;
 
 int adb_request(struct adb_request *req, void (*done)(struct adb_request *),
 		int flags, int nbytes, ...);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 03d6cfaa5b8a..a3720f973ea5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -87,7 +87,7 @@ extern int cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
-extern struct notifier_block *panic_notifier_list;
+extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e251dc43d0f5..8f04143ca363 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,7 +77,6 @@ extern int remove_memory_block(unsigned long, struct mem_section *, int);
 
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
-struct notifier_block;
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index f32d75c4f4cf..d54d7b278e96 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -308,29 +308,30 @@ DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
  
-extern struct notifier_block *ip_conntrack_chain;
-extern struct notifier_block *ip_conntrack_expect_chain;
+extern struct atomic_notifier_head ip_conntrack_chain;
+extern struct atomic_notifier_head ip_conntrack_expect_chain;
 
 static inline int ip_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_chain, nb);
 }
 
 static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_chain, nb);
 }
 
 static inline int 
 ip_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&ip_conntrack_expect_chain, nb);
 }
 
 static inline int
 ip_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&ip_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&ip_conntrack_expect_chain,
+			nb);
 }
 
 extern void ip_ct_deliver_cached_events(const struct ip_conntrack *ct);
@@ -355,14 +356,14 @@ static inline void ip_conntrack_event(enum ip_conntrack_events event,
 				      struct ip_conntrack *ct)
 {
 	if (is_confirmed(ct) && !is_dying(ct))
-		notifier_call_chain(&ip_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&ip_conntrack_chain, event, ct);
 }
 
 static inline void 
 ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct ip_conntrack_expect *exp)
 {
-	notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
 static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, 
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 5937dd6053c3..51dbab9710c7 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -10,25 +10,107 @@
 #ifndef _LINUX_NOTIFIER_H
 #define _LINUX_NOTIFIER_H
 #include <linux/errno.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
 
-struct notifier_block
-{
-	int (*notifier_call)(struct notifier_block *self, unsigned long, void *);
+/*
+ * Notifier chains are of three types:
+ *
+ *	Atomic notifier chains: Chain callbacks run in interrupt/atomic
+ *		context. Callouts are not allowed to block.
+ *	Blocking notifier chains: Chain callbacks run in process context.
+ *		Callouts are allowed to block.
+ *	Raw notifier chains: There are no restrictions on callbacks,
+ *		registration, or unregistration.  All locking and protection
+ *		must be provided by the caller.
+ *
+ * atomic_notifier_chain_register() may be called from an atomic context,
+ * but blocking_notifier_chain_register() must be called from a process
+ * context.  Ditto for the corresponding _unregister() routines.
+ *
+ * atomic_notifier_chain_unregister() and blocking_notifier_chain_unregister()
+ * _must not_ be called from within the call chain.
+ */
+
+struct notifier_block {
+	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
 	struct notifier_block *next;
 	int priority;
 };
 
+struct atomic_notifier_head {
+	spinlock_t lock;
+	struct notifier_block *head;
+};
+
+struct blocking_notifier_head {
+	struct rw_semaphore rwsem;
+	struct notifier_block *head;
+};
+
+struct raw_notifier_head {
+	struct notifier_block *head;
+};
+
+#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {	\
+		spin_lock_init(&(name)->lock);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {	\
+		init_rwsem(&(name)->rwsem);	\
+		(name)->head = NULL;		\
+	} while (0)
+#define RAW_INIT_NOTIFIER_HEAD(name) do {	\
+		(name)->head = NULL;		\
+	} while (0)
+
+#define ATOMIC_NOTIFIER_INIT(name) {				\
+		.lock = SPIN_LOCK_UNLOCKED,			\
+		.head = NULL }
+#define BLOCKING_NOTIFIER_INIT(name) {				\
+		.rwsem = __RWSEM_INITIALIZER((name).rwsem),	\
+		.head = NULL }
+#define RAW_NOTIFIER_INIT(name)	{				\
+		.head = NULL }
+
+#define ATOMIC_NOTIFIER_HEAD(name)				\
+	struct atomic_notifier_head name =			\
+		ATOMIC_NOTIFIER_INIT(name)
+#define BLOCKING_NOTIFIER_HEAD(name)				\
+	struct blocking_notifier_head name =			\
+		BLOCKING_NOTIFIER_INIT(name)
+#define RAW_NOTIFIER_HEAD(name)					\
+	struct raw_notifier_head name =				\
+		RAW_NOTIFIER_INIT(name)
 
 #ifdef __KERNEL__
 
-extern int notifier_chain_register(struct notifier_block **list, struct notifier_block *n);
-extern int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n);
-extern int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v);
+extern int atomic_notifier_chain_register(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_register(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_register(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *,
+		struct notifier_block *);
+extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *,
+		struct notifier_block *);
+extern int raw_notifier_chain_unregister(struct raw_notifier_head *,
+		struct notifier_block *);
+
+extern int atomic_notifier_call_chain(struct atomic_notifier_head *,
+		unsigned long val, void *v);
+extern int blocking_notifier_call_chain(struct blocking_notifier_head *,
+		unsigned long val, void *v);
+extern int raw_notifier_call_chain(struct raw_notifier_head *,
+		unsigned long val, void *v);
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)	/* Bad/Veto action	*/
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
  */
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index b6f0905a4ee2..916013ca4a5c 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -300,29 +300,30 @@ DECLARE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
 #define CONNTRACK_ECACHE(x)	(__get_cpu_var(nf_conntrack_ecache).x)
 
-extern struct notifier_block *nf_conntrack_chain;
-extern struct notifier_block *nf_conntrack_expect_chain;
+extern struct atomic_notifier_head nf_conntrack_chain;
+extern struct atomic_notifier_head nf_conntrack_expect_chain;
 
 static inline int nf_conntrack_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_chain, nb);
 }
 
 static inline int nf_conntrack_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_register(&nf_conntrack_expect_chain, nb);
 }
 
 static inline int
 nf_conntrack_expect_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&nf_conntrack_expect_chain, nb);
+	return atomic_notifier_chain_unregister(&nf_conntrack_expect_chain,
+			nb);
 }
 
 extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
@@ -347,14 +348,14 @@ static inline void nf_conntrack_event(enum ip_conntrack_events event,
 				      struct nf_conn *ct)
 {
 	if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct))
-		notifier_call_chain(&nf_conntrack_chain, event, ct);
+		atomic_notifier_call_chain(&nf_conntrack_chain, event, ct);
 }
 
 static inline void
 nf_conntrack_expect_event(enum ip_conntrack_expect_events event,
 			  struct nf_conntrack_expect *exp)
 {
-	notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
+	atomic_notifier_call_chain(&nf_conntrack_expect_chain, event, exp);
 }
 #else /* CONFIG_NF_CONNTRACK_EVENTS */
 static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8be22bd80933..fe2b8d0bfe4c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
 /* This protects CPUs going up and down... */
 static DECLARE_MUTEX(cpucontrol);
 
-static struct notifier_block *cpu_chain;
+static BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
 static struct task_struct *lock_cpu_hotplug_owner;
@@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
-	int ret;
-
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-	ret = notifier_chain_register(&cpu_chain, nb);
-	unlock_cpu_hotplug();
-	return ret;
+	return blocking_notifier_chain_register(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	lock_cpu_hotplug();
-	notifier_chain_unregister(&cpu_chain, nb);
-	unlock_cpu_hotplug();
+	blocking_notifier_chain_unregister(&cpu_chain, nb);
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+	err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
 		printk("%s: attempt to take down CPU %u failed\n",
@@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu)
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
 				(void *)(long)cpu) == NOTIFY_BAD)
 			BUG();
 
@@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu)
 	put_cpu();
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu)
-	    == NOTIFY_BAD)
+	if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD,
+			(void *)(long)cpu) == NOTIFY_BAD)
 		BUG();
 
 	check_for_tasks(cpu);
@@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
 				__FUNCTION__, cpu);
@@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+	blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
 out_notify:
 	if (ret != 0)
-		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+		blocking_notifier_call_chain(&cpu_chain,
+				CPU_UP_CANCELED, hcpu);
 out:
 	unlock_cpu_hotplug();
 	return ret;
diff --git a/kernel/module.c b/kernel/module.c
index ddfe45ac2fd1..4fafd58038a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock);
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
-static DEFINE_MUTEX(notify_mutex);
-static struct notifier_block * module_notify_list;
+static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
 int register_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_register(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_register(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(register_module_notifier);
 
 int unregister_module_notifier(struct notifier_block * nb)
 {
-	int err;
-	mutex_lock(&notify_mutex);
-	err = notifier_chain_unregister(&module_notify_list, nb);
-	mutex_unlock(&notify_mutex);
-	return err;
+	return blocking_notifier_chain_unregister(&module_notify_list, nb);
 }
 EXPORT_SYMBOL(unregister_module_notifier);
 
@@ -1816,9 +1807,8 @@ sys_init_module(void __user *umod,
 	/* Drop lock so they can recurse */
 	mutex_unlock(&module_mutex);
 
-	mutex_lock(&notify_mutex);
-	notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod);
-	mutex_unlock(&notify_mutex);
+	blocking_notifier_call_chain(&module_notify_list,
+			MODULE_STATE_COMING, mod);
 
 	/* Start the module */
 	if (mod->init != NULL)
diff --git a/kernel/panic.c b/kernel/panic.c
index acd95adddb93..f895c7c01d5b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(pause_on_oops_lock);
 int panic_timeout;
 EXPORT_SYMBOL(panic_timeout);
 
-struct notifier_block *panic_notifier_list;
+ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
 EXPORT_SYMBOL(panic_notifier_list);
 
@@ -97,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	smp_send_stop();
 #endif
 
-	notifier_call_chain(&panic_notifier_list, 0, buf);
+	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
 	if (!panic_blink)
 		panic_blink = no_blink;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad81f799a9b4..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -87,72 +87,52 @@ void __init profile_init(void)
  
 #ifdef CONFIG_PROFILING
  
-static DECLARE_RWSEM(profile_rwsem);
-static DEFINE_RWLOCK(handoff_lock);
-static struct notifier_block * task_exit_notifier;
-static struct notifier_block * task_free_notifier;
-static struct notifier_block * munmap_notifier;
+static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
  
 void profile_task_exit(struct task_struct * task)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&task_exit_notifier, 0, task);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
  
 int profile_handoff_task(struct task_struct * task)
 {
 	int ret;
-	read_lock(&handoff_lock);
-	ret = notifier_call_chain(&task_free_notifier, 0, task);
-	read_unlock(&handoff_lock);
+	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 	return (ret == NOTIFY_OK) ? 1 : 0;
 }
 
 void profile_munmap(unsigned long addr)
 {
-	down_read(&profile_rwsem);
-	notifier_call_chain(&munmap_notifier, 0, (void *)addr);
-	up_read(&profile_rwsem);
+	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 
 int task_handoff_register(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_register(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
 
 int task_handoff_unregister(struct notifier_block * n)
 {
-	int err = -EINVAL;
-
-	write_lock(&handoff_lock);
-	err = notifier_chain_unregister(&task_free_notifier, n);
-	write_unlock(&handoff_lock);
-	return err;
+	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
 
 int profile_event_register(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_register(&task_exit_notifier, n);
+			err = blocking_notifier_chain_register(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_register(&munmap_notifier, n);
+			err = blocking_notifier_chain_register(
+					&munmap_notifier, n);
 			break;
 	}
  
-	up_write(&profile_rwsem);
- 
 	return err;
 }
 
@@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n)
 {
 	int err = -EINVAL;
  
-	down_write(&profile_rwsem);
- 
 	switch (type) {
 		case PROFILE_TASK_EXIT:
-			err = notifier_chain_unregister(&task_exit_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&task_exit_notifier, n);
 			break;
 		case PROFILE_MUNMAP:
-			err = notifier_chain_unregister(&munmap_notifier, n);
+			err = blocking_notifier_chain_unregister(
+					&munmap_notifier, n);
 			break;
 	}
 
-	up_write(&profile_rwsem);
 	return err;
 }
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9b3d5847ed8..ced91e1ff564 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void)
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
-	notifier_chain_register(&panic_notifier_list, &panic_block);
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 38bc73ede2ba..c93d37f71aef 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -95,99 +95,304 @@ int cad_pid = 1;
  *	and the like. 
  */
 
-static struct notifier_block *reboot_notifier_list;
-static DEFINE_RWLOCK(notifier_lock);
+static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
+
+/*
+ *	Notifier chain core routines.  The exported routines below
+ *	are layered on top of these, with appropriate locking added.
+ */
+
+static int notifier_chain_register(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if (n->priority > (*nl)->priority)
+			break;
+		nl = &((*nl)->next);
+	}
+	n->next = *nl;
+	rcu_assign_pointer(*nl, n);
+	return 0;
+}
+
+static int notifier_chain_unregister(struct notifier_block **nl,
+		struct notifier_block *n)
+{
+	while ((*nl) != NULL) {
+		if ((*nl) == n) {
+			rcu_assign_pointer(*nl, n->next);
+			return 0;
+		}
+		nl = &((*nl)->next);
+	}
+	return -ENOENT;
+}
+
+static int __kprobes notifier_call_chain(struct notifier_block **nl,
+		unsigned long val, void *v)
+{
+	int ret = NOTIFY_DONE;
+	struct notifier_block *nb;
+
+	nb = rcu_dereference(*nl);
+	while (nb) {
+		ret = nb->notifier_call(nb, val, v);
+		if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+			break;
+		nb = rcu_dereference(nb->next);
+	}
+	return ret;
+}
+
+/*
+ *	Atomic notifier chain routines.  Registration and unregistration
+ *	use a mutex, and call_chain is synchronized by RCU (no locks).
+ */
 
 /**
- *	notifier_chain_register	- Add notifier to a notifier chain
- *	@list: Pointer to root list pointer
+ *	atomic_notifier_chain_register - Add notifier to an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
  *	@n: New entry in notifier chain
  *
- *	Adds a notifier to a notifier chain.
+ *	Adds a notifier to an atomic notifier chain.
  *
  *	Currently always returns zero.
  */
+
+int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_register(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
+
+/**
+ *	atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from an atomic notifier chain.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
+		struct notifier_block *n)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&nh->lock, flags);
+	ret = notifier_chain_unregister(&nh->head, n);
+	spin_unlock_irqrestore(&nh->lock, flags);
+	synchronize_rcu();
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
+
+/**
+ *	atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *	@nh: Pointer to head of the atomic notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an atomic context, so they must not block.
+ *	This routine uses RCU to synchronize with changes to the chain.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then atomic_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
  
-int notifier_chain_register(struct notifier_block **list, struct notifier_block *n)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	write_lock(&notifier_lock);
-	while(*list)
-	{
-		if(n->priority > (*list)->priority)
-			break;
-		list= &((*list)->next);
-	}
-	n->next = *list;
-	*list=n;
-	write_unlock(&notifier_lock);
-	return 0;
+	int ret;
+
+	rcu_read_lock();
+	ret = notifier_call_chain(&nh->head, val, v);
+	rcu_read_unlock();
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_register);
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+
+/*
+ *	Blocking notifier chain routines.  All access to the chain is
+ *	synchronized by an rwsem.
+ */
 
 /**
- *	notifier_chain_unregister - Remove notifier from a notifier chain
- *	@nl: Pointer to root list pointer
+ *	blocking_notifier_chain_register - Add notifier to a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@n: New entry in notifier chain
  *
- *	Removes a notifier from a notifier chain.
+ *	Adds a notifier to a blocking notifier chain.
+ *	Must be called in process context.
  *
- *	Returns zero on success, or %-ENOENT on failure.
+ *	Currently always returns zero.
  */
  
-int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n)
+int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
 {
-	write_lock(&notifier_lock);
-	while((*nl)!=NULL)
-	{
-		if((*nl)==n)
-		{
-			*nl=n->next;
-			write_unlock(&notifier_lock);
-			return 0;
-		}
-		nl=&((*nl)->next);
-	}
-	write_unlock(&notifier_lock);
-	return -ENOENT;
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_register(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_register(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
 }
 
-EXPORT_SYMBOL(notifier_chain_unregister);
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
 
 /**
- *	notifier_call_chain - Call functions in a notifier chain
- *	@n: Pointer to root pointer of notifier chain
+ *	blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a blocking notifier chain.
+ *	Must be called from process context.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
+		struct notifier_block *n)
+{
+	int ret;
+
+	/*
+	 * This code gets used during boot-up, when task switching is
+	 * not yet working and interrupts must remain disabled.  At
+	 * such times we must not call down_write().
+	 */
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return notifier_chain_unregister(&nh->head, n);
+
+	down_write(&nh->rwsem);
+	ret = notifier_chain_unregister(&nh->head, n);
+	up_write(&nh->rwsem);
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
+
+/**
+ *	blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *	@nh: Pointer to head of the blocking notifier chain
  *	@val: Value passed unmodified to notifier function
  *	@v: Pointer passed unmodified to notifier function
  *
- *	Calls each function in a notifier chain in turn.
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in a process context, so they are allowed to block.
  *
- *	If the return value of the notifier can be and'd
- *	with %NOTIFY_STOP_MASK, then notifier_call_chain
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then blocking_notifier_call_chain
  *	will return immediately, with the return value of
  *	the notifier function which halted execution.
- *	Otherwise, the return value is the return value
+ *	Otherwise the return value is the return value
  *	of the last notifier function called.
  */
  
-int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+		unsigned long val, void *v)
 {
-	int ret=NOTIFY_DONE;
-	struct notifier_block *nb = *n;
+	int ret;
 
-	while(nb)
-	{
-		ret=nb->notifier_call(nb,val,v);
-		if(ret&NOTIFY_STOP_MASK)
-		{
-			return ret;
-		}
-		nb=nb->next;
-	}
+	down_read(&nh->rwsem);
+	ret = notifier_call_chain(&nh->head, val, v);
+	up_read(&nh->rwsem);
 	return ret;
 }
 
-EXPORT_SYMBOL(notifier_call_chain);
+EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
+
+/*
+ *	Raw notifier chain routines.  There is no protection;
+ *	the caller must provide it.  Use at your own risk!
+ */
+
+/**
+ *	raw_notifier_chain_register - Add notifier to a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: New entry in notifier chain
+ *
+ *	Adds a notifier to a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Currently always returns zero.
+ */
+
+int raw_notifier_chain_register(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_register(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_register);
+
+/**
+ *	raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@n: Entry to remove from notifier chain
+ *
+ *	Removes a notifier from a raw notifier chain.
+ *	All locking must be provided by the caller.
+ *
+ *	Returns zero on success or %-ENOENT on failure.
+ */
+int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
+		struct notifier_block *n)
+{
+	return notifier_chain_unregister(&nh->head, n);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
+
+/**
+ *	raw_notifier_call_chain - Call functions in a raw notifier chain
+ *	@nh: Pointer to head of the raw notifier chain
+ *	@val: Value passed unmodified to notifier function
+ *	@v: Pointer passed unmodified to notifier function
+ *
+ *	Calls each function in a notifier chain in turn.  The functions
+ *	run in an undefined context.
+ *	All locking must be provided by the caller.
+ *
+ *	If the return value of the notifier can be and'ed
+ *	with %NOTIFY_STOP_MASK then raw_notifier_call_chain
+ *	will return immediately, with the return value of
+ *	the notifier function which halted execution.
+ *	Otherwise the return value is the return value
+ *	of the last notifier function called.
+ */
+
+int raw_notifier_call_chain(struct raw_notifier_head *nh,
+		unsigned long val, void *v)
+{
+	return notifier_call_chain(&nh->head, val, v);
+}
+
+EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
 
 /**
  *	register_reboot_notifier - Register function to be called at reboot time
@@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain);
  *	Registers a function with the list of functions
  *	to be called at reboot time.
  *
- *	Currently always returns zero, as notifier_chain_register
+ *	Currently always returns zero, as blocking_notifier_chain_register
  *	always returns zero.
  */
  
 int register_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_register(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_register(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(register_reboot_notifier);
@@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier);
  
 int unregister_reboot_notifier(struct notifier_block * nb)
 {
-	return notifier_chain_unregister(&reboot_notifier_list, nb);
+	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
 }
 
 EXPORT_SYMBOL(unregister_reboot_notifier);
@@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart);
 
 void kernel_restart_prepare(char *cmd)
 {
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
 }
@@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec);
 
 void kernel_shutdown_prepare(enum system_states state)
 {
-	notifier_call_chain(&reboot_notifier_list,
+	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
 	system_state = state;
 	device_shutdown();
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 9106354c781e..a49a6975092d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -73,23 +73,23 @@ DEFINE_RWLOCK(hci_cb_list_lock);
 struct hci_proto *hci_proto[HCI_MAX_PROTO];
 
 /* HCI notifiers list */
-static struct notifier_block *hci_notifier;
+static ATOMIC_NOTIFIER_HEAD(hci_notifier);
 
 /* ---- HCI notifications ---- */
 
 int hci_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&hci_notifier, nb);
+	return atomic_notifier_chain_register(&hci_notifier, nb);
 }
 
 int hci_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&hci_notifier, nb);
+	return atomic_notifier_chain_unregister(&hci_notifier, nb);
 }
 
 static void hci_notify(struct hci_dev *hdev, int event)
 {
-	notifier_call_chain(&hci_notifier, event, hdev);
+	atomic_notifier_call_chain(&hci_notifier, event, hdev);
 }
 
 /* ---- HCI requests ---- */
diff --git a/net/core/dev.c b/net/core/dev.c
index 8e1dc3051222..a3ab11f34153 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -193,7 +193,7 @@ static inline struct hlist_head *dev_index_hash(int ifindex)
  *	Our notifier list
  */
 
-static struct notifier_block *netdev_chain;
+static BLOCKING_NOTIFIER_HEAD(netdev_chain);
 
 /*
  *	Device drivers call our routines to queue packets here. We empty the
@@ -736,7 +736,8 @@ int dev_change_name(struct net_device *dev, char *newname)
 	if (!err) {
 		hlist_del(&dev->name_hlist);
 		hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGENAME, dev);
 	}
 
 	return err;
@@ -750,7 +751,7 @@ int dev_change_name(struct net_device *dev, char *newname)
  */
 void netdev_features_change(struct net_device *dev)
 {
-	notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 }
 EXPORT_SYMBOL(netdev_features_change);
 
@@ -765,7 +766,8 @@ EXPORT_SYMBOL(netdev_features_change);
 void netdev_state_change(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 	}
 }
@@ -862,7 +864,7 @@ int dev_open(struct net_device *dev)
 		/*
 		 *	... and announce new interface.
 		 */
-		notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
+		blocking_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 	}
 	return ret;
 }
@@ -885,7 +887,7 @@ int dev_close(struct net_device *dev)
 	 *	Tell people we are going down, so that they can
 	 *	prepare to death, when device is still operating.
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 
 	dev_deactivate(dev);
 
@@ -922,7 +924,7 @@ int dev_close(struct net_device *dev)
 	/*
 	 * Tell people we are down
 	 */
-	notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 
 	return 0;
 }
@@ -953,7 +955,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_register(&netdev_chain, nb);
+	err = blocking_notifier_chain_register(&netdev_chain, nb);
 	if (!err) {
 		for (dev = dev_base; dev; dev = dev->next) {
 			nb->notifier_call(nb, NETDEV_REGISTER, dev);
@@ -981,7 +983,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	int err;
 
 	rtnl_lock();
-	err = notifier_chain_unregister(&netdev_chain, nb);
+	err = blocking_notifier_chain_unregister(&netdev_chain, nb);
 	rtnl_unlock();
 	return err;
 }
@@ -992,12 +994,12 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
  *      @v:   pointer passed unmodified to notifier function
  *
  *	Call all network notifier blocks.  Parameters and return value
- *	are as for notifier_call_chain().
+ *	are as for blocking_notifier_call_chain().
  */
 
 int call_netdevice_notifiers(unsigned long val, void *v)
 {
-	return notifier_call_chain(&netdev_chain, val, v);
+	return blocking_notifier_call_chain(&netdev_chain, val, v);
 }
 
 /* When > 0 there are consumers of rx skb time stamps */
@@ -2242,7 +2244,8 @@ int dev_change_flags(struct net_device *dev, unsigned flags)
 	if (dev->flags & IFF_UP &&
 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
 					  IFF_VOLATILE)))
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGE, dev);
 
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
@@ -2286,8 +2289,8 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
 	else
 		dev->mtu = new_mtu;
 	if (!err && dev->flags & IFF_UP)
-		notifier_call_chain(&netdev_chain,
-				    NETDEV_CHANGEMTU, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEMTU, dev);
 	return err;
 }
 
@@ -2303,7 +2306,8 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
 		return -ENODEV;
 	err = dev->set_mac_address(dev, sa);
 	if (!err)
-		notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev);
+		blocking_notifier_call_chain(&netdev_chain,
+				NETDEV_CHANGEADDR, dev);
 	return err;
 }
 
@@ -2359,7 +2363,7 @@ static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
 				return -EINVAL;
 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_CHANGEADDR, dev);
 			return 0;
 
@@ -2813,7 +2817,7 @@ int register_netdevice(struct net_device *dev)
 	write_unlock_bh(&dev_base_lock);
 
 	/* Notify protocols, that a new device appeared. */
-	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
 
 	/* Finish registration after unlock */
 	net_set_todo(dev);
@@ -2892,7 +2896,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
 			rtnl_lock();
 
 			/* Rebroadcast unregister notification */
-			notifier_call_chain(&netdev_chain,
+			blocking_notifier_call_chain(&netdev_chain,
 					    NETDEV_UNREGISTER, dev);
 
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
@@ -3148,7 +3152,7 @@ int unregister_netdevice(struct net_device *dev)
 	/* Notify protocols, that we are about to destroy
 	   this device. They should clean all the things.
 	*/
-	notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
+	blocking_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
 	
 	/*
 	 *	Flush the multicast chain
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index cc7b9d9255ef..d2ae9893ca17 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -68,7 +68,7 @@ __le16 decnet_address = 0;
 
 static DEFINE_RWLOCK(dndev_lock);
 static struct net_device *decnet_default_device;
-static struct notifier_block *dnaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
 
 static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
 static void dn_dev_delete(struct net_device *dev);
@@ -446,7 +446,7 @@ static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int de
 	}
 
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
 	if (destroy) {
 		dn_dev_free_ifa(ifa1);
 
@@ -481,7 +481,7 @@ static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
 	dn_db->ifa_list = ifa;
 
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -1285,12 +1285,12 @@ void dn_dev_devices_on(void)
 
 int register_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&dnaddr_chain, nb);
+	return blocking_notifier_chain_register(&dnaddr_chain, nb);
 }
 
 int unregister_dnaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&dnaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 44fdf1413e2c..81c2f7885292 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -81,7 +81,7 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
 
 static void rtmsg_ifa(int event, struct in_ifaddr *);
 
-static struct notifier_block *inetaddr_chain;
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
@@ -267,7 +267,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 				*ifap1 = ifa->ifa_next;
 
 				rtmsg_ifa(RTM_DELADDR, ifa);
-				notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+				blocking_notifier_call_chain(&inetaddr_chain,
+						NETDEV_DOWN, ifa);
 				inet_free_ifa(ifa);
 			} else {
 				promote = ifa;
@@ -291,7 +292,7 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	   So that, this order is correct.
 	 */
 	rtmsg_ifa(RTM_DELADDR, ifa1);
-	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
 
 	if (promote) {
 
@@ -303,7 +304,8 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 
 		promote->ifa_flags &= ~IFA_F_SECONDARY;
 		rtmsg_ifa(RTM_NEWADDR, promote);
-		notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
+		blocking_notifier_call_chain(&inetaddr_chain,
+				NETDEV_UP, promote);
 		for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) {
 			if (ifa1->ifa_mask != ifa->ifa_mask ||
 			    !inet_ifa_match(ifa1->ifa_address, ifa))
@@ -366,7 +368,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
 	   Notifier will trigger FIB update, so that
 	   listeners of netlink will know about new ifaddr */
 	rtmsg_ifa(RTM_NEWADDR, ifa);
-	notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+	blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
 
 	return 0;
 }
@@ -938,12 +940,12 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
 
 int register_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&inetaddr_chain, nb);
+	return blocking_notifier_chain_register(&inetaddr_chain, nb);
 }
 
 int unregister_inetaddr_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&inetaddr_chain, nb);
+	return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
 }
 
 /* Rename ifa_labels for a device name change. Make some effort to preserve existing
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 9e34034729a6..ceaabc18202b 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -80,8 +80,8 @@ static int ip_conntrack_vmalloc;
 static unsigned int ip_conntrack_next_id;
 static unsigned int ip_conntrack_expect_next_id;
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-struct notifier_block *ip_conntrack_chain;
-struct notifier_block *ip_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
@@ -92,7 +92,7 @@ __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
 {
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
-		notifier_call_chain(&ip_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
 				    ecache->ct);
 	ecache->events = 0;
 	ip_conntrack_put(ecache->ct);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 01c62a0d3742..445006ee4522 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -143,7 +143,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 				struct prefix_info *pinfo);
 static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev);
 
-static struct notifier_block *inet6addr_chain;
+static ATOMIC_NOTIFIER_HEAD(inet6addr_chain);
 
 struct ipv6_devconf ipv6_devconf = {
 	.forwarding		= 0,
@@ -593,7 +593,7 @@ out2:
 	read_unlock_bh(&addrconf_lock);
 
 	if (likely(err == 0))
-		notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
+		atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa);
 	else {
 		kfree(ifa);
 		ifa = ERR_PTR(err);
@@ -688,7 +688,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 	ipv6_ifa_notify(RTM_DELADDR, ifp);
 
-	notifier_call_chain(&inet6addr_chain,NETDEV_DOWN,ifp);
+	atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp);
 
 	addrconf_del_timer(ifp);
 
@@ -3767,12 +3767,12 @@ static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
 
 int register_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_register(&inet6addr_chain, nb);
+        return atomic_notifier_chain_register(&inet6addr_chain, nb);
 }
 
 int unregister_inet6addr_notifier(struct notifier_block *nb)
 {
-        return notifier_chain_unregister(&inet6addr_chain,nb);
+        return atomic_notifier_chain_unregister(&inet6addr_chain,nb);
 }
 
 /*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0ae281d9bfc3..56389c83557c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -90,8 +90,8 @@ static int nf_conntrack_vmalloc;
 static unsigned int nf_conntrack_next_id;
 static unsigned int nf_conntrack_expect_next_id;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct notifier_block *nf_conntrack_chain;
-struct notifier_block *nf_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
 
 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
 
@@ -103,7 +103,7 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
 	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
 	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
 	    && ecache->events)
-		notifier_call_chain(&nf_conntrack_chain, ecache->events,
+		atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
 				    ecache->ct);
 
 	ecache->events = 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d00a9034cb5f..2a233ffcf618 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -123,7 +123,7 @@ static void netlink_destroy_callback(struct netlink_callback *cb);
 static DEFINE_RWLOCK(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
 
-static struct notifier_block *netlink_chain;
+static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static u32 netlink_group_mask(u32 group)
 {
@@ -469,7 +469,8 @@ static int netlink_release(struct socket *sock)
 						.protocol = sk->sk_protocol,
 						.pid = nlk->pid,
 					  };
-		notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n);
+		atomic_notifier_call_chain(&netlink_chain,
+				NETLINK_URELEASE, &n);
 	}	
 
 	if (nlk->module)
@@ -1695,12 +1696,12 @@ static struct file_operations netlink_seq_fops = {
 
 int netlink_register_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_register(&netlink_chain, nb);
+	return atomic_notifier_chain_register(&netlink_chain, nb);
 }
 
 int netlink_unregister_notifier(struct notifier_block *nb)
 {
-	return notifier_chain_unregister(&netlink_chain, nb);
+	return atomic_notifier_chain_unregister(&netlink_chain, nb);
 }
                 
 static const struct proto_ops netlink_ops = {
-- 
cgit v1.2.3


From f83ca9fe3ee390755f18b4a7780c25ce593b484a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Mar 2006 01:56:20 -0800
Subject: [PATCH] symversion warning fix

gcc-4.2:

kernel/module.c: In function '__find_symbol':
kernel/module.c:158: warning: the address of '__start___kcrctab', will always evaluate as 'true'
kernel/module.c:165: warning: the address of '__start___kcrctab_gpl', will always evaluate as 'true'
kernel/module.c:182: warning: the address of '__start___kcrctab_gpl_future', will always evaluate as 'true'

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 4fafd58038a0..bd088a7c1499 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -127,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[];
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
 #else
-#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL)
+#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
 /* lookup symbol in given range of kernel_symbols */
-- 
cgit v1.2.3


From b9e20a920092eb3840424f85c78852c0433df00d Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Tue, 28 Mar 2006 01:56:24 -0800
Subject: [PATCH] Change dash2underscore() return value to char

Since dash2underscore() just operates and returns chars, I guess its safe
to change the return value to a char.  With my .config, this reduces its
size by 5 bytes.

   text    data     bss     dec     hex filename
   4155     152       0    4307    10d3 params.o.orig
   4150     152       0    4302    10ce params.o

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 9de637a5c8bc..af43ecdc8d9b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,7 +31,7 @@
 #define DEBUGP(fmt, a...)
 #endif
 
-static inline int dash2underscore(char c)
+static inline char dash2underscore(char c)
 {
 	if (c == '-')
 		return '_';
-- 
cgit v1.2.3


From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:37 -0800
Subject: [PATCH] for_each_possible_cpu: fixes for generic part

replaces for_each_cpu with for_each_possible_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c            | 2 +-
 fs/file.c                    | 2 +-
 fs/proc/proc_misc.c          | 2 +-
 include/asm-generic/percpu.h | 2 +-
 include/linux/genhd.h        | 4 ++--
 include/linux/kernel_stat.h  | 2 +-
 init/main.c                  | 2 +-
 kernel/rcutorture.c          | 4 ++--
 kernel/sched.c               | 8 ++++----
 mm/slab.c                    | 4 ++--
 mm/swap.c                    | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..5a19e2eb5711 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3514,7 +3514,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/fs/file.c b/fs/file.c
index bbc743314730..55f4e7022563 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -373,6 +373,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1e9ea37d457e..1edce0c34bfd 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -534,7 +534,7 @@ static int show_stat(struct seq_file *p, void *v)
 	if (wall_to_monotonic.tv_nsec)
 		--jif;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int j;
 
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 78cf45547e31..c0caf433a7d7 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -19,7 +19,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for_each_cpu(__i)					\
+	for_each_possible_cpu(__i)				\
 		memcpy((pcpudst)+__per_cpu_offset[__i],		\
 		       (src), (size));				\
 } while (0)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3c1b0294a742..10a27f29d692 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -152,14 +152,14 @@ struct disk_attribute {
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
-	for_each_cpu(i)							\
+	for_each_possible_cpu(i)					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	res;								\
 })
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
 				sizeof (struct disk_stats));
 }		
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a484572c302e..b46249082cca 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,7 +46,7 @@ static inline int kstat_irqs(int irq)
 {
 	int cpu, sum = 0;
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		sum += kstat_cpu(cpu).irqs[irq];
 
 	return sum;
diff --git a/init/main.c b/init/main.c
index 64466ea1984c..4a2f0898dda1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -341,7 +341,7 @@ static void __init setup_per_cpu_areas(void)
 #endif
 	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5baf5..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
 			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_error, 0);
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			per_cpu(rcu_torture_count, cpu)[i] = 0;
 			per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7854ee516b92..a9ecac398bb9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 
 	/*
@@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 
 	return sum;
@@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
 	return sum;
@@ -6080,7 +6080,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/slab.c b/mm/slab.c
index 681837499d7d..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3311,7 +3311,7 @@ void *__alloc_percpu(size_t size)
 	 * and we have no way of figuring out how to fix the array
 	 * that we have allocated then....
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int node = cpu_to_node(i);
 
 		if (node_online(node))
@@ -3398,7 +3398,7 @@ void free_percpu(const void *objp)
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 	    kfree(p->ptrs[i]);
 	kfree(p);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 91b7e2026f69..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -512,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		long *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-- 
cgit v1.2.3


From ec7e15d6486e9d1da1c2f344b670b8388ba7019b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Mar 2006 01:56:55 -0800
Subject: [PATCH] compat_sys_futex() warning fix

kernel/futex_compat.c: In function `compat_sys_futex':
kernel/futex_compat.c:140: warning: passing arg 1 of `do_futex' makes integer from pointer without a cast
kernel/futex_compat.c:140: warning: passing arg 5 of `do_futex' makes integer from pointer without a cast

Not sure what Ingo was thinking of here.  Put the casts back in.

Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex_compat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 9c077cf9aa84..54274fc85321 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -137,5 +137,6 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 	if (op >= FUTEX_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
-	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+	return do_futex((unsigned long)uaddr, op, val, timeout,
+			(unsigned long)uaddr2, val2, val3);
 }
-- 
cgit v1.2.3


From fef23e7fbb11a0a78cd61935f7056bc2b237995a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:10:58 -0800
Subject: [PATCH] exec: allow init to exec from any thread.

After looking at the problem of init calling exec some more I figured out
an easy way to make the code work.

The actual symptom without out this patch is that all threads will die
except pid == 1, and the thread calling exec.  The thread calling exec will
wait forever for pid == 1 to die.

Since pid == 1 does not install a handler for SIGKILL it will never die.

This modifies the tests for init from current->pid == 1 to the equivalent
current == child_reaper.  And then it causes exec in the ugly case to
modify child_reaper.

The only weird symptom is that you wind up with an init process that
doesn't have the oldest start time on the box.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c       | 13 ++++++++++++-
 kernel/exit.c   |  2 +-
 kernel/signal.c |  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index c7397c46ad6d..d0ecea0781f7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -660,12 +660,23 @@ static int de_thread(struct task_struct *tsk)
 		struct dentry *proc_dentry1, *proc_dentry2;
 		unsigned long ptrace;
 
+		leader = current->group_leader;
+		/*
+		 * If our leader is the child_reaper become
+		 * the child_reaper and resend SIGKILL signal.
+		 */
+		if (unlikely(leader == child_reaper)) {
+			write_lock(&tasklist_lock);
+			child_reaper = current;
+			zap_other_threads(current);
+			write_unlock(&tasklist_lock);
+		}
+
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
-		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
diff --git a/kernel/exit.c b/kernel/exit.c
index a8c7efc7a681..223a8802b665 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -807,7 +807,7 @@ fastcall NORET_TYPE void do_exit(long code)
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
-	if (unlikely(tsk->pid == 1))
+	if (unlikely(tsk == child_reaper))
 		panic("Attempted to kill init!");
 
 	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 75f7341b0c39..dc8f91bf9f89 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1990,7 +1990,7 @@ relock:
 			continue;
 
 		/* Init gets no signals it doesn't want.  */
-		if (current->pid == 1)
+		if (current == child_reaper)
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.2.3


From d73d65293e3e2de7e916a89c8da30be0948afab7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 28 Mar 2006 16:11:03 -0800
Subject: [PATCH] pidhash: kill switch_exec_pids

switch_exec_pids is only called from de_thread by way of exec, and it is
only called when we are exec'ing from a non thread group leader.

Currently switch_exec_pids gives the leader the pid of the thread and
unhashes and rehashes all of the process groups.  The leader is already in
the EXIT_DEAD state so no one cares about it's pids.  The only concern for
the leader is that __unhash_process called from release_task will function
correctly.  If we don't touch the leader at all we know that
__unhash_process will work fine so there is no need to touch the leader.

For the task becomming the thread group leader, we just need to give it the
pid of the old thread group leader, add it to the task list, and attach it
to the session and the process group of the thread group.

Currently de_thread is also adding the task to the task list which is just
silly.

Currently the only leader of __detach_pid besides detach_pid is
switch_exec_pids because of the ugly extra work that was being
performed.

So this patch removes switch_exec_pids because it is doing too much, it is
creating an unnecessary special case in pid.c, duing work duplicated in
de_thread, and generally obscuring what it is going on.

The necessary work is added to de_thread, and it seems to be a little
clearer there what is going on.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Kirill Korotaev <dev@sw.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c           | 14 +++++++++++---
 include/linux/pid.h |  1 -
 kernel/pid.c        | 30 ------------------------------
 3 files changed, 11 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index dd194923c52c..db0769447d33 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -708,7 +708,17 @@ static int de_thread(struct task_struct *tsk)
 		remove_parent(current);
 		remove_parent(leader);
 
-		switch_exec_pids(leader, current);
+
+		/* Become a process group leader with the old leader's pid.
+		 * Note: The old leader also uses thispid until release_task
+		 *       is called.  Odd but simple and correct.
+		 */
+		detach_pid(current, PIDTYPE_PID);
+		current->pid = leader->pid;
+		attach_pid(current, PIDTYPE_PID,  current->pid);
+		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
+		attach_pid(current, PIDTYPE_SID,  current->signal->session);
+		list_add_tail(&current->tasks, &init_task.tasks);
 
 		current->parent = current->real_parent = leader->real_parent;
 		leader->parent = leader->real_parent = child_reaper;
@@ -722,8 +732,6 @@ static int de_thread(struct task_struct *tsk)
 			__ptrace_link(current, parent);
 		}
 
-		list_del(&current->tasks);
-		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 5b2fcb19d2da..099e70ecf7c7 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -38,7 +38,6 @@ extern struct pid *FASTCALL(find_pid(enum pid_type, int));
 
 extern int alloc_pidmap(void);
 extern void FASTCALL(free_pidmap(int));
-extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread);
 
 #define do_each_task_pid(who, type, task)				\
 	if ((task = find_task_by_pid_type(type, who))) {		\
diff --git a/kernel/pid.c b/kernel/pid.c
index 1acc07246991..7781d9999058 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,36 +217,6 @@ task_t *find_task_by_pid_type(int type, int nr)
 
 EXPORT_SYMBOL(find_task_by_pid_type);
 
-/*
- * This function switches the PIDs if a non-leader thread calls
- * sys_execve() - this must be done without releasing the PID.
- * (which a detach_pid() would eventually do.)
- */
-void switch_exec_pids(task_t *leader, task_t *thread)
-{
-	__detach_pid(leader, PIDTYPE_PID);
-	__detach_pid(leader, PIDTYPE_TGID);
-	__detach_pid(leader, PIDTYPE_PGID);
-	__detach_pid(leader, PIDTYPE_SID);
-
-	__detach_pid(thread, PIDTYPE_PID);
-	__detach_pid(thread, PIDTYPE_TGID);
-
-	leader->pid = leader->tgid = thread->pid;
-	thread->pid = thread->tgid;
-
-	attach_pid(thread, PIDTYPE_PID, thread->pid);
-	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
-	list_add_tail(&thread->tasks, &init_task.tasks);
-
-	attach_pid(leader, PIDTYPE_PID, leader->pid);
-	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
-}
-
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
-- 
cgit v1.2.3


From d799f03597cabc6112acb518fc8ab4487aa4f953 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:04 -0800
Subject: [PATCH] choose_new_parent: remove unused arg, sanitize exit_state
 check

'child_reaper' arg is not used in choose_new_parent().

"->exit_state >= EXIT_ZOMBIE" check is a leftover, was
valid when EXIT_ZOMBIE lived in ->state var.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 223a8802b665..e04a59405e99 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -538,13 +538,13 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+static inline void choose_new_parent(task_t *p, task_t *reaper)
 {
 	/*
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
-	BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+	BUG_ON(p == reaper || reaper->exit_state);
 	p->real_parent = reaper;
 }
 
@@ -645,7 +645,7 @@ static void forget_original_parent(struct task_struct * father,
 
 		if (father == p->real_parent) {
 			/* reparent with a reaper, real father it's us */
-			choose_new_parent(p, reaper, child_reaper);
+			choose_new_parent(p, reaper);
 			reparent_thread(p, father, 0);
 		} else {
 			/* reparent ptraced task to its real parent */
@@ -666,7 +666,7 @@ static void forget_original_parent(struct task_struct * father,
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
 		p = list_entry(_p,struct task_struct,ptrace_list);
-		choose_new_parent(p, reaper, child_reaper);
+		choose_new_parent(p, reaper);
 		reparent_thread(p, father, 1);
 	}
 }
-- 
cgit v1.2.3


From 8fafabd86f1b75ed3cc6a6ffbe6c3e53e3d8457d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:05 -0800
Subject: [PATCH] remove add_parent()'s parent argument

add_parent(p, parent) is always called with parent == p->parent, and it makes
no sense to do it differently.  This patch removes this argument.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixsig.c | 4 ++--
 fs/exec.c                  | 4 ++--
 include/linux/sched.h      | 4 ++--
 kernel/exit.c              | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c
index 08273a2a501d..8150f071f80a 100644
--- a/arch/mips/kernel/irixsig.c
+++ b/arch/mips/kernel/irixsig.c
@@ -603,7 +603,7 @@ repeat:
 			/* move to end of parent's list to avoid starvation */
 			write_lock_irq(&tasklist_lock);
 			remove_parent(p);
-			add_parent(p, p->parent);
+			add_parent(p);
 			write_unlock_irq(&tasklist_lock);
 			retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 			if (retval)
@@ -643,7 +643,7 @@ repeat:
 				write_lock_irq(&tasklist_lock);
 				remove_parent(p);
 				p->parent = p->real_parent;
-				add_parent(p, p->parent);
+				add_parent(p);
 				do_notify_parent(p, SIGCHLD);
 				write_unlock_irq(&tasklist_lock);
 			} else
diff --git a/fs/exec.c b/fs/exec.c
index db0769447d33..9046ad2b0614 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -725,8 +725,8 @@ static int de_thread(struct task_struct *tsk)
 		current->group_leader = current;
 		leader->group_leader = leader;
 
-		add_parent(current, current->parent);
-		add_parent(leader, leader->parent);
+		add_parent(current);
+		add_parent(leader);
 		if (ptrace) {
 			current->ptrace = ptrace;
 			__ptrace_link(current, parent);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5f5ab98bbb6b..b4b14c32b28a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1184,7 +1184,7 @@ extern void wait_task_inactive(task_t * p);
 #endif
 
 #define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p, parent)	list_add_tail(&(p)->sibling,&(parent)->children)
+#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
 #define REMOVE_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
@@ -1195,7 +1195,7 @@ extern void wait_task_inactive(task_t * p);
 #define SET_LINKS(p) do {					\
 	if (thread_group_leader(p))				\
 		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p, (p)->parent);				\
+	add_parent(p);						\
 	} while (0)
 
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
diff --git a/kernel/exit.c b/kernel/exit.c
index e04a59405e99..df26c33037d2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1281,7 +1281,7 @@ bail_ref:
 
 	/* move to end of parent's list to avoid starvation */
 	remove_parent(p);
-	add_parent(p, p->parent);
+	add_parent(p);
 
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.2.3


From 9b678ece42893b53aae5ed7cb8d7cb261cacb72c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:05 -0800
Subject: [PATCH] don't use REMOVE_LINKS/SET_LINKS for reparenting

There are places where kernel uses REMOVE_LINKS/SET_LINKS while changing
process's ->parent.  Use add_parent/remove_parent instead, they don't abuse
of global process list.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c   | 4 ++--
 kernel/ptrace.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index df26c33037d2..5b5e8b67680e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -238,10 +238,10 @@ static void reparent_to_init(void)
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	REMOVE_LINKS(current);
+	remove_parent(current);
 	current->parent = child_reaper;
 	current->real_parent = child_reaper;
-	SET_LINKS(current);
+	add_parent(current);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d95a72c9279d..86a7f6c60cb2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent)
 	if (child->parent == new_parent)
 		return;
 	list_add(&child->ptrace_list, &child->parent->ptrace_children);
-	REMOVE_LINKS(child);
+	remove_parent(child);
 	child->parent = new_parent;
-	SET_LINKS(child);
+	add_parent(child);
 }
  
 /*
@@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child)
 	child->ptrace = 0;
 	if (!list_empty(&child->ptrace_list)) {
 		list_del_init(&child->ptrace_list);
-		REMOVE_LINKS(child);
+		remove_parent(child);
 		child->parent = child->real_parent;
-		SET_LINKS(child);
+		add_parent(child);
 	}
 
 	ptrace_untrace(child);
-- 
cgit v1.2.3


From c97d98931ac52ef110b62d9b75c6a6f2bfbc1898 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:06 -0800
Subject: [PATCH] kill SET_LINKS/REMOVE_LINKS

Both SET_LINKS() and SET_LINKS/REMOVE_LINKS() have exactly one caller, and
these callers already check thread_group_leader().

This patch kills theese macros, they mix two different things: setting
process's parent and registering it in init_task.tasks list.  Callers are
updated to do these actions by hand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 12 ------------
 kernel/exit.c         |  4 +++-
 kernel/fork.c         |  4 +++-
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4b14c32b28a..1f16fb1fea22 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1186,18 +1186,6 @@ extern void wait_task_inactive(task_t * p);
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
-#define REMOVE_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_del_init(&(p)->tasks);			\
-	remove_parent(p);					\
-	} while (0)
-
-#define SET_LINKS(p) do {					\
-	if (thread_group_leader(p))				\
-		list_add_tail(&(p)->tasks,&init_task.tasks);	\
-	add_parent(p);						\
-	} while (0)
-
 #define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
 #define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b5e8b67680e..f436a6bd3fb7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,11 +54,13 @@ static void __unhash_process(struct task_struct *p)
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
+
+		list_del_init(&p->tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)--;
 	}
 
-	REMOVE_LINKS(p);
+	remove_parent(p);
 }
 
 void release_task(struct task_struct * p)
diff --git a/kernel/fork.c b/kernel/fork.c
index c49bd193b058..74c67629ee62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,7 +1181,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	SET_LINKS(p);
+	add_parent(p);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
@@ -1191,6 +1191,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->signal->session = current->signal->session;
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+		list_add_tail(&p->tasks, &init_task.tasks);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
-- 
cgit v1.2.3


From 73b9ebfe126a4a886ee46cbab637374d7024668a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:07 -0800
Subject: [PATCH] pidhash: don't count idle threads

fork_idle() does unhash_process() just after copy_process().  Contrary,
boot_cpu's idle thread explicitely registers itself for each pid_type with nr
= 0.

copy_process() already checks p->pid != 0 before process_counts++, I think we
can just skip attach_pid() calls and job control inits for idle threads and
kill unhash_process().  We don't need to cleanup ->proc_dentry in fork_idle()
because with this patch idle threads are never hashed in
kernel/pid.c:pid_hash[].

We don't need to hash pid == 0 in pidmap_init().  free_pidmap() is never
called with pid == 0 arg, so it will never be reused.  So it is still possible
to use pid == 0 in any PIDTYPE_xxx namespace from kernel/pid.c's POV.

However with this patch we don't hash pid == 0 for PIDTYPE_PID case.  We still
have have PIDTYPE_PGID/PIDTYPE_SID entries with pid == 0: /sbin/init and
kernel threads which don't call daemonize().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/um/kernel/smp.c  |  1 -
 include/linux/sched.h |  2 --
 kernel/exit.c         | 18 +-----------------
 kernel/fork.c         | 35 ++++++++++++++++++-----------------
 kernel/pid.c          | 10 +---------
 5 files changed, 20 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index c8d8d0ac1a7f..511116aebaf7 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -143,7 +143,6 @@ void smp_prepare_cpus(unsigned int maxcpus)
 		idle = idle_thread(cpu);
 
 		init_idle(idle, cpu);
-		unhash_process(idle);
 
 		waittime = 200000000;
 		while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1f16fb1fea22..ddc0df7f8bf5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1214,8 +1214,6 @@ static inline int thread_group_empty(task_t *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern void unhash_process(struct task_struct *p);
-
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index f436a6bd3fb7..a94e1c31131b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,8 +56,7 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_SID);
 
 		list_del_init(&p->tasks);
-		if (p->pid)
-			__get_cpu_var(process_counts)--;
+		__get_cpu_var(process_counts)--;
 	}
 
 	remove_parent(p);
@@ -118,21 +117,6 @@ repeat:
 		goto repeat;
 }
 
-/* we are using it only for SMP init */
-
-void unhash_process(struct task_struct *p)
-{
-	struct dentry *proc_dentry;
-
-	spin_lock(&p->proc_lock);
-	proc_dentry = proc_pid_unhash(p);
-	write_lock_irq(&tasklist_lock);
-	__unhash_process(p);
-	write_unlock_irq(&tasklist_lock);
-	spin_unlock(&p->proc_lock);
-	proc_pid_flush(proc_dentry);
-}
-
 /*
  * This checks not only the pgrp, but falls back on the pid if no
  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
diff --git a/kernel/fork.c b/kernel/fork.c
index 74c67629ee62..0c32e28cdc5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1181,25 +1181,26 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->ioprio = current->ioprio;
 
-	add_parent(p);
-	if (unlikely(p->ptrace & PT_PTRACED))
-		__ptrace_link(p, current->parent);
-
-	if (thread_group_leader(p)) {
-		p->signal->tty = current->signal->tty;
-		p->signal->pgrp = process_group(current);
-		p->signal->session = current->signal->session;
-		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->signal->session);
-
-		list_add_tail(&p->tasks, &init_task.tasks);
-		if (p->pid)
+	if (likely(p->pid)) {
+		add_parent(p);
+		if (unlikely(p->ptrace & PT_PTRACED))
+			__ptrace_link(p, current->parent);
+
+		if (thread_group_leader(p)) {
+			p->signal->tty = current->signal->tty;
+			p->signal->pgrp = process_group(current);
+			p->signal->session = current->signal->session;
+			attach_pid(p, PIDTYPE_PGID, process_group(p));
+			attach_pid(p, PIDTYPE_SID, p->signal->session);
+
+			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
+		}
+		attach_pid(p, PIDTYPE_TGID, p->tgid);
+		attach_pid(p, PIDTYPE_PID, p->pid);
+		nr_threads++;
 	}
-	attach_pid(p, PIDTYPE_TGID, p->tgid);
-	attach_pid(p, PIDTYPE_PID, p->pid);
 
-	nr_threads++;
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
@@ -1263,7 +1264,7 @@ task_t * __devinit fork_idle(int cpu)
 	if (!task)
 		return ERR_PTR(-ENOMEM);
 	init_idle(task, cpu);
-	unhash_process(task);
+
 	return task;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 7781d9999058..a9f2dfd006d2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -247,16 +247,8 @@ void __init pidhash_init(void)
 
 void __init pidmap_init(void)
 {
-	int i;
-
 	pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
+	/* Reserve PID 0. We never call free_pidmap(0) */
 	set_bit(0, pidmap_array->page);
 	atomic_dec(&pidmap_array->nr_free);
-
-	/*
-	 * Allocate PID 0, and hash it via all PID types:
-	 */
-
-	for (i = 0; i < PIDTYPE_MAX; i++)
-		attach_pid(current, i, 0);
 }
-- 
cgit v1.2.3


From 6ac781b11ade6e3451f6b460991c8b2b87e58280 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:09 -0800
Subject: [PATCH] reparent_thread: use remove_parent/add_parent

Use remove_parent/add_parent instead of open coding.

No changes in kernel/exit.o

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a94e1c31131b..98eec590ecbd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -555,9 +555,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
 		 * anyway, so let go of it.
 		 */
 		p->ptrace = 0;
-		list_del_init(&p->sibling);
+		remove_parent(p);
 		p->parent = p->real_parent;
-		list_add_tail(&p->sibling, &p->parent->children);
+		add_parent(p);
 
 		/* If we'd notified the old parent about this child's death,
 		 * also notify the new parent.
-- 
cgit v1.2.3


From 8292d633add73d40eda1d26089e2fc758944ac7c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:10 -0800
Subject: [PATCH] wait_for_helper: trivial style cleanup

Use NULL instead of (... *)0

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 51a892063aaa..20a997c73c3d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -170,7 +170,7 @@ static int wait_for_helper(void *data)
 	sa.sa.sa_handler = SIG_IGN;
 	sa.sa.sa_flags = 0;
 	siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-	do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
+	do_sigaction(SIGCHLD, &sa, NULL);
 	allow_signal(SIGCHLD);
 
 	pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
-- 
cgit v1.2.3


From 1f09f9749cdde4e69f95d62d96d2e03f50b3353c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:11 -0800
Subject: [PATCH] release_task: replace open-coded ptrace_unlink()

Use ptrace_unlink() instead of open-coding.  No changes in kernel/exit.o

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 98eec590ecbd..77c35efad88c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,13 +68,12 @@ void release_task(struct task_struct * p)
 	task_t *leader;
 	struct dentry *proc_dentry;
 
-repeat: 
+repeat:
 	atomic_dec(&p->user->processes);
 	spin_lock(&p->proc_lock);
 	proc_dentry = proc_pid_unhash(p);
 	write_lock_irq(&tasklist_lock);
-	if (unlikely(p->ptrace))
-		__ptrace_unlink(p);
+	ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
 	/*
-- 
cgit v1.2.3


From aa1757f90bea3f598b6e5d04d922a6a60200f1da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:12 -0800
Subject: [PATCH] convert sighand_cache to use SLAB_DESTROY_BY_RCU

This patch borrows a clever Hugh's 'struct anon_vma' trick.

Without tasklist_lock held we can't trust task->sighand until we locked it
and re-checked that it is still the same.

But this means we don't need to defer 'kmem_cache_free(sighand)'.  We can
return the memory to slab immediately, all we need is to be sure that
sighand->siglock can't dissapear inside rcu protected section.

To do so we need to initialize ->siglock inside ctor function,
SLAB_DESTROY_BY_RCU does the rest.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  3 +--
 include/linux/sched.h |  8 --------
 kernel/fork.c         | 21 +++++++++++----------
 kernel/signal.c       |  2 +-
 4 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 9046ad2b0614..950ebd43cdc3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ no_thread_group:
 		/*
 		 * Move our state over to newsighand and switch it in.
 		 */
-		spin_lock_init(&newsighand->siglock);
 		atomic_set(&newsighand->count, 1);
 		memcpy(newsighand->action, oldsighand->action,
 		       sizeof(newsighand->action));
@@ -785,7 +784,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			sighand_free(oldsighand);
+			kmem_cache_free(sighand_cachep, oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ddc0df7f8bf5..bbcfc873bd98 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -355,16 +355,8 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct rcu_head		rcu;
 };
 
-extern void sighand_free_cb(struct rcu_head *rhp);
-
-static inline void sighand_free(struct sighand_struct *sp)
-{
-	call_rcu(&sp->rcu, sighand_free_cb);
-}
-
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
diff --git a/kernel/fork.c b/kernel/fork.c
index 0c32e28cdc5f..33ffb5bf0dbc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -786,14 +786,6 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
-void sighand_free_cb(struct rcu_head *rhp)
-{
-	struct sighand_struct *sp;
-
-	sp = container_of(rhp, struct sighand_struct, rcu);
-	kmem_cache_free(sighand_cachep, sp);
-}
-
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -806,7 +798,6 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	rcu_assign_pointer(tsk->sighand, sig);
 	if (!sig)
 		return -ENOMEM;
-	spin_lock_init(&sig->siglock);
 	atomic_set(&sig->count, 1);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 	return 0;
@@ -1356,11 +1347,21 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
+static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct sighand_struct *sighand = data;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+					SLAB_CTOR_CONSTRUCTOR)
+		spin_lock_init(&sighand->siglock);
+}
+
 void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
+			sighand_ctor, NULL);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
diff --git a/kernel/signal.c b/kernel/signal.c
index dc8f91bf9f89..b0b1ca9daa33 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -330,7 +330,7 @@ void __exit_sighand(struct task_struct *tsk)
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		sighand_free(sighand);
+		kmem_cache_free(sighand_cachep, sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
-- 
cgit v1.2.3


From f63ee72e0fb82e504a0489490babc7612c7cd6c2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:13 -0800
Subject: [PATCH] introduce lock_task_sighand() helper

Add lock_task_sighand() helper and converts group_send_sig_info() to use
it.  Hopefully we will have more users soon.

This patch also removes '!sighand->count' and '!p->usage' checks, I think
they both are bogus, racy and unneeded (but probably it makes sense to
restore them as BUG_ON()s).

->sighand is cleared and it's ->count is decremented in release_task() with
sighand->siglock held, so it is a bug to have '!p->usage || !->count' after
we already locked and verified it is the same.  On the other hand, an
already dead task without ->sighand can have a non-zero ->usage due to
ptrace, for example.

If we read the stale value of ->sighand we must see the change after
spin_lock(), because that change was done while holding that same old
->sighand.siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  9 +++++++++
 kernel/signal.c       | 38 ++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bbcfc873bd98..ca1fd31aae97 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1225,6 +1225,15 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+							unsigned long *flags);
+
+static inline void unlock_task_sighand(struct task_struct *tsk,
+						unsigned long *flags)
+{
+	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+}
+
 #ifndef __HAVE_THREAD_FUNCTIONS
 
 #define task_thread_info(task) (task)->thread_info
diff --git a/kernel/signal.c b/kernel/signal.c
index b0b1ca9daa33..819fa49aa70a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1120,27 +1120,37 @@ void zap_other_threads(struct task_struct *p)
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
+struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+{
+	struct sighand_struct *sighand;
+
+	for (;;) {
+		sighand = rcu_dereference(tsk->sighand);
+		if (unlikely(sighand == NULL))
+			break;
+
+		spin_lock_irqsave(&sighand->siglock, *flags);
+		if (likely(sighand == tsk->sighand))
+			break;
+		spin_unlock_irqrestore(&sighand->siglock, *flags);
+	}
+
+	return sighand;
+}
+
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
-	struct sighand_struct *sp;
 	int ret;
 
-retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && (sp = rcu_dereference(p->sighand))) {
-		spin_lock_irqsave(&sp->siglock, flags);
-		if (p->sighand != sp) {
-			spin_unlock_irqrestore(&sp->siglock, flags);
-			goto retry;
-		}
-		if ((atomic_read(&sp->count) == 0) ||
-				(atomic_read(&p->usage) == 0)) {
-			spin_unlock_irqrestore(&sp->siglock, flags);
-			return -ESRCH;
+
+	if (!ret && sig) {
+		ret = -ESRCH;
+		if (lock_task_sighand(p, &flags)) {
+			ret = __group_send_sig_info(sig, info, p);
+			unlock_task_sighand(p, &flags);
 		}
-		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&sp->siglock, flags);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From a9e88e84b5245da0a1dadb6ccca70ae84e93ccf6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:14 -0800
Subject: [PATCH] introduce sig_needs_tasklist() helper

In my opinion this patch cleans up the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 819fa49aa70a..c5b65aa4c2bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -147,6 +147,9 @@ static kmem_cache_t *sigqueue_cachep;
 #define sig_kernel_stop(sig) \
 		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
 
+#define sig_needs_tasklist(sig) \
+		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK | M(SIGCONT)))
+
 #define sig_user_defined(t, signr) \
 	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
 	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
@@ -1199,7 +1202,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	struct task_struct *p;
 
 	rcu_read_lock();
-	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+	if (unlikely(sig_needs_tasklist(sig))) {
 		read_lock(&tasklist_lock);
 		acquired_tasklist_lock = 1;
 	}
-- 
cgit v1.2.3


From 7001510d0cbf51ad202dd2d0744f54104285cbb9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:14 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_sighand

The only caller of exit_sighand(tsk) is copy_process's error path.  We can
call __exit_sighand() directly and kill exit_sighand().

This 'tsk' was not yet registered in pid_hash[] or init_task.tasks, it has no
external references, nobody can see it, and

	IF (clone_flags & CLONE_SIGHAND)
		At least 'current' has a reference to ->sighand, this
		means atomic_dec_and_test(sighand->count) can't be true.

	ELSE
		Nobody can see this ->sighand, this means we can free it
		without any locking.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  3 ++-
 kernel/signal.c       | 14 --------------
 3 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca1fd31aae97..69c2a1e1529e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 33ffb5bf0dbc..8a46ad52be8f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1208,7 +1208,8 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-	exit_sighand(p);
+	if (p->sighand)
+		__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index c5b65aa4c2bc..1d7f4463c32d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -336,20 +336,6 @@ void __exit_sighand(struct task_struct *tsk)
 		kmem_cache_free(sighand_cachep, sighand);
 }
 
-void exit_sighand(struct task_struct *tsk)
-{
-	write_lock_irq(&tasklist_lock);
-	rcu_read_lock();
-	if (tsk->sighand != NULL) {
-		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
-		spin_lock(&sighand->siglock);
-		__exit_sighand(tsk);
-		spin_unlock(&sighand->siglock);
-	}
-	rcu_read_unlock();
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
-- 
cgit v1.2.3


From 6b3934ef52712ece50605dfc72e55d00c580831a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:16 -0800
Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_signal

__exit_signal() does important cleanups atomically under ->siglock.  It is
also called from copy_process's error path.  This is not good, for example we
can't move __unhash_process() under ->siglock for that reason.

We should not mix these 2 paths, just look at ugly 'if (p->sighand)' under
'bad_fork_cleanup_sighand:' label.  For copy_process() case it is sufficient
to just backout copy_signal(), nothing more.

Again, nobody can see this task yet.  For CLONE_THREAD case we just decrement
signal->count, otherwise nobody can see this ->signal and we can free it
lockless.

This patch assumes it is safe to do exit_thread_group_keys() without
tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 include/linux/slab.h  |  1 -
 kernel/fork.c         | 23 +++++++++++++++++++----
 kernel/signal.c       | 15 +--------------
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 69c2a1e1529e..7dd430b697aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1149,7 +1149,7 @@ extern void flush_thread(void);
 extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
-extern void exit_signal(struct task_struct *);
+extern void __cleanup_signal(struct signal_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 15e1d9736b1b..3af03b19c983 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -210,7 +210,6 @@ extern kmem_cache_t	*names_cachep;
 extern kmem_cache_t	*files_cachep;
 extern kmem_cache_t	*filp_cachep;
 extern kmem_cache_t	*fs_cachep;
-extern kmem_cache_t	*signal_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a46ad52be8f..0aff28cdbadd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep;
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-kmem_cache_t *signal_cachep;
+static kmem_cache_t *signal_cachep;
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 kmem_cache_t *sighand_cachep;
@@ -872,6 +872,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	return 0;
 }
 
+void __cleanup_signal(struct signal_struct *sig)
+{
+	exit_thread_group_keys(sig);
+	kmem_cache_free(signal_cachep, sig);
+}
+
+static inline void cleanup_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	atomic_dec(&sig->live);
+
+	if (atomic_dec_and_test(&sig->count))
+		__cleanup_signal(sig);
+}
+
 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
@@ -1206,10 +1222,9 @@ bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
 bad_fork_cleanup_signal:
-	exit_signal(p);
+	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	if (p->sighand)
-		__exit_sighand(p);
+	__exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d7f4463c32d..54e9ef673e68 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -395,23 +395,10 @@ void __exit_signal(struct task_struct *tsk)
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
-		/*
-		 * We are cleaning up the signal_struct here.
-		 */
-		exit_thread_group_keys(sig);
-		kmem_cache_free(signal_cachep, sig);
+		__cleanup_signal(sig);
 	}
 }
 
-void exit_signal(struct task_struct *tsk)
-{
-	atomic_dec(&tsk->signal->live);
-
-	write_lock_irq(&tasklist_lock);
-	__exit_signal(tsk);
-	write_unlock_irq(&tasklist_lock);
-}
-
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From 29ff471234d53c7235db287bc52f91884c2977c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:17 -0800
Subject: [PATCH] cleanup __exit_signal()

This patch factors out duplicated code under 'if' branches.  Also, BUG_ON()
conversions and whitespace cleanups.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 54e9ef673e68..ca1fa854e469 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -341,24 +341,20 @@ void __exit_sighand(struct task_struct *tsk)
  */
 void __exit_signal(struct task_struct *tsk)
 {
-	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand;
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *sighand;
+
+	BUG_ON(!sig);
+	BUG_ON(!atomic_read(&sig->count));
 
-	if (!sig)
-		BUG();
-	if (!atomic_read(&sig->count))
-		BUG();
 	rcu_read_lock();
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
+
 	posix_cpu_timers_exit(tsk);
-	if (atomic_dec_and_test(&sig->count)) {
+	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
-		tsk->signal = NULL;
-		__exit_sighand(tsk);
-		spin_unlock(&sighand->siglock);
-		flush_sigqueue(&sig->shared_pending);
-	} else {
+	else {
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -369,7 +365,6 @@ void __exit_signal(struct task_struct *tsk)
 		}
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
-		tsk->signal = NULL;
 		/*
 		 * Accumulate here the counters for all threads but the
 		 * group leader as they die, so they can be added into
@@ -387,14 +382,18 @@ void __exit_signal(struct task_struct *tsk)
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
-		__exit_sighand(tsk);
-		spin_unlock(&sighand->siglock);
-		sig = NULL;	/* Marker for below.  */
+		sig = NULL; /* Marker for below. */
 	}
+
+	tsk->signal = NULL;
+	__exit_sighand(tsk);
+	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
+
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
+		flush_sigqueue(&sig->shared_pending);
 		__cleanup_signal(sig);
 	}
 }
-- 
cgit v1.2.3


From c81addc9d3a0ebff2155e0cd86f90820ab97147e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:17 -0800
Subject: [PATCH] rename __exit_sighand to cleanup_sighand

Cosmetic, rename __exit_sighand to cleanup_sighand and move it close to
copy_sighand().

This matches copy_signal/cleanup_signal naming, and I think it is easier to
follow.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  2 +-
 kernel/fork.c         | 12 +++++++++++-
 kernel/signal.c       | 19 ++-----------------
 3 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7dd430b697aa..921148277da9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1150,8 +1150,8 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
+extern void cleanup_sighand(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
-extern void __exit_sighand(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0aff28cdbadd..12cdd9fc9d02 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,6 +803,16 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
+void cleanup_sighand(struct task_struct *tsk)
+{
+	struct sighand_struct * sighand = tsk->sighand;
+
+	/* Ok, we're done with the signal handlers */
+	tsk->sighand = NULL;
+	if (atomic_dec_and_test(&sighand->count))
+		kmem_cache_free(sighand_cachep, sighand);
+}
+
 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct signal_struct *sig;
@@ -1224,7 +1234,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	__exit_sighand(p);
+	cleanup_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
diff --git a/kernel/signal.c b/kernel/signal.c
index ca1fa854e469..b29c868bd5ee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -310,9 +310,7 @@ static void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
-
-void
-flush_signals(struct task_struct *t)
+void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
@@ -323,19 +321,6 @@ flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_sighand(struct task_struct *tsk)
-{
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
-	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
-}
-
 /*
  * This function expects the tasklist_lock write-locked.
  */
@@ -386,7 +371,7 @@ void __exit_signal(struct task_struct *tsk)
 	}
 
 	tsk->signal = NULL;
-	__exit_sighand(tsk);
+	cleanup_sighand(tsk);
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3


From 6a14c5c9da0b4c34b5be783403c54f0396fcfe77 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:18 -0800
Subject: [PATCH] move __exit_signal() to kernel/exit.c

__exit_signal() is private to release_task() now.  I think it is better to
make it static in kernel/exit.c and export flush_sigqueue() instead - this
function is much more simple and straightforward.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h  |  1 -
 include/linux/signal.h |  2 ++
 kernel/exit.c          | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/signal.c        | 65 +-------------------------------------------------
 4 files changed, 66 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 921148277da9..a913fca9e70d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,6 @@ extern void exit_thread(void);
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
 extern void cleanup_sighand(struct task_struct *);
-extern void __exit_signal(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b7d093520bb6..162a8fd10b29 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -249,6 +249,8 @@ static inline void init_sigpending(struct sigpending *sig)
 	INIT_LIST_HEAD(&sig->list);
 }
 
+extern void flush_sigqueue(struct sigpending *queue);
+
 /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
 static inline int valid_signal(unsigned long sig)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 77c35efad88c..3823ec89d7b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -29,6 +29,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
@@ -62,6 +63,68 @@ static void __unhash_process(struct task_struct *p)
 	remove_parent(p);
 }
 
+/*
+ * This function expects the tasklist_lock write-locked.
+ */
+static void __exit_signal(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+	struct sighand_struct *sighand;
+
+	BUG_ON(!sig);
+	BUG_ON(!atomic_read(&sig->count));
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	spin_lock(&sighand->siglock);
+
+	posix_cpu_timers_exit(tsk);
+	if (atomic_dec_and_test(&sig->count))
+		posix_cpu_timers_exit_group(tsk);
+	else {
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
+		if (tsk == sig->curr_target)
+			sig->curr_target = next_thread(tsk);
+		/*
+		 * Accumulate here the counters for all threads but the
+		 * group leader as they die, so they can be added into
+		 * the process-wide totals when those are taken.
+		 * The group leader stays around as a zombie as long
+		 * as there are other threads.  When it gets reaped,
+		 * the exit.c code will add its counts into these totals.
+		 * We won't ever get here for the group leader, since it
+		 * will have been the last reference on the signal_struct.
+		 */
+		sig->utime = cputime_add(sig->utime, tsk->utime);
+		sig->stime = cputime_add(sig->stime, tsk->stime);
+		sig->min_flt += tsk->min_flt;
+		sig->maj_flt += tsk->maj_flt;
+		sig->nvcsw += tsk->nvcsw;
+		sig->nivcsw += tsk->nivcsw;
+		sig->sched_time += tsk->sched_time;
+		sig = NULL; /* Marker for below. */
+	}
+
+	tsk->signal = NULL;
+	cleanup_sighand(tsk);
+	spin_unlock(&sighand->siglock);
+	rcu_read_unlock();
+
+	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+	flush_sigqueue(&tsk->pending);
+	if (sig) {
+		flush_sigqueue(&sig->shared_pending);
+		__cleanup_signal(sig);
+	}
+}
+
 void release_task(struct task_struct * p)
 {
 	int zap_leader;
diff --git a/kernel/signal.c b/kernel/signal.c
index b29c868bd5ee..6ea49f742a2f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,7 +22,6 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ptrace.h>
-#include <linux/posix-timers.h>
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/capability.h>
@@ -295,7 +294,7 @@ static void __sigqueue_free(struct sigqueue *q)
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
-static void flush_sigqueue(struct sigpending *queue)
+void flush_sigqueue(struct sigpending *queue)
 {
 	struct sigqueue *q;
 
@@ -321,68 +320,6 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
-/*
- * This function expects the tasklist_lock write-locked.
- */
-void __exit_signal(struct task_struct *tsk)
-{
-	struct signal_struct *sig = tsk->signal;
-	struct sighand_struct *sighand;
-
-	BUG_ON(!sig);
-	BUG_ON(!atomic_read(&sig->count));
-
-	rcu_read_lock();
-	sighand = rcu_dereference(tsk->sighand);
-	spin_lock(&sighand->siglock);
-
-	posix_cpu_timers_exit(tsk);
-	if (atomic_dec_and_test(&sig->count))
-		posix_cpu_timers_exit_group(tsk);
-	else {
-		/*
-		 * If there is any task waiting for the group exit
-		 * then notify it:
-		 */
-		if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
-			wake_up_process(sig->group_exit_task);
-			sig->group_exit_task = NULL;
-		}
-		if (tsk == sig->curr_target)
-			sig->curr_target = next_thread(tsk);
-		/*
-		 * Accumulate here the counters for all threads but the
-		 * group leader as they die, so they can be added into
-		 * the process-wide totals when those are taken.
-		 * The group leader stays around as a zombie as long
-		 * as there are other threads.  When it gets reaped,
-		 * the exit.c code will add its counts into these totals.
-		 * We won't ever get here for the group leader, since it
-		 * will have been the last reference on the signal_struct.
-		 */
-		sig->utime = cputime_add(sig->utime, tsk->utime);
-		sig->stime = cputime_add(sig->stime, tsk->stime);
-		sig->min_flt += tsk->min_flt;
-		sig->maj_flt += tsk->maj_flt;
-		sig->nvcsw += tsk->nvcsw;
-		sig->nivcsw += tsk->nivcsw;
-		sig->sched_time += tsk->sched_time;
-		sig = NULL; /* Marker for below. */
-	}
-
-	tsk->signal = NULL;
-	cleanup_sighand(tsk);
-	spin_unlock(&sighand->siglock);
-	rcu_read_unlock();
-
-	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-	flush_sigqueue(&tsk->pending);
-	if (sig) {
-		flush_sigqueue(&sig->shared_pending);
-		__cleanup_signal(sig);
-	}
-}
-
 /*
  * Flush all handlers for a task.
  */
-- 
cgit v1.2.3


From 35f5cad8c4bab94ecc5acdc4055df5ea12dc76f8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:19 -0800
Subject: [PATCH] revert "Optimize sys_times for a single thread process"

This patch reverts 'CONFIG_SMP && thread_group_empty()' optimization in
sys_times().  The reason is that the next patch breaks memory ordering which
is needed for that optimization.

tasklist_lock in sys_times() will be eliminated completely by further patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c |  6 +----
 kernel/sys.c  | 86 ++++++++++++++++++-----------------------------------------
 2 files changed, 27 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3823ec89d7b8..6b2e4cf3e140 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -139,11 +139,7 @@ repeat:
 	ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	/*
-	 * Note that the fastpath in sys_times depends on __exit_signal having
-	 * updated the counters before a task is removed from the tasklist of
-	 * the process by __unhash_process.
-	 */
+
 	__unhash_process(p);
 
 	/*
diff --git a/kernel/sys.c b/kernel/sys.c
index c93d37f71aef..84371fdc660b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1202,69 +1202,35 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 	 */
 	if (tbuf) {
 		struct tms tmp;
+		struct task_struct *tsk = current;
+		struct task_struct *t;
 		cputime_t utime, stime, cutime, cstime;
 
-#ifdef CONFIG_SMP
-		if (thread_group_empty(current)) {
-			/*
-			 * Single thread case without the use of any locks.
-			 *
-			 * We may race with release_task if two threads are
-			 * executing. However, release task first adds up the
-			 * counters (__exit_signal) before  removing the task
-			 * from the process tasklist (__unhash_process).
-			 * __exit_signal also acquires and releases the
-			 * siglock which results in the proper memory ordering
-			 * so that the list modifications are always visible
-			 * after the counters have been updated.
-			 *
-			 * If the counters have been updated by the second thread
-			 * but the thread has not yet been removed from the list
-			 * then the other branch will be executing which will
-			 * block on tasklist_lock until the exit handling of the
-			 * other task is finished.
-			 *
-			 * This also implies that the sighand->siglock cannot
-			 * be held by another processor. So we can also
-			 * skip acquiring that lock.
-			 */
-			utime = cputime_add(current->signal->utime, current->utime);
-			stime = cputime_add(current->signal->utime, current->stime);
-			cutime = current->signal->cutime;
-			cstime = current->signal->cstime;
-		} else
-#endif
-		{
-
-			/* Process with multiple threads */
-			struct task_struct *tsk = current;
-			struct task_struct *t;
-
-			read_lock(&tasklist_lock);
-			utime = tsk->signal->utime;
-			stime = tsk->signal->stime;
-			t = tsk;
-			do {
-				utime = cputime_add(utime, t->utime);
-				stime = cputime_add(stime, t->stime);
-				t = next_thread(t);
-			} while (t != tsk);
+		read_lock(&tasklist_lock);
+		utime = tsk->signal->utime;
+		stime = tsk->signal->stime;
+		t = tsk;
+		do {
+			utime = cputime_add(utime, t->utime);
+			stime = cputime_add(stime, t->stime);
+			t = next_thread(t);
+		} while (t != tsk);
+
+		/*
+		 * While we have tasklist_lock read-locked, no dying thread
+		 * can be updating current->signal->[us]time.  Instead,
+		 * we got their counts included in the live thread loop.
+		 * However, another thread can come in right now and
+		 * do a wait call that updates current->signal->c[us]time.
+		 * To make sure we always see that pair updated atomically,
+		 * we take the siglock around fetching them.
+		 */
+		spin_lock_irq(&tsk->sighand->siglock);
+		cutime = tsk->signal->cutime;
+		cstime = tsk->signal->cstime;
+		spin_unlock_irq(&tsk->sighand->siglock);
+		read_unlock(&tasklist_lock);
 
-			/*
-			 * While we have tasklist_lock read-locked, no dying thread
-			 * can be updating current->signal->[us]time.  Instead,
-			 * we got their counts included in the live thread loop.
-			 * However, another thread can come in right now and
-			 * do a wait call that updates current->signal->c[us]time.
-			 * To make sure we always see that pair updated atomically,
-			 * we take the siglock around fetching them.
-			 */
-			spin_lock_irq(&tsk->sighand->siglock);
-			cutime = tsk->signal->cutime;
-			cstime = tsk->signal->cstime;
-			spin_unlock_irq(&tsk->sighand->siglock);
-			read_unlock(&tasklist_lock);
-		}
 		tmp.tms_utime = cputime_to_clock_t(utime);
 		tmp.tms_stime = cputime_to_clock_t(stime);
 		tmp.tms_cutime = cputime_to_clock_t(cutime);
-- 
cgit v1.2.3


From 5876700cd399112ecfa70df36203c8c6660d84f8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:20 -0800
Subject: [PATCH] do __unhash_process() under ->siglock

This patch moves __unhash_process() call from realease_task() to
__exit_signal(), so __detach_pid() is called with ->siglock held.

This means we don't need tasklist_lock to iterate over thread group anymore:

	copy_process() was already changed to do attach_pid()
	under ->siglock.

	Eric's "pidhash-kill-switch_exec_pids.patch" from -mm
	changed de_thread() so it doesn't touch PIDTYPE_TGID.

NOTE: de_thread() still needs some attention.  It still changes task->pid
lockless.  Taking ->sighand.siglock here allows to do more tasklist_lock
removals.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6b2e4cf3e140..44d6c6e3896d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,6 +112,8 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
+	__unhash_process(tsk);
+
 	tsk->signal = NULL;
 	cleanup_sighand(tsk);
 	spin_unlock(&sighand->siglock);
@@ -140,8 +142,6 @@ repeat:
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
 
-	__unhash_process(p);
-
 	/*
 	 * If we are the last non-leader member of the thread
 	 * group, and the leader is zombie, then notify the
-- 
cgit v1.2.3


From 7d7185c818925ba5fe90efa75840d0b415032774 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:21 -0800
Subject: [PATCH] sys_times: don't take tasklist_lock

sys_times: don't take tasklist_lock

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 84371fdc660b..7ef7f6054c28 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1206,7 +1206,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 		struct task_struct *t;
 		cputime_t utime, stime, cutime, cstime;
 
-		read_lock(&tasklist_lock);
+		spin_lock_irq(&tsk->sighand->siglock);
 		utime = tsk->signal->utime;
 		stime = tsk->signal->stime;
 		t = tsk;
@@ -1216,20 +1216,9 @@ asmlinkage long sys_times(struct tms __user * tbuf)
 			t = next_thread(t);
 		} while (t != tsk);
 
-		/*
-		 * While we have tasklist_lock read-locked, no dying thread
-		 * can be updating current->signal->[us]time.  Instead,
-		 * we got their counts included in the live thread loop.
-		 * However, another thread can come in right now and
-		 * do a wait call that updates current->signal->c[us]time.
-		 * To make sure we always see that pair updated atomically,
-		 * we take the siglock around fetching them.
-		 */
-		spin_lock_irq(&tsk->sighand->siglock);
 		cutime = tsk->signal->cutime;
 		cstime = tsk->signal->cstime;
 		spin_unlock_irq(&tsk->sighand->siglock);
-		read_unlock(&tasklist_lock);
 
 		tmp.tms_utime = cputime_to_clock_t(utime);
 		tmp.tms_stime = cputime_to_clock_t(stime);
-- 
cgit v1.2.3


From 6108ccd3e2f3012d5eec582e0af4d75e693824da Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:22 -0800
Subject: [PATCH] relax sig_needs_tasklist()

handle_stop_signal() does not need tasklist_lock for SIG_KERNEL_STOP_MASK
signals anymore.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea49f742a2f..e99ec2f891a0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -146,8 +146,7 @@ static kmem_cache_t *sigqueue_cachep;
 #define sig_kernel_stop(sig) \
 		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
 
-#define sig_needs_tasklist(sig) \
-		(((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK | M(SIGCONT)))
+#define sig_needs_tasklist(sig)	((sig) == SIGCONT)
 
 #define sig_user_defined(t, signr) \
 	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-- 
cgit v1.2.3


From a122b341b74c08020f6521b615acca6a692aac79 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:22 -0800
Subject: [PATCH] do_signal_stop: don't take tasklist_lock

do_signal_stop() does not need tasklist_lock anymore.  So it does not need to
do misc re-checks, and we can simplify the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 69 ++++++++++++++-------------------------------------------
 1 file changed, 17 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e99ec2f891a0..e3bdec914626 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1682,8 +1682,7 @@ out:
  * Returns nonzero if we've actually stopped and released the siglock.
  * Returns zero if we didn't stop and still hold the siglock.
  */
-static int
-do_signal_stop(int signr)
+static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
 	struct sighand_struct *sighand = current->sighand;
@@ -1703,7 +1702,6 @@ do_signal_stop(int signr)
 		set_current_state(TASK_STOPPED);
 		if (stop_count == 0)
 			sig->flags = SIGNAL_STOP_STOPPED;
-		spin_unlock_irq(&sighand->siglock);
 	}
 	else if (thread_group_empty(current)) {
 		/*
@@ -1712,71 +1710,38 @@ do_signal_stop(int signr)
 		current->exit_code = current->signal->group_exit_code = signr;
 		set_current_state(TASK_STOPPED);
 		sig->flags = SIGNAL_STOP_STOPPED;
-		spin_unlock_irq(&sighand->siglock);
 	}
 	else {
 		/*
+		 * (sig->group_stop_count == 0)
 		 * There is no group stop already in progress.
-		 * We must initiate one now, but that requires
-		 * dropping siglock to get both the tasklist lock
-		 * and siglock again in the proper order.  Note that
-		 * this allows an intervening SIGCONT to be posted.
-		 * We need to check for that and bail out if necessary.
+		 * We must initiate one now.
 		 */
 		struct task_struct *t;
 
-		spin_unlock_irq(&sighand->siglock);
-
-		/* signals can be posted during this window */
-
-		read_lock(&tasklist_lock);
-		spin_lock_irq(&sighand->siglock);
+		current->exit_code = signr;
+		sig->group_exit_code = signr;
 
-		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) {
+		stop_count = 0;
+		for (t = next_thread(current); t != current; t = next_thread(t))
 			/*
-			 * Another stop or continue happened while we
-			 * didn't have the lock.  We can just swallow this
-			 * signal now.  If we raced with a SIGCONT, that
-			 * should have just cleared it now.  If we raced
-			 * with another processor delivering a stop signal,
-			 * then the SIGCONT that wakes us up should clear it.
+			 * Setting state to TASK_STOPPED for a group
+			 * stop is always done with the siglock held,
+			 * so this check has no races.
 			 */
-			read_unlock(&tasklist_lock);
-			return 0;
-		}
-
-		if (sig->group_stop_count == 0) {
-			sig->group_exit_code = signr;
-			stop_count = 0;
-			for (t = next_thread(current); t != current;
-			     t = next_thread(t))
-				/*
-				 * Setting state to TASK_STOPPED for a group
-				 * stop is always done with the siglock held,
-				 * so this check has no races.
-				 */
-				if (!t->exit_state &&
-				    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
-					stop_count++;
-					signal_wake_up(t, 0);
-				}
-			sig->group_stop_count = stop_count;
-		}
-		else {
-			/* A race with another thread while unlocked.  */
-			signr = sig->group_exit_code;
-			stop_count = --sig->group_stop_count;
-		}
+			if (!t->exit_state &&
+			    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
+				stop_count++;
+				signal_wake_up(t, 0);
+			}
+		sig->group_stop_count = stop_count;
 
-		current->exit_code = signr;
 		set_current_state(TASK_STOPPED);
 		if (stop_count == 0)
 			sig->flags = SIGNAL_STOP_STOPPED;
-
-		spin_unlock_irq(&sighand->siglock);
-		read_unlock(&tasklist_lock);
 	}
 
+	spin_unlock_irq(&sighand->siglock);
 	finish_stop(stop_count);
 	return 1;
 }
-- 
cgit v1.2.3


From aacc90944d4b1f2fcec73a8127eb60a3a701ce1c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:23 -0800
Subject: [PATCH] do_group_exit: don't take tasklist_lock

do_group_exit() takes tasklist_lock for zap_other_threads(), this is unneeded
now.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 44d6c6e3896d..aea23e713cf4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -985,7 +985,6 @@ do_group_exit(int exit_code)
 	else if (!thread_group_empty(current)) {
 		struct signal_struct *const sig = current->signal;
 		struct sighand_struct *const sighand = current->sighand;
-		read_lock(&tasklist_lock);
 		spin_lock_irq(&sighand->siglock);
 		if (sig->flags & SIGNAL_GROUP_EXIT)
 			/* Another thread got here before we took the lock.  */
@@ -995,7 +994,6 @@ do_group_exit(int exit_code)
 			zap_other_threads(current);
 		}
 		spin_unlock_irq(&sighand->siglock);
-		read_unlock(&tasklist_lock);
 	}
 
 	do_exit(exit_code);
-- 
cgit v1.2.3


From 88531f725bd52e37a7be726860e4ff3f09031d89 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:24 -0800
Subject: [PATCH] do_sigaction: don't take tasklist_lock

do_sigaction() does not need tasklist_lock anymore, we can simplify the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index e3bdec914626..2dfaa5076c31 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2301,8 +2301,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
 	return kill_proc_info(sig, &info, pid);
 }
 
-int
-do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
+int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
 	sigset_t mask;
@@ -2328,6 +2327,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	if (act) {
 		sigdelsetmask(&act->sa.sa_mask,
 			      sigmask(SIGKILL) | sigmask(SIGSTOP));
+		*k = *act;
 		/*
 		 * POSIX 3.3.1.3:
 		 *  "Setting a signal action to SIG_IGN for a signal that is
@@ -2340,19 +2340,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 		 *   be discarded, whether or not it is blocked"
 		 */
 		if (act->sa.sa_handler == SIG_IGN ||
-		    (act->sa.sa_handler == SIG_DFL &&
-		     sig_kernel_ignore(sig))) {
-			/*
-			 * This is a fairly rare case, so we only take the
-			 * tasklist_lock once we're sure we'll need it.
-			 * Now we must do this little unlock and relock
-			 * dance to maintain the lock hierarchy.
-			 */
+		   (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) {
 			struct task_struct *t = current;
-			spin_unlock_irq(&t->sighand->siglock);
-			read_lock(&tasklist_lock);
-			spin_lock_irq(&t->sighand->siglock);
-			*k = *act;
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2361,12 +2350,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 				recalc_sigpending_tsk(t);
 				t = next_thread(t);
 			} while (t != current);
-			spin_unlock_irq(&current->sighand->siglock);
-			read_unlock(&tasklist_lock);
-			return 0;
 		}
-
-		*k = *act;
 	}
 
 	spin_unlock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3


From 47e65328a7b1cdfc4e3102e50d60faf94ebba7d3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:25 -0800
Subject: [PATCH] pids: kill PIDTYPE_TGID

This patch kills PIDTYPE_TGID pid_type thus saving one hash table in
kernel/pid.c and speeding up subthreads create/destroy a bit.  It is also a
preparation for the further tref/pids rework.

This patch adds 'struct list_head thread_group' to 'struct task_struct'
instead.

We don't detach group leader from PIDTYPE_PID namespace until another
thread inherits it's ->pid == ->tgid, so we are safe wrt premature
free_pidmap(->tgid) call.

Currently there are no users of find_task_by_pid_type(PIDTYPE_TGID).
Should the need arise, we can use find_task_by_pid()->group_leader.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-By: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pid.h   |  1 -
 include/linux/sched.h | 11 ++++++++---
 kernel/exit.c         | 10 +---------
 kernel/fork.c         |  4 +++-
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 099e70ecf7c7..5b9082cc600f 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,7 +4,6 @@
 enum pid_type
 {
 	PIDTYPE_PID,
-	PIDTYPE_TGID,
 	PIDTYPE_PGID,
 	PIDTYPE_SID,
 	PIDTYPE_MAX
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a913fca9e70d..99855f694ebd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -752,6 +752,7 @@ struct task_struct {
 
 	/* PID/PID hash table linkage. */
 	struct pid pids[PIDTYPE_MAX];
+	struct list_head thread_group;
 
 	struct completion *vfork_done;		/* for vfork() */
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
@@ -1192,13 +1193,17 @@ extern void wait_task_inactive(task_t * p);
 #define while_each_thread(g, t) \
 	while ((t = next_thread(t)) != g)
 
-extern task_t * FASTCALL(next_thread(const task_t *p));
-
 #define thread_group_leader(p)	(p->pid == p->tgid)
 
+static inline task_t *next_thread(task_t *p)
+{
+	return list_entry(rcu_dereference(p->thread_group.next),
+				task_t, thread_group);
+}
+
 static inline int thread_group_empty(task_t *p)
 {
-	return list_empty(&p->pids[PIDTYPE_TGID].pid_list);
+	return list_empty(&p->thread_group);
 }
 
 #define delay_group_leader(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index aea23e713cf4..22399caf7574 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@ static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
-	detach_pid(p, PIDTYPE_TGID);
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
@@ -59,7 +58,7 @@ static void __unhash_process(struct task_struct *p)
 		list_del_init(&p->tasks);
 		__get_cpu_var(process_counts)--;
 	}
-
+	list_del_rcu(&p->thread_group);
 	remove_parent(p);
 }
 
@@ -964,13 +963,6 @@ asmlinkage long sys_exit(int error_code)
 	do_exit((error_code&0xff)<<8);
 }
 
-task_t fastcall *next_thread(const task_t *p)
-{
-	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
-}
-
-EXPORT_SYMBOL(next_thread);
-
 /*
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
diff --git a/kernel/fork.c b/kernel/fork.c
index 12cdd9fc9d02..bc551efb5fd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,6 +1112,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * We dont wake it up yet.
 	 */
 	p->group_leader = p;
+	INIT_LIST_HEAD(&p->thread_group);
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
@@ -1165,7 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags,
 			retval = -EAGAIN;
 			goto bad_fork_cleanup_namespace;
 		}
+
 		p->group_leader = current->group_leader;
+		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 
 		if (current->signal->group_stop_count > 0) {
 			/*
@@ -1213,7 +1216,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			list_add_tail(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
-		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PID, p->pid);
 		nr_threads++;
 	}
-- 
cgit v1.2.3


From 4a2c7a7837da1b91468e50426066d988050e4d56 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:26 -0800
Subject: [PATCH] make fork() atomic wrt pgrp/session signals

Eric W. Biederman wrote:
>
> Ok. SUSV3/Posix is clear, fork is atomic with respect
> to signals.  Either a signal comes before or after a
> fork but not during. (See the rationale section).
> http://www.opengroup.org/onlinepubs/000095399/functions/fork.html
>
> The tasklist_lock does not stop forks from adding to a process
> group. The forks stall while the tasklist_lock is held, but a fork
> that began before we grabbed the tasklist_lock simply completes
> afterwards, and the child does not receive the signal.

This also means that SIGSTOP or sig_kernel_coredump() signal can't
be delivered to pgrp/session reliably.

With this patch copy_process() returns -ERESTARTNOINTR when it
detects a pending signal, fork() will be restarted transparently
after handling the signals.

This patch also deletes now unneeded "group_stop_count > 0" check,
copy_process() can no longer succeed while group stop in progress.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-By: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bc551efb5fd4..aa50c848fae7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1136,16 +1136,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
 
-	/*
-	 * Check for pending SIGKILL! The new thread should not be allowed
-	 * to slip out of an OOM kill. (or normal SIGKILL.)
-	 */
-	if (sigismember(&current->pending.signal, SIGKILL)) {
-		write_unlock_irq(&tasklist_lock);
-		retval = -EINTR;
-		goto bad_fork_cleanup_namespace;
-	}
-
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
 		p->real_parent = current->real_parent;
@@ -1154,6 +1144,23 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->parent = p->real_parent;
 
 	spin_lock(&current->sighand->siglock);
+
+	/*
+	 * Process group and session signals need to be delivered to just the
+	 * parent before the fork or both the parent and the child after the
+	 * fork. Restart if a signal comes in before we add the new process to
+	 * it's process group.
+	 * A fatal signal pending means that current will exit, so the new
+	 * thread can't slip out of an OOM kill (or normal SIGKILL).
+ 	 */
+ 	recalc_sigpending();
+	if (signal_pending(current)) {
+		spin_unlock(&current->sighand->siglock);
+		write_unlock_irq(&tasklist_lock);
+		retval = -ERESTARTNOINTR;
+		goto bad_fork_cleanup_namespace;
+	}
+
 	if (clone_flags & CLONE_THREAD) {
 		/*
 		 * Important: if an exit-all has been started then
@@ -1170,16 +1177,6 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->group_leader = current->group_leader;
 		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
 
-		if (current->signal->group_stop_count > 0) {
-			/*
-			 * There is an all-stop in progress for the group.
-			 * We ourselves will stop as soon as we check signals.
-			 * Make the new thread part of that group stop too.
-			 */
-			current->signal->group_stop_count++;
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
 		if (!cputime_eq(current->signal->it_virt_expires,
 				cputime_zero) ||
 		    !cputime_eq(current->signal->it_prof_expires,
-- 
cgit v1.2.3


From a7e5328a06a2beee3a2bbfaf87ce2a7bbe937de1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:27 -0800
Subject: [PATCH] cleanup __exit_signal->cleanup_sighand path

Move 'tsk->sighand = NULL' from cleanup_sighand() to __exit_signal().  This
makes the exit path more understandable and allows us to do
cleanup_sighand() outside of ->siglock protected section.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/exit.c         | 3 ++-
 kernel/fork.c         | 8 ++------
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 99855f694ebd..d04186d8cc68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1151,7 +1151,7 @@ extern void exit_thread(void);
 
 extern void exit_files(struct task_struct *);
 extern void __cleanup_signal(struct signal_struct *);
-extern void cleanup_sighand(struct task_struct *);
+extern void __cleanup_sighand(struct sighand_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 22399caf7574..bc0ec674d3f4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -114,10 +114,11 @@ static void __exit_signal(struct task_struct *tsk)
 	__unhash_process(tsk);
 
 	tsk->signal = NULL;
-	cleanup_sighand(tsk);
+	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 	rcu_read_unlock();
 
+	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
diff --git a/kernel/fork.c b/kernel/fork.c
index aa50c848fae7..b3f7a1bb5e55 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,12 +803,8 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	return 0;
 }
 
-void cleanup_sighand(struct task_struct *tsk)
+void __cleanup_sighand(struct sighand_struct *sighand)
 {
-	struct sighand_struct * sighand = tsk->sighand;
-
-	/* Ok, we're done with the signal handlers */
-	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
 		kmem_cache_free(sighand_cachep, sighand);
 }
@@ -1233,7 +1229,7 @@ bad_fork_cleanup_mm:
 bad_fork_cleanup_signal:
 	cleanup_signal(p);
 bad_fork_cleanup_sighand:
-	cleanup_sighand(p);
+	__cleanup_sighand(p->sighand);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
-- 
cgit v1.2.3


From dac27f4a09c274db048e80d2511102e540ac9e3a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:28 -0800
Subject: [PATCH] simplify do_signal_stop()

do_signal_stop() considers 'thread_group_empty()' as a special case.
This was needed to avoid taking tasklist_lock. Since this lock is
unneeded any longer, we can remove this special case and simplify
the code even more.

Also, before this patch, finish_stop() was called with stop_count == -1
for 'thread_group_empty()' case. This is not strictly wrong, but confusing
and unneeded.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 32 ++++++++------------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2dfaa5076c31..efba39626e66 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1685,8 +1685,7 @@ out:
 static int do_signal_stop(int signr)
 {
 	struct signal_struct *sig = current->signal;
-	struct sighand_struct *sighand = current->sighand;
-	int stop_count = -1;
+	int stop_count;
 
 	if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED))
 		return 0;
@@ -1696,30 +1695,14 @@ static int do_signal_stop(int signr)
 		 * There is a group stop in progress.  We don't need to
 		 * start another one.
 		 */
-		signr = sig->group_exit_code;
 		stop_count = --sig->group_stop_count;
-		current->exit_code = signr;
-		set_current_state(TASK_STOPPED);
-		if (stop_count == 0)
-			sig->flags = SIGNAL_STOP_STOPPED;
-	}
-	else if (thread_group_empty(current)) {
-		/*
-		 * Lock must be held through transition to stopped state.
-		 */
-		current->exit_code = current->signal->group_exit_code = signr;
-		set_current_state(TASK_STOPPED);
-		sig->flags = SIGNAL_STOP_STOPPED;
-	}
-	else {
+	} else {
 		/*
-		 * (sig->group_stop_count == 0)
 		 * There is no group stop already in progress.
 		 * We must initiate one now.
 		 */
 		struct task_struct *t;
 
-		current->exit_code = signr;
 		sig->group_exit_code = signr;
 
 		stop_count = 0;
@@ -1735,13 +1718,14 @@ static int do_signal_stop(int signr)
 				signal_wake_up(t, 0);
 			}
 		sig->group_stop_count = stop_count;
-
-		set_current_state(TASK_STOPPED);
-		if (stop_count == 0)
-			sig->flags = SIGNAL_STOP_STOPPED;
 	}
 
-	spin_unlock_irq(&sighand->siglock);
+	if (stop_count == 0)
+		sig->flags = SIGNAL_STOP_STOPPED;
+	current->exit_code = sig->group_exit_code;
+	__set_current_state(TASK_STOPPED);
+
+	spin_unlock_irq(&current->sighand->siglock);
 	finish_stop(stop_count);
 	return 1;
 }
-- 
cgit v1.2.3


From 883606a7c9e84e206f96c38f88c4bd66df72f51e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:28 -0800
Subject: [PATCH] finish_stop: don't check stop_count < 0

Remove an obscure 'stop_count < 0' check in finish_stop().  The previous patch
made this case impossible.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index efba39626e66..24d5059ab0a9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1657,7 +1657,7 @@ finish_stop(int stop_count)
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+	if (current->ptrace & PT_PTRACED)
 		to_self = 1;
 	else if (stop_count == 0)
 		to_self = 0;
-- 
cgit v1.2.3


From a1d5e21e3e388fb2c16463d007e788b1e41b74f1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:29 -0800
Subject: [PATCH] do_notify_parent_cldstop: remove 'to_self' param

The previous patch has changed callsites of do_notify_parent_cldstop() so that
to_self == (->ptrace & PT_PTRACED) always (as it should be).  We can remove
this parameter now.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 24d5059ab0a9..0528a959daa9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -591,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 }
 
 /* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk,
-				     int to_self,
-				     int why);
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
 
 /*
  * Handle magic process-wide effects of stop/continue signals.
@@ -643,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->group_stop_count = 0;
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			spin_unlock(&p->sighand->siglock);
-			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
+			do_notify_parent_cldstop(p, CLD_STOPPED);
 			spin_lock(&p->sighand->siglock);
 		}
 		rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -684,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 			p->signal->flags = SIGNAL_STOP_CONTINUED;
 			p->signal->group_exit_code = 0;
 			spin_unlock(&p->sighand->siglock);
-			do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
+			do_notify_parent_cldstop(p, CLD_CONTINUED);
 			spin_lock(&p->sighand->siglock);
 		} else {
 			/*
@@ -1519,14 +1517,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 	spin_unlock_irqrestore(&psig->siglock, flags);
 }
 
-static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 {
 	struct siginfo info;
 	unsigned long flags;
 	struct task_struct *parent;
 	struct sighand_struct *sighand;
 
-	if (to_self)
+	if (tsk->ptrace & PT_PTRACED)
 		parent = tsk->parent;
 	else {
 		tsk = tsk->group_leader;
@@ -1601,7 +1599,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 		   !(current->ptrace & PT_ATTACHED)) &&
 	    (likely(current->parent->signal != current->signal) ||
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
+		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
 	} else {
@@ -1650,25 +1648,17 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
-	int to_self;
-
 	/*
 	 * If there are no other threads in the group, or if there is
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (current->ptrace & PT_PTRACED)
-		to_self = 1;
-	else if (stop_count == 0)
-		to_self = 0;
-	else
-		goto out;
-
-	read_lock(&tasklist_lock);
-	do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
-	read_unlock(&tasklist_lock);
+	if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(current, CLD_STOPPED);
+		read_unlock(&tasklist_lock);
+	}
 
-out:
 	schedule();
 	/*
 	 * Now we don't run again until continued.
-- 
cgit v1.2.3


From 547679087bc60277d11b11631d826895762c4505 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 28 Mar 2006 16:11:30 -0800
Subject: [PATCH] send_sigqueue: simplify and fix the race

send_sigqueue() checks PF_EXITING, then locks p->sighand->siglock.  This is
unsafe: 'p' can exit in between and set ->sighand = NULL.  The race is
theoretical, the window is tiny and irqs are disabled by the caller, so I
don't think we need the fix for -stable tree.

Convert send_sigqueue() to use lock_task_sighand() helper.

Also, delete 'p->flags & PF_EXITING' re-check, it is unneeded and the
comment is wrong.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 41 ++++-------------------------------------
 1 file changed, 4 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0528a959daa9..4922928d91f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1309,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q)
 	__sigqueue_free(q);
 }
 
-int
-send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
+int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 {
 	unsigned long flags;
 	int ret = 0;
-	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
 
@@ -1328,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 */
 	rcu_read_lock();
 
-	if (unlikely(p->flags & PF_EXITING)) {
+	if (!likely(lock_task_sighand(p, &flags))) {
 		ret = -1;
 		goto out_err;
 	}
 
-retry:
-	sh = rcu_dereference(p->sighand);
-
-	spin_lock_irqsave(&sh->siglock, flags);
-	if (p->sighand != sh) {
-		/* We raced with exec() in a multithreaded process... */
-		spin_unlock_irqrestore(&sh->siglock, flags);
-		goto retry;
-	}
-
-	/*
-	 * We do the check here again to handle the following scenario:
-	 *
-	 * CPU 0		CPU 1
-	 * send_sigqueue
-	 * check PF_EXITING
-	 * interrupt		exit code running
-	 *			__exit_signal
-	 *			lock sighand->siglock
-	 *			unlock sighand->siglock
-	 * lock sh->siglock
-	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
-	 *
-	 */
-
-	if (unlikely(p->flags & PF_EXITING)) {
-		ret = -1;
-		goto out;
-	}
-
 	if (unlikely(!list_empty(&q->list))) {
 		/*
 		 * If an SI_TIMER entry is already queue just increment
 		 * the overrun count.
 		 */
-		if (q->info.si_code != SI_TIMER)
-			BUG();
+		BUG_ON(q->info.si_code != SI_TIMER);
 		q->info.si_overrun++;
 		goto out;
 	}
@@ -1385,7 +1352,7 @@ retry:
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&sh->siglock, flags);
+	unlock_task_sighand(p, &flags);
 out_err:
 	rcu_read_unlock();
 
-- 
cgit v1.2.3