From b20367a6c2a0cd937cb1f0a8cf848f1402fef99c Mon Sep 17 00:00:00 2001
From: Jordan Hargrave <jordan_hargrave@dell.com>
Date: Fri, 7 Apr 2006 19:50:18 +0200
Subject: [PATCH] x86_64: Fix drift with HPET timer enabled

If the HPET timer is enabled, the clock can drift by ~3 seconds a day.
This is due to the HPET timer not being initialized with the correct
setting (still using PIT count).

If HZ changes, this drift can become even more pronounced.

HPET patch initializes tick_nsec with correct tick_nsec settings for
HPET timer.

Vojtech comments:

  "It's not entirely correct (it assumes the HPET ticks totally
   exactly), but it's significantly better than assuming the PIT error
   there."

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index c3a874f1393c..471ab8710b8f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1455,7 +1455,7 @@ static void time_interpolator_update(long delta_nsec)
 	 */
 	if (jiffies % INTERPOLATOR_ADJUST == 0)
 	{
-		if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+		if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
 			time_interpolator->nsec_per_cyc--;
 		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
 			time_interpolator->nsec_per_cyc++;
-- 
cgit v1.2.3


From b92ce55893745e011edae70830b8bc863be881f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 11 Apr 2006 13:52:07 +0200
Subject: [PATCH] splice: add direct fd <-> fd splicing support

It's more efficient for sendfile() emulation. Basically we cache an
internal private pipe and just use that as the intermediate area for
pages. Direct splicing is not available from sys_splice(), it is only
meant to be used for sendfile() emulation.

Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at
exit for the normal fast path.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c                 |  10 +++-
 fs/splice.c               | 148 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/fs.h        |   2 +
 include/linux/pipe_fs_i.h |   1 +
 include/linux/sched.h     |   6 ++
 kernel/exit.c             |   4 ++
 6 files changed, 150 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index 705b48692627..036536f072c9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -691,12 +691,10 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 	return info;
 }
 
-void free_pipe_info(struct inode *inode)
+void __free_pipe_info(struct pipe_inode_info *info)
 {
 	int i;
-	struct pipe_inode_info *info = inode->i_pipe;
 
-	inode->i_pipe = NULL;
 	for (i = 0; i < PIPE_BUFFERS; i++) {
 		struct pipe_buffer *buf = info->bufs + i;
 		if (buf->ops)
@@ -707,6 +705,12 @@ void free_pipe_info(struct inode *inode)
 	kfree(info);
 }
 
+void free_pipe_info(struct inode *inode)
+{
+	__free_pipe_info(inode->i_pipe);
+	inode->i_pipe = NULL;
+}
+
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
diff --git a/fs/splice.c b/fs/splice.c
index a5326127aad5..c47b561edac0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage);
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t __user *off_out, size_t len,
-			   unsigned int flags)
+			   size_t len, unsigned int flags)
 {
 	loff_t pos;
 	int ret;
@@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (!(out->f_mode & FMODE_WRITE))
 		return -EBADF;
 
-	if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t)))
-		return -EFAULT;
-
 	pos = out->f_pos;
 
 	ret = rw_verify_area(WRITE, out, &pos, len);
@@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t __user *off_in,
-			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
+			 size_t len, unsigned int flags)
 {
 	loff_t pos, isize, left;
 	int ret;
@@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
 	if (!(in->f_mode & FMODE_READ))
 		return -EBADF;
 
-	if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
-		return -EFAULT;
-
 	pos = in->f_pos;
 
 	ret = rw_verify_area(READ, in, &pos, len);
@@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in,
 	return in->f_op->splice_read(in, pipe, len, flags);
 }
 
+long do_splice_direct(struct file *in, struct file *out, size_t len,
+		      unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	umode_t i_mode;
+	int i;
+
+	/*
+	 * We require the input being a regular file, as we don't want to
+	 * randomly drop data for eg socket -> socket splicing. Use the
+	 * piped splicing for that!
+	 */
+	i_mode = in->f_dentry->d_inode->i_mode;
+	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+		return -EINVAL;
+
+	/*
+	 * neither in nor out is a pipe, setup an internal pipe attached to
+	 * 'out' and transfer the wanted data from 'in' to 'out' through that
+	 */
+	pipe = current->splice_pipe;
+	if (!pipe) {
+		pipe = alloc_pipe_info(NULL);
+		if (!pipe)
+			return -ENOMEM;
+
+		/*
+		 * We don't have an immediate reader, but we'll read the stuff
+		 * out of the pipe right after the move_to_pipe(). So set
+		 * PIPE_READERS appropriately.
+		 */
+		pipe->readers = 1;
+
+		current->splice_pipe = pipe;
+	}
+
+	/*
+	 * do the splice
+	 */
+	ret = 0;
+	bytes = 0;
+
+	while (len) {
+		size_t read_len, max_read_len;
+
+		/*
+		 * Do at most PIPE_BUFFERS pages worth of transfer:
+		 */
+		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+		ret = do_splice_to(in, pipe, max_read_len, flags);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		read_len = ret;
+
+		/*
+		 * NOTE: nonblocking mode only applies to the input. We
+		 * must not do the output in nonblocking mode as then we
+		 * could get stuck data in the internal pipe:
+		 */
+		ret = do_splice_from(pipe, out, read_len,
+				     flags & ~SPLICE_F_NONBLOCK);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		bytes += ret;
+		len -= ret;
+
+		/*
+		 * In nonblocking mode, if we got back a short read then
+		 * that was due to either an IO error or due to the
+		 * pagecache entry not being there. In the IO error case
+		 * the _next_ splice attempt will produce a clean IO error
+		 * return value (not a short read), so in both cases it's
+		 * correct to break out of the loop here:
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
+			break;
+	}
+
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	return bytes;
+
+out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < PIPE_BUFFERS; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	/*
+	 * If we transferred some data, return the number of bytes:
+	 */
+	if (bytes > 0)
+		return bytes;
+
+	return ret;
+}
+
+EXPORT_SYMBOL(do_splice_direct);
+
 /*
  * Determine where to splice to/from.
  */
@@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 {
 	struct pipe_inode_info *pipe;
 
-	if (off_out && out->f_op->llseek == no_llseek)
-		return -EINVAL;
-	if (off_in && in->f_op->llseek == no_llseek)
-		return -EINVAL;
-
 	pipe = in->f_dentry->d_inode->i_pipe;
 	if (pipe) {
 		if (off_in)
 			return -ESPIPE;
+		if (off_out) {
+			if (out->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&out->f_pos, off_out,
+					   sizeof(loff_t)))
+				return -EFAULT;
+		}
 
-		return do_splice_from(pipe, out, off_out, len, flags);
+		return do_splice_from(pipe, out, len, flags);
 	}
 
 	pipe = out->f_dentry->d_inode->i_pipe;
 	if (pipe) {
 		if (off_out)
 			return -ESPIPE;
+		if (off_in) {
+			if (in->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
+				return -EFAULT;
+		}
 
-		return do_splice_to(in, off_in, pipe, len, flags);
+		return do_splice_to(in, pipe, len, flags);
 	}
 
 	return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e6454454fbd..9e8e2ee353b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1613,6 +1613,8 @@ extern void do_generic_mapping_read(struct address_space *mapping,
 				    loff_t *, read_descriptor_t *, read_actor_t);
 extern ssize_t generic_file_splice_read(struct file *, struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, size_t, unsigned int);
+extern long do_splice_direct(struct file *in, struct file *out,
+			     size_t len, unsigned int flags);
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 9cf99cb34c15..660e9d866e5d 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -58,6 +58,7 @@ void pipe_wait(struct pipe_inode_info *pipe);
 
 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
 void free_pipe_info(struct inode * inode);
+void __free_pipe_info(struct pipe_inode_info *);
 
 /*
  * splice is tied to pipes as a transport (at least for now), so we'll just
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 541f4828f5e7..e194ec75833d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -684,6 +684,7 @@ static inline void prefetch_stack(struct task_struct *t) { }
 
 struct audit_context;		/* See audit.c */
 struct mempolicy;
+struct pipe_inode_info;
 
 enum sleep_type {
 	SLEEP_NORMAL,
@@ -882,6 +883,11 @@ struct task_struct {
 
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
+
+	/*
+	 * cache last used pipe for splice
+	 */
+	struct pipe_inode_info *splice_pipe;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
diff --git a/kernel/exit.c b/kernel/exit.c
index 6c2eeb8f6390..1a9787ac6173 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -34,6 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/futex.h>
 #include <linux/compat.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -941,6 +942,9 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (tsk->io_context)
 		exit_io_context();
 
+	if (tsk->splice_pipe)
+		__free_pipe_info(tsk->splice_pipe);
+
 	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
 	BUG_ON(tsk->flags & PF_DEAD);
-- 
cgit v1.2.3


From 5ce74abe788a26698876e66b9c9ce7e7acc25413 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 10 Apr 2006 22:52:44 -0700
Subject: [PATCH] sched: fix interactive task starvation

Fix a starvation problem that occurs when a stream of highly interactive tasks
delay an array switch for extended periods despite EXPIRED_STARVING(rq) being
true.  AFAIKT, the only choice is to enqueue awakening tasks on the expired
array in this case.

Without this patch, it can be nearly impossible to remotely login to a busy
server, and interactive shell commands can starve for minutes.

Also, convert the EXPIRED_STARVING macro into an inline function which humans
can understand.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 62 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dd153d6f8a04..2e8a146dd066 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -664,6 +664,48 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired, and switch periodically
+ * regardless, to ensure that highly interactive tasks do not starve
+ * the less fortunate for unreasonably long periods.
+ */
+static inline int expired_starving(runqueue_t *rq)
+{
+	int limit;
+
+	/*
+	 * Arrays were recently switched, all is well
+	 */
+	if (!rq->expired_timestamp)
+		return 0;
+
+	limit = STARVATION_LIMIT * rq->nr_running;
+
+	/*
+	 * It's time to switch arrays
+	 */
+	if (jiffies - rq->expired_timestamp >= limit)
+		return 1;
+
+	/*
+	 * There's a better selection in the expired array
+	 */
+	if (rq->curr->static_prio > rq->best_expired_prio)
+		return 1;
+
+	/*
+	 * All is well
+	 */
+	return 0;
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
@@ -671,7 +713,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 {
 	prio_array_t *target = rq->active;
 
-	if (batch_task(p))
+	if (unlikely(batch_task(p) || expired_starving(rq)))
 		target = rq->expired;
 	enqueue_task(p, target);
 	rq->nr_running++;
@@ -2489,22 +2531,6 @@ unsigned long long current_sched_time(const task_t *tsk)
 	return ns;
 }
 
-/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-#define EXPIRED_STARVING(rq) \
-	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
-			((rq)->curr->static_prio > (rq)->best_expired_prio))
-
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2640,7 +2666,7 @@ void scheduler_tick(void)
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
-- 
cgit v1.2.3


From 8a5bc075b8d8cf7a87b3f08fad2fba0f5d13295e Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 10 Apr 2006 22:52:45 -0700
Subject: [PATCH] sched: don't awaken RT tasks on expired array

RT tasks are being awakened on the expired array when expired_starving() is
true, whereas they really should be excluded.  Fix.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2e8a146dd066..365f0b90b4de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -713,7 +713,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 {
 	prio_array_t *target = rq->active;
 
-	if (unlikely(batch_task(p) || expired_starving(rq)))
+	if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
 		target = rq->expired;
 	enqueue_task(p, target);
 	rq->nr_running++;
-- 
cgit v1.2.3


From 3016b421534e2fa8a5eede1c12a3eba6164822f4 Mon Sep 17 00:00:00 2001
From: "Hyok S. Choi" <hyok.choi@samsung.com>
Date: Mon, 10 Apr 2006 22:53:06 -0700
Subject: [PATCH] frv: define MMU mode specific syscalls as 'cond_syscall' and
 clean up unneeded macros

For some architectures, a few syscalls are not linked in noMMU mode.  In
that case, the MMU depending syscalls are needed to be defined as
'cond_syscall'.  For example, ARM architecture selectively links sys_mlock
by the mode configuration.

In case of FRV, it has been managed by #ifdef CONFIG_MMU macro in
arch/frv/kernel/entry.S.  However these conditional macros are just
duplicates if they were defined as cond_syscall.  Compilation test is done
with FRV toolchains for both of MMU and noMMU mode.

Signed-off-by: Hyok S. Choi <hyok.choi@samsung.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/entry.S | 26 ++++++++++----------------
 kernel/sys_ni.c         | 12 ++++++++++++
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 1d21c8d34d8a..a9b59527a741 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1170,12 +1170,6 @@ __syscall_badsys:
 # syscall vector table
 #
 ###############################################################################
-#ifdef CONFIG_MMU
-#define __MMU(X) X
-#else
-#define __MMU(X) sys_ni_syscall
-#endif
-
 	.section .rodata
 ALIGN
 	.globl		sys_call_table
@@ -1305,7 +1299,7 @@ sys_call_table:
 	.long sys_newuname
 	.long sys_ni_syscall	/* old "cacheflush" */
 	.long sys_adjtimex
-	.long __MMU(sys_mprotect) /* 125 */
+	.long sys_mprotect	/* 125 */
 	.long sys_sigprocmask
 	.long sys_ni_syscall	/* old "create_module" */
 	.long sys_init_module
@@ -1324,16 +1318,16 @@ sys_call_table:
 	.long sys_getdents
 	.long sys_select
 	.long sys_flock
-	.long __MMU(sys_msync)
+	.long sys_msync
 	.long sys_readv		/* 145 */
 	.long sys_writev
 	.long sys_getsid
 	.long sys_fdatasync
 	.long sys_sysctl
-	.long __MMU(sys_mlock)		/* 150 */
-	.long __MMU(sys_munlock)
-	.long __MMU(sys_mlockall)
-	.long __MMU(sys_munlockall)
+	.long sys_mlock		/* 150 */
+	.long sys_munlock
+	.long sys_mlockall
+	.long sys_munlockall
 	.long sys_sched_setparam
 	.long sys_sched_getparam   /* 155 */
 	.long sys_sched_setscheduler
@@ -1343,7 +1337,7 @@ sys_call_table:
 	.long sys_sched_get_priority_min  /* 160 */
 	.long sys_sched_rr_get_interval
 	.long sys_nanosleep
-	.long __MMU(sys_mremap)
+	.long sys_mremap
 	.long sys_setresuid16
 	.long sys_getresuid16	/* 165 */
 	.long sys_ni_syscall	/* for vm86 */
@@ -1398,8 +1392,8 @@ sys_call_table:
 	.long sys_setfsuid		/* 215 */
 	.long sys_setfsgid
 	.long sys_pivot_root
-	.long __MMU(sys_mincore)
-	.long __MMU(sys_madvise)
+	.long sys_mincore
+	.long sys_madvise
 	.long sys_getdents64	/* 220 */
 	.long sys_fcntl64
 	.long sys_ni_syscall	/* reserved for TUX */
@@ -1437,7 +1431,7 @@ sys_call_table:
 	.long sys_epoll_create
 	.long sys_epoll_ctl	/* 255 */
 	.long sys_epoll_wait
- 	.long __MMU(sys_remap_file_pages)
+ 	.long sys_remap_file_pages
  	.long sys_set_tid_address
  	.long sys_timer_create
  	.long sys_timer_settime		/* 260 */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d82864c4a617..5433195040f1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -120,3 +120,15 @@ cond_syscall(sys32_sysctl);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
+
+/* mmu depending weak syscall entries */
+cond_syscall(sys_mprotect);
+cond_syscall(sys_msync);
+cond_syscall(sys_mlock);
+cond_syscall(sys_munlock);
+cond_syscall(sys_mlockall);
+cond_syscall(sys_munlockall);
+cond_syscall(sys_mincore);
+cond_syscall(sys_madvise);
+cond_syscall(sys_mremap);
+cond_syscall(sys_remap_file_pages);
-- 
cgit v1.2.3


From ba6edfcd1708da2e665f14eee76e87f39448ec40 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Apr 2006 22:53:58 -0700
Subject: [PATCH] timer initialisation fix

We need the boot CPU's tvec_bases[] entry to be initialised super-early in
boot, for early_serial_setup().  That runs within setup_arch(), before even
per-cpu areas are initialised.

The patch changes tvec_bases to use compile-time initialisation, and adds a
separate array `tvec_base_done' to keep track of which CPU has had its
tvec_bases[] entry initialised (because we can no longer use the zeroness of
that tvec_bases[] entry to determine whether it has been initialised).

Thanks to Eugene Surovegin <ebs@ebshome.net> for diagnosing this.

Cc: Eugene Surovegin <ebs@ebshome.net>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 471ab8710b8f..883773788836 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -81,9 +81,10 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
@@ -1224,28 +1225,36 @@ static int __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
+	static char __devinitdata tvec_base_done[NR_CPUS];
 
-	base = per_cpu(tvec_bases, cpu);
-	if (!base) {
+	if (!tvec_base_done[cpu]) {
 		static char boot_done;
 
-		/*
-		 * Cannot do allocation in init_timers as that runs before the
-		 * allocator initializes (and would waste memory if there are
-		 * more possible CPUs than will ever be installed/brought up).
-		 */
 		if (boot_done) {
+			/*
+			 * The APs use this path later in boot
+			 */
 			base = kmalloc_node(sizeof(*base), GFP_KERNEL,
 						cpu_to_node(cpu));
 			if (!base)
 				return -ENOMEM;
 			memset(base, 0, sizeof(*base));
+			per_cpu(tvec_bases, cpu) = base;
 		} else {
-			base = &boot_tvec_bases;
+			/*
+			 * This is for the boot CPU - we use compile-time
+			 * static initialisation because per-cpu memory isn't
+			 * ready yet and because the memory allocators are not
+			 * initialised either.
+			 */
 			boot_done = 1;
+			base = &boot_tvec_bases;
 		}
-		per_cpu(tvec_bases, cpu) = base;
+		tvec_base_done[cpu] = 1;
+	} else {
+		base = per_cpu(tvec_bases, cpu);
 	}
+
 	spin_lock_init(&base->lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit v1.2.3


From aa7271076ae6547d7f370ad7e91ef86fdb318f17 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 10 Apr 2006 22:53:59 -0700
Subject: [PATCH] the scheduled unexport of panic_timeout

Implement the scheduled unexport of panic_timeout.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/feature-removal-schedule.txt | 8 --------
 include/linux/kernel.h                     | 2 +-
 kernel/panic.c                             | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 59d0c74c79c9..293fed113dff 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -71,14 +71,6 @@ Who:	Mauro Carvalho Chehab <mchehab@brturbo.com.br>
 
 ---------------------------
 
-What:	remove EXPORT_SYMBOL(panic_timeout)
-When:	April 2006
-Files:	kernel/panic.c
-Why:	No modular usage in the kernel.
-Who:	Adrian Bunk <bunk@stusta.de>
-
----------------------------
-
 What:	remove EXPORT_SYMBOL(insert_resource)
 When:	April 2006
 Files:	kernel/resource.c
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a3720f973ea5..e1bd0842f6a1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -176,7 +176,7 @@ static inline void console_verbose(void)
 
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
-extern __deprecated_for_modules int panic_timeout;
+extern int panic_timeout;
 extern int panic_on_oops;
 extern int tainted;
 extern const char *print_tainted(void);
diff --git a/kernel/panic.c b/kernel/panic.c
index f895c7c01d5b..cc2a4c9c36ac 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,7 +27,6 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 
 int panic_timeout;
-EXPORT_SYMBOL(panic_timeout);
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
-- 
cgit v1.2.3


From d824e66a9a427faf69c58f98dd7e1c3d1bb51c61 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Apr 2006 22:54:04 -0700
Subject: [PATCH] build kernel/irq/migration.c only if
 CONFIG_GENERIC_PENDING_IRQ is set

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/Makefile    | 3 ++-
 kernel/irq/migration.c | 5 +----
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2b33f852be3e..9f77f50d8143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,4 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o migration.o
+obj-y := handle.o manage.o spurious.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 52a8655fa080..134f9f2e0e39 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,6 +1,5 @@
-#include <linux/irq.h>
 
-#if defined(CONFIG_GENERIC_PENDING_IRQ)
+#include <linux/irq.h>
 
 void set_pending_irq(unsigned int irq, cpumask_t mask)
 {
@@ -61,5 +60,3 @@ void move_native_irq(int irq)
 	}
 	cpus_clear(pending_irq_cpumask[irq]);
 }
-
-#endif
-- 
cgit v1.2.3


From 5ef37b196467bf2f9d41e5579dd388c08b800f7c Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Mon, 10 Apr 2006 22:54:13 -0700
Subject: [PATCH] add cpu_relax to hrtimer_cancel

Add a cpu_relax() to the hand-coded spinwait in hrtimer_cancel().

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f181ff4dd32e..d2a7296c8251 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 
 		if (ret >= 0)
 			return ret;
+		cpu_relax();
 	}
 }
 
-- 
cgit v1.2.3


From a145410dccdb44f81d3b56763ef9b6f721f4e47c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 11 Apr 2006 22:18:58 +0400
Subject: [PATCH] __group_complete_signal: remove bogus BUG_ON

Commit e56d090310d7625ecb43a1eeebd479f04affb48b

   [PATCH] RCU signal handling

made this BUG_ON() unsafe. This code runs under ->siglock,
while switch_exec_pids() takes tasklist_lock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 5ccaac505e8d..b14f895027c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -868,7 +868,6 @@ __group_complete_signal(int sig, struct task_struct *p)
 		if (t == NULL)
 			/* restart balancing at this thread */
 			t = p->signal->curr_target = p;
-		BUG_ON(t->tgid != p->tgid);
 
 		while (!wants_signal(sig, t)) {
 			t = next_thread(t);
-- 
cgit v1.2.3


From e57a5059846e55d82b86d96dde40e988598601b3 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 12 Apr 2006 16:30:20 -0700
Subject: [PATCH] fix non-leader exec under ptrace

This reverts most of commit 30e0fca6c1d7d26f3f2daa4dd2b12c51dadc778a.
It broke the case of non-leader MT exec when ptraced.
I think the bug it was intended to fix was already addressed by commit
788e05a67c343fa22f2ae1d3ca264e7f15c25eaf.

Signed-off-by: Roland McGrath <roland@redhat.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 7 ++-----
 kernel/signal.c | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0eeb7e66722c..4e0f0ec003f7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,10 +56,6 @@ void ptrace_untrace(task_t *child)
 			signal_wake_up(child, 1);
 		}
 	}
-	if (child->signal->flags & SIGNAL_GROUP_EXIT) {
-		sigaddset(&child->pending.signal, SIGKILL);
-		signal_wake_up(child, 1);
-	}
 	spin_unlock(&child->sighand->siglock);
 }
 
@@ -81,7 +77,8 @@ void __ptrace_unlink(task_t *child)
 		add_parent(child);
 	}
 
-	ptrace_untrace(child);
+	if (child->state == TASK_TRACED)
+		ptrace_untrace(child);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index b14f895027c3..e5f8aea78ffe 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1754,9 +1754,9 @@ relock:
 			/* Let the debugger run.  */
 			ptrace_stop(signr, signr, info);
 
-			/* We're back.  Did the debugger cancel the sig or group_exit? */
+			/* We're back.  Did the debugger cancel the sig?  */
 			signr = current->exit_code;
-			if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
+			if (signr == 0)
 				continue;
 
 			current->exit_code = 0;
-- 
cgit v1.2.3


From 78a596b4490e17b9990d87b9d468ef5bb70daa10 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 31 Mar 2006 01:38:12 -0800
Subject: [PATCH] remove kernel/power/pm.c:pm_unregister()

Since the last user is removed in -mm, we can now remove this long deprecated
function.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pm_legacy.h |  7 -------
 kernel/power/pm.c         | 20 --------------------
 2 files changed, 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/pm_legacy.h b/include/linux/pm_legacy.h
index 1252b45face1..008932d73c35 100644
--- a/include/linux/pm_legacy.h
+++ b/include/linux/pm_legacy.h
@@ -15,11 +15,6 @@ extern int pm_active;
 struct pm_dev __deprecated *
 pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
 
-/*
- * Unregister a device with power management
- */
-void __deprecated pm_unregister(struct pm_dev *dev);
-
 /*
  * Unregister all devices with matching callback
  */
@@ -41,8 +36,6 @@ static inline struct pm_dev *pm_register(pm_dev_t type,
 	return NULL;
 }
 
-static inline void pm_unregister(struct pm_dev *dev) {}
-
 static inline void pm_unregister_all(pm_callback callback) {}
 
 static inline int pm_send_all(pm_request_t rqst, void *data)
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 0f6908cce1dd..84063ac8fcfc 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,25 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
 	return dev;
 }
 
-/**
- *	pm_unregister -  unregister a device with power management
- *	@dev: device to unregister
- *
- *	Remove a device from the power management notification lists. The
- *	dev passed must be a handle previously returned by pm_register.
- */
- 
-void pm_unregister(struct pm_dev *dev)
-{
-	if (dev) {
-		mutex_lock(&pm_devs_lock);
-		list_del(&dev->entry);
-		mutex_unlock(&pm_devs_lock);
-
-		kfree(dev);
-	}
-}
-
 static void __pm_unregister(struct pm_dev *dev)
 {
 	if (dev) {
@@ -258,7 +239,6 @@ int pm_send_all(pm_request_t rqst, void *data)
 }
 
 EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister);
 EXPORT_SYMBOL(pm_unregister_all);
 EXPORT_SYMBOL(pm_send_all);
 EXPORT_SYMBOL(pm_active);
-- 
cgit v1.2.3


From 64541d19702cfdb7ea946fdc20faee849f6874b1 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 14 Apr 2006 12:43:15 -0600
Subject: [PATCH] kill unushed __put_task_struct_cb

Somehow in the midst of dotting i's and crossing t's during
the merge up to rc1 we wound up keeping __put_task_struct_cb
when it should have been killed as it no longer has any users.
Sorry I probably should have caught this while it was
still in the -mm tree.

Having the old code there gets confusing when reading
through the code and trying to understand what is
happening.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 -
 kernel/fork.c         | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3539c14e47e..b7d31e2e1729 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,7 +911,6 @@ static inline int pid_alive(struct task_struct *p)
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
-extern void __put_task_struct_cb(struct rcu_head *rhp);
 extern void __put_task_struct(struct task_struct *t);
 
 static inline void put_task_struct(struct task_struct *t)
diff --git a/kernel/fork.c b/kernel/fork.c
index 3384eb89cb1c..54b15f8cda53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -124,12 +124,6 @@ void __put_task_struct(struct task_struct *tsk)
 		free_task(tsk);
 }
 
-void __put_task_struct_cb(struct rcu_head *rhp)
-{
-	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-	__put_task_struct(tsk);
-}
-
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-- 
cgit v1.2.3


From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 18 Apr 2006 22:20:16 -0700
Subject: [PATCH] task: Make task list manipulations RCU safe

While we can currently walk through thread groups, process groups, and
sessions with just the rcu_read_lock, this opens the door to walking the
entire task list.

We already have all of the other RCU guarantees so there is no cost in
doing this, this should be enough so that proc can stop taking the
tasklist lock during readdir.

prev_task was killed because it has no users, and using it will miss new
tasks when doing an rcu traversal.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             | 2 +-
 include/linux/sched.h | 3 +--
 kernel/exit.c         | 2 +-
 kernel/fork.c         | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 4121bb559739..3a79d97ac234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail(&current->tasks, &init_task.tasks);
+		list_add_tail_rcu(&current->tasks, &init_task.tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7d31e2e1729..29b7d4f87d20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1192,8 +1192,7 @@ extern void wait_task_inactive(task_t * p);
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
-#define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
-#define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
+#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
diff --git a/kernel/exit.c b/kernel/exit.c
index 1a9787ac6173..f86434d7b3d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,7 +56,7 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 
-		list_del_init(&p->tasks);
+		list_del_rcu(&p->tasks);
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
diff --git a/kernel/fork.c b/kernel/fork.c
index 54b15f8cda53..34515772611e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,7 +1204,7 @@ static task_t *copy_process(unsigned long clone_flags,
 			attach_pid(p, PIDTYPE_PGID, process_group(p));
 			attach_pid(p, PIDTYPE_SID, p->signal->session);
 
-			list_add_tail(&p->tasks, &init_task.tasks);
+			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
 		attach_pid(p, PIDTYPE_PID, p->pid);
-- 
cgit v1.2.3


From 4a3b98a422a20dedf3a2a40c44892d6e7e665157 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 18 Apr 2006 22:20:29 -0700
Subject: [PATCH] swsusp: prevent possible image corruption on resume

The function free_pagedir() used by swsusp for freeing its internal data
structures clears the PG_nosave and PG_nosave_free flags for each page
being freed.

However, during resume PG_nosave_free set means that the page in
question is "unsafe" (ie.  it will be overwritten in the process of
restoring the saved system state from the image), so it should not be
used for the image data.

Therefore free_pagedir() should not clear PG_nosave_free if it's called
during resume (otherwise "unsafe" pages freed by it may be used for
storing the image data and the data may get corrupted later on).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c5863d02c89e..3eeedbb13b78 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,14 +240,15 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-static void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
 {
 	struct pbe *pbe;
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
 		ClearPageNosave(virt_to_page(pblist));
-		ClearPageNosaveFree(virt_to_page(pblist));
+		if (clear_nosave_free)
+			ClearPageNosaveFree(virt_to_page(pblist));
 		free_page((unsigned long)pblist);
 		pblist = pbe;
 	}
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 		pbe->next = alloc_image_page(gfp_mask, safe_needed);
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist);
+		free_pagedir(pblist, 1);
 		pblist = NULL;
         } else
 		create_pbe_list(pblist, nr_pages);
@@ -736,7 +737,7 @@ static int create_image(struct snapshot_handle *handle)
 		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
 		if (pblist)
 			copy_page_backup_list(pblist, p);
-		free_pagedir(p);
+		free_pagedir(p, 0);
 		if (!pblist)
 			error = -ENOMEM;
 	}
-- 
cgit v1.2.3


From 5a7b46b369419493bab4de67b1526e9f76b22a7f Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 20 Apr 2006 06:41:39 +0900
Subject: [PATCH] Add more prevent_tail_call()

Those also break userland regs like following.

   00000000 <sys_chown16>:
      0:	0f b7 44 24 0c       	movzwl 0xc(%esp),%eax
      5:	83 ca ff             	or     $0xffffffff,%edx
      8:	0f b7 4c 24 08       	movzwl 0x8(%esp),%ecx
      d:	66 83 f8 ff          	cmp    $0xffffffff,%ax
     11:	0f 44 c2             	cmove  %edx,%eax
     14:	66 83 f9 ff          	cmp    $0xffffffff,%cx
     18:	0f 45 d1             	cmovne %ecx,%edx
     1b:	89 44 24 0c          	mov    %eax,0xc(%esp)
     1f:	89 54 24 08          	mov    %edx,0x8(%esp)
     23:	e9 fc ff ff ff       	jmp    24 <sys_chown16+0x24>

where the tailcall at the end overwrites the incoming stack-frame.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
[ I would _really_ like to have a way to tell gcc about calling
  conventions. The "prevent_tail_call()" macro is pretty ugly ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/uid16.c | 59 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/uid16.c b/kernel/uid16.c
index aa25605027c8..187e2a423878 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -20,43 +20,67 @@
 
 asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-	return sys_chown(filename, low2highuid(user), low2highgid(group));
+	long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
 {
-	return sys_lchown(filename, low2highuid(user), low2highgid(group));
+	long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
 {
-	return sys_fchown(fd, low2highuid(user), low2highgid(group));
+	long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
 {
-	return sys_setregid(low2highgid(rgid), low2highgid(egid));
+	long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setgid16(old_gid_t gid)
 {
-	return sys_setgid(low2highgid(gid));
+	long ret = sys_setgid(low2highgid(gid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
 {
-	return sys_setreuid(low2highuid(ruid), low2highuid(euid));
+	long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setuid16(old_uid_t uid)
 {
-	return sys_setuid(low2highuid(uid));
+	long ret = sys_setuid(low2highuid(uid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
 {
-	return sys_setresuid(low2highuid(ruid), low2highuid(euid),
-		low2highuid(suid));
+	long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+				 low2highuid(suid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
 
 asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
 {
-	return sys_setresgid(low2highgid(rgid), low2highgid(egid),
-		low2highgid(sgid));
+	long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+				 low2highgid(sgid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
 
 asmlinkage long sys_setfsuid16(old_uid_t uid)
 {
-	return sys_setfsuid(low2highuid(uid));
+	long ret = sys_setfsuid(low2highuid(uid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 asmlinkage long sys_setfsgid16(old_gid_t gid)
 {
-	return sys_setfsgid(low2highgid(gid));
+	long ret = sys_setfsgid(low2highgid(gid));
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 static int groups16_to_user(old_gid_t __user *grouplist,
-- 
cgit v1.2.3


From a0aa7f68afeeb92f6274b395177c20e617c8ed2d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 20 Apr 2006 13:05:33 +0200
Subject: [PATCH] Don't inherit ->splice_pipe across forks

It's really task private, so clear that field on fork after copying
task structure.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 34515772611e..d2fa57d480d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -180,6 +180,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
 	tsk->btrace_seq = 0;
+	tsk->splice_pipe = NULL;
 	return tsk;
 }
 
-- 
cgit v1.2.3


From 7522a8423bed9931cbac5502b9c0657bde2700ea Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Thu, 20 Apr 2006 02:43:11 -0700
Subject: [PATCH] kprobes: NULL out non-relevant fields in struct kretprobe

In cases where a struct kretprobe's *_handler fields are non-NULL, it is
possible to cause a system crash, due to the possibility of calls ending up
in zombie functions.  Documentation clearly states that unused *_handlers
should be set to NULL, but kprobe users sometimes fail to do so.

Fix it by setting the non-relevant fields of the struct kretprobe to NULL.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1156eb0977d0..1fbf466a29aa 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -585,6 +585,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 	int i;
 
 	rp->kp.pre_handler = pre_handler_kretprobe;
+	rp->kp.post_handler = NULL;
+	rp->kp.fault_handler = NULL;
+	rp->kp.break_handler = NULL;
 
 	/* Pre-allocate memory for max kretprobe instances */
 	if (rp->maxactive <= 0) {
-- 
cgit v1.2.3