From c672af35d54992b88d3c48133bd62cc3386fb2f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:36:59 +0100 Subject: signal: Fix SIGCONT notification code After a task receives SIGCONT, its parent is notified via SIGCHLD with its siginfo describing what the notified event is. If SIGCONT is received while the child process is stopped, the code should be CLD_CONTINUED. If SIGCONT is recieved while the child process is in the process of being stopped, it should be CLD_STOPPED. Which code to use is determined in prepare_signal() and recorded in signal->flags using SIGNAL_CLD_CONTINUED|STOP flags. get_signal_deliver() should test these flags and then notify accoringly; however, it incorrectly tested SIGNAL_STOP_CONTINUED instead of SIGNAL_CLD_CONTINUED, thus incorrectly notifying CLD_CONTINUED if the signal is delivered before the task is wait(2)ed and CLD_STOPPED if the state was fetched already. Fix it by testing SIGNAL_CLD_CONTINUED. While at it, uncompress the ?: test into if/else clause for better readability. Signed-off-by: Tejun Heo Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath --- kernel/signal.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 31751868de88..e26274abf3a9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1853,8 +1853,13 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - int why = (signal->flags & SIGNAL_STOP_CONTINUED) - ? CLD_CONTINUED : CLD_STOPPED; + int why; + + if (signal->flags & SIGNAL_CLD_CONTINUED) + why = CLD_CONTINUED; + else + why = CLD_STOPPED; + signal->flags &= ~SIGNAL_CLD_MASK; why = tracehook_notify_jctl(why, CLD_CONTINUED); -- cgit v1.2.3 From 9f2bf6513a6cca0b00cbbf67ba6197017cfba548 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Remove the extra wake_up_state() from ptrace_detach() This wake_up_state() has a turbulent history. This is a remnant from ancient ptrace implementation and patently wrong. Commit 95a3540d (ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic) removed it but the change was reverted later by commit edaba2c5 (ptrace: revert "ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic") citing compatibility breakage and general brokeness of the whole group stop / ptrace interaction. Then, recently, it got converted from wake_up_process() to wake_up_state() to make it less dangerous. Digging through the mailing archives, the compatibility breakage doesn't seem to be critical in the sense that the behavior isn't well defined or reliable to begin with and it seems to have been agreed to remove the wakeup with proper cleanup of the whole thing. Now that the group stop and its interaction with ptrace are being cleaned up, it's high time to finally kill this silliness. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- kernel/ptrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e40b360..6acf8954017c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -312,8 +312,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); - if (!child->exit_state) - wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 71db5eb99c960e9c30e4b3ed04103c513b6251b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: signal: Remove superflous try_to_freeze() loop in do_signal_stop() do_signal_stop() is used only by get_signal_to_deliver() and after a successful signal stop, it always calls try_to_freeze(), so the try_to_freeze() loop around schedule() in do_signal_stop() is superflous and confusing. Remove it. Signed-off-by: Tejun Heo Acked-by: Rafael J. Wysocki Acked-by: Oleg Nesterov Acked-by: Roland McGrath --- kernel/signal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e26274abf3a9..f4db76986ec1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1781,9 +1781,7 @@ static int do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - do { - schedule(); - } while (try_to_freeze()); + schedule(); tracehook_finish_jctl(); current->exit_code = 0; -- cgit v1.2.3 From edf2ed153bcae52de70db00a98b0e81a5668e563 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Kill tracehook_notify_jctl() tracehook_notify_jctl() aids in determining whether and what to report to the parent when a task is stopped or continued. The function also adds an extra requirement that siglock may be released across it, which is currently unused and quite difficult to satisfy in well-defined manner. As job control and the notifications are about to receive major overhaul, remove the tracehook and open code it. If ever necessary, let's factor it out after the overhaul. * Oleg spotted incorrect CLD_CONTINUED/STOPPED selection when ptraced. Fixed. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Roland McGrath --- include/linux/tracehook.h | 27 --------------------------- kernel/signal.c | 34 ++++++++++++++-------------------- 2 files changed, 14 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3a2e66d88a32..b073f3c8adc3 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -468,33 +468,6 @@ static inline int tracehook_get_signal(struct task_struct *task, return 0; } -/** - * tracehook_notify_jctl - report about job control stop/continue - * @notify: zero, %CLD_STOPPED or %CLD_CONTINUED - * @why: %CLD_STOPPED or %CLD_CONTINUED - * - * This is called when we might call do_notify_parent_cldstop(). - * - * @notify is zero if we would not ordinarily send a %SIGCHLD, - * or is the %CLD_STOPPED or %CLD_CONTINUED .si_code for %SIGCHLD. - * - * @why is %CLD_STOPPED when about to stop for job control; - * we are already in %TASK_STOPPED state, about to call schedule(). - * It might also be that we have just exited (check %PF_EXITING), - * but need to report that a group-wide stop is complete. - * - * @why is %CLD_CONTINUED when waking up after job control stop and - * ready to make a delayed @notify report. - * - * Return the %CLD_* value for %SIGCHLD, or zero to generate no signal. - * - * Called with the siglock held. - */ -static inline int tracehook_notify_jctl(int notify, int why) -{ - return notify ?: (current->ptrace & PT_PTRACED) ? why : 0; -} - /** * tracehook_finish_jctl - report about return from job control stop * diff --git a/kernel/signal.c b/kernel/signal.c index f4db76986ec1..03d874e1058f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1727,7 +1727,7 @@ void ptrace_notify(int exit_code) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int notify; + int notify = 0; if (!sig->group_stop_count) { struct task_struct *t; @@ -1759,19 +1759,16 @@ static int do_signal_stop(int signr) * a group stop in progress and we are the last to stop, report * to the parent. When ptraced, every thread reports itself. */ - notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; - notify = tracehook_notify_jctl(notify, CLD_STOPPED); - /* - * tracehook_notify_jctl() can drop and reacquire siglock, so - * we keep ->group_stop_count != 0 before the call. If SIGCONT - * or SIGKILL comes in between ->group_stop_count == 0. - */ - if (sig->group_stop_count) { - if (!--sig->group_stop_count) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (!--sig->group_stop_count) { + sig->flags = SIGNAL_STOP_STOPPED; + notify = CLD_STOPPED; } + if (task_ptrace(current)) + notify = CLD_STOPPED; + + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); if (notify) { @@ -1860,14 +1857,11 @@ relock: signal->flags &= ~SIGNAL_CLD_MASK; - why = tracehook_notify_jctl(why, CLD_CONTINUED); spin_unlock_irq(&sighand->siglock); - if (why) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); - } + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); goto relock; } @@ -2034,7 +2028,7 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); + group_stop = CLD_STOPPED; } out: spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From fe1bc6a0954611b806f9e158eb0817cf8ba21660 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Add @why to ptrace_stop() To prepare for cleanup of the interaction between group stop and ptrace, add @why to ptrace_stop(). Existing users are updated such that there is no behavior change. Signed-off-by: Tejun Heo Acked-by: Roland McGrath --- kernel/signal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 03d874e1058f..95ac42dc3bcb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1617,7 +1617,7 @@ static int sigkill_pending(struct task_struct *tsk) * If we actually decide not to stop at all because the tracer * is gone, we keep current->exit_code unless clear_code. */ -static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { @@ -1655,7 +1655,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, CLD_TRAPPED); + do_notify_parent_cldstop(current, why); /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -1714,7 +1714,7 @@ void ptrace_notify(int exit_code) /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, 1, &info); + ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); spin_unlock_irq(¤t->sighand->siglock); } @@ -1795,7 +1795,7 @@ static int ptrace_signal(int signr, siginfo_t *info, ptrace_signal_deliver(regs, cookie); /* Let the debugger run. */ - ptrace_stop(signr, 0, info); + ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; -- cgit v1.2.3 From e5c1902e9260a0075ea52cb5ef627a8d9aaede89 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- include/linux/sched.h | 6 +++++ kernel/signal.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b601be3dace..85f51042c2b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1260,6 +1260,7 @@ struct task_struct { int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ + unsigned int group_stop; /* GROUP_STOP_*, siglock protected */ /* ??? */ unsigned int personality; unsigned did_exec:1; @@ -1771,6 +1772,11 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) +/* + * task->group_stop flags + */ +#define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ + #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ diff --git a/kernel/signal.c b/kernel/signal.c index 95ac42dc3bcb..ecb20089eaff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -223,6 +223,52 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +/** + * task_clear_group_stop_pending - clear pending group stop + * @task: target task + * + * Clear group stop states for @task. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void task_clear_group_stop_pending(struct task_struct *task) +{ + task->group_stop &= ~GROUP_STOP_CONSUME; +} + +/** + * task_participate_group_stop - participate in a group stop + * @task: task participating in a group stop + * + * @task is participating in a group stop. Group stop states are cleared + * and the group stop count is consumed if %GROUP_STOP_CONSUME was set. If + * the consumption completes the group stop, the appropriate %SIGNAL_* + * flags are set. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static bool task_participate_group_stop(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + bool consume = task->group_stop & GROUP_STOP_CONSUME; + + task_clear_group_stop_pending(task); + + if (!consume) + return false; + + if (!WARN_ON_ONCE(sig->group_stop_count == 0)) + sig->group_stop_count--; + + if (!sig->group_stop_count) { + sig->flags = SIGNAL_STOP_STOPPED; + return true; + } + return false; +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an @@ -1645,7 +1691,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * we must participate in the bookkeeping. */ if (current->signal->group_stop_count > 0) - --current->signal->group_stop_count; + task_participate_group_stop(current); current->last_siginfo = info; current->exit_code = exit_code; @@ -1730,6 +1776,7 @@ static int do_signal_stop(int signr) int notify = 0; if (!sig->group_stop_count) { + unsigned int gstop = GROUP_STOP_CONSUME; struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1741,6 +1788,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; + current->group_stop = gstop; sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* @@ -1750,19 +1798,19 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { + t->group_stop = gstop; sig->group_stop_count++; signal_wake_up(t, 0); - } + } else + task_clear_group_stop_pending(t); } /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, report * to the parent. When ptraced, every thread reports itself. */ - if (!--sig->group_stop_count) { - sig->flags = SIGNAL_STOP_STOPPED; + if (task_participate_group_stop(current)) notify = CLD_STOPPED; - } if (task_ptrace(current)) notify = CLD_STOPPED; @@ -2026,10 +2074,8 @@ void exit_signals(struct task_struct *tsk) recalc_sigpending_and_wake(t); if (unlikely(tsk->signal->group_stop_count) && - !--tsk->signal->group_stop_count) { - tsk->signal->flags = SIGNAL_STOP_STOPPED; + task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; - } out: spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From 39efa3ef3a376a4e53de2f82fc91182459d34200 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- fs/exec.c | 1 + include/linux/sched.h | 3 +++ kernel/signal.c | 36 +++++++++++++++++++++--------------- 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 5e62d26a4fec..8328beb9016f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1659,6 +1659,7 @@ static int zap_process(struct task_struct *start, int exit_code) t = start; do { + task_clear_group_stop_pending(t); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/include/linux/sched.h b/include/linux/sched.h index 85f51042c2b8..b2a17dfbdbad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1775,8 +1775,11 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * task->group_stop flags */ +#define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ +extern void task_clear_group_stop_pending(struct task_struct *task); + #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ diff --git a/kernel/signal.c b/kernel/signal.c index ecb20089eaff..a2e7a6527d24 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if (t->signal->group_stop_count > 0 || + if ((t->group_stop & GROUP_STOP_PENDING) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -232,19 +232,19 @@ static inline void print_dropped_signal(int sig) * CONTEXT: * Must be called with @task->sighand->siglock held. */ -static void task_clear_group_stop_pending(struct task_struct *task) +void task_clear_group_stop_pending(struct task_struct *task) { - task->group_stop &= ~GROUP_STOP_CONSUME; + task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * - * @task is participating in a group stop. Group stop states are cleared - * and the group stop count is consumed if %GROUP_STOP_CONSUME was set. If - * the consumption completes the group stop, the appropriate %SIGNAL_* - * flags are set. + * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * Group stop states are cleared and the group stop count is consumed if + * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * stop, the appropriate %SIGNAL_* flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. @@ -254,6 +254,8 @@ static bool task_participate_group_stop(struct task_struct *task) struct signal_struct *sig = task->signal; bool consume = task->group_stop & GROUP_STOP_CONSUME; + WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + task_clear_group_stop_pending(task); if (!consume) @@ -765,6 +767,9 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) t = p; do { unsigned int state; + + task_clear_group_stop_pending(t); + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); /* * If there is a handler for SIGCONT, we must make @@ -906,6 +911,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { + task_clear_group_stop_pending(t); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1139,6 +1145,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { + task_clear_group_stop_pending(t); count++; /* Don't bother with already dead threads */ @@ -1690,7 +1697,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * If there is a group stop in progress, * we must participate in the bookkeeping. */ - if (current->signal->group_stop_count > 0) + if (current->group_stop & GROUP_STOP_PENDING) task_participate_group_stop(current); current->last_siginfo = info; @@ -1775,8 +1782,8 @@ static int do_signal_stop(int signr) struct signal_struct *sig = current->signal; int notify = 0; - if (!sig->group_stop_count) { - unsigned int gstop = GROUP_STOP_CONSUME; + if (!(current->group_stop & GROUP_STOP_PENDING)) { + unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1796,8 +1803,7 @@ static int do_signal_stop(int signr) * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && - !task_is_stopped_or_traced(t)) { + if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { t->group_stop = gstop; sig->group_stop_count++; signal_wake_up(t, 0); @@ -1926,8 +1932,8 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) + if (unlikely(current->group_stop & + GROUP_STOP_PENDING) && do_signal_stop(0)) goto relock; signr = dequeue_signal(current, ¤t->blocked, @@ -2073,7 +2079,7 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(t) && !(t->flags & PF_EXITING)) recalc_sigpending_and_wake(t); - if (unlikely(tsk->signal->group_stop_count) && + if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: -- cgit v1.2.3 From 0ae8ce1c8c5b9007ce6bfc83ec2aa0dfce5bbed3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Participate in group stop from ptrace_stop() iff the task is trapping for group stop Currently, ptrace_stop() unconditionally participates in group stop bookkeeping. This is unnecessary and inaccurate. Make it only participate if the task is trapping for group stop - ie. if @why is CLD_STOPPED. As ptrace_stop() currently is not used when trapping for group stop, this equals to disabling group stop participation from ptrace_stop(). A visible behavior change is increased likelihood of delayed group stop completion if the thread group contains one or more ptraced tasks. This is to preapre for further cleanup of the interaction between group stop and ptrace. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath --- kernel/signal.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a2e7a6527d24..9f36dd2e8d5a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1694,10 +1694,13 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) } /* - * If there is a group stop in progress, - * we must participate in the bookkeeping. + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * while siglock was released for the arch hook, PENDING could be + * clear now. We act as if SIGCONT is received after TASK_TRACED + * is entered - ignore it. */ - if (current->group_stop & GROUP_STOP_PENDING) + if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) task_participate_group_stop(current); current->last_siginfo = info; -- cgit v1.2.3 From 5224fa3660ad3881d2f2ad726d22614117963f10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Make do_signal_stop() use ptrace_stop() if the task is being ptraced A ptraced task would still stop at do_signal_stop() when it's stopping for stop signals and do_signal_stop() behaves the same whether the task is ptraced or not. However, in addition to stopping, ptrace_stop() also does ptrace specific stuff like calling architecture specific callbacks, so this behavior makes the code more fragile and difficult to understand. This patch makes do_signal_stop() test whether the task is ptraced and use ptrace_stop() if so. This renders tracehook_notify_jctl() rather pointless as the ptrace notification is now handled by ptrace_stop() regardless of the return value from the tracehook. It probably is a good idea to update it. This doesn't solve the whole problem as tasks already in stopped state would stay in the regular stop when ptrace attached. That part will be handled by the next patch. Oleg pointed out that this makes a userland-visible change. Before, SIGCONT would be able to wake up a task in group stop even if the task is ptraced if the tracer hasn't issued another ptrace command afterwards (as the next ptrace commands transitions the state into TASK_TRACED which ignores SIGCONT wakeups). With this and the next patch, SIGCONT may race with the transition into TASK_TRACED and is ignored if the tracee already entered TASK_TRACED. Another userland visible change of this and the next patch is that the ptracee's state would now be TASK_TRACED where it used to be TASK_STOPPED, which is visible via fs/proc. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Jan Kratochvil --- kernel/signal.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9f36dd2e8d5a..418776c41d24 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1783,7 +1783,6 @@ void ptrace_notify(int exit_code) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int notify = 0; if (!(current->group_stop & GROUP_STOP_PENDING)) { unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; @@ -1813,29 +1812,37 @@ static int do_signal_stop(int signr) } else task_clear_group_stop_pending(t); } - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, report - * to the parent. When ptraced, every thread reports itself. - */ - if (task_participate_group_stop(current)) - notify = CLD_STOPPED; - if (task_ptrace(current)) - notify = CLD_STOPPED; current->exit_code = sig->group_exit_code; __set_current_state(TASK_STOPPED); - spin_unlock_irq(¤t->sighand->siglock); + if (likely(!task_ptrace(current))) { + int notify = 0; - if (notify) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, notify); - read_unlock(&tasklist_lock); - } + /* + * If there are no other threads in the group, or if there + * is a group stop in progress and we are the last to stop, + * report to the parent. + */ + if (task_participate_group_stop(current)) + notify = CLD_STOPPED; - /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); + spin_unlock_irq(¤t->sighand->siglock); + + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + schedule(); + + spin_lock_irq(¤t->sighand->siglock); + } else + ptrace_stop(current->exit_code, CLD_STOPPED, 0, NULL); + + spin_unlock_irq(¤t->sighand->siglock); tracehook_finish_jctl(); current->exit_code = 0; -- cgit v1.2.3 From d79fdd6d96f46fabb779d86332e3677c6f5c2a4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:00 +0100 Subject: ptrace: Clean transitions between TASK_STOPPED and TRACED Currently, if the task is STOPPED on ptrace attach, it's left alone and the state is silently changed to TRACED on the next ptrace call. The behavior breaks the assumption that arch_ptrace_stop() is called before any task is poked by ptrace and is ugly in that a task manipulates the state of another task directly. With GROUP_STOP_PENDING, the transitions between TASK_STOPPED and TRACED can be made clean. The tracer can use the flag to tell the tracee to retry stop on attach and detach. On retry, the tracee will enter the desired state in the correct way. The lower 16bits of task->group_stop is used to remember the signal number which caused the last group stop. This is used while retrying for ptrace attach as the original group_exit_code could have been consumed with wait(2) by then. As the real parent may wait(2) and consume the group_exit_code anytime, the group_exit_code needs to be saved separately so that it can be used when switching from regular sleep to ptrace_stop(). This is recorded in the lower 16bits of task->group_stop. If a task is already stopped and there's no intervening SIGCONT, a ptrace request immediately following a successful PTRACE_ATTACH should always succeed even if the tracer doesn't wait(2) for attach completion; however, with this change, the tracee might still be TASK_RUNNING trying to enter TASK_TRACED which would cause the following request to fail with -ESRCH. This intermediate state is hidden from the ptracer by setting GROUP_STOP_TRAPPING on attach and making ptrace_check_attach() wait for it to clear on its signal->wait_chldexit. Completing the transition or getting killed clears TRAPPING and wakes up the tracer. Note that the STOPPED -> RUNNING -> TRACED transition is still visible to other threads which are in the same group as the ptracer and the reverse transition is visible to all. Please read the comments for details. Oleg: * Spotted a race condition where a task may retry group stop without proper bookkeeping. Fixed by redoing bookkeeping on retry. * Spotted that the transition is visible to userland in several different ways. Most are fixed with GROUP_STOP_TRAPPING. Unhandled corner case is documented. * Pointed out not setting GROUP_STOP_SIGMASK on an already stopped task would result in more consistent behavior. * Pointed out that calling ptrace_stop() from do_signal_stop() in TASK_STOPPED can race with group stop start logic and then confuse the TRAPPING wait in ptrace_check_attach(). ptrace_stop() is now called with TASK_RUNNING. * Suggested using signal->wait_chldexit instead of bit wait. * Spotted a race condition between TRACED transition and clearing of TRAPPING. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Jan Kratochvil --- include/linux/sched.h | 2 ++ kernel/ptrace.c | 49 ++++++++++++++++++++++++++++---- kernel/signal.c | 79 ++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 112 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b2a17dfbdbad..456d80ed3b78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1775,8 +1775,10 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * task->group_stop flags */ +#define GROUP_STOP_SIGMASK 0xffff /* signr of the last group stop */ #define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ +#define GROUP_STOP_TRAPPING (1 << 18) /* switching from STOPPED to TRACED */ extern void task_clear_group_stop_pending(struct task_struct *task); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6acf8954017c..745fc2dd00c5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -49,14 +49,22 @@ static void ptrace_untrace(struct task_struct *child) spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { /* - * If the group stop is completed or in progress, - * this thread was already counted as stopped. + * If group stop is completed or in progress, it should + * participate in the group stop. Set GROUP_STOP_PENDING + * before kicking it. + * + * This involves TRACED -> RUNNING -> STOPPED transition + * which is similar to but in the opposite direction of + * what happens while attaching to a stopped task. + * However, in this direction, the intermediate RUNNING + * state is not hidden even from the current ptracer and if + * it immediately re-attaches and performs a WNOHANG + * wait(2), it may fail. */ if (child->signal->flags & SIGNAL_STOP_STOPPED || child->signal->group_stop_count) - __set_task_state(child, TASK_STOPPED); - else - signal_wake_up(child, 1); + child->group_stop |= GROUP_STOP_PENDING; + signal_wake_up(child, 1); } spin_unlock(&child->sighand->siglock); } @@ -165,6 +173,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) static int ptrace_attach(struct task_struct *task) { + bool wait_trap = false; int retval; audit_ptrace(task); @@ -204,12 +213,42 @@ static int ptrace_attach(struct task_struct *task) __ptrace_link(task, current); send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + spin_lock(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set GROUP_STOP_PENDING and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task)) { + task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + signal_wake_up(task, 1); + wait_trap = true; + } + + spin_unlock(&task->sighand->siglock); + retval = 0; unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: + if (wait_trap) + wait_event(current->signal->wait_chldexit, + !(task->group_stop & GROUP_STOP_TRAPPING)); return retval; } diff --git a/kernel/signal.c b/kernel/signal.c index 418776c41d24..1e199919ae09 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -223,6 +223,26 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +/** + * task_clear_group_stop_trapping - clear group stop trapping bit + * @task: target task + * + * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it + * and wake up the ptracer. Note that we don't need any further locking. + * @task->siglock guarantees that @task->parent points to the ptracer. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void task_clear_group_stop_trapping(struct task_struct *task) +{ + if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { + task->group_stop &= ~GROUP_STOP_TRAPPING; + __wake_up_sync(&task->parent->signal->wait_chldexit, + TASK_UNINTERRUPTIBLE, 1); + } +} + /** * task_clear_group_stop_pending - clear pending group stop * @task: target task @@ -1706,8 +1726,20 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) current->last_siginfo = info; current->exit_code = exit_code; - /* Let the debugger run. */ - __set_current_state(TASK_TRACED); + /* + * TRACED should be visible before TRAPPING is cleared; otherwise, + * the tracer might fail do_wait(). + */ + set_current_state(TASK_TRACED); + + /* + * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and + * transition to TASK_TRACED should be atomic with respect to + * siglock. This hsould be done after the arch hook as siglock is + * released and regrabbed across it. + */ + task_clear_group_stop_trapping(current); + spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { @@ -1788,6 +1820,9 @@ static int do_signal_stop(int signr) unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; struct task_struct *t; + /* signr will be recorded in task->group_stop for retries */ + WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; @@ -1797,25 +1832,27 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - current->group_stop = gstop; + current->group_stop &= ~GROUP_STOP_SIGMASK; + current->group_stop |= signr | gstop; sig->group_stop_count = 1; - for (t = next_thread(current); t != current; t = next_thread(t)) + for (t = next_thread(current); t != current; + t = next_thread(t)) { + t->group_stop &= ~GROUP_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { - t->group_stop = gstop; + t->group_stop |= signr | gstop; sig->group_stop_count++; signal_wake_up(t, 0); - } else + } else { task_clear_group_stop_pending(t); + } + } } - - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); - +retry: if (likely(!task_ptrace(current))) { int notify = 0; @@ -1827,6 +1864,7 @@ static int do_signal_stop(int signr) if (task_participate_group_stop(current)) notify = CLD_STOPPED; + __set_current_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); if (notify) { @@ -1839,13 +1877,28 @@ static int do_signal_stop(int signr) schedule(); spin_lock_irq(¤t->sighand->siglock); - } else - ptrace_stop(current->exit_code, CLD_STOPPED, 0, NULL); + } else { + ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, + CLD_STOPPED, 0, NULL); + current->exit_code = 0; + } + + /* + * GROUP_STOP_PENDING could be set if another group stop has + * started since being woken up or ptrace wants us to transit + * between TASK_STOPPED and TRACED. Retry group stop. + */ + if (current->group_stop & GROUP_STOP_PENDING) { + WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); + goto retry; + } + + /* PTRACE_ATTACH might have raced with task killing, clear trapping */ + task_clear_group_stop_trapping(current); spin_unlock_irq(¤t->sighand->siglock); tracehook_finish_jctl(); - current->exit_code = 0; return 1; } -- cgit v1.2.3 From e3bd058f62896ec7a2c605ed62a0a811e9147947 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: ptrace: Collapse ptrace_untrace() into __ptrace_unlink() Remove the extra task_is_traced() check in __ptrace_unlink() and collapse ptrace_untrace() into __ptrace_unlink(). This is to prepare for further changes. While at it, drop the comment on top of ptrace_untrace() and convert __ptrace_unlink() comment to docbook format. Detailed comment will be added by the next patch. This patch doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/ptrace.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 745fc2dd00c5..e6098434b533 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -37,15 +37,23 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) child->parent = new_parent; } -/* - * Turn a tracing stop into a normal stop now, since with no tracer there - * would be no way to wake it up with SIGCONT or SIGKILL. If there was a - * signal sent that would resume the child, but didn't because it was in - * TASK_TRACED, resume it now. - * Requires that irqs be disabled. +/** + * __ptrace_unlink - unlink ptracee and restore its execution state + * @child: ptracee to be unlinked + * + * Remove @child from the ptrace list, move it back to the original parent. + * + * CONTEXT: + * write_lock_irq(tasklist_lock) */ -static void ptrace_untrace(struct task_struct *child) +void __ptrace_unlink(struct task_struct *child) { + BUG_ON(!child->ptrace); + + child->ptrace = 0; + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); + spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { /* @@ -69,24 +77,6 @@ static void ptrace_untrace(struct task_struct *child) spin_unlock(&child->sighand->siglock); } -/* - * unptrace a task: move it back to its original parent and - * remove it from the ptrace list. - * - * Must be called with the tasklist lock write-held. - */ -void __ptrace_unlink(struct task_struct *child) -{ - BUG_ON(!child->ptrace); - - child->ptrace = 0; - child->parent = child->real_parent; - list_del_init(&child->ptrace_entry); - - if (task_is_traced(child)) - ptrace_untrace(child); -} - /* * Check that we have indeed attached to the thing.. */ -- cgit v1.2.3 From 0e9f0a4abfd80f8adca624538d479d95159b16d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: ptrace: Always put ptracee into appropriate execution state Currently, __ptrace_unlink() wakes up the tracee iff it's in TASK_TRACED. For unlinking from PTRACE_DETACH, this is correct as the tracee is guaranteed to be in TASK_TRACED or dead; however, unlinking also happens when the ptracer exits and in this case the ptracee can be in any state and ptrace might be left running even if the group it belongs to is stopped. This patch updates __ptrace_unlink() such that GROUP_STOP_PENDING is reinstated regardless of the ptracee's current state as long as it's alive and makes sure that signal_wake_up() is called if execution state transition is necessary. Test case follows. #include #include #include #include #include static const struct timespec ts1s = { .tv_sec = 1 }; int main(void) { pid_t tracee; siginfo_t si; tracee = fork(); if (tracee == 0) { while (1) { nanosleep(&ts1s, NULL); write(1, ".", 1); } } ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); write(1, "exiting", 7); return 0; } Before the patch, after the parent process exits, the child is left running and prints out "." every second. exiting..... (continues) After the patch, the group stop initiated by the implied SIGSTOP from PTRACE_ATTACH is re-established when the parent exits. exiting Signed-off-by: Tejun Heo Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov --- kernel/ptrace.c | 59 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e6098434b533..43485866749a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -41,7 +41,26 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) * __ptrace_unlink - unlink ptracee and restore its execution state * @child: ptracee to be unlinked * - * Remove @child from the ptrace list, move it back to the original parent. + * Remove @child from the ptrace list, move it back to the original parent, + * and restore the execution state so that it conforms to the group stop + * state. + * + * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer + * exiting. For PTRACE_DETACH, unless the ptracee has been killed between + * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. + * If the ptracer is exiting, the ptracee can be in any state. + * + * After detach, the ptracee should be in a state which conforms to the + * group stop. If the group is stopped or in the process of stopping, the + * ptracee should be put into TASK_STOPPED; otherwise, it should be woken + * up from TASK_TRACED. + * + * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, + * it goes through TRACED -> RUNNING -> STOPPED transition which is similar + * to but in the opposite direction of what happens while attaching to a + * stopped task. However, in this direction, the intermediate RUNNING + * state is not hidden even from the current ptracer and if it immediately + * re-attaches and performs a WNOHANG wait(2), it may fail. * * CONTEXT: * write_lock_irq(tasklist_lock) @@ -55,25 +74,25 @@ void __ptrace_unlink(struct task_struct *child) list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - if (task_is_traced(child)) { - /* - * If group stop is completed or in progress, it should - * participate in the group stop. Set GROUP_STOP_PENDING - * before kicking it. - * - * This involves TRACED -> RUNNING -> STOPPED transition - * which is similar to but in the opposite direction of - * what happens while attaching to a stopped task. - * However, in this direction, the intermediate RUNNING - * state is not hidden even from the current ptracer and if - * it immediately re-attaches and performs a WNOHANG - * wait(2), it may fail. - */ - if (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count) - child->group_stop |= GROUP_STOP_PENDING; - signal_wake_up(child, 1); - } + + /* + * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * @child isn't dead. + */ + if (!(child->flags & PF_EXITING) && + (child->signal->flags & SIGNAL_STOP_STOPPED || + child->signal->group_stop_count)) + child->group_stop |= GROUP_STOP_PENDING; + + /* + * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick + * @child in the butt. Note that @resume should be used iff @child + * is in TASK_TRACED; otherwise, we might unduly disrupt + * TASK_KILLABLE sleeps. + */ + if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + signal_wake_up(child, task_is_traced(child)); + spin_unlock(&child->sighand->siglock); } -- cgit v1.2.3 From 408a37de6c95832a4880a88a742f89f0cc554d06 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Don't set group_stop exit_code if re-entering job control stop While ptraced, a task may be resumed while the containing process is still job control stopped. If the task receives another stop signal in this state, it will still initiate group stop, which generates group_exit_code, which the real parent would be able to see once the ptracer detaches. In this scenario, the real parent may see two consecutive CLD_STOPPED events from two stop signals without intervening SIGCONT, which normally is impossible. Test case follows. #include #include #include #include int main(void) { pid_t tracee; siginfo_t si; tracee = fork(); if (!tracee) while (1) pause(); kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); if (!fork()) { ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG); if (si.si_pid) printf("st=%02d c=%02d\n", si.si_status, si.si_code); } return 0; } Before the patch, the latter waitid() in polling mode reports the second stopped event generated by the implied SIGSTOP of PTRACE_ATTACH. st=19 c=05 ^C After the patch, the second event is not reported. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/signal.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1e199919ae09..2f2c8f6ee01f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1827,10 +1827,27 @@ static int do_signal_stop(int signr) unlikely(signal_group_exit(sig))) return 0; /* - * There is no group stop already in progress. - * We must initiate one now. + * There is no group stop already in progress. We must + * initiate one now. + * + * While ptraced, a task may be resumed while group stop is + * still in effect and then receive a stop signal and + * initiate another group stop. This deviates from the + * usual behavior as two consecutive stop signals can't + * cause two group stops when !ptraced. + * + * The condition can be distinguished by testing whether + * SIGNAL_STOP_STOPPED is already set. Don't generate + * group_exit_code in such case. + * + * This is not necessary for SIGNAL_STOP_CONTINUED because + * an intervening stop signal is required to cause two + * continued events regardless of ptrace. */ - sig->group_exit_code = signr; + if (!(sig->flags & SIGNAL_STOP_STOPPED)) + sig->group_exit_code = signr; + else + WARN_ON_ONCE(!task_ptrace(current)); current->group_stop &= ~GROUP_STOP_SIGMASK; current->group_stop |= signr | gstop; -- cgit v1.2.3 From 823b018e5b1196d810790559357447948f644548 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Small reorganization of wait_consider_task() Move EXIT_DEAD test in wait_consider_task() above ptrace check. As ptraced tasks can't be EXIT_DEAD, this change doesn't cause any behavior change. This is to prepare for further changes. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b1..b4a935c72159 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1537,6 +1537,10 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } + /* dead body doesn't have much to contribute */ + if (p->exit_state == EXIT_DEAD) + return 0; + if (likely(!ptrace) && unlikely(task_ptrace(p))) { /* * This child is hidden by ptrace. @@ -1546,9 +1550,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (p->exit_state == EXIT_DEAD) - return 0; - /* * We don't reap group leaders with subthreads. */ -- cgit v1.2.3 From 9b84cca2564b9a5b2d064fb44d2a55a5b44473a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Fix ptracer wait(2) hang and explain notask_error clearing wait(2) and friends allow access to stopped/continued states through zombies, which is required as the states are process-wide and should be accessible whether the leader task is alive or undead. wait_consider_task() implements this by always clearing notask_error and going through wait_task_stopped/continued() for unreaped zombies. However, while ptraced, the stopped state is per-task and as such if the ptracee became a zombie, there's no further stopped event to listen to and wait(2) and friends should return -ECHILD on the tracee. Fix it by clearing notask_error only if WCONTINUED | WEXITED is set for ptraced zombies. While at it, document why clearing notask_error is safe for each case. Test case follows. #include #include #include #include #include #include #include static void *nooper(void *arg) { pause(); return NULL; } int main(void) { const struct timespec ts1s = { .tv_sec = 1 }; pid_t tracee, tracer; siginfo_t si; tracee = fork(); if (tracee == 0) { pthread_t thr; pthread_create(&thr, NULL, nooper, NULL); nanosleep(&ts1s, NULL); printf("tracee exiting\n"); pthread_exit(NULL); /* let subthread run */ } tracer = fork(); if (tracer == 0) { ptrace(PTRACE_ATTACH, tracee, NULL, NULL); while (1) { if (waitid(P_PID, tracee, &si, WSTOPPED) < 0) { perror("waitid"); break; } ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } return 0; } waitid(P_PID, tracer, &si, WEXITED); kill(tracee, SIGKILL); return 0; } Before the patch, after the tracee becomes a zombie, the tracer's waitid(WSTOPPED) never returns and the program doesn't terminate. tracee exiting ^C After the patch, tracee exiting triggers waitid() to fail. tracee exiting waitid: No child processes -v2: Oleg pointed out that exited in addition to continued can happen for ptraced dead group leader. Clear notask_error for ptraced child on WEXITED too. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/exit.c | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b4a935c72159..84d13d6bb30b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1550,17 +1550,41 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* - * We don't reap group leaders with subthreads. - */ - if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) - return wait_task_zombie(wo, p); + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { + /* we don't reap group leaders with subthreads */ + if (!delay_group_leader(p)) + return wait_task_zombie(wo, p); - /* - * It's stopped or running now, so it might - * later continue, exit, or stop again. - */ - wo->notask_error = 0; + /* + * Allow access to stopped/continued state via zombie by + * falling through. Clearing of notask_error is complex. + * + * When !@ptrace: + * + * If WEXITED is set, notask_error should naturally be + * cleared. If not, subset of WSTOPPED|WCONTINUED is set, + * so, if there are live subthreads, there are events to + * wait for. If all subthreads are dead, it's still safe + * to clear - this function will be called again in finite + * amount time once all the subthreads are released and + * will then return without clearing. + * + * When @ptrace: + * + * Stopped state is per-task and thus can't change once the + * target task dies. Only continued and exited can happen. + * Clear notask_error if WCONTINUED | WEXITED. + */ + if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) + wo->notask_error = 0; + } else { + /* + * @p is alive and it's gonna stop, continue or exit, so + * there always is something to wait for. + */ + wo->notask_error = 0; + } if (task_stopped_code(p, ptrace)) return wait_task_stopped(wo, ptrace, p); -- cgit v1.2.3 From 45cb24a1da53beb70f09efccc0373f6a47a9efe0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Allow access to job control events through ptracees Currently a real parent can't access job control stopped/continued events through a ptraced child. This utterly breaks job control when the children are ptraced. For example, if a program is run from an interactive shell and then strace(1) attaches to it, pressing ^Z would send SIGTSTP and strace(1) would notice it but the shell has no way to tell whether the child entered job control stop and thus can't tell when to take over the terminal - leading to awkward lone ^Z on the terminal. Because the job control and ptrace stopped states are independent, there is no reason to prevent real parents from accessing the stopped state regardless of ptrace. The continued state isn't separate but ptracers don't have any use for them as ptracees can never resume without explicit command from their ptracers, so as long as ptracers don't consume it, it should be fine. Although this is a behavior change, because the previous behavior is utterly broken when viewed from real parents and the change is only visible to real parents, I don't think it's necessary to make this behavior optional. One situation to be careful about is when a task from the real parent's group is ptracing. The parent group is the recipient of both ptrace and job control stop events and one stop can be reported as both job control and ptrace stops. As this can break the current ptrace users, suppress job control stopped events for these cases. If a real parent ptracer wants to know about both job control and ptrace stops, it can create a separate process to serve the role of real parent. Note that this only updates wait(2) side of things. The real parent can access the states via wait(2) but still is not properly notified (woken up and delivered signal). Test case polls wait(2) with WNOHANG to work around. Notification will be updated by future patches. Test case follows. #include #include #include #include #include #include #include int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED | WNOHANG); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); nanosleep(&ts100ms, NULL); } return 0; } Before the patch, while ptraced, the parent can't see any job control events. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After the patch, tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that wait(2) should be suppressed for the real parent's group instead of only the real parent task itself. Updated accordingly. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/exit.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 84d13d6bb30b..1a0f10f0a4db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1541,17 +1541,19 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (p->exit_state == EXIT_DEAD) return 0; - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { /* - * This child is hidden by ptrace. - * We aren't allowed to see it now, but eventually we will. + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the real + * parent when the ptracer detaches. */ - wo->notask_error = 0; - return 0; - } + if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* it will become visible, clear notask_error */ + wo->notask_error = 0; + return 0; + } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) return wait_task_zombie(wo, p); @@ -1579,6 +1581,20 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { + /* + * If @p is ptraced by a task in its real parent's group, + * hide group stop/continued state when looking at @p as + * the real parent; otherwise, a single stop can be + * reported twice as group and ptrace stops. + * + * If a ptracer wants to distinguish the two events for its + * own children, it should create a separate process which + * takes the role of real parent. + */ + if (likely(!ptrace) && task_ptrace(p) && + same_thread_group(p->parent, p->real_parent)) + return 0; + /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. @@ -1586,9 +1602,18 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } + /* + * Wait for stopped. Depending on @ptrace, different stopped state + * is used and the two don't interact with each other. + */ if (task_stopped_code(p, ptrace)) return wait_task_stopped(wo, ptrace, p); + /* + * Wait for continued. There's only one continued state and the + * ptracer can consume it which can confuse the real parent. Don't + * use WCONTINUED from ptracer. You don't need or want it. + */ return wait_task_continued(wo, p); } -- cgit v1.2.3 From 75b95953a56969a990e6ce154b260be83818fe71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Add @for_ptrace to do_notify_parent_cldstop() Currently, do_notify_parent_cldstop() determines whether the notification is for the real parent or ptracer. Move the decision to the caller by adding @for_ptrace parameter to do_notify_parent_cldstop(). All the callers are updated to pass task_ptrace(target_task), so this patch doesn't cause any behavior difference. While at it, add function comment to do_notify_parent_cldstop(). Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/signal.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2f2c8f6ee01f..69d60540a680 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1595,16 +1595,30 @@ int do_notify_parent(struct task_struct *tsk, int sig) return ret; } -static void do_notify_parent_cldstop(struct task_struct *tsk, int why) +/** + * do_notify_parent_cldstop - notify parent of stopped/continued state change + * @tsk: task reporting the state change + * @for_ptracer: the notification is for ptracer + * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report + * + * Notify @tsk's parent that the stopped/continued state has changed. If + * @for_ptracer is %false, @tsk's group leader notifies to its real parent. + * If %true, @tsk reports to @tsk->parent which should be the ptracer. + * + * CONTEXT: + * Must be called with tasklist_lock at least read locked. + */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + bool for_ptracer, int why) { struct siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - if (task_ptrace(tsk)) + if (for_ptracer) { parent = tsk->parent; - else { + } else { tsk = tsk->group_leader; parent = tsk->real_parent; } @@ -1743,7 +1757,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, why); + do_notify_parent_cldstop(current, task_ptrace(current), why); /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -1886,7 +1900,8 @@ retry: if (notify) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, notify); + do_notify_parent_cldstop(current, task_ptrace(current), + notify); read_unlock(&tasklist_lock); } @@ -1982,6 +1997,7 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { + struct task_struct *leader; int why; if (signal->flags & SIGNAL_CLD_CONTINUED) @@ -1994,7 +2010,8 @@ relock: spin_unlock_irq(&sighand->siglock); read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); + leader = current->group_leader; + do_notify_parent_cldstop(leader, task_ptrace(leader), why); read_unlock(&tasklist_lock); goto relock; } @@ -2167,7 +2184,7 @@ out: if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, group_stop); + do_notify_parent_cldstop(tsk, task_ptrace(tsk), group_stop); read_unlock(&tasklist_lock); } } -- cgit v1.2.3 From 62bcf9d992ecc19ea4f37ff57ee0b3333e3e843e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Job control stop notifications should always go to the real parent The stopped notifications in do_signal_stop() and exit_signals() are always for the completion of job control. The one in do_signal_stop() may be delivered to the ptracer if PTRACE_ATTACH races with notification and the one in exit_signals() if task exits while ptraced. In both cases, the notifications are meaningless and confusing to the ptracer as it never accesses the group stop state while the real parent would miss notifications for the events it is watching. Make sure these notifications always go to the real parent by calling do_notify_parent_cld_stop() with %false @for_ptrace. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/signal.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 69d60540a680..9f10b246fd46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1898,10 +1898,18 @@ retry: __set_current_state(TASK_STOPPED); spin_unlock_irq(¤t->sighand->siglock); + /* + * Notify the parent of the group stop completion. Because + * we're not holding either the siglock or tasklist_lock + * here, ptracer may attach inbetween; however, this is for + * group stop and should always be delivered to the real + * parent of the group leader. The new ptracer will get + * its notification when this task transitions into + * TASK_TRACED. + */ if (notify) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, task_ptrace(current), - notify); + do_notify_parent_cldstop(current, false, notify); read_unlock(&tasklist_lock); } @@ -2182,9 +2190,13 @@ void exit_signals(struct task_struct *tsk) out: spin_unlock_irq(&tsk->sighand->siglock); + /* + * If group stop has completed, deliver the notification. This + * should always go to the real parent of the group leader. + */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, task_ptrace(tsk), group_stop); + do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } -- cgit v1.2.3 From ceb6bd67f9b9db765e1c29405f26e8460391badd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Notify the real parent of job control events regardless of ptrace With recent changes, job control and ptrace stopped states are properly separated and accessible to the real parent and the ptracer respectively; however, notifications of job control stopped/continued events to the real parent while ptraced are still missing. A ptracee participates in group stop in ptrace_stop() but the completion isn't notified. If participation results in completion of group stop, notify the real parent of the event. The ptrace and group stops are separate and can be handled as such. However, when the real parent and the ptracer are in the same thread group, only the ptrace stop event is visible through wait(2) and the duplicate notifications are different from the current behavior and are confusing. Suppress group stop notification in such cases. The continued state is shared between the real parent and the ptracer but is only meaningful to the real parent. Always notify the real parent and notify the ptracer too for backward compatibility. Similar to stop notification, if the real parent is the ptracer, suppress a duplicate notification. Test case follows. #include #include #include #include #include #include #include int main(void) { const struct timespec ts100ms = { .tv_nsec = 100000000 }; pid_t tracee, tracer; siginfo_t si; int i; tracee = fork(); if (tracee == 0) { while (1) { printf("tracee: SIGSTOP\n"); raise(SIGSTOP); nanosleep(&ts100ms, NULL); printf("tracee: SIGCONT\n"); raise(SIGCONT); nanosleep(&ts100ms, NULL); } } waitid(P_PID, tracee, &si, WSTOPPED | WNOHANG | WNOWAIT); tracer = fork(); if (tracer == 0) { nanosleep(&ts100ms, NULL); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); for (i = 0; i < 11; i++) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED); if (si.si_pid && si.si_code == CLD_TRAPPED) ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); } printf("tracer: EXITING\n"); return 0; } while (1) { si.si_pid = 0; waitid(P_PID, tracee, &si, WSTOPPED | WCONTINUED | WEXITED); if (si.si_pid) printf("mommy : WAIT status=%02d code=%02d\n", si.si_status, si.si_code); } return 0; } Before this patch, while ptraced, the real parent doesn't get notifications for job control events, so although it can access those events, the later waitid(2) call never wakes up. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracee: SIGCONT tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C After this patch, it works as expected. tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP mommy : WAIT status=19 code=05 tracee: SIGCONT mommy : WAIT status=18 code=06 tracee: SIGSTOP tracer: EXITING mommy : WAIT status=19 code=05 ^C -v2: Oleg pointed out that * Group stop notification to the real parent should also happen when ptracer detach races with ptrace_stop(). * real_parent_is_ptracer() should be testing thread group equality not the task itself as wait(2) and stop/cont notifications are normally thread-group wide. Both issues are fixed accordingly. -v3: real_parent_is_ptracer() updated to test child->real_parent instead of child->group_leader->real_parent per Oleg's suggestion. Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/signal.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 9f10b246fd46..f65403da4101 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1693,6 +1693,15 @@ static int sigkill_pending(struct task_struct *tsk) sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } +/* + * Test whether the target task of the usual cldstop notification - the + * real_parent of @child - is in the same group as the ptracer. + */ +static bool real_parent_is_ptracer(struct task_struct *child) +{ + return same_thread_group(child->parent, child->real_parent); +} + /* * This must be called with current->sighand->siglock held. * @@ -1708,6 +1717,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { + bool gstop_done = false; + if (arch_ptrace_stop_needed(exit_code, info)) { /* * The arch code has something special to do before a @@ -1735,7 +1746,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * is entered - ignore it. */ if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) - task_participate_group_stop(current); + gstop_done = task_participate_group_stop(current); current->last_siginfo = info; current->exit_code = exit_code; @@ -1757,7 +1768,20 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, task_ptrace(current), why); + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + do_notify_parent_cldstop(current, true, why); + if (gstop_done && !real_parent_is_ptracer(current)) + do_notify_parent_cldstop(current, false, why); + /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -1772,7 +1796,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) /* * By the time we got the lock, our tracer went away. * Don't drop the lock yet, another tracer may come. + * + * If @gstop_done, the ptracer went away between group stop + * completion and here. During detach, it would have set + * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED + * in do_signal_stop() on return, so notifying the real + * parent of the group stop completion is enough. */ + if (gstop_done) + do_notify_parent_cldstop(current, false, why); + __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; @@ -2017,10 +2050,24 @@ relock: spin_unlock_irq(&sighand->siglock); + /* + * Notify the parent that we're continuing. This event is + * always per-process and doesn't make whole lot of sense + * for ptracers, who shouldn't consume the state via + * wait(2) either, but, for backward compatibility, notify + * the ptracer of the group leader too unless it's gonna be + * a duplicate. + */ read_lock(&tasklist_lock); + + do_notify_parent_cldstop(current, false, why); + leader = current->group_leader; - do_notify_parent_cldstop(leader, task_ptrace(leader), why); + if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + do_notify_parent_cldstop(leader, true, why); + read_unlock(&tasklist_lock); + goto relock; } -- cgit v1.2.3 From 244056f9dbbc6dc4126a301c745fa3dd67d8af3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Mar 2011 10:37:01 +0100 Subject: job control: Don't send duplicate job control stop notification while ptraced Just as group_exit_code shouldn't be generated when a PTRACE_CONT'd task re-enters job control stop, notifiction for the event should be suppressed too. The logic is the same as the group_exit_code generation suppression in do_signal_stop(), if SIGNAL_STOP_STOPPED is already set, the task is re-entering job control stop without intervening SIGCONT and the notifications should be suppressed. Test case follows. #include #include #include #include #include #include static const struct timespec ts100ms = { .tv_nsec = 100000000 }; static pid_t tracee, tracer; static const char *pid_who(pid_t pid) { return pid == tracee ? "tracee" : (pid == tracer ? "tracer" : "mommy "); } static void sigchld_sigaction(int signo, siginfo_t *si, void *ucxt) { printf("%s: SIG status=%02d code=%02d (%s)\n", pid_who(getpid()), si->si_status, si->si_code, pid_who(si->si_pid)); } int main(void) { const struct sigaction chld_sa = { .sa_sigaction = sigchld_sigaction, .sa_flags = SA_SIGINFO|SA_RESTART }; siginfo_t si; sigaction(SIGCHLD, &chld_sa, NULL); tracee = fork(); if (!tracee) { tracee = getpid(); while (1) pause(); } kill(tracee, SIGSTOP); waitid(P_PID, tracee, &si, WSTOPPED); tracer = fork(); if (!tracer) { tracer = getpid(); ptrace(PTRACE_ATTACH, tracee, NULL, NULL); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); ptrace(PTRACE_CONT, tracee, NULL, (void *)(long)si.si_status); waitid(P_PID, tracee, &si, WSTOPPED); printf("tracer: detaching\n"); ptrace(PTRACE_DETACH, tracee, NULL, NULL); return 0; } while (1) pause(); return 0; } Before the patch, the parent gets the second notification for the tracee after the tracer detaches. si_status is zero because group_exit_code is not set by the group stop completion which triggered this notification. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=05 (tracee) mommy : SIG status=00 code=01 (tracer) ^C After the patch, the duplicate notification is gone. mommy : SIG status=19 code=05 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: SIG status=19 code=04 (tracee) tracer: SIG status=00 code=05 (tracee) tracer: detaching mommy : SIG status=00 code=01 (tracer) ^C Signed-off-by: Tejun Heo Acked-by: Oleg Nesterov --- kernel/signal.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f65403da4101..f799a054f292 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -268,6 +268,10 @@ void task_clear_group_stop_pending(struct task_struct *task) * * CONTEXT: * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if group stop completion should be notified to the parent, %false + * otherwise. */ static bool task_participate_group_stop(struct task_struct *task) { @@ -284,7 +288,11 @@ static bool task_participate_group_stop(struct task_struct *task) if (!WARN_ON_ONCE(sig->group_stop_count == 0)) sig->group_stop_count--; - if (!sig->group_stop_count) { + /* + * Tell the caller to notify completion iff we are entering into a + * fresh group stop. Read comment in do_signal_stop() for details. + */ + if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { sig->flags = SIGNAL_STOP_STOPPED; return true; } -- cgit v1.2.3 From 0415b00d175e0d8945e6785aad21b5f157976ce0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 24 Mar 2011 18:50:09 +0100 Subject: percpu: Always align percpu output section to PAGE_SIZE Percpu allocator honors alignment request upto PAGE_SIZE and both the percpu addresses in the percpu address space and the translated kernel addresses should be aligned accordingly. The calculation of the former depends on the alignment of percpu output section in the kernel image. The linker script macros PERCPU_VADDR() and PERCPU() are used to define this output section and the latter takes @align parameter. Several architectures are using @align smaller than PAGE_SIZE breaking percpu memory alignment. This patch removes @align parameter from PERCPU(), renames it to PERCPU_SECTION() and makes it always align to PAGE_SIZE. While at it, add PCPU_SETUP_BUG_ON() checks such that alignment problems are reliably detected and remove percpu alignment comment recently added in workqueue.c as the condition would trigger BUG way before reaching there. For um, this patch raises the alignment of percpu area. As the area is in .init, there shouldn't be any noticeable difference. This problem was discovered by David Howells while debugging boot failure on mn10300. Signed-off-by: Tejun Heo Acked-by: Mike Frysinger Cc: uclinux-dist-devel@blackfin.uclinux.org Cc: David Howells Cc: Jeff Dike Cc: user-mode-linux-devel@lists.sourceforge.net --- arch/alpha/kernel/vmlinux.lds.S | 2 +- arch/arm/kernel/vmlinux.lds.S | 2 +- arch/blackfin/kernel/vmlinux.lds.S | 2 +- arch/cris/kernel/vmlinux.lds.S | 2 +- arch/frv/kernel/vmlinux.lds.S | 2 +- arch/m32r/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 2 +- arch/mn10300/kernel/vmlinux.lds.S | 2 +- arch/parisc/kernel/vmlinux.lds.S | 2 +- arch/powerpc/kernel/vmlinux.lds.S | 2 +- arch/s390/kernel/vmlinux.lds.S | 2 +- arch/sh/kernel/vmlinux.lds.S | 2 +- arch/sparc/kernel/vmlinux.lds.S | 2 +- arch/tile/kernel/vmlinux.lds.S | 2 +- arch/um/include/asm/common.lds.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 2 +- arch/xtensa/kernel/vmlinux.lds.S | 2 +- include/asm-generic/vmlinux.lds.h | 17 ++++++++--------- kernel/workqueue.c | 4 +--- mm/percpu.c | 2 ++ 20 files changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 433be2a24f31..8d57948c0aef 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -39,7 +39,7 @@ SECTIONS __init_begin = ALIGN(PAGE_SIZE); INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) - PERCPU(L1_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) /* Align to THREAD_SIZE rather than PAGE_SIZE here so any padding page needed for the THREAD_SIZE aligned init_task gets freed after init */ . = ALIGN(THREAD_SIZE); diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index b4348e62ef06..e5287f21badc 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -82,7 +82,7 @@ SECTIONS #endif } - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) #ifndef CONFIG_XIP_KERNEL . = ALIGN(PAGE_SIZE); diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 854fa49f1c3e..8d85c8c6f857 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -136,7 +136,7 @@ SECTIONS . = ALIGN(16); INIT_DATA_SECTION(16) - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) .exit.data : { diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 728bbd9e7d4c..a6990cb0f098 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -102,7 +102,7 @@ SECTIONS #endif __vmlinux_end = .; /* Last address of the physical file. */ #ifdef CONFIG_ETRAX_ARCH_V32 - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) .init.ramfs : { INIT_RAM_FS diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 0daae8af5787..7e958d829ec9 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -37,7 +37,7 @@ SECTIONS _einittext = .; INIT_DATA_SECTION(8) - PERCPU(L1_CACHE_BYTES, 4096) + PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(PAGE_SIZE); __init_end = .; diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index c194d64cdbb9..2e7ccf7299a0 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -53,7 +53,7 @@ SECTIONS __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) INIT_DATA_SECTION(16) - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 832afbb87588..8616709452b1 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -115,7 +115,7 @@ SECTIONS EXIT_DATA } - PERCPU(1 << CONFIG_MIPS_L1_CACHE_SHIFT, PAGE_SIZE) + PERCPU_SECTION(1 << CONFIG_MIPS_L1_CACHE_SHIFT) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 968bcd2cb022..6f702a6ab395 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -70,7 +70,7 @@ SECTIONS .exit.text : { EXIT_TEXT; } .exit.data : { EXIT_DATA; } - PERCPU(32, PAGE_SIZE) + PERCPU_SECTION(32) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 8f1e4efd143e..85b86617fe0b 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -145,7 +145,7 @@ SECTIONS EXIT_DATA } - PERCPU(L1_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b9150f07d266..920276c0f6a1 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -160,7 +160,7 @@ SECTIONS INIT_RAM_FS } - PERCPU(L1_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(8); .machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) { diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 1bc18cdb525b..56fe6bc81fee 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -77,7 +77,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) - PERCPU(0x100, PAGE_SIZE) + PERCPU_SECTION(0x100) . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index af4d46187a79..731c10ce67b5 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -66,7 +66,7 @@ SECTIONS __machvec_end = .; } - PERCPU(L1_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) /* * .exit.text is discarded at runtime, not link time, to deal with diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 92b557afe535..c0220759003e 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -108,7 +108,7 @@ SECTIONS __sun4v_2insn_patch_end = .; } - PERCPU(SMP_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(SMP_CACHE_BYTES) . = ALIGN(PAGE_SIZE); __init_end = .; diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 38f64fafdc10..631f10de12fe 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -60,7 +60,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); VMLINUX_SYMBOL(_sinitdata) = .; INIT_DATA_SECTION(16) :data =0 - PERCPU(L2_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(L2_CACHE_BYTES) . = ALIGN(PAGE_SIZE); VMLINUX_SYMBOL(_einitdata) = .; diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index 34bede8aad4a..4938de5512d2 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -42,7 +42,7 @@ INIT_SETUP(0) } - PERCPU(32, 32) + PERCPU_SECTION(32) .initcall.init : { INIT_CALLS diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 624a2016198e..3e9fb5d54f96 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -319,7 +319,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(INTERNODE_CACHE_BYTES) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index a2820065927e..88ecea3facb4 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -155,7 +155,7 @@ SECTIONS INIT_RAM_FS } - PERCPU(XCHAL_ICACHE_LINESIZE, PAGE_SIZE) + PERCPU_SECTION(XCHAL_ICACHE_LINESIZE) /* We need this dummy segment here */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 32c45e5fe0ab..f301cea5ca2d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -15,7 +15,7 @@ * HEAD_TEXT_SECTION * INIT_TEXT_SECTION(PAGE_SIZE) * INIT_DATA_SECTION(...) - * PERCPU(CACHELINE_SIZE, PAGE_SIZE) + * PERCPU_SECTION(CACHELINE_SIZE) * __init_end = .; * * _stext = .; @@ -709,7 +709,7 @@ * * Note that this macros defines __per_cpu_load as an absolute symbol. * If there is no need to put the percpu section at a predetermined - * address, use PERCPU(). + * address, use PERCPU_SECTION. */ #define PERCPU_VADDR(cacheline, vaddr, phdr) \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ @@ -729,20 +729,19 @@ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data..percpu); /** - * PERCPU - define output section for percpu area, simple version + * PERCPU_SECTION - define output section for percpu area, simple version * @cacheline: cacheline size - * @align: required alignment * - * Align to @align and outputs output section for percpu area. This macro - * doesn't manipulate @vaddr or @phdr and __per_cpu_load and + * Align to PAGE_SIZE and outputs output section for percpu area. This + * macro doesn't manipulate @vaddr or @phdr and __per_cpu_load and * __per_cpu_start will be identical. * - * This macro is equivalent to ALIGN(@align); PERCPU_VADDR(@cacheline,,) + * This macro is equivalent to ALIGN(PAGE_SIZE); PERCPU_VADDR(@cacheline,,) * except that __per_cpu_load is defined as a relative symbol against * .data..percpu which is required for relocatable x86_32 configuration. */ -#define PERCPU(cacheline, align) \ - . = ALIGN(align); \ +#define PERCPU_SECTION(cacheline) \ + . = ALIGN(PAGE_SIZE); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04ef830690ec..d30a502e8c6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2860,9 +2860,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned - * - this is affected by PERCPU() alignment in vmlinux.lds.S - */ + /* just in case, make sure it's actually aligned */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } diff --git a/mm/percpu.c b/mm/percpu.c index 3f930018aa60..c5feb79f5995 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1216,8 +1216,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); #endif PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); -- cgit v1.2.3 From 1deac632fc3dcff33a6df3e82ef10c738ac13fe6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Apr 2011 20:11:50 +0200 Subject: signal: prepare_signal(SIGCONT) shouldn't play with TIF_SIGPENDING prepare_signal(SIGCONT) should never set TIF_SIGPENDING or wake up the TASK_INTERRUPTIBLE threads. We are going to call complete_signal() which should pick the right thread correctly. All we need is to wake up the TASK_STOPPED threads. If the task was stopped, it can't return to usermode without taking ->siglock. Otherwise we don't care, and the spurious TIF_SIGPENDING can't be useful. The comment says: * If there is a handler for SIGCONT, we must make * sure that no thread returns to user mode before * we post the signal It is not clear what this means. Probably, "when there's only a single thread" and this continues to be true. Otherwise, even if this SIGCONT is not private, with or without this change only one thread can dequeue SIGCONT, other threads can happily return to user mode before before that thread handles this signal. Note also that wake_up_state(t, __TASK_STOPPED) can't race with the task which changes its state, TASK_STOPPED state is protected by ->siglock as well. In short: when it comes to signal delivery, SIGCONT is the normal signal and does not need any special support. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/signal.c | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f799a054f292..38ea9e2f1831 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -788,37 +788,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } else if (sig == SIGCONT) { unsigned int why; /* - * Remove all stop signals from all queues, - * and wake all threads. + * Remove all stop signals from all queues, wake all threads. */ rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - unsigned int state; - task_clear_group_stop_pending(t); - rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - /* - * If there is a handler for SIGCONT, we must make - * sure that no thread returns to user mode before - * we post the signal, in case it was the only - * thread eligible to run the signal handler--then - * it must not do anything between resuming and - * running the handler. With the TIF_SIGPENDING - * flag set, the thread will pause and acquire the - * siglock that we hold now and until we've queued - * the pending signal. - * - * Wake up the stopped thread _after_ setting - * TIF_SIGPENDING - */ - state = __TASK_STOPPED; - if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - state |= TASK_INTERRUPTIBLE; - } - wake_up_state(t, state); + wake_up_state(t, __TASK_STOPPED); } while_each_thread(p, t); /* -- cgit v1.2.3 From 780006eac2fe7f4d2582da16a096e5a44c4767ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Apr 2011 20:12:16 +0200 Subject: signal: do_signal_stop: Remove the unneeded task_clear_group_stop_pending() PF_EXITING or TASK_STOPPED has already called task_participate_group_stop() and cleared its ->group_stop. No need to do task_clear_group_stop_pending() when we start the new group stop. Add a small comment to explain the !task_is_stopped() check. Note that this check is not exactly right and it can lead to unnecessary stop later if the thread is TASK_PTRACED. What we need is task_participated_in_group_stop(), this will be solved later. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/signal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 38ea9e2f1831..e9abc69dc0d8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1866,7 +1866,8 @@ static int do_signal_stop(int signr) * still in effect and then receive a stop signal and * initiate another group stop. This deviates from the * usual behavior as two consecutive stop signals can't - * cause two group stops when !ptraced. + * cause two group stops when !ptraced. That is why we + * also check !task_is_stopped(t) below. * * The condition can be distinguished by testing whether * SIGNAL_STOP_STOPPED is already set. Don't generate @@ -1896,8 +1897,6 @@ static int do_signal_stop(int signr) t->group_stop |= signr | gstop; sig->group_stop_count++; signal_wake_up(t, 0); - } else { - task_clear_group_stop_pending(t); } } } -- cgit v1.2.3 From ee77f075921730b2b465880f9fd4367003bdab39 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Apr 2011 20:12:38 +0200 Subject: signal: Turn SIGNAL_STOP_DEQUEUED into GROUP_STOP_DEQUEUED This patch moves SIGNAL_STOP_DEQUEUED from signal_struct->flags to task_struct->group_stop, and thus makes it per-thread. Like SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can be false-positive after return from get_signal_to_deliver(), this is fine. The only purpose of this bit is: we can drop ->siglock after __dequeue_signal() returns the sig_kernel_stop() signal and before we call do_signal_stop(), in this case we must not miss SIGCONT if it comes in between. But, unlike SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can not be false-positive in do_signal_stop() if multiple threads dequeue the sig_kernel_stop() signal at the same time. Consider two threads T1 and T2, SIGTTIN has a hanlder. - T1 dequeues SIGTSTP and sets SIGNAL_STOP_DEQUEUED, then it drops ->siglock - SIGCONT comes and clears SIGNAL_STOP_DEQUEUED, SIGTSTP should be cancelled. - T2 dequeues SIGTTIN and sets SIGNAL_STOP_DEQUEUED again. Since we have a handler we should not stop, T2 returns to usermode to run the handler. - T1 continues, calls do_signal_stop() and wrongly starts the group stop because SIGNAL_STOP_DEQUEUED was restored in between. With or without this change: - we need to do something with ptrace_signal() which can return SIGSTOP, but this needs another discussion - SIGSTOP can be lost if it races with the mt exec, will be fixed later. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- include/linux/sched.h | 6 +++--- kernel/signal.c | 14 ++++---------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 456d80ed3b78..8cef82d4cf77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -652,9 +652,8 @@ struct signal_struct { * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ -#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ -#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ -#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ @@ -1779,6 +1778,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define GROUP_STOP_PENDING (1 << 16) /* task should stop for group stop */ #define GROUP_STOP_CONSUME (1 << 17) /* consume group stop count */ #define GROUP_STOP_TRAPPING (1 << 18) /* switching from STOPPED to TRACED */ +#define GROUP_STOP_DEQUEUED (1 << 19) /* stop signal dequeued */ extern void task_clear_group_stop_pending(struct task_struct *task); diff --git a/kernel/signal.c b/kernel/signal.c index e9abc69dc0d8..4f7312b49b2d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -254,7 +254,8 @@ static void task_clear_group_stop_trapping(struct task_struct *task) */ void task_clear_group_stop_pending(struct task_struct *task) { - task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME); + task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | + GROUP_STOP_DEQUEUED); } /** @@ -602,7 +603,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + current->group_stop |= GROUP_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -821,13 +822,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) signal->flags = why | SIGNAL_STOP_CONTINUED; signal->group_stop_count = 0; signal->group_exit_code = 0; - } else { - /* - * We are not stopped, but there could be a stop - * signal in the middle of being processed after - * being removed from the queue. Clear that too. - */ - signal->flags &= ~SIGNAL_STOP_DEQUEUED; } } @@ -1855,7 +1849,7 @@ static int do_signal_stop(int signr) /* signr will be recorded in task->group_stop for retries */ WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || + if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; /* -- cgit v1.2.3 From 321fb561971ba0f10ce18c0f8a4b9fbfc7cef4b9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Apr 2011 20:13:01 +0200 Subject: ptrace: ptrace_check_attach() should not do s/STOPPED/TRACED/ After "ptrace: Clean transitions between TASK_STOPPED and TRACED" d79fdd6d96f46fabb779d86332e3677c6f5c2a4f, ptrace_check_attach() should never see a TASK_STOPPED tracee and s/STOPPED/TRACED/ is no longer legal. Add the warning. Note: ptrace_check_attach() can be greatly simplified, in particular it doesn't need tasklist. But I'd prefer another patch for that. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/ptrace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43485866749a..20d5efdeee02 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -112,16 +112,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ read_lock(&tasklist_lock); if ((child->ptrace & PT_PTRACED) && child->parent == current) { - ret = 0; /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ spin_lock_irq(&child->sighand->siglock); - if (task_is_stopped(child)) - child->state = TASK_TRACED; - else if (!task_is_traced(child) && !kill) - ret = -ESRCH; + WARN_ON_ONCE(task_is_stopped(child)); + if (task_is_traced(child) || kill) + ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); -- cgit v1.2.3 From 17f60a7da150fdd0cfb9756f86a262daa72c835f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:07:50 -0400 Subject: capabilites: allow the application of capability limits to usermode helpers There is no way to limit the capabilities of usermodehelpers. This problem reared its head recently when someone complained that any user with cap_net_admin was able to load arbitrary kernel modules, even though the user didn't have cap_sys_module. The reason is because the actual load is done by a usermode helper and those always have the full cap set. This patch addes new sysctls which allow us to bound the permissions of usermode helpers. /proc/sys/kernel/usermodehelper/bset /proc/sys/kernel/usermodehelper/inheritable You must have CAP_SYS_MODULE and CAP_SETPCAP to change these (changes are &= ONLY). When the kernel launches a usermodehelper it will do so with these as the bset and pI. -v2: make globals static create spinlock to protect globals -v3: require both CAP_SETPCAP and CAP_SYS_MODULE -v4: fix the typo s/CAP_SET_PCAP/CAP_SETPCAP/ because I didn't commit Signed-off-by: Eric Paris No-objection-from: Serge E. Hallyn Acked-by: David Howells Acked-by: Serge E. Hallyn Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/kmod.h | 3 ++ kernel/kmod.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 6 ++++ 3 files changed, 109 insertions(+) (limited to 'kernel') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 6efd7a78de6a..79bb98d71858 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -24,6 +24,7 @@ #include #include #include +#include #define KMOD_PATH_LEN 256 @@ -109,6 +110,8 @@ call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) NULL, NULL, NULL); } +extern struct ctl_table usermodehelper_table[]; + extern void usermodehelper_init(void); extern int usermodehelper_disable(void); diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..06fdea2819b6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); + #ifdef CONFIG_MODULES /* @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + struct cred *new; int retval; spin_lock_irq(¤t->sighand->siglock); @@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data) goto fail; } + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto fail; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + commit_creds(new); + retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -418,6 +440,84 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..965134bed6cd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -615,6 +616,11 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = random_table, }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3 From ffa8e59df047d57e812a04f7d6baf6a25c652c0c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:34 -0400 Subject: capabilities: do not drop CAP_SETPCAP from the initial task In olden' days of yore CAP_SETPCAP had special meaning for the init task. We actually have code to make sure that CAP_SETPCAP wasn't in pE of things using the init_cred. But CAP_SETPCAP isn't so special any more and we don't have a reason to special case dropping it for init or kthreads.... Signed-off-by: Eric Paris Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/capability.h | 6 ++++-- kernel/capability.c | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 16ee8b49a200..11d562863e49 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -412,7 +412,6 @@ extern const kernel_cap_t __cap_init_eff_set; # define CAP_EMPTY_SET ((kernel_cap_t){{ 0, 0 }}) # define CAP_FULL_SET ((kernel_cap_t){{ ~0, ~0 }}) -# define CAP_INIT_EFF_SET ((kernel_cap_t){{ ~CAP_TO_MASK(CAP_SETPCAP), ~0 }}) # define CAP_FS_SET ((kernel_cap_t){{ CAP_FS_MASK_B0 \ | CAP_TO_MASK(CAP_LINUX_IMMUTABLE), \ CAP_FS_MASK_B1 } }) @@ -423,10 +422,10 @@ extern const kernel_cap_t __cap_init_eff_set; #endif /* _KERNEL_CAPABILITY_U32S != 2 */ #define CAP_INIT_INH_SET CAP_EMPTY_SET +#define CAP_INIT_EFF_SET CAP_FULL_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) # define cap_set_full(c) do { (c) = __cap_full_set; } while (0) -# define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) @@ -547,6 +546,9 @@ extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); +extern const kernel_cap_t __cap_empty_set; +extern const kernel_cap_t __cap_full_set; + /** * nsown_capable - Check superior capability to one's own user_ns * @cap: The capability in question diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734d0c12..2a374d512ead 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -23,11 +23,9 @@ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); int file_caps_enabled = 1; -- cgit v1.2.3 From 5163b583a036b103c3cec7171d6731c125773ed6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:39 -0400 Subject: capabilities: delete unused cap_set_full unused code. Clean it up. Signed-off-by: Eric Paris Acked-by: David Howells Acked-by: Andrew G. Morgan Signed-off-by: James Morris --- include/linux/capability.h | 2 -- kernel/capability.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 11d562863e49..8d0da30dad23 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -425,7 +425,6 @@ extern const kernel_cap_t __cap_init_eff_set; #define CAP_INIT_EFF_SET CAP_FULL_SET # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) -# define cap_set_full(c) do { (c) = __cap_full_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) #define cap_lower(c, flag) ((c).cap[CAP_TO_INDEX(flag)] &= ~CAP_TO_MASK(flag)) @@ -547,7 +546,6 @@ extern bool ns_capable(struct user_namespace *ns, int cap); extern bool task_ns_capable(struct task_struct *t, int cap); extern const kernel_cap_t __cap_empty_set; -extern const kernel_cap_t __cap_full_set; /** * nsown_capable - Check superior capability to one's own user_ns diff --git a/kernel/capability.c b/kernel/capability.c index 2a374d512ead..14ea4210a530 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -22,10 +22,8 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); int file_caps_enabled = 1; -- cgit v1.2.3 From a3232d2fa2e3cbab3e76d91cdae5890fee8a4034 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 1 Apr 2011 17:08:45 -0400 Subject: capabilities: delete all CAP_INIT macros The CAP_INIT macros of INH, BSET, and EFF made sense at one point in time, but now days they aren't helping. Just open code the logic in the init_cred. Signed-off-by: Eric Paris Acked-by: David Howells Signed-off-by: James Morris --- include/linux/capability.h | 3 --- include/linux/init_task.h | 7 ------- kernel/cred.c | 6 +++--- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 8d0da30dad23..04fed72809de 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -421,9 +421,6 @@ extern const kernel_cap_t __cap_init_eff_set; #endif /* _KERNEL_CAPABILITY_U32S != 2 */ -#define CAP_INIT_INH_SET CAP_EMPTY_SET -#define CAP_INIT_EFF_SET CAP_FULL_SET - # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index caa151fbebb7..1f277204de34 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -83,13 +83,6 @@ extern struct group_info init_groups; #define INIT_IDS #endif -/* - * Because of the reduced scope of CAP_SETPCAP when filesystem - * capabilities are in effect, it is safe to allow CAP_SETPCAP to - * be available in the default configuration. - */ -# define CAP_INIT_BSET CAP_FULL_SET - #ifdef CONFIG_RCU_BOOST #define INIT_TASK_RCU_BOOST() \ .rcu_boost_mutex = NULL, diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55048df..b982f0863ae9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,10 +49,10 @@ struct cred init_cred = { .magic = CRED_MAGIC, #endif .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_INIT_INH_SET, + .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_INIT_EFF_SET, - .cap_bset = CAP_INIT_BSET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, .user = INIT_USER, .group_info = &init_groups, #ifdef CONFIG_KEYS -- cgit v1.2.3 From 6eab04a87677a37cf15b52e2b4b4fd57917102ad Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Fri, 8 Apr 2011 19:49:08 -0700 Subject: treewide: remove extra semicolons Signed-off-by: Justin P. Mattock Signed-off-by: Jiri Kosina --- arch/arm/mach-at91/at91cap9_devices.c | 2 +- arch/arm/mach-at91/at91sam9g45_devices.c | 2 +- arch/arm/mach-at91/at91sam9rl_devices.c | 2 +- arch/arm/mach-tegra/tegra2_clocks.c | 2 +- arch/s390/hypfs/hypfs.h | 2 +- arch/um/drivers/mmapper_kern.c | 2 +- drivers/gpio/langwell_gpio.c | 2 +- drivers/leds/leds-mc13783.c | 2 +- drivers/net/can/softing/softing_main.c | 2 +- drivers/net/wireless/ath/ath9k/phy.h | 2 +- drivers/net/wireless/iwlwifi/iwl-agn.c | 2 +- drivers/net/wireless/rtlwifi/core.c | 2 +- drivers/net/wireless/rtlwifi/pci.c | 2 +- drivers/net/wireless/rtlwifi/rtl8192ce/hw.c | 4 ++-- drivers/power/intel_mid_battery.c | 2 +- drivers/scsi/be2iscsi/be_main.c | 4 ++-- drivers/scsi/lpfc/lpfc_bsg.c | 2 +- drivers/scsi/pm8001/pm8001_init.c | 2 +- drivers/scsi/qla2xxx/qla_isr.c | 4 ++-- drivers/target/target_core_alua.c | 4 ++-- drivers/target/target_core_transport.c | 2 +- drivers/tty/serial/mrst_max3110.c | 2 +- drivers/video/mxsfb.c | 2 +- drivers/xen/evtchn.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- fs/logfs/readwrite.c | 2 +- fs/ocfs2/refcounttree.c | 2 +- kernel/pm_qos_params.c | 2 +- mm/hugetlb.c | 2 +- net/netfilter/ipset/ip_set_bitmap_ip.c | 2 +- net/sunrpc/addr.c | 2 +- security/selinux/selinuxfs.c | 8 ++++---- tools/power/x86/turbostat/turbostat.c | 2 +- 33 files changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-at91/at91cap9_devices.c b/arch/arm/mach-at91/at91cap9_devices.c index d1f775e86353..308ce7a87edd 100644 --- a/arch/arm/mach-at91/at91cap9_devices.c +++ b/arch/arm/mach-at91/at91cap9_devices.c @@ -171,7 +171,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c index 1e8f275c17f6..5e9f8a4c38df 100644 --- a/arch/arm/mach-at91/at91sam9g45_devices.c +++ b/arch/arm/mach-at91/at91sam9g45_devices.c @@ -256,7 +256,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) { usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-at91/at91sam9rl_devices.c b/arch/arm/mach-at91/at91sam9rl_devices.c index 53aaa94df75a..c49262bddd85 100644 --- a/arch/arm/mach-at91/at91sam9rl_devices.c +++ b/arch/arm/mach-at91/at91sam9rl_devices.c @@ -145,7 +145,7 @@ void __init at91_add_device_usba(struct usba_platform_data *data) */ usba_udc_data.pdata.vbus_pin = -EINVAL; usba_udc_data.pdata.num_ep = ARRAY_SIZE(usba_udc_ep); - memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep));; + memcpy(usba_udc_data.ep, usba_udc_ep, sizeof(usba_udc_ep)); if (data && data->vbus_pin > 0) { at91_set_gpio_input(data->vbus_pin, 0); diff --git a/arch/arm/mach-tegra/tegra2_clocks.c b/arch/arm/mach-tegra/tegra2_clocks.c index 6d7c4eea4dcb..3b6f290fde4c 100644 --- a/arch/arm/mach-tegra/tegra2_clocks.c +++ b/arch/arm/mach-tegra/tegra2_clocks.c @@ -337,7 +337,7 @@ static int tegra2_super_clk_set_parent(struct clk *c, struct clk *p) const struct clk_mux_sel *sel; int shift; - val = clk_readl(c->reg + SUPER_CLK_MUX);; + val = clk_readl(c->reg + SUPER_CLK_MUX); BUG_ON(((val & SUPER_STATE_MASK) != SUPER_STATE_RUN) && ((val & SUPER_STATE_MASK) != SUPER_STATE_IDLE)); shift = ((val & SUPER_STATE_MASK) == SUPER_STATE_IDLE) ? diff --git a/arch/s390/hypfs/hypfs.h b/arch/s390/hypfs/hypfs.h index 80c1526f2af3..d9df5a060a83 100644 --- a/arch/s390/hypfs/hypfs.h +++ b/arch/s390/hypfs/hypfs.h @@ -47,7 +47,7 @@ struct hypfs_dbfs_data { void *buf; void *buf_free_ptr; size_t size; - struct hypfs_dbfs_file *dbfs_file;; + struct hypfs_dbfs_file *dbfs_file; struct kref kref; }; diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c index 7e0619c2c2c6..c0ef803c7c70 100644 --- a/arch/um/drivers/mmapper_kern.c +++ b/arch/um/drivers/mmapper_kern.c @@ -116,7 +116,7 @@ static int __init mmapper_init(void) if (err) { printk(KERN_ERR "mmapper - misc_register failed, err = %d\n", err); - return err;; + return err; } return 0; } diff --git a/drivers/gpio/langwell_gpio.c b/drivers/gpio/langwell_gpio.c index 560ab648cf18..1b06f67e1f69 100644 --- a/drivers/gpio/langwell_gpio.c +++ b/drivers/gpio/langwell_gpio.c @@ -122,7 +122,7 @@ static int lnw_gpio_direction_output(struct gpio_chip *chip, lnw_gpio_set(chip, offset, value); spin_lock_irqsave(&lnw->lock, flags); value = readl(gpdr); - value |= BIT(offset % 32);; + value |= BIT(offset % 32); writel(value, gpdr); spin_unlock_irqrestore(&lnw->lock, flags); return 0; diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c index f05bb08d0f09..f369e56d6547 100644 --- a/drivers/leds/leds-mc13783.c +++ b/drivers/leds/leds-mc13783.c @@ -234,7 +234,7 @@ static int __devinit mc13783_leds_prepare(struct platform_device *pdev) MC13783_LED_Cx_PERIOD; if (pdata->flags & MC13783_LED_TRIODE_TC3) - reg |= MC13783_LED_Cx_TRIODE_TC_BIT;; + reg |= MC13783_LED_Cx_TRIODE_TC_BIT; ret = mc13783_reg_write(dev, MC13783_REG_LED_CONTROL_5, reg); if (ret) diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index aeea9f9ff6e8..8d7595344a62 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -797,7 +797,7 @@ static __devinit int softing_pdev_probe(struct platform_device *pdev) ret = -EINVAL; pres = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!pres) - goto platform_resource_failed;; + goto platform_resource_failed; card->dpram_phys = pres->start; card->dpram_size = pres->end - pres->start + 1; card->dpram = ioremap_nocache(card->dpram_phys, card->dpram_size); diff --git a/drivers/net/wireless/ath/ath9k/phy.h b/drivers/net/wireless/ath/ath9k/phy.h index 5e3d7496986e..3a7cd7523d5c 100644 --- a/drivers/net/wireless/ath/ath9k/phy.h +++ b/drivers/net/wireless/ath/ath9k/phy.h @@ -54,7 +54,7 @@ #define RF_BANK_SETUP(_bank, _iniarray, _col) do { \ int i; \ for (i = 0; i < (_iniarray)->ia_rows; i++) \ - (_bank)[i] = INI_RA((_iniarray), i, _col);; \ + (_bank)[i] = INI_RA((_iniarray), i, _col); \ } while (0) #define AR_PHY_TIMING11_SPUR_FREQ_SD 0x3FF00000 diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index 581dc9f10273..31233a330372 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -2062,7 +2062,7 @@ static const char *desc_lookup(u32 num) max = ARRAY_SIZE(advanced_lookup) - 1; for (i = 0; i < max; i++) { if (advanced_lookup[i].num == num) - break;; + break; } return advanced_lookup[i].name; } diff --git a/drivers/net/wireless/rtlwifi/core.c b/drivers/net/wireless/rtlwifi/core.c index e4f4aee8f298..d52fa27103c6 100644 --- a/drivers/net/wireless/rtlwifi/core.c +++ b/drivers/net/wireless/rtlwifi/core.c @@ -719,7 +719,7 @@ static void rtl_op_set_tsf(struct ieee80211_hw *hw, u64 tsf) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); - u8 bibss = (mac->opmode == NL80211_IFTYPE_ADHOC) ? 1 : 0;; + u8 bibss = (mac->opmode == NL80211_IFTYPE_ADHOC) ? 1 : 0; mac->tsf = tsf; rtlpriv->cfg->ops->set_hw_reg(hw, HW_VAR_CORRECT_TSF, (u8 *) (&bibss)); diff --git a/drivers/net/wireless/rtlwifi/pci.c b/drivers/net/wireless/rtlwifi/pci.c index 9cd7703c2a30..baa2efeeedf6 100644 --- a/drivers/net/wireless/rtlwifi/pci.c +++ b/drivers/net/wireless/rtlwifi/pci.c @@ -1226,7 +1226,7 @@ static unsigned int _rtl_mac_to_hwqueue(__le16 fc, hw_queue_index = VI_QUEUE; break; case 2: - hw_queue_index = BE_QUEUE;; + hw_queue_index = BE_QUEUE; break; case 3: hw_queue_index = BK_QUEUE; diff --git a/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c b/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c index 05477f465a75..7432c85a8dff 100644 --- a/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c +++ b/drivers/net/wireless/rtlwifi/rtl8192ce/hw.c @@ -729,7 +729,7 @@ static bool _rtl92ce_init_mac(struct ieee80211_hw *hw) rtl_write_word(rtlpriv, REG_CR, 0x2ff); if (_rtl92ce_llt_table_init(hw) == false) - return false;; + return false; rtl_write_dword(rtlpriv, REG_HISR, 0xffffffff); rtl_write_byte(rtlpriv, REG_HISRE, 0xff); @@ -786,7 +786,7 @@ static bool _rtl92ce_init_mac(struct ieee80211_hw *hw) rtl_write_dword(rtlpriv, REG_MCUTST_1, 0x0); - return true;; + return true; } static void _rtl92ce_hw_configure(struct ieee80211_hw *hw) diff --git a/drivers/power/intel_mid_battery.c b/drivers/power/intel_mid_battery.c index bce3a01da2f0..cffcb7c00b00 100644 --- a/drivers/power/intel_mid_battery.c +++ b/drivers/power/intel_mid_battery.c @@ -522,7 +522,7 @@ static int pmic_battery_set_charger(struct pmic_power_module_info *pbi, if (retval) { dev_warn(pbi->dev, "%s(): ipc pmic read failed\n", __func__); - return retval;; + return retval; } return 0; diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index 24e20ba9633c..44ac4aef879f 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -618,7 +618,7 @@ static void beiscsi_get_params(struct beiscsi_hba *phba) + BE2_NOPOUT_REQ)); phba->params.cxns_per_ctrl = phba->fw_config.iscsi_cid_count; phba->params.asyncpdus_per_ctrl = phba->fw_config.iscsi_cid_count * 2; - phba->params.icds_per_ctrl = phba->fw_config.iscsi_icd_count;; + phba->params.icds_per_ctrl = phba->fw_config.iscsi_icd_count; phba->params.num_sge_per_io = BE2_SGE; phba->params.defpdu_hdr_sz = BE2_DEFPDU_HDR_SZ; phba->params.defpdu_data_sz = BE2_DEFPDU_DATA_SZ; @@ -781,7 +781,7 @@ static irqreturn_t be_isr(int irq, void *dev_id) int isr; phba = dev_id; - ctrl = &phba->ctrl;; + ctrl = &phba->ctrl; isr = ioread32(ctrl->csr + CEV_ISR0_OFFSET + (PCI_FUNC(ctrl->pdev->devfn) * CEV_ISR_SIZE)); if (!isr) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index 0dd43bb91618..04fef038b1ff 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -595,7 +595,7 @@ lpfc_bsg_rport_els(struct fc_bsg_job *job) dd_data->context_un.iocb.cmdiocbq = cmdiocbq; dd_data->context_un.iocb.rspiocbq = rspiocbq; dd_data->context_un.iocb.set_job = job; - dd_data->context_un.iocb.bmp = NULL;; + dd_data->context_un.iocb.bmp = NULL; dd_data->context_un.iocb.ndlp = ndlp; if (phba->cfg_poll & DISABLE_FCP_RING_INT) { diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 002360da01e3..172cefb6deb9 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -160,7 +160,7 @@ static void pm8001_free(struct pm8001_hba_info *pm8001_ha) static void pm8001_tasklet(unsigned long opaque) { struct pm8001_hba_info *pm8001_ha; - pm8001_ha = (struct pm8001_hba_info *)opaque;; + pm8001_ha = (struct pm8001_hba_info *)opaque; if (unlikely(!pm8001_ha)) BUG_ON(1); PM8001_CHIP_DISP->isr(pm8001_ha); diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c index d17ed9a94a0c..8b4d99a224f3 100644 --- a/drivers/scsi/qla2xxx/qla_isr.c +++ b/drivers/scsi/qla2xxx/qla_isr.c @@ -1051,7 +1051,7 @@ qla2x00_ct_entry(scsi_qla_host_t *vha, struct req_que *req, } DEBUG2(qla2x00_dump_buffer((uint8_t *)pkt, sizeof(*pkt))); } else { - bsg_job->reply->result = DID_OK << 16;; + bsg_job->reply->result = DID_OK << 16; bsg_job->reply->reply_payload_rcv_len = bsg_job->reply_payload.payload_len; bsg_job->reply_len = 0; @@ -1146,7 +1146,7 @@ qla24xx_els_ct_entry(scsi_qla_host_t *vha, struct req_que *req, DEBUG2(qla2x00_dump_buffer((uint8_t *)pkt, sizeof(*pkt))); } else { - bsg_job->reply->result = DID_OK << 16;; + bsg_job->reply->result = DID_OK << 16; bsg_job->reply->reply_payload_rcv_len = bsg_job->reply_payload.payload_len; bsg_job->reply_len = 0; } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index 2c5fcfed5934..e7dddafe73f6 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -1036,7 +1036,7 @@ core_alua_allocate_lu_gp(const char *name, int def_group) lu_gp = kmem_cache_zalloc(t10_alua_lu_gp_cache, GFP_KERNEL); if (!(lu_gp)) { printk(KERN_ERR "Unable to allocate struct t10_alua_lu_gp\n"); - return ERR_PTR(-ENOMEM);; + return ERR_PTR(-ENOMEM); } INIT_LIST_HEAD(&lu_gp->lu_gp_list); INIT_LIST_HEAD(&lu_gp->lu_gp_mem_list); @@ -1044,7 +1044,7 @@ core_alua_allocate_lu_gp(const char *name, int def_group) atomic_set(&lu_gp->lu_gp_ref_cnt, 0); if (def_group) { - lu_gp->lu_gp_id = se_global->alua_lu_gps_counter++;; + lu_gp->lu_gp_id = se_global->alua_lu_gps_counter++; lu_gp->lu_gp_valid_id = 1; se_global->alua_lu_gps_count++; } diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ff9ace01e27a..39ac190080e2 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2131,7 +2131,7 @@ static void transport_failure_reset_queue_depth(struct se_device *dev) { unsigned long flags; - spin_lock_irqsave(&SE_HBA(dev)->hba_queue_lock, flags);; + spin_lock_irqsave(&SE_HBA(dev)->hba_queue_lock, flags); atomic_inc(&dev->depth_left); atomic_inc(&SE_HBA(dev)->left_queue_depth); spin_unlock_irqrestore(&SE_HBA(dev)->hba_queue_lock, flags); diff --git a/drivers/tty/serial/mrst_max3110.c b/drivers/tty/serial/mrst_max3110.c index 37e13c3d91d9..8890612a8d9a 100644 --- a/drivers/tty/serial/mrst_max3110.c +++ b/drivers/tty/serial/mrst_max3110.c @@ -56,7 +56,7 @@ struct uart_max3110 { wait_queue_head_t wq; struct task_struct *main_thread; struct task_struct *read_thread; - struct mutex thread_mutex;; + struct mutex thread_mutex; u32 baud; u16 cur_conf; diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c index 7d0284882984..0b2f2dd41416 100644 --- a/drivers/video/mxsfb.c +++ b/drivers/video/mxsfb.c @@ -401,7 +401,7 @@ static int mxsfb_set_par(struct fb_info *fb_info) writel(CTRL1_FIFO_CLEAR, host->base + LCDC_CTRL1 + REG_SET); ctrl = CTRL_BYPASS_COUNT | CTRL_MASTER | - CTRL_SET_BUS_WIDTH(host->ld_intf_width);; + CTRL_SET_BUS_WIDTH(host->ld_intf_width); switch (fb_info->var.bits_per_pixel) { case 16: diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index ef11daf0cafe..dbc13e94b612 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -470,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) filp->private_data = u; - return nonseekable_open(inode, filp);; + return nonseekable_open(inode, filp); } static int evtchn_release(struct inode *inode, struct file *filp) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 54469c3eeacd..65ea21a97492 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -54,7 +54,7 @@ u64 start_dma_addr; static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) { - return phys_to_machine(XPADDR(paddr)).maddr;; + return phys_to_machine(XPADDR(paddr)).maddr; } static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index ee99a9f5dfd3..3dcb3a60c331 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb, val = inode_val0(inode); break; case INODE_USED_OFS: - val = cpu_to_be64(li->li_used_bytes);; + val = cpu_to_be64(li->li_used_bytes); break; case INODE_SIZE_OFS: val = cpu_to_be64(i_size_read(inode)); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c384d634872a..c4feceda2d96 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3707,7 +3707,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; - context->ref_root_bh = ref_root_bh;; + context->ref_root_bh = ref_root_bh; context->cow_object = xv; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0da058bff8eb..beb184689af9 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -385,7 +385,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, s32 value; unsigned long flags; struct pm_qos_object *o; - struct pm_qos_request_list *pm_qos_req = filp->private_data;; + struct pm_qos_request_list *pm_qos_req = filp->private_data; if (!pm_qos_req) return -EINVAL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb0b7c128015..838fe25f704c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - goto err;; + goto err; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index bca96990218d..cc2992299e74 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -293,7 +293,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], for (; !before(ip_to, ip); ip += map->hosts) { id = ip_to_id(map, ip); - ret = adtfn(set, &id, timeout);; + ret = adtfn(set, &id, timeout); if (ret && !ip_set_eexist(ret, flags)) return ret; diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 1419d0cdbbac..4195233c4914 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -151,7 +151,7 @@ static size_t rpc_pton4(const char *buf, const size_t buflen, return 0; sin->sin_family = AF_INET; - return sizeof(struct sockaddr_in);; + return sizeof(struct sockaddr_in); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index ea39cb742ae5..47b7d624a6e1 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -280,7 +280,7 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf, length = -ENOMEM; if (count >= PAGE_SIZE) - goto out;; + goto out; /* No partial writes. */ length = -EINVAL; @@ -876,12 +876,12 @@ static ssize_t sel_write_user(struct file *file, char *buf, size_t size) length = task_has_security(current, SECURITY__COMPUTE_USER); if (length) - goto out;; + goto out; length = -ENOMEM; con = kzalloc(size + 1, GFP_KERNEL); if (!con) - goto out;; + goto out; length = -ENOMEM; user = kzalloc(size + 1, GFP_KERNEL); @@ -941,7 +941,7 @@ static ssize_t sel_write_member(struct file *file, char *buf, size_t size) length = -ENOMEM; scon = kzalloc(size + 1, GFP_KERNEL); if (!scon) - goto out;; + goto out; length = -ENOMEM; tcon = kzalloc(size + 1, GFP_KERNEL); diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 362a0cb448db..6d8ef4a3a9b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -990,7 +990,7 @@ int fork_it(char **argv) if (!retval) print_counters(cnt_delta); - fprintf(stderr, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0);; + fprintf(stderr, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0); return 0; } -- cgit v1.2.3 From 687566990640e476aeeed844947f2ecadc4717d3 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Tue, 29 Mar 2011 09:36:51 -0700 Subject: arch:Kconfig.locks Remove unused config option. Signed-off-by: Justin P. Mattock Acked-by: Steven Rostedt Acked-by: Heiko Carstens Signed-off-by: Jiri Kosina --- arch/Kconfig | 3 --- kernel/Kconfig.locks | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index f78c2be4242b..8d24bacaa61e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -144,9 +144,6 @@ config HAVE_CLK config HAVE_DMA_API_DEBUG bool -config HAVE_DEFAULT_NO_SPIN_MUTEXES - bool - config HAVE_HW_BREAKPOINT bool depends on PERF_EVENTS diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES + def_bool SMP && !DEBUG_MUTEXES -- cgit v1.2.3 From 7816c45bf13255157c00fb8aca86cb64d825e878 Mon Sep 17 00:00:00 2001 From: Roland Vossen Date: Thu, 7 Apr 2011 11:20:58 +0200 Subject: modules: Enabled dynamic debugging for staging modules Driver modules from the staging directory are marked 'tainted' by module.c. Subsequently, tainted modules are denied dynamic debugging. This is unwanted behavior, since staging modules should be able to use the dynamic debugging mechanism. Please merge this also into the staging-linus branch. Signed-off-by: Roland Vossen Acked-by: Jason Baron Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d5938a5c19c4..4d5c16aae745 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2790,7 +2790,7 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - if (!mod->taints) + if (!mod->taints || mod->taints == (1U<taints) + if (!mod->taints || mod->taints == (1U< Date: Wed, 27 Apr 2011 15:10:49 +0200 Subject: audit: acquire creds selectively to reduce atomic op overhead Commit c69e8d9c01db ("CRED: Use RCU to access another task's creds and to release a task's own creds") added calls to get_task_cred and put_cred in audit_filter_rules. Profiling with a large number of audit rules active on the exit chain shows that we are spending upto 48% in this routine for syscall intensive tests, most of which is in the atomic ops. 1. The code should be accessing tsk->cred rather than tsk->real_cred. 2. Since tsk is current (or tsk is being created by copy_process) access to tsk->cred without rcu read lock is possible. At the request of the audit maintainer, a new flag has been added to audit_filter_rules in order to make this explicit and guide future code. Signed-off-by: Tony Jones Acked-by: Eric Paris Signed-off-by: Jiri Kosina --- kernel/auditsc.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b33513a08beb..00d79df03e76 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 - * otherwise. */ + * otherwise. + * + * If task_creation is true, this is an explicit indication that we are + * filtering a task rule at task creation time. This and tsk == current are + * the only situations where tsk->cred may be accessed without an rcu read lock. + */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, - enum audit_state *state) + enum audit_state *state, + bool task_creation) { - const struct cred *cred = get_task_cred(tsk); + const struct cred *cred; int i, j, need_sid = 1; u32 sid; + cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); + for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; @@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, break; } - if (!result) { - put_cred(cred); + if (!result) return 0; - } } if (ctx) { @@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } - put_cred(cred); return 1; } @@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, + &state, true)) { if (state == AUDIT_RECORD_CONTEXT) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); @@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state)) { + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return state; @@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { + audit_filter_rules(tsk, &e->rule, ctx, n, + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return; -- cgit v1.2.3 From 0edceb7bcd82802f721f3c94eed9b3e2869d3740 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 19:17:37 +0200 Subject: signal: introduce retarget_shared_pending() No functional changes. Move the notify-other-threads code from exit_signals() to the new helper, retarget_shared_pending(). Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- kernel/signal.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c15e9792b088..5341e2141904 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2198,10 +2198,25 @@ relock: return signr; } +/* + * It could be that complete_signal() picked us to notify about the + * group-wide signal. Another thread should be notified now to take + * the signal since we will not. + */ +static void retarget_shared_pending(struct task_struct *tsk) +{ + struct task_struct *t; + + t = tsk; + while_each_thread(tsk, t) { + if (!signal_pending(t) && !(t->flags & PF_EXITING)) + recalc_sigpending_and_wake(t); + } +} + void exit_signals(struct task_struct *tsk) { int group_stop = 0; - struct task_struct *t; if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; @@ -2217,14 +2232,7 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* - * It could be that __group_complete_signal() choose us to - * notify about group-wide signal. Another thread should be - * woken now to take the signal since we will not. - */ - for (t = tsk; (t = next_thread(t)) != tsk; ) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); + retarget_shared_pending(tsk); if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && task_participate_group_stop(tsk)) -- cgit v1.2.3 From f646e227b88a164a841d6b6dd969d8a45272dd83 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 19:18:39 +0200 Subject: signal: retarget_shared_pending: consider shared/unblocked signals only exit_signals() checks signal_pending() before retarget_shared_pending() but this is suboptimal. We can avoid the while_each_thread() loop in case when there are no shared signals visible to us. Add the "shared_pending.signal & ~blocked" check. We don't use tsk->blocked directly but pass ~blocked as an argument, this is needed for the next patch. Note: we can optimize this more. while_each_thread(t) can check t->blocked into account and stop after every pending signal has the new target, see the next patch. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- kernel/signal.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5341e2141904..06214b55dc5e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2203,10 +2203,15 @@ relock: * group-wide signal. Another thread should be notified now to take * the signal since we will not. */ -static void retarget_shared_pending(struct task_struct *tsk) +static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { + sigset_t retarget; struct task_struct *t; + sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); + if (sigisemptyset(&retarget)) + return; + t = tsk; while_each_thread(tsk, t) { if (!signal_pending(t) && !(t->flags & PF_EXITING)) @@ -2217,6 +2222,7 @@ static void retarget_shared_pending(struct task_struct *tsk) void exit_signals(struct task_struct *tsk) { int group_stop = 0; + sigset_t unblocked; if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; @@ -2232,7 +2238,9 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - retarget_shared_pending(tsk); + unblocked = tsk->blocked; + signotset(&unblocked); + retarget_shared_pending(tsk, &unblocked); if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && task_participate_group_stop(tsk)) -- cgit v1.2.3 From fec9993db093acfc3999a364e31f8adae41fcb28 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 19:50:21 +0200 Subject: signal: retarget_shared_pending: optimize while_each_thread() loop retarget_shared_pending() blindly does recalc_sigpending_and_wake() for every sub-thread, this is suboptimal. We can check t->blocked and stop looping once every bit in shared_pending has the new target. Note: we do not take task_is_stopped_or_traced(t) into account, we are not trying to speed up the signal delivery or to avoid the unnecessary (but harmless) signal_wake_up(0) in this unlikely case. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- kernel/signal.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 06214b55dc5e..707736e64e0e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2200,8 +2200,8 @@ relock: /* * It could be that complete_signal() picked us to notify about the - * group-wide signal. Another thread should be notified now to take - * the signal since we will not. + * group-wide signal. Other threads should be notified now to take + * the shared signals in @which since we will not. */ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { @@ -2214,8 +2214,19 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) t = tsk; while_each_thread(tsk, t) { - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); + if (t->flags & PF_EXITING) + continue; + + if (!has_pending_signals(&retarget, &t->blocked)) + continue; + /* Remove the signals this thread can handle. */ + sigandsets(&retarget, &retarget, &t->blocked); + + if (!signal_pending(t)) + signal_wake_up(t, 0); + + if (sigisemptyset(&retarget)) + break; } } -- cgit v1.2.3 From 73ef4aeb61b53fce464a7e24ef03a26f98b2f617 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 19:54:20 +0200 Subject: signal: sigprocmask: narrow the scope of ->siglock No functional changes, preparation to simplify the review of the next change. 1. We can read current->block lockless, nobody else can ever change this mask. 2. Calculate the resulting sigset_t outside of ->siglock into the temporary variable, then take ->siglock and change ->blocked. Also, kill the stale comment about BKL. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- kernel/signal.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 707736e64e0e..e8308e3238c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2299,12 +2299,6 @@ long do_no_restart_syscall(struct restart_block *param) return -EINTR; } -/* - * We don't need to get the kernel lock - this is all local to this - * particular thread.. (and that's good, because this is _heavily_ - * used by various programs) - */ - /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. @@ -2315,30 +2309,33 @@ long do_no_restart_syscall(struct restart_block *param) */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { - int error; + struct task_struct *tsk = current; + sigset_t newset; - spin_lock_irq(¤t->sighand->siglock); + /* Lockless, only current can change ->blocked, never from irq */ if (oldset) - *oldset = current->blocked; + *oldset = tsk->blocked; - error = 0; switch (how) { case SIG_BLOCK: - sigorsets(¤t->blocked, ¤t->blocked, set); + sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: - signandsets(¤t->blocked, ¤t->blocked, set); + signandsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: - current->blocked = *set; + newset = *set; break; default: - error = -EINVAL; + return -EINVAL; } + + spin_lock_irq(&tsk->sighand->siglock); + tsk->blocked = newset; recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&tsk->sighand->siglock); - return error; + return 0; } /** -- cgit v1.2.3 From e6fa16ab9c1e9b344428e6fea4d29e3cc4b28fb0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 20:59:41 +0200 Subject: signal: sigprocmask() should do retarget_shared_pending() In short, almost every changing of current->blocked is wrong, or at least can lead to the unexpected results. For example. Two threads T1 and T2, T1 sleeps in sigtimedwait/pause/etc. kill(tgid, SIG) can pick T2 for TIF_SIGPENDING. If T2 calls sigprocmask() and blocks SIG before it notices the pending signal, nobody else can handle this pending shared signal. I am not sure this is bug, but at least this looks strange imho. T1 should not sleep forever, there is a signal which should wake it up. This patch moves the code which actually changes ->blocked into the new helper, set_current_blocked() and changes this code to call retarget_shared_pending() as exit_signals() does. We should only care about the signals we just blocked, we use "newset & ~current->blocked" as a mask. We do not check !sigisemptyset(newblocked), retarget_shared_pending() is cheap unless mask & shared_pending. Note: for this particular case we could simply change sigprocmask() to return -EINTR if signal_pending(), but then we should change other callers and, more importantly, if we need this fix then set_current_blocked() will have more callers and some of them can't restart. See the next patch as a random example. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- include/linux/signal.h | 1 + kernel/signal.c | 29 ++++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index fcd2b14b1932..ba009c167275 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -243,6 +243,7 @@ extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info); extern long do_sigpending(void __user *, unsigned long); extern int sigprocmask(int, sigset_t *, sigset_t *); +extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; struct pt_regs; diff --git a/kernel/signal.c b/kernel/signal.c index e8308e3238c1..8aa3a2e226af 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2299,6 +2299,29 @@ long do_no_restart_syscall(struct restart_block *param) return -EINTR; } +/** + * set_current_blocked - change current->blocked mask + * @newset: new mask + * + * It is wrong to change ->blocked directly, this helper should be used + * to ensure the process can't miss a shared signal we are going to block. + */ +void set_current_blocked(const sigset_t *newset) +{ + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + if (signal_pending(tsk) && !thread_group_empty(tsk)) { + sigset_t newblocked; + /* A set of now blocked but previously unblocked signals. */ + signandsets(&newblocked, newset, ¤t->blocked); + retarget_shared_pending(tsk, &newblocked); + } + tsk->blocked = *newset; + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); +} + /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. @@ -2330,11 +2353,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return -EINVAL; } - spin_lock_irq(&tsk->sighand->siglock); - tsk->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(&tsk->sighand->siglock); - + set_current_blocked(&newset); return 0; } -- cgit v1.2.3 From bb7efee2ca63b08795ffb3cda96fc89d2e641b79 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 21:18:10 +0200 Subject: signal: cleanup sys_rt_sigprocmask() sys_rt_sigprocmask() looks unnecessarily complicated, simplify it. We can just read current->blocked lockless unconditionally before anything else and then copy-to-user it if needed. At worst we copy 4 words on mips. We could copy-to-user the old mask first and simplify the code even more, but the patch tries to keep the current behaviour: we change current->block even if copy_to_user(oset) fails. Signed-off-by: Oleg Nesterov Reviewed-by: Matt Fleming Acked-by: Tejun Heo --- kernel/signal.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8aa3a2e226af..bb9200070ea0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2364,40 +2364,34 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ -SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { - int error = -EINVAL; sigset_t old_set, new_set; + int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; + old_set = current->blocked; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(sigset_t))) + return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - error = sigprocmask(how, &new_set, &old_set); + error = sigprocmask(how, &new_set, NULL); if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked; - spin_unlock_irq(¤t->sighand->siglock); + return error; + } - set_old: - error = -EFAULT; - if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + if (oset) { + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } long do_sigpending(void __user *set, unsigned long sigsetsize) -- cgit v1.2.3 From fe0faa005d43bc44c357631d51c273806086caa4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 21:24:19 +0200 Subject: signal: sys_rt_sigtimedwait: simplify the timeout logic No functional changes, cleanup compat_sys_rt_sigtimedwait() and sys_rt_sigtimedwait(). Calculate the timeout before we take ->siglock, this simplifies and lessens the code. Use timespec_valid() to check the timespec. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Matt Fleming --- kernel/compat.c | 42 ++++++++++++++++++------------------------ kernel/signal.c | 48 +++++++++++++++++++++--------------------------- 2 files changed, 39 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe8..06cbb0619531 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -893,7 +893,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, int sig; struct timespec t; siginfo_t info; - long ret, timeout = 0; + long ret, timeout; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; @@ -904,36 +904,30 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); signotset(&s); + timeout = MAX_SCHEDULE_TIMEOUT; if (uts) { if (get_compat_timespec (&t, uts)) return -EFAULT; - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 - || t.tv_sec < 0) + if (!timespec_valid(&t)) return -EINVAL; + timeout = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); } spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &s, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = timespec_to_jiffies(&t) - +(t.tv_sec || t.tv_nsec); - if (timeout) { - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &s); - - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } + if (!sig && timeout) { + current->real_blocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &s); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &s, &info); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); } spin_unlock_irq(¤t->sighand->siglock); @@ -943,7 +937,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } - }else { + } else { ret = timeout?-EINTR:-EAGAIN; } return ret; diff --git a/kernel/signal.c b/kernel/signal.c index bb9200070ea0..c734619554f6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2519,7 +2519,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, sigset_t these; struct timespec ts; siginfo_t info; - long timeout = 0; + long timeout; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) @@ -2535,41 +2535,35 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); signotset(&these); + timeout = MAX_SCHEDULE_TIMEOUT; if (uts) { if (copy_from_user(&ts, uts, sizeof(ts))) return -EFAULT; - if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 - || ts.tv_sec < 0) + if (!timespec_valid(&ts)) return -EINVAL; + timeout = timespec_to_jiffies(&ts) + (ts.tv_sec || ts.tv_nsec); } spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &these, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = (timespec_to_jiffies(&ts) - + (ts.tv_sec || ts.tv_nsec)); + if (!sig && timeout) { + /* + * None ready -- temporarily unblock those we're + * interested while we are sleeping in so that we'll + * be awakened when they arrive. + */ + current->real_blocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &these); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); - if (timeout) { - /* - * None ready -- temporarily unblock those we're - * interested while we are sleeping in so that we'll - * be awakened when they arrive. - */ - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &these, &info); + current->blocked = current->real_blocked; + siginitset(¤t->real_blocked, 0); + recalc_sigpending(); } spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From 943df1485a8ff0e600729e082e568ece04d4de9e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 21:44:14 +0200 Subject: signal: introduce do_sigtimedwait() to factor out compat/native code Factor out the common code in sys_rt_sigtimedwait/compat_sys_rt_sigtimedwait to the new helper, do_sigtimedwait(). Add the comment to document the extra tick we add to timespec_to_jiffies(ts), thanks to Linus who explained this to me. Perhaps it would be better to move compat_sys_rt_sigtimedwait() into signal.c under CONFIG_COMPAT, then we can make do_sigtimedwait() static. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Matt Fleming --- include/linux/signal.h | 2 + kernel/compat.c | 41 ++++-------------- kernel/signal.c | 110 +++++++++++++++++++++++++++++-------------------- 3 files changed, 74 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index ba009c167275..782546d661ba 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -242,6 +242,8 @@ extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info); extern long do_sigpending(void __user *, unsigned long); +extern int do_sigtimedwait(const sigset_t *, siginfo_t *, + const struct timespec *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/compat.c b/kernel/compat.c index 06cbb0619531..9214dcd087b7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -890,10 +890,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, { compat_sigset_t s32; sigset_t s; - int sig; struct timespec t; siginfo_t info; - long ret, timeout; + long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; @@ -901,45 +900,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) return -EFAULT; sigset_from_compat(&s, &s32); - sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&s); - timeout = MAX_SCHEDULE_TIMEOUT; if (uts) { - if (get_compat_timespec (&t, uts)) + if (get_compat_timespec(&t, uts)) return -EFAULT; - if (!timespec_valid(&t)) - return -EINVAL; - timeout = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - if (!sig && timeout) { - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &s); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - } else { - ret = timeout?-EINTR:-EAGAIN; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; } + return ret; } diff --git a/kernel/signal.c b/kernel/signal.c index c734619554f6..1ab89f677424 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2503,6 +2503,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * do_sigtimedwait - wait for queued signals specified in @which + * @which: queued signals to wait for + * @info: if non-null, the signal's siginfo is returned here + * @ts: upper bound on process time suspension + */ +int do_sigtimedwait(const sigset_t *which, siginfo_t *info, + const struct timespec *ts) +{ + struct task_struct *tsk = current; + long timeout = MAX_SCHEDULE_TIMEOUT; + sigset_t mask = *which; + int sig; + + if (ts) { + if (!timespec_valid(ts)) + return -EINVAL; + timeout = timespec_to_jiffies(ts); + /* + * We can be close to the next tick, add another one + * to ensure we will wait at least the time asked for. + */ + if (ts->tv_sec || ts->tv_nsec) + timeout++; + } + + /* + * Invert the set of allowed signals to get those we want to block. + */ + sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&mask); + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); + if (!sig && timeout) { + /* + * None ready, temporarily unblock those we're interested + * while we are sleeping in so that we'll be awakened when + * they arrive. + */ + tsk->real_blocked = tsk->blocked; + sigandsets(&tsk->blocked, &tsk->blocked, &mask); + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); + + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); + tsk->blocked = tsk->real_blocked; + siginitset(&tsk->real_blocked, 0); + recalc_sigpending(); + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (sig) + return sig; + return timeout ? -EINTR : -EAGAIN; +} + /** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese @@ -2515,11 +2575,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) { - int ret, sig; sigset_t these; struct timespec ts; siginfo_t info; - long timeout; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) @@ -2528,55 +2587,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - /* - * Invert the set of allowed signals to get those we - * want to block. - */ - sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&these); - - timeout = MAX_SCHEDULE_TIMEOUT; if (uts) { if (copy_from_user(&ts, uts, sizeof(ts))) return -EFAULT; - if (!timespec_valid(&ts)) - return -EINVAL; - timeout = timespec_to_jiffies(&ts) + (ts.tv_sec || ts.tv_nsec); } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - if (!sig && timeout) { - /* - * None ready -- temporarily unblock those we're - * interested while we are sleeping in so that we'll - * be awakened when they arrive. - */ - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user(uinfo, &info)) - ret = -EFAULT; - } - } else { - ret = -EAGAIN; - if (timeout) - ret = -EINTR; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; } return ret; -- cgit v1.2.3 From b182801ab35f7a0afb3cdf8ba5df464d04206b46 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 21:56:14 +0200 Subject: signal: do_sigtimedwait() needs retarget_shared_pending() do_sigtimedwait() changes current->blocked and thus it needs set_current_blocked()->retarget_shared_pending(). We could use set_current_blocked() directly. It is fine to change ->real_blocked from all-zeroes to ->blocked and vice versa lockless, but this is not immediately clear, looks racy, and needs a huge comment to explain why this is correct. To keep the things simple this patch adds the new static helper, __set_task_blocked() which should be called with ->siglock held. This way we can change both ->real_blocked and ->blocked atomically under ->siglock as the current code does. This is more understandable. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Matt Fleming --- kernel/signal.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1ab89f677424..4d97e11d7672 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2299,6 +2299,18 @@ long do_no_restart_syscall(struct restart_block *param) return -EINTR; } +static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) +{ + if (signal_pending(tsk) && !thread_group_empty(tsk)) { + sigset_t newblocked; + /* A set of now blocked but previously unblocked signals. */ + signandsets(&newblocked, newset, ¤t->blocked); + retarget_shared_pending(tsk, &newblocked); + } + tsk->blocked = *newset; + recalc_sigpending(); +} + /** * set_current_blocked - change current->blocked mask * @newset: new mask @@ -2311,14 +2323,7 @@ void set_current_blocked(const sigset_t *newset) struct task_struct *tsk = current; spin_lock_irq(&tsk->sighand->siglock); - if (signal_pending(tsk) && !thread_group_empty(tsk)) { - sigset_t newblocked; - /* A set of now blocked but previously unblocked signals. */ - signandsets(&newblocked, newset, ¤t->blocked); - retarget_shared_pending(tsk, &newblocked); - } - tsk->blocked = *newset; - recalc_sigpending(); + __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); } @@ -2541,7 +2546,8 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when - * they arrive. + * they arrive. Unblocking is always fine, we can avoid + * set_current_blocked(). */ tsk->real_blocked = tsk->blocked; sigandsets(&tsk->blocked, &tsk->blocked, &mask); @@ -2551,10 +2557,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, timeout = schedule_timeout_interruptible(timeout); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info); - tsk->blocked = tsk->real_blocked; + __set_task_blocked(tsk, &tsk->real_blocked); siginitset(&tsk->real_blocked, 0); - recalc_sigpending(); + sig = dequeue_signal(tsk, &mask, info); } spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From 702a5073fdb71eb29cd4912575289fb5044c1894 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Apr 2011 22:01:27 +0200 Subject: signal: rename signandsets() to sigandnsets() As Tejun and Linus pointed out, "nand" is the wrong name for "x & ~y", it should be "andn". Rename signandsets() as suggested. Suggested-by: Tejun Heo Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo --- include/linux/signal.h | 6 +++--- kernel/signal.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 782546d661ba..7e2526374fd7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -123,13 +123,13 @@ _SIG_SET_BINOP(sigorsets, _sig_or) #define _sig_and(x,y) ((x) & (y)) _SIG_SET_BINOP(sigandsets, _sig_and) -#define _sig_nand(x,y) ((x) & ~(y)) -_SIG_SET_BINOP(signandsets, _sig_nand) +#define _sig_andn(x,y) ((x) & ~(y)) +_SIG_SET_BINOP(sigandnsets, _sig_andn) #undef _SIG_SET_BINOP #undef _sig_or #undef _sig_and -#undef _sig_nand +#undef _sig_andn #define _SIG_SET_OP(name, op) \ static inline void name(sigset_t *set) \ diff --git a/kernel/signal.c b/kernel/signal.c index 4d97e11d7672..e7ee4e642c5a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -669,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) if (sigisemptyset(&m)) return 0; - signandsets(&s->signal, &s->signal, mask); + sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); @@ -2304,7 +2304,7 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) if (signal_pending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ - signandsets(&newblocked, newset, ¤t->blocked); + sigandnsets(&newblocked, newset, ¤t->blocked); retarget_shared_pending(tsk, &newblocked); } tsk->blocked = *newset; @@ -2349,7 +2349,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: - signandsets(&newset, &tsk->blocked, set); + sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: newset = *set; -- cgit v1.2.3 From b013c399245a88a73aaa031279f0c4d7cea7fe68 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 28 Apr 2011 11:36:20 +0200 Subject: signal: cleanup sys_sigprocmask() Cleanup. Remove the unneeded goto's, we can simply read blocked.sig[0] unconditionally and then copy-to-user it if oset != NULL. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Matt Fleming --- kernel/signal.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e7ee4e642c5a..c0af959b8530 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2889,29 +2889,28 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals - * @set: signals to add or remove (if non-null) + * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ -SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { - int error; old_sigset_t old_set, new_set; + int error; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; - new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + old_set = current->blocked.sig[0]; - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked.sig[0]; + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(*nset))) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); error = 0; + spin_lock_irq(¤t->sighand->siglock); switch (how) { default: error = -EINVAL; @@ -2930,19 +2929,15 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - old_set = current->blocked.sig[0]; - set_old: - error = -EFAULT; + return error; + } + + if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ -- cgit v1.2.3 From b12a03ce4880bd13786a98db6de494a3e0123129 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 May 2011 16:48:57 +0200 Subject: hrtimers: Prepare for cancel on clock was set timers Make clock_was_set() unconditional and rename hres_timers_resume to hrtimers_resume. This is a preparatory patch for hrtimers which are cancelled when clock realtime was set. Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 16 ++---- kernel/hrtimer.c | 125 ++++++++++++++++++++++------------------------ kernel/time/timekeeping.c | 2 +- 3 files changed, 65 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 62f500c724f9..4135c88fe4fa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -148,9 +148,7 @@ struct hrtimer_clock_base { ktime_t resolution; ktime_t (*get_time)(void); ktime_t softirq_time; -#ifdef CONFIG_HIGH_RES_TIMERS ktime_t offset; -#endif }; enum hrtimer_base_type { @@ -256,8 +254,6 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; -extern void clock_was_set(void); -extern void hres_timers_resume(void); extern void hrtimer_interrupt(struct clock_event_device *dev); /* @@ -291,16 +287,8 @@ extern void hrtimer_peek_ahead_timers(void); # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES -/* - * clock_was_set() is a NOP for non- high-resolution systems. The - * time-sorted order guarantees that a timer does not expire early and - * is expired in the next softirq when the clock was advanced. - */ -static inline void clock_was_set(void) { } static inline void hrtimer_peek_ahead_timers(void) { } -static inline void hres_timers_resume(void) { } - /* * In non high resolution mode the time reference is taken from * the base softirq time variable. @@ -316,11 +304,13 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) } #endif +extern void clock_was_set(void); +extern void hrtimers_resume(void); + extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); extern ktime_t ktime_get_boottime(void); - DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index dbbbf7d43080..c145ed643bca 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -621,66 +621,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, return res; } - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm, sleep; - - if (!hrtimer_hres_active()) - return; - - get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, - &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - base = &__get_cpu_var(hrtimer_bases); - - /* Adjust CLOCK_REALTIME offset */ - raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hres_timers_resume(void) -{ - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hres_timers_resume() called with IRQs enabled!"); - - retrigger_next_event(NULL); -} - /* * Initialize the high resolution related parts of cpu_base */ @@ -714,12 +654,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static void retrigger_next_event(void *arg); + /* * Switch to high resolution mode */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); + int i, cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -735,9 +677,8 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; - base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -764,6 +705,62 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset, xtim, wtm, sleep; + + if (!hrtimer_hres_active()) + return; + + get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); + + /* Adjust CLOCK_REALTIME offset */ + raw_spin_lock(&base->lock); + base->clock_base[HRTIMER_BASE_REALTIME].offset = + timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); + + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt (on the local CPU): + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + retrigger_next_event(NULL); +} + static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { #ifdef CONFIG_TIMER_STATS diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8e6a05a5915a..a61b8fa2d39a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -680,7 +680,7 @@ static void timekeeping_resume(void) clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); /* Resume hrtimers */ - hres_timers_resume(); + hrtimers_resume(); } static int timekeeping_suspend(void) -- cgit v1.2.3 From 99ee5315dac6211e972fa3f23bcc9a0343ff58c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 27 Apr 2011 14:16:42 +0200 Subject: timerfd: Allow timers to be cancelled when clock was set Some applications must be aware of clock realtime being set backward. A simple example is a clock applet which arms a timer for the next minute display. If clock realtime is set backward then the applet displays a stale time for the amount of time which the clock was set backwards. Due to that applications poll the time because we don't have an interface. Extend the timerfd interface by adding a flag which puts the timer onto a different internal realtime clock. All timers on this clock are expired whenever the clock was set. The timerfd core records the monotonic offset when the timer is created. When the timer is armed, then the current offset is compared to the previous recorded offset. When it has changed, then timerfd_settime returns -ECANCELED. When a timer is read the offset is compared and if it changed -ECANCELED returned to user space. Periodic timers are not rearmed in the cancelation case. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: Chris Friesen Tested-by: Kay Sievers Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Davide Libenzi Reviewed-by: Alexander Shishkin Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1104271359580.3323%40ionos%3E Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 57 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/hrtimer.h | 2 ++ include/linux/time.h | 6 +++++ include/linux/timerfd.h | 3 ++- kernel/hrtimer.c | 36 +++++++++++++++++++++++++++++- kernel/time/timekeeping.c | 15 +++++++++++++ 6 files changed, 111 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/timerfd.c b/fs/timerfd.c index 8c4fc1425b3e..7e14c9e7c4ee 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -26,10 +26,12 @@ struct timerfd_ctx { struct hrtimer tmr; ktime_t tintv; + ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int expired; int clockid; + bool might_cancel; }; /* @@ -59,24 +61,52 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } -static void timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) +static bool timerfd_canceled(struct timerfd_ctx *ctx) +{ + ktime_t moffs; + + if (!ctx->might_cancel) + return false; + + moffs = ktime_get_monotonic_offset(); + + if (moffs.tv64 == ctx->moffs.tv64) + return false; + + ctx->moffs = moffs; + return true; +} + +static int timerfd_setup(struct timerfd_ctx *ctx, int flags, + const struct itimerspec *ktmr) { enum hrtimer_mode htmode; ktime_t texp; + int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; + ctx->might_cancel = false; + if (htmode == HRTIMER_MODE_ABS && ctx->clockid == CLOCK_REALTIME && + (flags & TFD_TIMER_CANCELON_SET)) { + clockid = CLOCK_REALTIME_COS; + ctx->might_cancel = true; + } + texp = timespec_to_ktime(ktmr->it_value); ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, ctx->clockid, htmode); + hrtimer_init(&ctx->tmr, clockid, htmode); hrtimer_set_expires(&ctx->tmr, texp); ctx->tmr.function = timerfd_tmrproc; - if (texp.tv64 != 0) + if (texp.tv64 != 0) { hrtimer_start(&ctx->tmr, texp, htmode); + if (timerfd_canceled(ctx)) + return -ECANCELED; + } + return 0; } static int timerfd_release(struct inode *inode, struct file *file) @@ -118,8 +148,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); + if (ctx->ticks) { ticks = ctx->ticks; + + /* + * If clock has changed, we do not care about the + * ticks and we do not rearm the timer. Userspace must + * reevaluate anyway. + */ + if (timerfd_canceled(ctx)) { + ticks = 0; + ctx->expired = 0; + res = -ECANCELED; + } + if (ctx->expired && ctx->tintv.tv64) { /* * If tintv.tv64 != 0, this is a periodic timer that @@ -183,6 +226,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); @@ -199,6 +243,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, struct file *file; struct timerfd_ctx *ctx; struct itimerspec ktmr, kotmr; + int ret; if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; @@ -240,14 +285,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, /* * Re-program the timer to the new value ... */ - timerfd_setup(ctx, flags, &ktmr); + ret = timerfd_setup(ctx, flags, &ktmr); spin_unlock_irq(&ctx->wqh.lock); fput(file); if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) return -EFAULT; - return 0; + return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4135c88fe4fa..eda4ccde0730 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -155,6 +155,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME, HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_BOOTTIME, + HRTIMER_BASE_REALTIME_COS, HRTIMER_MAX_CLOCK_BASES, }; @@ -310,6 +311,7 @@ extern void hrtimers_resume(void); extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); extern ktime_t ktime_get_boottime(void); +extern ktime_t ktime_get_monotonic_offset(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/include/linux/time.h b/include/linux/time.h index b3061782dec3..a9242773eb24 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -302,6 +302,12 @@ struct itimerval { * The IDs of various hardware clocks: */ #define CLOCK_SGI_CYCLE 10 + +#ifdef __KERNEL__ +/* This clock is not exposed to user space */ +#define CLOCK_REALTIME_COS 15 +#endif + #define MAX_CLOCKS 16 #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) #define CLOCKS_MONO CLOCK_MONOTONIC diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index 2d0792983f8c..e9571fc8f1a0 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -19,6 +19,7 @@ * shared O_* flags. */ #define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCELON_SET (1 << 1) #define TFD_CLOEXEC O_CLOEXEC #define TFD_NONBLOCK O_NONBLOCK @@ -26,6 +27,6 @@ /* Flags for timerfd_create. */ #define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS /* Flags for timerfd_settime. */ -#define TFD_SETTIME_FLAGS TFD_TIMER_ABSTIME +#define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCELON_SET) #endif /* _LINUX_TIMERFD_H */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c145ed643bca..eabcbd781433 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -78,6 +78,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = CLOCK_REALTIME_COS, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, } }; @@ -85,6 +90,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_REALTIME_COS] = HRTIMER_BASE_REALTIME_COS, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -110,6 +116,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_REALTIME_COS].softirq_time = xtim; } /* @@ -479,6 +486,8 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } +static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base); + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -715,9 +724,14 @@ static void retrigger_next_event(void *arg) struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); struct timespec realtime_offset, xtim, wtm, sleep; - if (!hrtimer_hres_active()) + if (!hrtimer_hres_active()) { + raw_spin_lock(&base->lock); + hrtimer_expire_cancelable(base); + raw_spin_unlock(&base->lock); return; + } + /* Optimized out for !HIGH_RES */ get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); @@ -727,6 +741,10 @@ static void retrigger_next_event(void *arg) timespec_to_ktime(realtime_offset); base->clock_base[HRTIMER_BASE_BOOTTIME].offset = timespec_to_ktime(sleep); + base->clock_base[HRTIMER_BASE_REALTIME_COS].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_expire_cancelable(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); @@ -1222,6 +1240,22 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) timer->state &= ~HRTIMER_STATE_CALLBACK; } +static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base) +{ + struct timerqueue_node *node; + struct hrtimer_clock_base *base; + ktime_t now = ktime_get_real(); + + base = &cpu_base->clock_base[HRTIMER_BASE_REALTIME_COS]; + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + __run_hrtimer(timer, &now); + } +} + #ifdef CONFIG_HIGH_RES_TIMERS /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a61b8fa2d39a..342408cf68dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1098,6 +1098,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&xtime_lock, seq)); } +/** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ + unsigned long seq; + struct timespec wtom; + + do { + seq = read_seqbegin(&xtime_lock); + wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + return timespec_to_ktime(wtom); +} + /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. -- cgit v1.2.3 From ce788f930b0cdf821de7ee8f84cfe8cf7fcb6311 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 May 2011 08:00:47 +0200 Subject: alarmtimer: Check return value of class_find_device() alarmtimer_late_init() uses class_find_device() to find a alarm capable rtc device. The match callback stores a pointer to the name in the char pointer handed in from the call site. alarmtimer_late_init() checks the char pointer for NULL, but the pointer is on the stack and not initialized to NULL before the call. So it can have random content when the match function did not identify a device, which leads to random access in the following rtc_open() call where the pointer is dereferenced Instead of relying on the char pointer, check the return value of class_find_device. If a device is found then the name pointer is valid as well. Reported-by: Ingo Molnar Cc: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9265014cb4db..e5db9b00751b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -669,11 +669,13 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) */ static int __init alarmtimer_init_late(void) { + struct device *dev; char *str; /* Find an rtc device and init the rtc_timer */ - class_find_device(rtc_class, NULL, &str, has_wakealarm); - if (str) + dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); + /* If we have a device then str is valid. See has_wakealarm() */ + if (dev) rtcdev = rtc_class_open(str); if (!rtcdev) { printk(KERN_WARNING "No RTC device found, ALARM timers will" -- cgit v1.2.3 From 179eb03268aa1da03d90f1566ea85dc1478d3ae3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 May 2011 08:18:34 +0200 Subject: alarmtimer: Drop device refcount after rtc_open() class_find_device() takes a refcount on the rtc device. rtc_open() takes another one, so we can drop it after the rtc_open() call. Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/time/alarmtimer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e5db9b00751b..c6027fe9a4e6 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -675,8 +675,14 @@ static int __init alarmtimer_init_late(void) /* Find an rtc device and init the rtc_timer */ dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); /* If we have a device then str is valid. See has_wakealarm() */ - if (dev) + if (dev) { rtcdev = rtc_class_open(str); + /* + * Drop the reference we got in class_find_device, + * rtc_open takes its own. + */ + put_device(dev); + } if (!rtcdev) { printk(KERN_WARNING "No RTC device found, ALARM timers will" " not wake from suspend"); -- cgit v1.2.3 From 7372b0b122af0f6675f3ab65bfd91c8a438e0480 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 4 May 2011 15:09:27 -0700 Subject: clockevents: Move C3 stop test outside lock Avoid taking broadcast_lock in the idle path for systems where the timer doesn't stop in C3. [ tglx: Removed the stale label and added comment ] Signed-off-by: Andi Kleen Cc: Dave Kleikamp Cc: Chris Mason Cc: Peter Zijlstra Cc: Tim Chen Cc: lenb@kernel.org Cc: paulmck@us.ibm.com Link: http://lkml.kernel.org/r/%3C20110504234806.GF2925%40one.firstfloor.org%3E Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da800ffa810c..827e0f862da4 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - goto out; + return; - bc = tick_broadcast_device.evtdev; + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; + + bc = tick_broadcast_device.evtdev; + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); @@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_program_event(dev->next_event, 1); } } - -out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From 228e548e602061b08ee8e8966f567c12aa079682 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 2 May 2011 20:21:35 +0000 Subject: net: Add sendmmsg socket system call This patch adds a multiple message send syscall and is the send version of the existing recvmmsg syscall. This is heavily based on the patch by Arnaldo that added recvmmsg. I wrote a microbenchmark to test the performance gains of using this new syscall: http://ozlabs.org/~anton/junkcode/sendmmsg_test.c The test was run on a ppc64 box with a 10 Gbit network card. The benchmark can send both UDP and RAW ethernet packets. 64B UDP batch pkts/sec 1 804570 2 872800 (+ 8 %) 4 916556 (+14 %) 8 939712 (+17 %) 16 952688 (+18 %) 32 956448 (+19 %) 64 964800 (+20 %) 64B raw socket batch pkts/sec 1 1201449 2 1350028 (+12 %) 4 1461416 (+22 %) 8 1513080 (+26 %) 16 1541216 (+28 %) 32 1553440 (+29 %) 64 1557888 (+30 %) We see a 20% improvement in throughput on UDP send and 30% on raw socket send. [ Add sparc syscall entries. -DaveM ] Signed-off-by: Anton Blanchard Signed-off-by: David S. Miller --- arch/powerpc/include/asm/systbl.h | 1 + arch/powerpc/include/asm/unistd.h | 3 +- arch/sparc/include/asm/unistd.h | 3 +- arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 4 +- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 +- arch/x86/include/asm/unistd_64.h | 2 + arch/x86/kernel/syscall_table_32.S | 1 + include/linux/net.h | 1 + include/linux/socket.h | 2 + include/linux/syscalls.h | 2 + include/net/compat.h | 2 + kernel/sys_ni.c | 2 + net/compat.c | 16 ++- net/socket.c | 199 +++++++++++++++++++++++++++++-------- 16 files changed, 192 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 60f64b132bd4..8489d372077f 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -352,3 +352,4 @@ SYSCALL_SPU(name_to_handle_at) COMPAT_SYS_SPU(open_by_handle_at) COMPAT_SYS_SPU(clock_adjtime) SYSCALL_SPU(syncfs) +COMPAT_SYS_SPU(sendmmsg) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 3c215648ce6d..6d23c8193caa 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -371,10 +371,11 @@ #define __NR_open_by_handle_at 346 #define __NR_clock_adjtime 347 #define __NR_syncfs 348 +#define __NR_sendmmsg 349 #ifdef __KERNEL__ -#define __NR_syscalls 349 +#define __NR_syscalls 350 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 9d897b6db983..c5387ed0add8 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -404,8 +404,9 @@ #define __NR_open_by_handle_at 333 #define __NR_clock_adjtime 334 #define __NR_syncfs 335 +#define __NR_sendmmsg 336 -#define NR_syscalls 336 +#define NR_syscalls 337 #ifdef __32bit_syscall_numbers__ /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants, diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 47ac73c32e88..332c83ff7701 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -84,4 +84,4 @@ sys_call_table: /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv /*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg, sys_fanotify_init /*330*/ .long sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime -/*335*/ .long sys_syncfs +/*335*/ .long sys_syncfs, sys_sendmmsg diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 4f3170c1ef47..43887ca0be0e 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -85,7 +85,7 @@ sys_call_table32: /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg, sys_fanotify_init /*330*/ .word sys32_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime - .word sys_syncfs + .word sys_syncfs, compat_sys_sendmmsg #endif /* CONFIG_COMPAT */ @@ -162,4 +162,4 @@ sys_call_table: /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg, sys_fanotify_init /*330*/ .word sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, sys_open_by_handle_at, sys_clock_adjtime - .word sys_syncfs + .word sys_syncfs, sys_sendmmsg diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 849a9d23c71d..95f5826be458 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -848,4 +848,5 @@ ia32_sys_call_table: .quad compat_sys_open_by_handle_at .quad compat_sys_clock_adjtime .quad sys_syncfs + .quad compat_sys_sendmmsg /* 345 */ ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index a755ef5e5977..fb6a625c99bf 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -350,10 +350,11 @@ #define __NR_open_by_handle_at 342 #define __NR_clock_adjtime 343 #define __NR_syncfs 344 +#define __NR_sendmmsg 345 #ifdef __KERNEL__ -#define NR_syscalls 345 +#define NR_syscalls 346 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 160fa76bd578..79f90eb15aad 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -677,6 +677,8 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) #define __NR_syncfs 306 __SYSCALL(__NR_syncfs, sys_syncfs) +#define __NR_sendmmsg 307 +__SYSCALL(__NR_sendmmsg, sys_sendmmsg) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index abce34d5c79d..32cbffb0c494 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -344,3 +344,4 @@ ENTRY(sys_call_table) .long sys_open_by_handle_at .long sys_clock_adjtime .long sys_syncfs + .long sys_sendmmsg /* 345 */ diff --git a/include/linux/net.h b/include/linux/net.h index 94de83c0f877..1da55e9b6f01 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -42,6 +42,7 @@ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ #define SYS_ACCEPT4 18 /* sys_accept4(2) */ #define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */ +#define SYS_SENDMMSG 20 /* sys_sendmmsg(2) */ typedef enum { SS_FREE = 0, /* not allocated */ diff --git a/include/linux/socket.h b/include/linux/socket.h index d2b5e982f079..4ef98e422fde 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -333,5 +333,7 @@ struct timespec; extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec *timeout); +extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags); #endif /* not kernel and not glibc */ #endif /* _LINUX_SOCKET_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 83ecc1749ef6..ab71447d0c5a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -610,6 +610,8 @@ asmlinkage long sys_send(int, void __user *, size_t, unsigned); asmlinkage long sys_sendto(int, void __user *, size_t, unsigned, struct sockaddr __user *, int); asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags); +asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, + unsigned int vlen, unsigned flags); asmlinkage long sys_recv(int, void __user *, size_t, unsigned); asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned, struct sockaddr __user *, int __user *); diff --git a/include/net/compat.h b/include/net/compat.h index 28d5428ec6a2..9ee75edcc295 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -43,6 +43,8 @@ extern int compat_sock_get_timestampns(struct sock *, struct timespec __user *); extern int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *); extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int); extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned); +extern asmlinkage long compat_sys_sendmmsg(int, struct compat_mmsghdr __user *, + unsigned, unsigned); extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned); extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *, unsigned, unsigned, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 25cc41cd8f33..97e966f171c6 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt); cond_syscall(compat_sys_getsockopt); cond_syscall(sys_shutdown); cond_syscall(sys_sendmsg); +cond_syscall(sys_sendmmsg); cond_syscall(compat_sys_sendmsg); +cond_syscall(compat_sys_sendmmsg); cond_syscall(sys_recvmsg); cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); diff --git a/net/compat.c b/net/compat.c index 3649d5895361..c578d9382e19 100644 --- a/net/compat.c +++ b/net/compat.c @@ -722,11 +722,11 @@ EXPORT_SYMBOL(compat_mc_getsockopt); /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) -static unsigned char nas[20] = { +static unsigned char nas[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), - AL(4), AL(5) + AL(4), AL(5), AL(4) }; #undef AL @@ -735,6 +735,13 @@ asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, uns return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } +asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags) +{ + return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT); +} + asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); @@ -780,7 +787,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args) u32 a[6]; u32 a0, a1; - if (call < SYS_SOCKET || call > SYS_RECVMMSG) + if (call < SYS_SOCKET || call > SYS_SENDMMSG) return -EINVAL; if (copy_from_user(a, args, nas[call])) return -EFAULT; @@ -839,6 +846,9 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args) case SYS_SENDMSG: ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); break; + case SYS_SENDMMSG: + ret = compat_sys_sendmmsg(a0, compat_ptr(a1), a[2], a[3]); + break; case SYS_RECVMSG: ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; diff --git a/net/socket.c b/net/socket.c index d25f5a9d6fa2..ed50255143d5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -551,11 +551,10 @@ int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) } EXPORT_SYMBOL(sock_tx_timestamp); -static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, - struct msghdr *msg, size_t size) +static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); - int err; sock_update_classid(sock->sk); @@ -564,13 +563,17 @@ static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, si->msg = msg; si->size = size; - err = security_socket_sendmsg(sock, msg, size); - if (err) - return err; - return sock->ops->sendmsg(iocb, sock, msg, size); } +static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + int err = security_socket_sendmsg(sock, msg, size); + + return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); +} + int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; @@ -586,6 +589,20 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(sock_sendmsg); +int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_sendmsg_nosec(&iocb, sock, msg, size); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { @@ -1863,57 +1880,47 @@ SYSCALL_DEFINE2(shutdown, int, fd, int, how) #define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) #define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) -/* - * BSD sendmsg interface - */ - -SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) +static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, + struct msghdr *msg_sys, unsigned flags, int nosec) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; - struct socket *sock; struct sockaddr_storage address; struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20] __attribute__ ((aligned(sizeof(__kernel_size_t)))); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; - struct msghdr msg_sys; int err, ctl_len, iov_size, total_len; - int fput_needed; err = -EFAULT; if (MSG_CMSG_COMPAT & flags) { - if (get_compat_msghdr(&msg_sys, msg_compat)) + if (get_compat_msghdr(msg_sys, msg_compat)) return -EFAULT; - } else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) + } else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) return -EFAULT; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; - /* do not move before msg_sys is valid */ err = -EMSGSIZE; - if (msg_sys.msg_iovlen > UIO_MAXIOV) - goto out_put; + if (msg_sys->msg_iovlen > UIO_MAXIOV) + goto out; /* Check whether to allocate the iovec area */ err = -ENOMEM; - iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); - if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov_size = msg_sys->msg_iovlen * sizeof(struct iovec); + if (msg_sys->msg_iovlen > UIO_FASTIOV) { iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); if (!iov) - goto out_put; + goto out; } /* This will also move the address data into kernel space */ if (MSG_CMSG_COMPAT & flags) { - err = verify_compat_iovec(&msg_sys, iov, + err = verify_compat_iovec(msg_sys, iov, (struct sockaddr *)&address, VERIFY_READ); } else - err = verify_iovec(&msg_sys, iov, + err = verify_iovec(msg_sys, iov, (struct sockaddr *)&address, VERIFY_READ); if (err < 0) @@ -1922,17 +1929,17 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) err = -ENOBUFS; - if (msg_sys.msg_controllen > INT_MAX) + if (msg_sys->msg_controllen > INT_MAX) goto out_freeiov; - ctl_len = msg_sys.msg_controllen; + ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = - cmsghdr_from_user_compat_to_kern(&msg_sys, sock->sk, ctl, + cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) goto out_freeiov; - ctl_buf = msg_sys.msg_control; - ctl_len = msg_sys.msg_controllen; + ctl_buf = msg_sys->msg_control; + ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); @@ -1941,21 +1948,22 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) } err = -EFAULT; /* - * Careful! Before this, msg_sys.msg_control contains a user pointer. + * Careful! Before this, msg_sys->msg_control contains a user pointer. * Afterwards, it will be a kernel pointer. Thus the compiler-assisted * checking falls down on this. */ if (copy_from_user(ctl_buf, - (void __user __force *)msg_sys.msg_control, + (void __user __force *)msg_sys->msg_control, ctl_len)) goto out_freectl; - msg_sys.msg_control = ctl_buf; + msg_sys->msg_control = ctl_buf; } - msg_sys.msg_flags = flags; + msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) - msg_sys.msg_flags |= MSG_DONTWAIT; - err = sock_sendmsg(sock, &msg_sys, total_len); + msg_sys->msg_flags |= MSG_DONTWAIT; + err = (nosec ? sock_sendmsg_nosec : sock_sendmsg)(sock, msg_sys, + total_len); out_freectl: if (ctl_buf != ctl) @@ -1963,12 +1971,114 @@ out_freectl: out_freeiov: if (iov != iovstack) sock_kfree_s(sock->sk, iov, iov_size); -out_put: +out: + return err; +} + +/* + * BSD sendmsg interface + */ + +SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) +{ + int fput_needed, err; + struct msghdr msg_sys; + struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed); + + if (!sock) + goto out; + + err = __sys_sendmsg(sock, msg, &msg_sys, flags, 0); + fput_light(sock->file, fput_needed); out: return err; } +/* + * Linux sendmmsg interface + */ + +int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, + unsigned int flags) +{ + int fput_needed, err, datagrams; + struct socket *sock; + struct mmsghdr __user *entry; + struct compat_mmsghdr __user *compat_entry; + struct msghdr msg_sys; + + datagrams = 0; + + sock = sockfd_lookup_light(fd, &err, &fput_needed); + if (!sock) + return err; + + err = sock_error(sock->sk); + if (err) + goto out_put; + + entry = mmsg; + compat_entry = (struct compat_mmsghdr __user *)mmsg; + + while (datagrams < vlen) { + /* + * No need to ask LSM for more than the first datagram. + */ + if (MSG_CMSG_COMPAT & flags) { + err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags, datagrams); + if (err < 0) + break; + err = __put_user(err, &compat_entry->msg_len); + ++compat_entry; + } else { + err = __sys_sendmsg(sock, (struct msghdr __user *)entry, + &msg_sys, flags, datagrams); + if (err < 0) + break; + err = put_user(err, &entry->msg_len); + ++entry; + } + + if (err) + break; + ++datagrams; + } + +out_put: + fput_light(sock->file, fput_needed); + + if (err == 0) + return datagrams; + + if (datagrams != 0) { + /* + * We may send less entries than requested (vlen) if the + * sock is non blocking... + */ + if (err != -EAGAIN) { + /* + * ... or if sendmsg returns an error after we + * send some datagrams, where we record the + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). + */ + sock->sk->sk_err = -err; + } + + return datagrams; + } + + return err; +} + +SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags) +{ + return __sys_sendmmsg(fd, mmsg, vlen, flags); +} + static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned flags, int nosec) { @@ -2214,11 +2324,11 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ #define AL(x) ((x) * sizeof(unsigned long)) -static const unsigned char nargs[20] = { +static const unsigned char nargs[21] = { AL(0), AL(3), AL(3), AL(3), AL(2), AL(3), AL(3), AL(3), AL(4), AL(4), AL(4), AL(6), AL(6), AL(2), AL(5), AL(5), AL(3), AL(3), - AL(4), AL(5) + AL(4), AL(5), AL(4) }; #undef AL @@ -2238,7 +2348,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) int err; unsigned int len; - if (call < 1 || call > SYS_RECVMMSG) + if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; len = nargs[call]; @@ -2313,6 +2423,9 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; + case SYS_SENDMMSG: + err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); + break; case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; -- cgit v1.2.3 From 2e4f7c7769a0b8b6b3e88b0436c6c634f146a4ef Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 May 2011 13:48:56 +0200 Subject: signal: sys_sigprocmask() needs retarget_shared_pending() sys_sigprocmask() changes current->blocked by hand. Convert this code to use set_current_blocked(). Signed-off-by: Oleg Nesterov --- kernel/signal.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index c0af959b8530..b87780ef7d28 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2900,7 +2900,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; - int error; + sigset_t new_blocked; old_set = current->blocked.sig[0]; @@ -2909,27 +2909,23 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EFAULT; new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - error = 0; - spin_lock_irq(¤t->sighand->siglock); + new_blocked = current->blocked; + switch (how) { - default: - error = -EINVAL; - break; case SIG_BLOCK: - sigaddsetmask(¤t->blocked, new_set); + sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: - sigdelsetmask(¤t->blocked, new_set); + sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: - current->blocked.sig[0] = new_set; + new_blocked.sig[0] = new_set; break; + default: + return -EINVAL; } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - if (error) - return error; + set_current_blocked(&new_blocked); } if (oset) { -- cgit v1.2.3 From 40ae717d1e78d982bd469b2013a4cbf4ec1ca434 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 May 2011 11:52:22 +0200 Subject: ptrace: fix signal->wait_chldexit usage in task_clear_group_stop_trapping() GROUP_STOP_TRAPPING waiting mechanism piggybacks on signal->wait_chldexit which is primarily used to implement waiting for wait(2) and friends. When do_wait() waits on signal->wait_chldexit, it uses a custom wake up callback, child_wait_callback(), which expects the child task which is waking up the parent to be passed in as @key to filter out spurious wakeups. task_clear_group_stop_trapping() used __wake_up_sync() which uses NULL @key causing the following oops if the parent was doing do_wait(). BUG: unable to handle kernel NULL pointer dereference at 00000000000002d8 IP: [] child_wait_callback+0x29/0x80 PGD 1d899067 PUD 1e418067 PMD 0 Oops: 0000 [#1] PREEMPT SMP last sysfs file: /sys/devices/pci0000:00/0000:00:03.0/local_cpus CPU 2 Modules linked in: Pid: 4498, comm: test-continued Not tainted 2.6.39-rc6-work+ #32 Bochs Bochs RIP: 0010:[] [] child_wait_callback+0x29/0x80 RSP: 0000:ffff88001b889bf8 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff88001fab3af8 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff88001d91df20 RBP: ffff88001b889c08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffff88001fb70550 R14: 0000000000000000 R15: 0000000000000001 FS: 00007f26ccae4700(0000) GS:ffff88001fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000000002d8 CR3: 000000001b8ac000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process test-continued (pid: 4498, threadinfo ffff88001b888000, task ffff88001fb88000) Stack: ffff88001b889c18 ffff88001fb70538 ffff88001b889c58 ffffffff810312f9 0000000000000001 0000000200000001 ffff88001b889c58 ffff88001fb70518 0000000000000002 0000000000000082 0000000000000001 0000000000000000 Call Trace: [] __wake_up_common+0x59/0x90 [] __wake_up_sync_key+0x53/0x80 [] __wake_up_sync+0x10/0x20 [] task_clear_jobctl_trapping+0x44/0x50 [] ptrace_stop+0x7c/0x290 [] do_signal_stop+0x28a/0x2d0 [] get_signal_to_deliver+0x14f/0x5a0 [] do_signal+0x75/0x7b0 [] do_notify_resume+0x5d/0x70 [] retint_signal+0x46/0x8c Code: 00 00 55 48 89 e5 53 48 83 ec 08 0f 1f 44 00 00 8b 47 d8 83 f8 03 74 3a 85 c0 49 89 c8 75 23 89 c0 48 8b 5f e0 4c 8d 0c 40 31 c0 <4b> 39 9c c8 d8 02 00 00 74 1d 48 83 c4 08 5b c9 c3 66 0f 1f 44 Fix it by using __wake_up_sync_key() and passing in the child as @key. I still think it's a mistake to piggyback on wait_chldexit for this. Given the relative low frequency of ptrace use, we would be much better off leaving already complex wait_chldexit alone and using bit waitqueue. Signed-off-by: Tejun Heo Reviewed-by: Oleg Nesterov --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b87780ef7d28..35c5f4b05344 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -238,8 +238,8 @@ static void task_clear_group_stop_trapping(struct task_struct *task) { if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { task->group_stop &= ~GROUP_STOP_TRAPPING; - __wake_up_sync(&task->parent->signal->wait_chldexit, - TASK_UNINTERRUPTIBLE, 1); + __wake_up_sync_key(&task->parent->signal->wait_chldexit, + TASK_UNINTERRUPTIBLE, 1, task); } } -- cgit v1.2.3 From 0663c6f8fa37d777ede74ff991a0cba3a42fcbd7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 17:48:52 -0800 Subject: ns: Introduce the setns syscall With the networking stack today there is demand to handle multiple network stacks at a time. Not in the context of containers but in the context of people doing interesting things with routing. There is also demand in the context of containers to have an efficient way to execute some code in the container itself. If nothing else it is very useful ad a debugging technique. Both problems can be solved by starting some form of login daemon in the namespaces people want access to, or you can play games by ptracing a process and getting the traced process to do things you want it to do. However it turns out that a login daemon or a ptrace puppet controller are more code, they are more prone to failure, and generally they are less efficient than simply changing the namespace of a process to a specified one. Pieces of this puzzle can also be solved by instead of coming up with a general purpose system call coming up with targed system calls perhaps socketat that solve a subset of the larger problem. Overall that appears to be more work for less reward. int setns(int fd, int nstype); The fd argument is a file descriptor referring to a proc file of the namespace you want to switch the process to. In the setns system call the nstype is 0 or specifies an clone flag of the namespace you intend to change to prevent changing a namespace unintentionally. v2: Most of the architecture support added by Daniel Lezcano v3: ported to v2.6.36-rc4 by: Eric W. Biederman v4: Moved wiring up of the system call to another patch v5: Cleaned up the system call arguments - Changed the order. - Modified nstype to take the standard clone flags. v6: Added missing error handling as pointed out by Matt Helsley Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- kernel/nsproxy.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd9..5424e37673ed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@ #include #include #include +#include +#include +#include static struct kmem_cache *nsproxy_cachep; @@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); -- cgit v1.2.3 From 34482e89a5218f0f9317abf1cfba3bb38b5c29dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 18:43:27 -0800 Subject: ns proc: Add support for the uts namespace Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 3 +++ include/linux/proc_fs.h | 1 + kernel/utsname.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) (limited to 'kernel') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dcbd483e9915..b017181f1273 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -19,6 +19,9 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif }; static const struct file_operations ns_file_operations = { diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 62126ec6ede9..52aa89df8a6d 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -266,6 +266,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, void *ns); }; extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; union proc_op { int (*proc_get_link)(struct inode *, struct path *); diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eaba..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct uts_namespace *create_uts_ns(void) { @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) put_user_ns(ns->user_ns); kfree(ns); } + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + -- cgit v1.2.3 From be84cb43833ee40a42e08f5425d20310f16229c7 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 9 May 2011 13:12:30 -0400 Subject: compat: fixes to allow working with tile arch The existing mechanism doesn't really provide enough to create the 64-bit "compat" ABI properly in a generic way, since the compat ABI is a mix of things were you can re-use the 64-bit versions of syscalls and things where you need a compat wrapper. To provide this in the most direct way possible, I added two new macros to go along with the existing __SYSCALL and __SC_3264 macros: __SC_COMP and SC_COMP_3264. These macros take an additional argument, typically a "compat_sys_xxx" function, which is passed to __SYSCALL if you define __SYSCALL_COMPAT when including the header, resulting in a pointer to the compat function being placed in the generated syscall table. The change also adds some missing definitions to so that it actually has declarations for all the compat syscalls, since the "[nr] = ##call" approach requires proper C declarations for all the functions included in the syscall table. Finally, compat.c defines compat_sys_sigpending() and compat_sys_sigprocmask() even if the underlying architecture doesn't request it, which tries to pull in undefined compat_old_sigset_t defines. We need to guard those compat syscall definitions with appropriate __ARCH_WANT_SYS_xxx ifdefs. Acked-by: Arnd Bergmann Signed-off-by: Chris Metcalf --- arch/tile/kernel/compat.c | 2 +- include/asm-generic/unistd.h | 221 ++++++++++++++++++++++++------------------- include/linux/compat.h | 187 ++++++++++++++++++++++++++++++++++++ kernel/compat.c | 8 ++ 4 files changed, 320 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c index dbc213adf5e1..e35a3975ca3b 100644 --- a/arch/tile/kernel/compat.c +++ b/arch/tile/kernel/compat.c @@ -135,7 +135,7 @@ long tile_compat_sys_msgrcv(int msqid, /* Provide the compat syscall number to call mapping. */ #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (compat_##call), +#define __SYSCALL(nr, call) [nr] = (call), /* The generic versions of these don't work for Tile. */ #define compat_sys_msgrcv tile_compat_sys_msgrcv diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 07c40d5149de..33d524704883 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -24,16 +24,24 @@ #define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64) #endif +#ifdef __SYSCALL_COMPAT +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp) +#else +#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys) +#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64) +#endif + #define __NR_io_setup 0 -__SYSCALL(__NR_io_setup, sys_io_setup) +__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup) #define __NR_io_destroy 1 __SYSCALL(__NR_io_destroy, sys_io_destroy) #define __NR_io_submit 2 -__SYSCALL(__NR_io_submit, sys_io_submit) +__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit) #define __NR_io_cancel 3 __SYSCALL(__NR_io_cancel, sys_io_cancel) #define __NR_io_getevents 4 -__SYSCALL(__NR_io_getevents, sys_io_getevents) +__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents) /* fs/xattr.c */ #define __NR_setxattr 5 @@ -67,7 +75,7 @@ __SYSCALL(__NR_getcwd, sys_getcwd) /* fs/cookies.c */ #define __NR_lookup_dcookie 18 -__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) +__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) /* fs/eventfd.c */ #define __NR_eventfd2 19 @@ -79,7 +87,7 @@ __SYSCALL(__NR_epoll_create1, sys_epoll_create1) #define __NR_epoll_ctl 21 __SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) #define __NR_epoll_pwait 22 -__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) +__SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait) /* fs/fcntl.c */ #define __NR_dup 23 @@ -87,7 +95,7 @@ __SYSCALL(__NR_dup, sys_dup) #define __NR_dup3 24 __SYSCALL(__NR_dup3, sys_dup3) #define __NR3264_fcntl 25 -__SC_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl) +__SC_COMP_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl, compat_sys_fcntl64) /* fs/inotify_user.c */ #define __NR_inotify_init1 26 @@ -99,7 +107,7 @@ __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) /* fs/ioctl.c */ #define __NR_ioctl 29 -__SYSCALL(__NR_ioctl, sys_ioctl) +__SC_COMP(__NR_ioctl, sys_ioctl, compat_sys_ioctl) /* fs/ioprio.c */ #define __NR_ioprio_set 30 @@ -129,26 +137,30 @@ __SYSCALL(__NR_renameat, sys_renameat) #define __NR_umount2 39 __SYSCALL(__NR_umount2, sys_umount) #define __NR_mount 40 -__SYSCALL(__NR_mount, sys_mount) +__SC_COMP(__NR_mount, sys_mount, compat_sys_mount) #define __NR_pivot_root 41 __SYSCALL(__NR_pivot_root, sys_pivot_root) /* fs/nfsctl.c */ #define __NR_nfsservctl 42 -__SYSCALL(__NR_nfsservctl, sys_nfsservctl) +__SC_COMP(__NR_nfsservctl, sys_nfsservctl, compat_sys_nfsservctl) /* fs/open.c */ #define __NR3264_statfs 43 -__SC_3264(__NR3264_statfs, sys_statfs64, sys_statfs) +__SC_COMP_3264(__NR3264_statfs, sys_statfs64, sys_statfs, \ + compat_sys_statfs64) #define __NR3264_fstatfs 44 -__SC_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs) +__SC_COMP_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs, \ + compat_sys_fstatfs64) #define __NR3264_truncate 45 -__SC_3264(__NR3264_truncate, sys_truncate64, sys_truncate) +__SC_COMP_3264(__NR3264_truncate, sys_truncate64, sys_truncate, \ + compat_sys_truncate64) #define __NR3264_ftruncate 46 -__SC_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate) +__SC_COMP_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate, \ + compat_sys_ftruncate64) #define __NR_fallocate 47 -__SYSCALL(__NR_fallocate, sys_fallocate) +__SC_COMP(__NR_fallocate, sys_fallocate, compat_sys_fallocate) #define __NR_faccessat 48 __SYSCALL(__NR_faccessat, sys_faccessat) #define __NR_chdir 49 @@ -166,7 +178,7 @@ __SYSCALL(__NR_fchownat, sys_fchownat) #define __NR_fchown 55 __SYSCALL(__NR_fchown, sys_fchown) #define __NR_openat 56 -__SYSCALL(__NR_openat, sys_openat) +__SC_COMP(__NR_openat, sys_openat, compat_sys_openat) #define __NR_close 57 __SYSCALL(__NR_close, sys_close) #define __NR_vhangup 58 @@ -182,7 +194,7 @@ __SYSCALL(__NR_quotactl, sys_quotactl) /* fs/readdir.c */ #define __NR_getdents64 61 -__SYSCALL(__NR_getdents64, sys_getdents64) +__SC_COMP(__NR_getdents64, sys_getdents64, compat_sys_getdents64) /* fs/read_write.c */ #define __NR3264_lseek 62 @@ -192,17 +204,17 @@ __SYSCALL(__NR_read, sys_read) #define __NR_write 64 __SYSCALL(__NR_write, sys_write) #define __NR_readv 65 -__SYSCALL(__NR_readv, sys_readv) +__SC_COMP(__NR_readv, sys_readv, compat_sys_readv) #define __NR_writev 66 -__SYSCALL(__NR_writev, sys_writev) +__SC_COMP(__NR_writev, sys_writev, compat_sys_writev) #define __NR_pread64 67 -__SYSCALL(__NR_pread64, sys_pread64) +__SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64) #define __NR_pwrite64 68 -__SYSCALL(__NR_pwrite64, sys_pwrite64) +__SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64) #define __NR_preadv 69 -__SYSCALL(__NR_preadv, sys_preadv) +__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv) #define __NR_pwritev 70 -__SYSCALL(__NR_pwritev, sys_pwritev) +__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev) /* fs/sendfile.c */ #define __NR3264_sendfile 71 @@ -210,17 +222,17 @@ __SC_3264(__NR3264_sendfile, sys_sendfile64, sys_sendfile) /* fs/select.c */ #define __NR_pselect6 72 -__SYSCALL(__NR_pselect6, sys_pselect6) +__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6) #define __NR_ppoll 73 -__SYSCALL(__NR_ppoll, sys_ppoll) +__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll) /* fs/signalfd.c */ #define __NR_signalfd4 74 -__SYSCALL(__NR_signalfd4, sys_signalfd4) +__SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4) /* fs/splice.c */ #define __NR_vmsplice 75 -__SYSCALL(__NR_vmsplice, sys_vmsplice) +__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice) #define __NR_splice 76 __SYSCALL(__NR_splice, sys_splice) #define __NR_tee 77 @@ -243,23 +255,27 @@ __SYSCALL(__NR_fsync, sys_fsync) __SYSCALL(__NR_fdatasync, sys_fdatasync) #ifdef __ARCH_WANT_SYNC_FILE_RANGE2 #define __NR_sync_file_range2 84 -__SYSCALL(__NR_sync_file_range2, sys_sync_file_range2) +__SC_COMP(__NR_sync_file_range2, sys_sync_file_range2, \ + compat_sys_sync_file_range2) #else #define __NR_sync_file_range 84 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) +__SC_COMP(__NR_sync_file_range, sys_sync_file_range, \ + compat_sys_sync_file_range) #endif /* fs/timerfd.c */ #define __NR_timerfd_create 85 __SYSCALL(__NR_timerfd_create, sys_timerfd_create) #define __NR_timerfd_settime 86 -__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) +__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \ + compat_sys_timerfd_settime) #define __NR_timerfd_gettime 87 -__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) +__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \ + compat_sys_timerfd_gettime) /* fs/utimes.c */ #define __NR_utimensat 88 -__SYSCALL(__NR_utimensat, sys_utimensat) +__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat) /* kernel/acct.c */ #define __NR_acct 89 @@ -281,7 +297,7 @@ __SYSCALL(__NR_exit, sys_exit) #define __NR_exit_group 94 __SYSCALL(__NR_exit_group, sys_exit_group) #define __NR_waitid 95 -__SYSCALL(__NR_waitid, sys_waitid) +__SC_COMP(__NR_waitid, sys_waitid, compat_sys_waitid) /* kernel/fork.c */ #define __NR_set_tid_address 96 @@ -291,25 +307,27 @@ __SYSCALL(__NR_unshare, sys_unshare) /* kernel/futex.c */ #define __NR_futex 98 -__SYSCALL(__NR_futex, sys_futex) +__SC_COMP(__NR_futex, sys_futex, compat_sys_futex) #define __NR_set_robust_list 99 -__SYSCALL(__NR_set_robust_list, sys_set_robust_list) +__SC_COMP(__NR_set_robust_list, sys_set_robust_list, \ + compat_sys_set_robust_list) #define __NR_get_robust_list 100 -__SYSCALL(__NR_get_robust_list, sys_get_robust_list) +__SC_COMP(__NR_get_robust_list, sys_get_robust_list, \ + compat_sys_get_robust_list) /* kernel/hrtimer.c */ #define __NR_nanosleep 101 -__SYSCALL(__NR_nanosleep, sys_nanosleep) +__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep) /* kernel/itimer.c */ #define __NR_getitimer 102 -__SYSCALL(__NR_getitimer, sys_getitimer) +__SC_COMP(__NR_getitimer, sys_getitimer, compat_sys_getitimer) #define __NR_setitimer 103 -__SYSCALL(__NR_setitimer, sys_setitimer) +__SC_COMP(__NR_setitimer, sys_setitimer, compat_sys_setitimer) /* kernel/kexec.c */ #define __NR_kexec_load 104 -__SYSCALL(__NR_kexec_load, sys_kexec_load) +__SC_COMP(__NR_kexec_load, sys_kexec_load, compat_sys_kexec_load) /* kernel/module.c */ #define __NR_init_module 105 @@ -319,23 +337,24 @@ __SYSCALL(__NR_delete_module, sys_delete_module) /* kernel/posix-timers.c */ #define __NR_timer_create 107 -__SYSCALL(__NR_timer_create, sys_timer_create) +__SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create) #define __NR_timer_gettime 108 -__SYSCALL(__NR_timer_gettime, sys_timer_gettime) +__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime) #define __NR_timer_getoverrun 109 __SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) #define __NR_timer_settime 110 -__SYSCALL(__NR_timer_settime, sys_timer_settime) +__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime) #define __NR_timer_delete 111 __SYSCALL(__NR_timer_delete, sys_timer_delete) #define __NR_clock_settime 112 -__SYSCALL(__NR_clock_settime, sys_clock_settime) +__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime) #define __NR_clock_gettime 113 -__SYSCALL(__NR_clock_gettime, sys_clock_gettime) +__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime) #define __NR_clock_getres 114 -__SYSCALL(__NR_clock_getres, sys_clock_getres) +__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres) #define __NR_clock_nanosleep 115 -__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) +__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \ + compat_sys_clock_nanosleep) /* kernel/printk.c */ #define __NR_syslog 116 @@ -355,9 +374,11 @@ __SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) #define __NR_sched_getparam 121 __SYSCALL(__NR_sched_getparam, sys_sched_getparam) #define __NR_sched_setaffinity 122 -__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) +__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \ + compat_sys_sched_setaffinity) #define __NR_sched_getaffinity 123 -__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) +__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \ + compat_sys_sched_getaffinity) #define __NR_sched_yield 124 __SYSCALL(__NR_sched_yield, sys_sched_yield) #define __NR_sched_get_priority_max 125 @@ -365,7 +386,8 @@ __SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) #define __NR_sched_get_priority_min 126 __SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) #define __NR_sched_rr_get_interval 127 -__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) +__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \ + compat_sys_sched_rr_get_interval) /* kernel/signal.c */ #define __NR_restart_syscall 128 @@ -377,21 +399,23 @@ __SYSCALL(__NR_tkill, sys_tkill) #define __NR_tgkill 131 __SYSCALL(__NR_tgkill, sys_tgkill) #define __NR_sigaltstack 132 -__SYSCALL(__NR_sigaltstack, sys_sigaltstack) +__SC_COMP(__NR_sigaltstack, sys_sigaltstack, compat_sys_sigaltstack) #define __NR_rt_sigsuspend 133 -__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ +__SC_COMP(__NR_rt_sigsuspend, sys_rt_sigsuspend, compat_sys_rt_sigsuspend) #define __NR_rt_sigaction 134 -__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) /* __ARCH_WANT_SYS_RT_SIGACTION */ +__SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction) #define __NR_rt_sigprocmask 135 __SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) #define __NR_rt_sigpending 136 __SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) #define __NR_rt_sigtimedwait 137 -__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) +__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \ + compat_sys_rt_sigtimedwait) #define __NR_rt_sigqueueinfo 138 -__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) +__SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \ + compat_sys_rt_sigqueueinfo) #define __NR_rt_sigreturn 139 -__SYSCALL(__NR_rt_sigreturn, sys_rt_sigreturn) /* sys_rt_sigreturn_wrapper, */ +__SC_COMP(__NR_rt_sigreturn, sys_rt_sigreturn, compat_sys_rt_sigreturn) /* kernel/sys.c */ #define __NR_setpriority 140 @@ -421,7 +445,7 @@ __SYSCALL(__NR_setfsuid, sys_setfsuid) #define __NR_setfsgid 152 __SYSCALL(__NR_setfsgid, sys_setfsgid) #define __NR_times 153 -__SYSCALL(__NR_times, sys_times) +__SC_COMP(__NR_times, sys_times, compat_sys_times) #define __NR_setpgid 154 __SYSCALL(__NR_setpgid, sys_setpgid) #define __NR_getpgid 155 @@ -441,11 +465,11 @@ __SYSCALL(__NR_sethostname, sys_sethostname) #define __NR_setdomainname 162 __SYSCALL(__NR_setdomainname, sys_setdomainname) #define __NR_getrlimit 163 -__SYSCALL(__NR_getrlimit, sys_getrlimit) +__SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit) #define __NR_setrlimit 164 -__SYSCALL(__NR_setrlimit, sys_setrlimit) +__SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit) #define __NR_getrusage 165 -__SYSCALL(__NR_getrusage, sys_getrusage) +__SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage) #define __NR_umask 166 __SYSCALL(__NR_umask, sys_umask) #define __NR_prctl 167 @@ -455,11 +479,11 @@ __SYSCALL(__NR_getcpu, sys_getcpu) /* kernel/time.c */ #define __NR_gettimeofday 169 -__SYSCALL(__NR_gettimeofday, sys_gettimeofday) +__SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday) #define __NR_settimeofday 170 -__SYSCALL(__NR_settimeofday, sys_settimeofday) +__SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday) #define __NR_adjtimex 171 -__SYSCALL(__NR_adjtimex, sys_adjtimex) +__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex) /* kernel/timer.c */ #define __NR_getpid 172 @@ -477,39 +501,40 @@ __SYSCALL(__NR_getegid, sys_getegid) #define __NR_gettid 178 __SYSCALL(__NR_gettid, sys_gettid) #define __NR_sysinfo 179 -__SYSCALL(__NR_sysinfo, sys_sysinfo) +__SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo) /* ipc/mqueue.c */ #define __NR_mq_open 180 -__SYSCALL(__NR_mq_open, sys_mq_open) +__SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open) #define __NR_mq_unlink 181 __SYSCALL(__NR_mq_unlink, sys_mq_unlink) #define __NR_mq_timedsend 182 -__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) +__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend) #define __NR_mq_timedreceive 183 -__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) +__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \ + compat_sys_mq_timedreceive) #define __NR_mq_notify 184 -__SYSCALL(__NR_mq_notify, sys_mq_notify) +__SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify) #define __NR_mq_getsetattr 185 -__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) +__SC_COMP(__NR_mq_getsetattr, sys_mq_getsetattr, compat_sys_mq_getsetattr) /* ipc/msg.c */ #define __NR_msgget 186 __SYSCALL(__NR_msgget, sys_msgget) #define __NR_msgctl 187 -__SYSCALL(__NR_msgctl, sys_msgctl) +__SC_COMP(__NR_msgctl, sys_msgctl, compat_sys_msgctl) #define __NR_msgrcv 188 -__SYSCALL(__NR_msgrcv, sys_msgrcv) +__SC_COMP(__NR_msgrcv, sys_msgrcv, compat_sys_msgrcv) #define __NR_msgsnd 189 -__SYSCALL(__NR_msgsnd, sys_msgsnd) +__SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd) /* ipc/sem.c */ #define __NR_semget 190 __SYSCALL(__NR_semget, sys_semget) #define __NR_semctl 191 -__SYSCALL(__NR_semctl, sys_semctl) +__SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl) #define __NR_semtimedop 192 -__SYSCALL(__NR_semtimedop, sys_semtimedop) +__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop) #define __NR_semop 193 __SYSCALL(__NR_semop, sys_semop) @@ -517,9 +542,9 @@ __SYSCALL(__NR_semop, sys_semop) #define __NR_shmget 194 __SYSCALL(__NR_shmget, sys_shmget) #define __NR_shmctl 195 -__SYSCALL(__NR_shmctl, sys_shmctl) +__SC_COMP(__NR_shmctl, sys_shmctl, compat_sys_shmctl) #define __NR_shmat 196 -__SYSCALL(__NR_shmat, sys_shmat) +__SC_COMP(__NR_shmat, sys_shmat, compat_sys_shmat) #define __NR_shmdt 197 __SYSCALL(__NR_shmdt, sys_shmdt) @@ -543,21 +568,21 @@ __SYSCALL(__NR_getpeername, sys_getpeername) #define __NR_sendto 206 __SYSCALL(__NR_sendto, sys_sendto) #define __NR_recvfrom 207 -__SYSCALL(__NR_recvfrom, sys_recvfrom) +__SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom) #define __NR_setsockopt 208 -__SYSCALL(__NR_setsockopt, sys_setsockopt) +__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt) #define __NR_getsockopt 209 -__SYSCALL(__NR_getsockopt, sys_getsockopt) +__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt) #define __NR_shutdown 210 __SYSCALL(__NR_shutdown, sys_shutdown) #define __NR_sendmsg 211 -__SYSCALL(__NR_sendmsg, sys_sendmsg) +__SC_COMP(__NR_sendmsg, sys_sendmsg, compat_sys_sendmsg) #define __NR_recvmsg 212 -__SYSCALL(__NR_recvmsg, sys_recvmsg) +__SC_COMP(__NR_recvmsg, sys_recvmsg, compat_sys_recvmsg) /* mm/filemap.c */ #define __NR_readahead 213 -__SYSCALL(__NR_readahead, sys_readahead) +__SC_COMP(__NR_readahead, sys_readahead, compat_sys_readahead) /* mm/nommu.c, also with MMU */ #define __NR_brk 214 @@ -573,19 +598,19 @@ __SYSCALL(__NR_add_key, sys_add_key) #define __NR_request_key 218 __SYSCALL(__NR_request_key, sys_request_key) #define __NR_keyctl 219 -__SYSCALL(__NR_keyctl, sys_keyctl) +__SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl) /* arch/example/kernel/sys_example.c */ #define __NR_clone 220 -__SYSCALL(__NR_clone, sys_clone) /* .long sys_clone_wrapper */ +__SYSCALL(__NR_clone, sys_clone) #define __NR_execve 221 -__SYSCALL(__NR_execve, sys_execve) /* .long sys_execve_wrapper */ +__SC_COMP(__NR_execve, sys_execve, compat_sys_execve) #define __NR3264_mmap 222 __SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap) /* mm/fadvise.c */ #define __NR3264_fadvise64 223 -__SYSCALL(__NR3264_fadvise64, sys_fadvise64_64) +__SC_COMP(__NR3264_fadvise64, sys_fadvise64_64, compat_sys_fadvise64_64) /* mm/, CONFIG_MMU only */ #ifndef __ARCH_NOMMU @@ -612,25 +637,26 @@ __SYSCALL(__NR_madvise, sys_madvise) #define __NR_remap_file_pages 234 __SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) #define __NR_mbind 235 -__SYSCALL(__NR_mbind, sys_mbind) +__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind) #define __NR_get_mempolicy 236 -__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) +__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy) #define __NR_set_mempolicy 237 -__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) +__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy) #define __NR_migrate_pages 238 -__SYSCALL(__NR_migrate_pages, sys_migrate_pages) +__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages) #define __NR_move_pages 239 -__SYSCALL(__NR_move_pages, sys_move_pages) +__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages) #endif #define __NR_rt_tgsigqueueinfo 240 -__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) +__SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \ + compat_sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 241 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_accept4 242 __SYSCALL(__NR_accept4, sys_accept4) #define __NR_recvmmsg 243 -__SYSCALL(__NR_recvmmsg, sys_recvmmsg) +__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg) /* * Architectures may provide up to 16 syscalls of their own @@ -639,19 +665,20 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __NR_arch_specific_syscall 244 #define __NR_wait4 260 -__SYSCALL(__NR_wait4, sys_wait4) +__SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4) #define __NR_prlimit64 261 __SYSCALL(__NR_prlimit64, sys_prlimit64) #define __NR_fanotify_init 262 __SYSCALL(__NR_fanotify_init, sys_fanotify_init) #define __NR_fanotify_mark 263 __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) -#define __NR_name_to_handle_at 264 +#define __NR_name_to_handle_at 264 __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) -#define __NR_open_by_handle_at 265 -__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#define __NR_open_by_handle_at 265 +__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \ + compat_sys_open_by_handle_at) #define __NR_clock_adjtime 266 -__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime) #define __NR_syncfs 267 __SYSCALL(__NR_syncfs, sys_syncfs) diff --git a/include/linux/compat.h b/include/linux/compat.h index 5778b559d59c..e94184834b71 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -209,6 +210,18 @@ struct compat_robust_list_head { compat_uptr_t list_op_pending; }; +struct compat_statfs; +struct compat_statfs64; +struct compat_old_linux_dirent; +struct compat_linux_dirent; +struct linux_dirent64; +struct compat_msghdr; +struct compat_mmsghdr; +struct compat_sysinfo; +struct compat_sysctl_args; +struct compat_kexec_segment; +struct compat_mq_attr; + extern void compat_exit_robust_list(struct task_struct *curr); asmlinkage long @@ -331,9 +344,13 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); +asmlinkage long compat_sys_utime(const char __user *filename, + struct compat_utimbuf __user *t); asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags); +asmlinkage long compat_sys_time(compat_time_t __user *tloc); +asmlinkage long compat_sys_stime(compat_time_t __user *tptr); asmlinkage long compat_sys_signalfd(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); @@ -350,11 +367,181 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, int flags); asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t); +asmlinkage long compat_sys_utimes(const char __user *filename, + struct compat_timeval __user *t); +asmlinkage long compat_sys_newstat(const char __user * filename, + struct compat_stat __user *statbuf); +asmlinkage long compat_sys_newlstat(const char __user * filename, + struct compat_stat __user *statbuf); asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user * filename, struct compat_stat __user *statbuf, int flag); +asmlinkage long compat_sys_newfstat(unsigned int fd, + struct compat_stat __user * statbuf); +asmlinkage long compat_sys_statfs(const char __user *pathname, + struct compat_statfs __user *buf); +asmlinkage long compat_sys_fstatfs(unsigned int fd, + struct compat_statfs __user *buf); +asmlinkage long compat_sys_statfs64(const char __user *pathname, + compat_size_t sz, + struct compat_statfs64 __user *buf); +asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, + struct compat_statfs64 __user *buf); +asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); +asmlinkage long compat_sys_io_getevents(aio_context_t ctx_id, + unsigned long min_nr, + unsigned long nr, + struct io_event __user *events, + struct compat_timespec __user *timeout); +asmlinkage long compat_sys_io_submit(aio_context_t ctx_id, int nr, + u32 __user *iocb); +asmlinkage long compat_sys_mount(const char __user * dev_name, + const char __user * dir_name, + const char __user * type, unsigned long flags, + const void __user * data); +asmlinkage long compat_sys_old_readdir(unsigned int fd, + struct compat_old_linux_dirent __user *, + unsigned int count); +asmlinkage long compat_sys_getdents(unsigned int fd, + struct compat_linux_dirent __user *dirent, + unsigned int count); +asmlinkage long compat_sys_getdents64(unsigned int fd, + struct linux_dirent64 __user * dirent, + unsigned int count); +asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *, + unsigned int nr_segs, unsigned int flags); +asmlinkage long compat_sys_open(const char __user *filename, int flags, + int mode); asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode); +asmlinkage long compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, + int flags); +asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, + compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, + void __user *sig); +asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, + unsigned int nfds, + struct compat_timespec __user *tsp, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize); +#if (defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)) && !defined(CONFIG_NFSD_DEPRECATED) +union compat_nfsctl_res; +struct compat_nfsctl_arg; +asmlinkage long compat_sys_nfsservctl(int cmd, + struct compat_nfsctl_arg __user *arg, + union compat_nfsctl_res __user *res); +#else +long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2); +#endif +asmlinkage long compat_sys_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags); +asmlinkage long compat_sys_get_mempolicy(int __user *policy, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, + compat_ulong_t addr, + compat_ulong_t flags); +asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, + compat_ulong_t maxnode); +asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, + compat_ulong_t mode, + compat_ulong_t __user *nmask, + compat_ulong_t maxnode, compat_ulong_t flags); + +asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, unsigned int optlen); +asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, + unsigned flags); +asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, + unsigned int flags); +asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, + unsigned flags); +asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, size_t len, + unsigned flags, struct sockaddr __user *addr, + int __user *addrlen); +asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, + unsigned vlen, unsigned int flags, + struct compat_timespec __user *timeout); +asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp); +asmlinkage long compat_sys_getitimer(int which, + struct compat_itimerval __user *it); +asmlinkage long compat_sys_setitimer(int which, + struct compat_itimerval __user *in, + struct compat_itimerval __user *out); +asmlinkage long compat_sys_times(struct compat_tms __user *tbuf); +asmlinkage long compat_sys_setrlimit(unsigned int resource, + struct compat_rlimit __user *rlim); +asmlinkage long compat_sys_getrlimit (unsigned int resource, + struct compat_rlimit __user *rlim); +asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru); +asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr); +asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr); +asmlinkage long compat_sys_timer_create(clockid_t which_clock, + struct compat_sigevent __user *timer_event_spec, + timer_t __user *created_timer_id); +asmlinkage long compat_sys_timer_settime(timer_t timer_id, int flags, + struct compat_itimerspec __user *new, + struct compat_itimerspec __user *old); +asmlinkage long compat_sys_timer_gettime(timer_t timer_id, + struct compat_itimerspec __user *setting); +asmlinkage long compat_sys_clock_settime(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_gettime(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *tp); +asmlinkage long compat_sys_clock_getres(clockid_t which_clock, + struct compat_timespec __user *tp); +asmlinkage long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, + struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp); +asmlinkage long compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct compat_timespec __user *uts, compat_size_t sigsetsize); +asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, + compat_size_t sigsetsize); +asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); +asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, + unsigned long arg); +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + u32 val3); +asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen); +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *, + unsigned long flags); +asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, + const struct compat_mq_attr __user *u_mqstat, + struct compat_mq_attr __user *u_omqstat); +asmlinkage long compat_sys_mq_notify(mqd_t mqdes, + const struct compat_sigevent __user *u_notification); +asmlinkage long compat_sys_mq_open(const char __user *u_name, + int oflag, compat_mode_t mode, + struct compat_mq_attr __user *u_attr); +asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, + const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct compat_timespec __user *u_abs_timeout); +asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, + char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct compat_timespec __user *u_abs_timeout); +asmlinkage long compat_sys_socketcall(int call, u32 __user *args); +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args); extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe8..80c28562f57b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) return compat_jiffies_to_clock_t(jiffies); } +#ifdef __ARCH_WANT_SYS_SIGPENDING + /* * Assumption: old_sigset_t and compat_old_sigset_t are both * types that can be passed to put_user()/get_user(). @@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) return ret; } +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK + asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, compat_old_sigset_t __user *oset) { @@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } +#endif + asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { -- cgit v1.2.3 From 19e274630c9e23a84d5940af83cf5db35103f968 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 12 May 2011 10:47:23 +0200 Subject: job control: reorganize wait_task_stopped() wait_task_stopped() tested task_stopped_code() without acquiring siglock and, if stop condition existed, called wait_task_stopped() and directly returned the result. This patch moves the initial task_stopped_code() testing into wait_task_stopped() and make wait_consider_task() fall through to wait_task_continue() on 0 return. This is for the following two reasons. * Because the initial task_stopped_code() test is done without acquiring siglock, it may race against SIGCONT generation. The stopped condition might have been replaced by continued state by the time wait_task_stopped() acquired siglock. This may lead to unexpected failure of WNOHANG waits. This reorganization addresses this single race case but there are other cases - TASK_RUNNING -> TASK_STOPPED transition and EXIT_* transitions. * Scheduled ptrace updates require changes to the initial test which would fit better inside wait_task_stopped(). Signed-off-by: Tejun Heo Reviewed-by: Oleg Nesterov Signed-off-by: Oleg Nesterov --- kernel/exit.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5cbc83e83a5d..33837936b98c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1377,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) return NULL; } -/* - * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. +/** + * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED + * @wo: wait options + * @ptrace: is the wait for ptrace + * @p: task to wait for + * + * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. + * + * CONTEXT: + * read_lock(&tasklist_lock), which is released if return value is + * non-zero. Also, grabs and releases @p->sighand->siglock. + * + * RETURNS: + * 0 if wait condition didn't exist and search for other wait conditions + * should continue. Non-zero return, -errno on failure and @p's pid on + * success, implies that tasklist_lock is released and wait condition + * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1397,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo, if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; + if (!task_stopped_code(p, ptrace)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); @@ -1607,8 +1622,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ - if (task_stopped_code(p, ptrace)) - return wait_task_stopped(wo, ptrace, p); + ret = wait_task_stopped(wo, ptrace, p); + if (ret) + return ret; /* * Wait for continued. There's only one continued state and the -- cgit v1.2.3 From 571d76acdab95876aeff869ab6449f826c23aa43 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Mon, 16 May 2011 14:23:44 -0400 Subject: arch/tile: support signal "exception-trace" hook This change adds support for /proc/sys/debug/exception-trace to tile. Like x86 and sparc, by default it is set to "1", generating a one-line printk whenever a user process crashes. By setting it to "2", we get a much more complete userspace diagnostic at crash time, including a user-space backtrace, register dump, and memory dump around the address of the crash. Some vestiges of the Tilera-internal version of this support are removed with this patch (the show_crashinfo variable and the arch_coredump_signal function). We retain a "crashinfo" boot parameter which allows you to set the boot-time value of exception-trace. Signed-off-by: Chris Metcalf --- arch/tile/include/asm/processor.h | 7 --- arch/tile/include/asm/signal.h | 4 ++ arch/tile/kernel/compat_signal.c | 4 +- arch/tile/kernel/signal.c | 128 ++++++++++++++++++++++++++++++++++++-- arch/tile/kernel/single_step.c | 4 ++ arch/tile/kernel/traps.c | 1 + arch/tile/mm/fault.c | 24 ++++--- kernel/sysctl.c | 2 +- 8 files changed, 151 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index d6b43ddfcc04..34c1e01ffb5e 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -257,10 +257,6 @@ static inline void cpu_relax(void) barrier(); } -struct siginfo; -extern void arch_coredump_signal(struct siginfo *, struct pt_regs *); -#define arch_coredump_signal arch_coredump_signal - /* Info on this processor (see fs/proc/cpuinfo.c) */ struct seq_operations; extern const struct seq_operations cpuinfo_op; @@ -271,9 +267,6 @@ extern char chip_model[64]; /* Data on which physical memory controller corresponds to which NUMA node. */ extern int node_controller[]; -/* Do we dump information to the console when a user application crashes? */ -extern int show_crashinfo; - #if CHIP_HAS_CBOX_HOME_MAP() /* Does the heap allocator return hash-for-home pages by default? */ extern int hash_default; diff --git a/arch/tile/include/asm/signal.h b/arch/tile/include/asm/signal.h index 81d92a45cd4b..1e1e616783eb 100644 --- a/arch/tile/include/asm/signal.h +++ b/arch/tile/include/asm/signal.h @@ -28,6 +28,10 @@ struct pt_regs; int restore_sigcontext(struct pt_regs *, struct sigcontext __user *); int setup_sigcontext(struct sigcontext __user *, struct pt_regs *); void do_signal(struct pt_regs *regs); +void signal_fault(const char *type, struct pt_regs *, + void __user *frame, int sig); +void trace_unhandled_signal(const char *type, struct pt_regs *regs, + unsigned long address, int signo); #endif #endif /* _ASM_TILE_SIGNAL_H */ diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index dbb0dfc7bece..a7869ad62776 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -317,7 +317,7 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs) return 0; badframe: - force_sig(SIGSEGV, current); + signal_fault("bad sigreturn frame", regs, frame, 0); return 0; } @@ -431,6 +431,6 @@ int compat_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; give_sigsegv: - force_sigsegv(sig, current); + signal_fault("bad setup frame", regs, frame, sig); return -EFAULT; } diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 1260321155f1..bedaf4e9f3a7 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -39,7 +39,6 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss, stack_t __user *, uoss, struct pt_regs *, regs) { @@ -78,6 +77,13 @@ int restore_sigcontext(struct pt_regs *regs, return err; } +void signal_fault(const char *type, struct pt_regs *regs, + void __user *frame, int sig) +{ + trace_unhandled_signal(type, regs, (unsigned long)frame, SIGSEGV); + force_sigsegv(sig, current); +} + /* The assembly shim for this function arranges to ignore the return value. */ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) { @@ -105,7 +111,7 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) return 0; badframe: - force_sig(SIGSEGV, current); + signal_fault("bad sigreturn frame", regs, frame, 0); return 0; } @@ -231,7 +237,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; give_sigsegv: - force_sigsegv(sig, current); + signal_fault("bad setup frame", regs, frame, sig); return -EFAULT; } @@ -245,7 +251,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, { int ret; - /* Are we from a system call? */ if (regs->faultnum == INT_SWINT_1) { /* If so, check system call restarting.. */ @@ -363,3 +368,118 @@ done: /* Avoid double syscall restart if there are nested signals. */ regs->faultnum = INT_SWINT_1_SIGRETURN; } + +int show_unhandled_signals = 1; + +static int __init crashinfo(char *str) +{ + unsigned long val; + const char *word; + + if (*str == '\0') + val = 2; + else if (*str != '=' || strict_strtoul(++str, 0, &val) != 0) + return 0; + show_unhandled_signals = val; + switch (show_unhandled_signals) { + case 0: + word = "No"; + break; + case 1: + word = "One-line"; + break; + default: + word = "Detailed"; + break; + } + pr_info("%s crash reports will be generated on the console\n", word); + return 1; +} +__setup("crashinfo", crashinfo); + +static void dump_mem(void __user *address) +{ + void __user *addr; + enum { region_size = 256, bytes_per_line = 16 }; + int i, j, k; + int found_readable_mem = 0; + + pr_err("\n"); + if (!access_ok(VERIFY_READ, address, 1)) { + pr_err("Not dumping at address 0x%lx (kernel address)\n", + (unsigned long)address); + return; + } + + addr = (void __user *) + (((unsigned long)address & -bytes_per_line) - region_size/2); + if (addr > address) + addr = NULL; + for (i = 0; i < region_size; + addr += bytes_per_line, i += bytes_per_line) { + unsigned char buf[bytes_per_line]; + char line[100]; + if (copy_from_user(buf, addr, bytes_per_line)) + continue; + if (!found_readable_mem) { + pr_err("Dumping memory around address 0x%lx:\n", + (unsigned long)address); + found_readable_mem = 1; + } + j = sprintf(line, REGFMT":", (unsigned long)addr); + for (k = 0; k < bytes_per_line; ++k) + j += sprintf(&line[j], " %02x", buf[k]); + pr_err("%s\n", line); + } + if (!found_readable_mem) + pr_err("No readable memory around address 0x%lx\n", + (unsigned long)address); +} + +void trace_unhandled_signal(const char *type, struct pt_regs *regs, + unsigned long address, int sig) +{ + struct task_struct *tsk = current; + + if (show_unhandled_signals == 0) + return; + + /* If the signal is handled, don't show it here. */ + if (!is_global_init(tsk)) { + void __user *handler = + tsk->sighand->action[sig-1].sa.sa_handler; + if (handler != SIG_IGN && handler != SIG_DFL) + return; + } + + /* Rate-limit the one-line output, not the detailed output. */ + if (show_unhandled_signals <= 1 && !printk_ratelimit()) + return; + + printk("%s%s[%d]: %s at %lx pc "REGFMT" signal %d", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), type, address, regs->pc, sig); + + print_vma_addr(KERN_CONT " in ", regs->pc); + + printk(KERN_CONT "\n"); + + if (show_unhandled_signals > 1) { + switch (sig) { + case SIGILL: + case SIGFPE: + case SIGSEGV: + case SIGBUS: + pr_err("User crash: signal %d," + " trap %ld, address 0x%lx\n", + sig, regs->faultnum, address); + show_regs(regs); + dump_mem((void __user *)address); + break; + default: + pr_err("User crash: signal %d, trap %ld\n", + sig, regs->faultnum); + break; + } + } +} diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c index 86df5a239b70..4032ca8e51b6 100644 --- a/arch/tile/kernel/single_step.c +++ b/arch/tile/kernel/single_step.c @@ -186,6 +186,8 @@ static tile_bundle_bits rewrite_load_store_unaligned( .si_code = SEGV_MAPERR, .si_addr = addr }; + trace_unhandled_signal("segfault", regs, + (unsigned long)addr, SIGSEGV); force_sig_info(info.si_signo, &info, current); return (tile_bundle_bits) 0; } @@ -196,6 +198,8 @@ static tile_bundle_bits rewrite_load_store_unaligned( .si_code = BUS_ADRALN, .si_addr = addr }; + trace_unhandled_signal("unaligned trap", regs, + (unsigned long)addr, SIGBUS); force_sig_info(info.si_signo, &info, current); return (tile_bundle_bits) 0; } diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 5474fc2e77e8..f9803dfa7357 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -308,6 +308,7 @@ void __kprobes do_trap(struct pt_regs *regs, int fault_num, info.si_addr = (void __user *)address; if (signo == SIGILL) info.si_trapno = fault_num; + trace_unhandled_signal("trap", regs, address, signo); force_sig_info(signo, &info, current); } diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 24ca54a0703b..25b7b90fd620 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -43,8 +43,11 @@ #include -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, int fault_num, struct task_struct *tsk) +static noinline void force_sig_info_fault(const char *type, int si_signo, + int si_code, unsigned long address, + int fault_num, + struct task_struct *tsk, + struct pt_regs *regs) { siginfo_t info; @@ -59,6 +62,7 @@ static noinline void force_sig_info_fault(int si_signo, int si_code, info.si_code = si_code; info.si_addr = (void __user *)address; info.si_trapno = fault_num; + trace_unhandled_signal(type, regs, address, si_signo); force_sig_info(si_signo, &info, tsk); } @@ -71,11 +75,12 @@ SYSCALL_DEFINE2(cmpxchg_badaddr, unsigned long, address, struct pt_regs *, regs) { if (address >= PAGE_OFFSET) - force_sig_info_fault(SIGSEGV, SEGV_MAPERR, address, - INT_DTLB_MISS, current); + force_sig_info_fault("atomic segfault", SIGSEGV, SEGV_MAPERR, + address, INT_DTLB_MISS, current, regs); else - force_sig_info_fault(SIGBUS, BUS_ADRALN, address, - INT_UNALIGN_DATA, current); + force_sig_info_fault("atomic alignment fault", SIGBUS, + BUS_ADRALN, address, + INT_UNALIGN_DATA, current, regs); /* * Adjust pc to point at the actual instruction, which is unusual @@ -471,8 +476,8 @@ bad_area_nosemaphore: */ local_irq_enable(); - force_sig_info_fault(SIGSEGV, si_code, address, - fault_num, tsk); + force_sig_info_fault("segfault", SIGSEGV, si_code, address, + fault_num, tsk, regs); return 0; } @@ -547,7 +552,8 @@ do_sigbus: if (is_kernel_mode) goto no_context; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, fault_num, tsk); + force_sig_info_fault("bus error", SIGBUS, BUS_ADRERR, address, + fault_num, tsk, regs); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..aaec9342a33c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1496,7 +1496,7 @@ static struct ctl_table fs_table[] = { static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) + defined(CONFIG_S390) || defined(CONFIG_TILE) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.3 From c0e299b1a91cbdb21ae08e382a4176200398bc36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 May 2011 10:50:52 +0200 Subject: clockevents/source: Use u64 to make 32bit happy unsigned long is not 64bit on 32bit machine. Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- kernel/time/clocksource.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 22a9da9a9c96..c027d4f602f1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { - unsigned long sec; + u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9d5f8c885f6..1c95fd677328 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -639,7 +639,7 @@ static void clocksource_enqueue(struct clocksource *cs) */ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - unsigned long sec; + u64 sec; /* * Calc the maximum number of seconds which we can run before -- cgit v1.2.3 From f05998d4b80632f2cc00f108da503066ef5d38d5 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:38 -0700 Subject: sched: Cleanup set_load_weight() Avoid using long repetitious names; make this simpler and nicer to read. No functional change introduced in this patch. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-2-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c62acf45d3b9..d036048b59a4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1778,17 +1778,20 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + /* * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; + load->weight = WEIGHT_IDLEPRIO; + load->inv_weight = WMULT_IDLEPRIO; return; } - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; + load->weight = prio_to_weight[prio]; + load->inv_weight = prio_to_wmult[prio]; } static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -- cgit v1.2.3 From 1399fa7807a1a5998bbf147e80668e9950661dfa Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 10:09:39 -0700 Subject: sched: Introduce SCHED_POWER_SCALE to scale cpu_power calculations SCHED_LOAD_SCALE is used to increase nice resolution and to scale cpu_power calculations in the scheduler. This patch introduces SCHED_POWER_SCALE and converts all uses of SCHED_LOAD_SCALE for scaling cpu_power to use SCHED_POWER_SCALE instead. This is a preparatory patch for increasing the resolution of SCHED_LOAD_SCALE, and there is no need to increase resolution for cpu_power calculations. Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Link: http://lkml.kernel.org/r/1305738580-9924-3-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 ++++++++----- kernel/sched.c | 4 ++-- kernel/sched_fair.c | 52 ++++++++++++++++++++++++++------------------------- 3 files changed, 37 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 12211e1666e2..f2f440221b70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -787,18 +787,21 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -/* - * sched-domains (multiprocessor balancing) declarations: - */ - /* * Increase resolution of nice-level calculations: */ #define SCHED_LOAD_SHIFT 10 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) -#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE +/* + * Increase resolution of cpu_power calculations + */ +#define SCHED_POWER_SHIFT 10 +#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +/* + * sched-domains (multiprocessor balancing) declarations: + */ #ifdef CONFIG_SMP #define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ diff --git a/kernel/sched.c b/kernel/sched.c index d036048b59a4..375e9c677d58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6530,7 +6530,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_LOAD_SCALE) { + if (group->cpu_power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", group->cpu_power); } @@ -7905,7 +7905,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_LOAD_SCALE; + rq->cpu_power = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37f22626225e..e32a9b70ee9c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) nr_running += cpu_rq(i)->cfs.nr_running; } - capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (tmp->flags & SD_POWERSAVINGS_BALANCE) nr_running /= 2; @@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { - return SCHED_LOAD_SCALE; + return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) @@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu) available = total - rq->rt_avg; } - if (unlikely((s64)total < SCHED_LOAD_SCALE)) - total = SCHED_LOAD_SCALE; + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; - total >>= SCHED_LOAD_SHIFT; + total >>= SCHED_POWER_SHIFT; return div_u64(available, total); } @@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_LOAD_SCALE; + unsigned long power = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { @@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_smt_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; } sdg->cpu_power_orig = power; @@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_freq_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; power *= scale_rt_power(cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; if (!power) power = 1; @@ -2682,7 +2682,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_LOAD_SCALE + * Only siblings can have significantly less than SCHED_POWER_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger @@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); sgs->group_weight = group->group_weight; @@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, return 0; *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, - SCHED_LOAD_SCALE); + SCHED_POWER_SCALE); return 1; } @@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, cpu_avg_load_per_task(this_cpu); scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_LOAD_SCALE; + * SCHED_POWER_SCALE; scaled_busy_load_per_task /= sds->busiest->cpu_power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= @@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, min(sds->busiest_load_per_task, sds->max_load); pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_LOAD_SCALE; + pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->busiest->cpu_power; if (sds->max_load > tmp) pwr_move += sds->busiest->cpu_power * @@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, /* Amount of load we'd add */ if (sds->max_load * sds->busiest->cpu_power < - sds->busiest_load_per_task * SCHED_LOAD_SCALE) + sds->busiest_load_per_task * SCHED_POWER_SCALE) tmp = (sds->max_load * sds->busiest->cpu_power) / sds->this->cpu_power; else - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->this->cpu_power; pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; + pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) @@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity = (sds->busiest_nr_running - sds->busiest_group_capacity); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= sds->busiest->cpu_power; } @@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, (sds->avg_load - sds->this_load) * sds->this->cpu_power) - / SCHED_LOAD_SCALE; + / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; /* * If the busiest group is imbalanced the below checks don't @@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); unsigned long wl; if (!capacity) @@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_LOAD_SCALE) / power; + wl = (wl * SCHED_POWER_SCALE) / power; if (wl > max_load) { max_load = wl; -- cgit v1.2.3 From c8b281161dfa4bb5d5be63fb036ce19347b88c63 Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 18 May 2011 14:37:48 -0700 Subject: sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao Acked-by: Peter Zijlstra Cc: Nikunj A. Dadhania Cc: Srivatsa Vaddagiri Cc: Stephan Barwolf Cc: Mike Galbraith Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 23 +++++++++++++++++++++-- kernel/sched.c | 28 ++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f440221b70..c34a718e20dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -788,9 +788,28 @@ enum cpu_idle_type { }; /* - * Increase resolution of nice-level calculations: + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. */ -#define SCHED_LOAD_SHIFT 10 +#if BITS_PER_LONG > 32 +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) /* diff --git a/kernel/sched.c b/kernel/sched.c index 375e9c677d58..bb504e1839e5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) +#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - tmp = (u64)delta_exec * weight; + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; else - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = WMULT_CONST / w; } /* @@ -1785,12 +1797,12 @@ static void set_load_weight(struct task_struct *p) * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - load->weight = WEIGHT_IDLEPRIO; + load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; } - load->weight = prio_to_weight[prio]; + load->weight = scale_load(prio_to_weight[prio]); load->inv_weight = prio_to_wmult[prio]; } @@ -8809,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), shareval); + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); } static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); - return (u64) tg->shares; + return (u64) scale_load_down(tg->shares); } #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 268bb0ce3e87872cb9290c322b0d35bce230d88f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 May 2011 12:50:29 -0700 Subject: sanitize usage Commit e66eed651fd1 ("list: remove prefetching from regular list iterators") removed the include of prefetch.h from list.h, which uncovered several cases that had apparently relied on that rather obscure header file dependency. So this fixes things up a bit, using grep -L linux/prefetch.h $(git grep -l '[^a-z_]prefetchw*(' -- '*.[ch]') grep -L 'prefetchw*(' $(git grep -l 'linux/prefetch.h' -- '*.[ch]') to guide us in finding files that either need inclusion, or have it despite not needing it. There are more of them around (mostly network drivers), but this gets many core ones. Reported-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 1 + arch/ia64/mm/fault.c | 1 + arch/powerpc/lib/sstep.c | 1 + arch/sh/kernel/cpu/sh4/sq.c | 1 + arch/x86/include/asm/uaccess.h | 1 - arch/x86/include/asm/uaccess_32.h | 1 - arch/x86/include/asm/uaccess_64.h | 1 - arch/x86/mm/fault.c | 1 + drivers/misc/sgi-gru/grufault.c | 1 + drivers/misc/sgi-gru/grumain.c | 1 + drivers/net/acenic.c | 1 + drivers/net/ehea/ehea_main.c | 1 + drivers/staging/pohmelfs/inode.c | 1 + drivers/usb/gadget/goku_udc.c | 1 + drivers/usb/gadget/imx_udc.c | 1 + drivers/usb/gadget/omap_udc.c | 1 + drivers/usb/gadget/pxa25x_udc.c | 1 + drivers/usb/gadget/pxa27x_udc.c | 1 + drivers/usb/host/isp1362-hcd.c | 1 + drivers/usb/host/sl811-hcd.c | 1 + drivers/video/udlfb.c | 1 + fs/btrfs/extent_io.c | 1 + fs/dcache.c | 1 + fs/logfs/dev_bdev.c | 1 + include/asm-generic/xor.h | 2 +- kernel/rcutiny.c | 1 + kernel/rcutree.c | 1 + mm/page_alloc.c | 1 + mm/prio_tree.c | 1 + mm/slab.c | 1 + mm/vmscan.c | 1 + net/core/dst.c | 1 + net/core/pktgen.c | 1 + net/core/skbuff.c | 1 + tools/perf/util/include/linux/list.h | 2 +- 35 files changed, 32 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 4ce8d1358fee..c04dd576f333 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include /* ia64_get_itc() */ #include diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 0799fea4c588..20b359376128 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index ae5189ab0049..f73daa6f3970 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 14726eef1ce0..f0907995b4c9 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99f0ad753f32..99ddd148a760 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 088d09fb1615..566e803cc602 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -6,7 +6,6 @@ */ #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 316708d5af92..1c66d30971ad 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -6,7 +6,6 @@ */ #include #include -#include #include #include #include diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 20e3f8702d1e..bcb394dfbb35 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -12,6 +12,7 @@ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ #include /* hstate_index_to_shift */ +#include /* prefetchw */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 38657cdaf54d..c4acac74725c 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "gru.h" #include "grutables.h" diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index f8538bbd0bfa..ae16c8cb4f3e 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "gru.h" #include "grutables.h" diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c index ee648fe5d96f..01560bb67a7a 100644 --- a/drivers/net/acenic.c +++ b/drivers/net/acenic.c @@ -68,6 +68,7 @@ #include #include #include +#include #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) #include diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index cf79cf759e13..2c60435f2beb 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index c93ef207b0b4..c0f0ac7c1cdb 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "netfs.h" diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c index 48a760220baf..bf6e11c758d5 100644 --- a/drivers/usb/gadget/goku_udc.c +++ b/drivers/usb/gadget/goku_udc.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/imx_udc.c b/drivers/usb/gadget/imx_udc.c index 5408186afc35..ade40066decf 100644 --- a/drivers/usb/gadget/imx_udc.c +++ b/drivers/usb/gadget/imx_udc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index cb5cd422f3f5..82fd24935332 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/pxa25x_udc.c b/drivers/usb/gadget/pxa25x_udc.c index 444b60aa15e9..365c02fc25fc 100644 --- a/drivers/usb/gadget/pxa25x_udc.c +++ b/drivers/usb/gadget/pxa25x_udc.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/pxa27x_udc.c b/drivers/usb/gadget/pxa27x_udc.c index 78a39a41547d..57607696735c 100644 --- a/drivers/usb/gadget/pxa27x_udc.c +++ b/drivers/usb/gadget/pxa27x_udc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/host/isp1362-hcd.c b/drivers/usb/host/isp1362-hcd.c index f97570a847ca..9c37dad3e816 100644 --- a/drivers/usb/host/isp1362-hcd.c +++ b/drivers/usb/host/isp1362-hcd.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c index 18b7099a8125..fafccc2fd331 100644 --- a/drivers/usb/host/sl811-hcd.c +++ b/drivers/usb/host/sl811-hcd.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/drivers/video/udlfb.c b/drivers/video/udlfb.c index 68041d9dc260..695066b5b2e6 100644 --- a/drivers/video/udlfb.c +++ b/drivers/video/udlfb.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include