From 44b99462d9d776522e174d6c531ce5ccef309e26 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 22 Jun 2012 11:50:05 -0700 Subject: ring-buffer: Fix crash due to uninitialized new_pages list head The new_pages list head in the cpu_buffer is not initialized. When adding pages to the ring buffer, if the memory allocation fails in ring_buffer_resize, the clean up handler tries to free up the allocated pages from all the cpu buffers. The panic is caused by referencing the uninitialized new_pages list head. Initializing the new_pages list head in rb_allocate_cpu_buffer fixes this. Link: http://lkml.kernel.org/r/1340391005-10880-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..ba39cbabdc9f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) rb_init_page(bpage->page); INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) -- cgit v1.2.3 From 48fdc72f23ad9a9956e524a47843135d0bbc3317 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Fri, 29 Jun 2012 12:31:41 -0700 Subject: ring-buffer: Fix accounting of entries when removing pages When removing pages from the ring buffer, its state is not reset. This means that the counters need to be correctly updated to account for the pages removed. Update the overrun counter to reflect the removed events from the pages. Link: http://lkml.kernel.org/r/1340998301-1715-1-git-send-email-vnagarnaik@google.com Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ba39cbabdc9f..f765465bffe4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1347,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) * If something was added to this page, it was full * since it is not the tail page. So we deduct the * bytes consumed in ring buffer from here. - * No need to update overruns, since this page is - * deleted from ring buffer and its entries are - * already accounted for. + * Increment overrun to account for the lost events. */ + local_add(page_entries, &cpu_buffer->overrun); local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); } -- cgit v1.2.3 From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jul 2012 07:08:42 -0700 Subject: Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation" This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9. (Move PREEMPT_RCU preemption to switch_to() invocation). Testing by Sasha Levin showed that this can result in deadlock due to invoking the scheduler when one of the runqueue locks is held. Because this commit was simply a performance optimization, revert it. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Tested-by: Sasha Levin --- arch/um/drivers/mconsole_kern.c | 1 - include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 6 ++++++ include/linux/sched.h | 10 ---------- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 +++++++++++--- kernel/sched/core.c | 1 - 8 files changed, 19 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 88e466b159dc..43b39d61b538 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,7 +705,6 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; - rcu_switch_from(from); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 26d1a47591f1..9cac722b169c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,7 +184,6 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); -extern void rcu_preempt_note_context_switch(void); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; extern void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 854dc4c5c271..4e56a9c69a35 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,10 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU +static inline void rcu_preempt_note_context_switch(void) +{ +} + static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; @@ -95,6 +99,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) #else /* #ifdef CONFIG_TINY_RCU */ +void rcu_preempt_note_context_switch(void); int rcu_preempt_needs_cpu(void); static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) @@ -108,6 +113,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..06a4c5f4f55c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,22 +1871,12 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } -static inline void rcu_switch_from(struct task_struct *prev) -{ - if (prev->rcu_read_lock_nesting != 0) - rcu_preempt_note_context_switch(); -} - #else static inline void rcu_copy_process(struct task_struct *p) { } -static inline void rcu_switch_from(struct task_struct *prev) -{ -} - #endif #ifdef CONFIG_SMP diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -1001,6 +1001,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + /* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..eaead2df6aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From 164c33c6adee609b8b9062cce4c10f764d0dce13 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Mon, 25 Jun 2012 18:18:15 -0700 Subject: sched: Fix fork() error path to not crash In dup_task_struct(), if arch_dup_task_struct() fails, the clean up code fails to clean up correctly. That's because the clean up code depends on unininitalized ti->task pointer. We fix this by making sure that the task and thread_info know about each other before we attempt to take the error path. Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120626011815.11323.5533.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..f00e319d8376 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); - if (err) - goto out; + /* + * We defer looking at err, because we will need this setup + * for the clean up path to work correctly. + */ tsk->stack = ti; - setup_thread_stack(tsk, orig); + + if (err) + goto out; + clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); -- cgit v1.2.3 From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 15:52:09 +0200 Subject: sched/nohz: Rewrite and fix load-avg computation -- again Thanks to Charles Wang for spotting the defects in the current code: - If we go idle during the sample window -- after sampling, we get a negative bias because we can negate our own sample. - If we wake up during the sample window we get a positive bias because we push the sample to a known active period. So rewrite the entire nohz load-avg muck once again, now adding copious documentation to the code. Reported-and-tested-by: Doug Smythies Reported-and-tested-by: Charles Wang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins [ minor edits ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++ kernel/sched/core.c | 275 ++++++++++++++++++++++++++++++++++------------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 2 - kernel/time/tick-sched.c | 2 + 5 files changed, 213 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..20cb7497c59c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif +#ifdef CONFIG_NO_HZ +void calc_load_enter_idle(void); +void calc_load_exit_idle(void); +#else +static inline void calc_load_enter_idle(void) { } +static inline void calc_load_exit_idle(void) { } +#endif /* CONFIG_NO_HZ */ + #ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..bb840405335d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; /* - * Its got a race, we don't care... + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2455,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,13 +2534,16 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } +/* + * End of global load-average stuff + */ + /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3 From 5c53d819c71c63fdc91f30a59164583f68e2d63a Mon Sep 17 00:00:00 2001 From: liu chuansheng Date: Fri, 6 Jul 2012 09:50:08 -0700 Subject: printk: replacing the raw_spin_lock/unlock with raw_spin_lock/unlock_irq In function devkmsg_read/writev/llseek/poll/open()..., the function raw_spin_lock/unlock is used, there is potential deadlock case happening. CPU1: thread1 doing the cat /dev/kmsg: raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { when thread1 run here, at this time one interrupt is coming on CPU1 and running based on this thread,if the interrupt handle called the printk which need the logbuf_lock spin also, it will cause deadlock. So we should use raw_spin_lock/unlock_irq here. Acked-by: Kay Sievers Signed-off-by: liu chuansheng Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index dba18211685e..12886cd19cd9 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -430,20 +430,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); ret = wait_event_interruptible(log_wait, user->seq != log_next_seq); if (ret) goto out; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } if (user->seq < log_first_seq) { @@ -451,7 +451,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_first_idx; user->seq = log_first_seq; ret = -EPIPE; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); goto out; } @@ -501,7 +501,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, user->idx = log_next(user->idx); user->seq++; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (len > count) { ret = -EINVAL; @@ -528,7 +528,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); switch (whence) { case SEEK_SET: /* the first record */ @@ -552,7 +552,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) default: ret = -EINVAL; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -566,14 +566,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; ret = POLLIN|POLLRDNORM; } - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); return ret; } @@ -597,10 +597,10 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); - raw_spin_lock(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); user->idx = log_first_idx; user->seq = log_first_seq; - raw_spin_unlock(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); file->private_data = user; return 0; -- cgit v1.2.3 From e3f5a5f27153228569f3396049838e9727dae86e Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: escape the backslash character while exporting data Non-printable characters in the log data are hex-escaped to ensure safe post processing. We need to escape a backslash we find in the data, to be able to distinguish it from a backslash we add for the escaping. Also escape the non-printable character 127. Thanks to Miloslav Trmac for the heads up. Reported-by: Michael Neuling Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 12886cd19cd9..505863aa3a7f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -465,7 +465,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, for (i = 0; i < msg->text_len; i++) { unsigned char c = log_text(msg)[i]; - if (c < ' ' || c >= 128) + if (c < ' ' || c >= 127 || c == '\\') len += sprintf(user->buf + len, "\\x%02x", c); else user->buf[len++] = c; @@ -489,7 +489,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, continue; } - if (c < ' ' || c >= 128) { + if (c < ' ' || c >= 127 || c == '\\') { len += sprintf(user->buf + len, "\\x%02x", c); continue; } -- cgit v1.2.3 From 43a73a50b352cd3df25b3ced72033942a6a0f919 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: add the facility number to the syslog prefix After the recent split of facility and level into separate variables, we miss the facility value (always 0 for kernel-originated messages) in the syslog prefix. On Tue, Jul 3, 2012 at 12:45 PM, Dan Carpenter wrote: > Static checkers complain about the impossible condition here. > > In 084681d14e ('printk: flush continuation lines immediately to > console'), we changed msg->level from being a u16 to being an unsigned > 3 bit bitfield. Cc: Dan Carpenter Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 505863aa3a7f..37cde752cb8a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -818,15 +818,18 @@ static size_t print_time(u64 ts, char *buf) static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; + unsigned int prefix = (msg->facility << 3) | msg->level; if (syslog) { if (buf) { - len += sprintf(buf, "<%u>", msg->level); + len += sprintf(buf, "<%u>", prefix); } else { len += 3; - if (msg->level > 9) - len++; - if (msg->level > 99) + if (prefix > 999) + len += 3; + else if (prefix > 99) + len += 2; + else if (prefix > 9) len++; } } -- cgit v1.2.3 From cb424ffe9f45ad80267f2a98fbd9bf21caa0ce22 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: properly handle concurrent non-blocking read() from /proc/kmsg The /proc/kmsg read() interface is internally simply wired up to a sequence of syslog() syscalls, which might are racy between their checks and actions, regarding concurrency. In the (very uncommon) case of concurrent readers of /dev/kmsg, relying on usual O_NONBLOCK behavior, the recently introduced mutex might block an O_NONBLOCK reader in read(), when poll() returns for it, but another process has already read the data in the meantime. We've seen that while running artificial test setups and tools that "fight" about /proc/kmsg data. This restores the original /proc/kmsg behavior, where in case of concurrent read()s, poll() might wake up but the read() syscall will just return 0 to the caller, while another process has "stolen" the data. This is in the general case not the expected behavior, but it is the exact same one, that can easily be triggered with a 3.4 kernel, and some tools might just rely on it. The mutex is not needed, the original integrity issue which introduced it, is in the meantime covered by: "fill buffer with more than a single message for SYSLOG_ACTION_READ" 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Cc: Yuanhan Liu Acked-by: Jan Beulich Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 37cde752cb8a..be9a82b2f0b3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1021,7 +1021,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; - static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1048,17 +1047,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } - error = mutex_lock_interruptible(&syslog_mutex); - if (error) - goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) { - mutex_unlock(&syslog_mutex); + if (error) goto out; - } error = syslog_print(buf, len); - mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: -- cgit v1.2.3 From 68b6507dc554ba015b5ed5e13b1ed4993cdf4024 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 6 Jul 2012 09:50:09 -0700 Subject: kmsg: make sure all messages reach a newly registered boot console We suppress printing kmsg records to the console, which are already printed immediately while we have received their fragments. Newly registered boot consoles print the entire kmsg buffer during registration. Clear the console-suppress flag after we skipped the record during its first storage, so any later print will see these records as usual. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index be9a82b2f0b3..f02f1f5ddc30 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1953,6 +1953,12 @@ skip: */ console_idx = log_next(console_idx); console_seq++; + /* + * We will get here again when we register a new + * CON_PRINTBUFFER console. Clear the flag so we + * will properly dump everything later. + */ + msg->flags &= ~LOG_NOCONS; goto skip; } -- cgit v1.2.3 From 7db5b3ca0ecdb2e8fad52a4770e4e320e61c77a6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Jul 2012 15:55:47 -0700 Subject: Revert "cgroup: superblock can't be released with active dentries" This reverts commit fa980ca87d15bb8a1317853f257a505990f3ffde. The commit was an attempt to fix a race condition where a cgroup hierarchy may be unmounted with positive dentry reference on root cgroup. While the commit made the race condition slightly more difficult to trigger, the race was still there and could be reliably triggered using a different test case. Revert the incorrect fix. The next commit will describe the race and fix it correctly. Signed-off-by: Tejun Heo LKML-Reference: <4FEEA5CB.8070809@huawei.com> Reported-by: shyju pv Cc: Sasha Levin Acked-by: Li Zefan --- kernel/cgroup.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2097684cf194..5f134a0e0e3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * We want to drop the active superblock reference from the - * cgroup creation after all the dentry refs are gone - - * kill_sb gets mighty unhappy otherwise. Mark - * dentry->d_fsdata with cgroup_diput() to tell - * cgroup_d_release() to call deactivate_super(). + * Drop the active superblock reference that we took when we + * created the cgroup */ - dentry->d_fsdata = cgroup_diput; + deactivate_super(cgrp->root->sb); /* * if we're getting rid of the cgroup, refcount should ensure @@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d) return 1; } -static void cgroup_d_release(struct dentry *dentry) -{ - /* did cgroup_diput() tell me to deactivate super? */ - if (dentry->d_fsdata == cgroup_diput) - deactivate_super(dentry->d_sb); -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1547,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, - .d_release = cgroup_d_release, }; struct inode *inode = -- cgit v1.2.3 From 5db9a4d99b0157a513944e9a44d29c9cec2e91dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Jul 2012 16:08:18 -0700 Subject: cgroup: fix cgroup hierarchy umount race 48ddbe1946 "cgroup: make css->refcnt clearing on cgroup removal optional" allowed a css to linger after the associated cgroup is removed. As a css holds a reference on the cgroup's dentry, it means that cgroup dentries may linger for a while. Destroying a superblock which has dentries with positive refcnts is a critical bug and triggers BUG() in vfs code. As each cgroup dentry holds an s_active reference, any lingering cgroup has both its dentry and the superblock pinned and thus preventing premature release of superblock. Unfortunately, after 48ddbe1946, there's a small window while releasing a cgroup which is directly under the root of the hierarchy. When a cgroup directory is released, vfs layer first deletes the corresponding dentry and then invokes dput() on the parent, which may recurse further, so when a cgroup directly below root cgroup is released, the cgroup is first destroyed - which releases the s_active it was holding - and then the dentry for the root cgroup is dput(). This creates a window where the root dentry's refcnt isn't zero but superblock's s_active is. If umount happens before or during this window, vfs will see the root dentry with non-zero refcnt and trigger BUG(). Before 48ddbe1946, this problem didn't exist because the last dentry reference was guaranteed to be put synchronously from rmdir(2) invocation which holds s_active around the whole process. Fix it by holding an extra superblock->s_active reference across dput() from css release, which is the dput() path added by 48ddbe1946 and the only one which doesn't hold an extra s_active ref across the final cgroup dput(). Signed-off-by: Tejun Heo LKML-Reference: <4FEEA5CB.8070809@huawei.com> Reported-by: shyju pv Tested-by: shyju pv Cc: Sasha Levin Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5f134a0e0e3f..b303dfc7dce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3883,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, dput_work); + struct dentry *dentry = css->cgroup->dentry; + struct super_block *sb = dentry->d_sb; - dput(css->cgroup->dentry); + atomic_inc(&sb->s_active); + dput(dentry); + deactivate_super(sb); } static void init_cgroup_css(struct cgroup_subsys_state *css, -- cgit v1.2.3 From eb02dac93708f581c99858a19162af8ca2b6bfcb Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 9 Jul 2012 10:05:10 -0700 Subject: kmsg: /proc/kmsg - support reading of partial log records Restore support for partial reads of any size on /proc/kmsg, in case the supplied read buffer is smaller than the record size. Some people seem to think is is ia good idea to run: $ dd if=/proc/kmsg bs=1 of=... as a klog bridge. Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=44211 Reported-by: Jukka Ollila Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index f02f1f5ddc30..50c33411442d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -217,6 +217,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -890,22 +891,33 @@ static int syslog_print(char __user *buf, int size) while (size > 0) { size_t n; + size_t skip; raw_spin_lock_irq(&logbuf_lock); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_partial = 0; } if (syslog_seq == log_next_seq) { raw_spin_unlock_irq(&logbuf_lock); break; } + + skip = syslog_partial; msg = log_from_idx(syslog_idx); n = msg_print_text(msg, true, text, LOG_LINE_MAX); - if (n <= size) { + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; + n -= syslog_partial; + syslog_partial = 0; + } else if (!len){ + /* partial read(), remember position */ + n = size; + syslog_partial += n; } else n = 0; raw_spin_unlock_irq(&logbuf_lock); @@ -913,17 +925,15 @@ static int syslog_print(char __user *buf, int size) if (!n) break; - len += n; - size -= n; - buf += n; - n = copy_to_user(buf - n, text, n); - - if (n) { - len -= n; + if (copy_to_user(buf, text + skip, n)) { if (!len) len = -EFAULT; break; } + + len += n; + size -= n; + buf += n; } kfree(text); @@ -1107,6 +1117,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_partial = 0; } if (from_file) { /* @@ -1129,6 +1140,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) idx = log_next(idx); seq++; } + error -= syslog_partial; } raw_spin_unlock_irq(&logbuf_lock); break; -- cgit v1.2.3 From 5becfb1df5ac8e491338e64b1029685ccad4b39c Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 9 Jul 2012 12:15:42 -0700 Subject: kmsg: merge continuation records while printing In (the unlikely) case our continuation merge buffer is busy, we unfortunately can not merge further continuation printk()s into a single record and have to store them separately, which leads to split-up output of these lines when they are printed. Add some flags about newlines and prefix existence to these records and try to reconstruct the full line again, when the separated records are printed. Reported-By: Michael Neuling Cc: Dave Jones Cc: Linus Torvalds Tested-By: Michael Neuling Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 120 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 50c33411442d..177fa49357a5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -194,8 +194,10 @@ static int console_may_schedule; */ enum log_flags { - LOG_DEFAULT = 0, - LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NOCONS = 1, /* already flushed, do not print to console */ + LOG_NEWLINE = 2, /* text ended with a newline */ + LOG_PREFIX = 4, /* text started with a prefix */ + LOG_CONT = 8, /* text is a fragment of a continuation line */ }; struct log { @@ -217,6 +219,7 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; +static enum log_flags syslog_prev; static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ @@ -839,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; + bool prefix = true; + bool newline = true; size_t len = 0; + if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) + prefix = false; + + if (msg->flags & LOG_CONT) { + if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) + prefix = false; + + if (!(msg->flags & LOG_NEWLINE)) + newline = false; + } + do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -863,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog, text_len + 1>= size - len) break; - len += print_prefix(msg, syslog, buf + len); + if (prefix) + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - buf[len++] = '\n'; + if (next || newline) + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - len += print_prefix(msg, syslog, NULL); - len += text_len + 1; + if (prefix) + len += print_prefix(msg, syslog, NULL); + len += text_len; + if (next || newline) + len++; } + prefix = true; text = next; } while (text); @@ -898,6 +920,7 @@ static int syslog_print(char __user *buf, int size) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; syslog_partial = 0; } if (syslog_seq == log_next_seq) { @@ -907,11 +930,12 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; + syslog_prev = msg->flags; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -954,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; + enum log_flags prev; if (clear_seq < log_first_seq) { /* messages are gone, move to first available one */ @@ -967,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) */ seq = clear_seq; idx = clear_idx; + prev = 0; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len += msg_print_text(msg, true, NULL, 0); + len += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } @@ -978,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; + prev = 0; while (len > size && seq < log_next_seq) { struct log *msg = log_from_idx(idx); - len -= msg_print_text(msg, true, NULL, 0); + len -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; } @@ -990,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; + prev = 0; while (len >= 0 && seq < next_seq) { struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); if (textlen < 0) { len = textlen; break; } idx = log_next(idx); seq++; + prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -1013,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; + prev = 0; } } } @@ -1117,6 +1147,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; + syslog_prev = 0; syslog_partial = 0; } if (from_file) { @@ -1127,18 +1158,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) */ error = log_next_idx - syslog_idx; } else { - u64 seq; - u32 idx; + u64 seq = syslog_seq; + u32 idx = syslog_idx; + enum log_flags prev = syslog_prev; error = 0; - seq = syslog_seq; - idx = syslog_idx; while (seq < log_next_seq) { struct log *msg = log_from_idx(idx); - error += msg_print_text(msg, true, NULL, 0); + error += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } error -= syslog_partial; } @@ -1408,10 +1439,9 @@ asmlinkage int vprintk_emit(int facility, int level, static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; + enum log_flags lflags = 0; unsigned long flags; int this_cpu; - bool newline = false; - bool prefix = false; int printed_len = 0; boot_delay_msec(); @@ -1450,7 +1480,7 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_DEFAULT, 0, + log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, printed_len); } @@ -1463,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level, /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { text_len--; - newline = true; + lflags |= LOG_NEWLINE; } /* strip syslog prefix and extract log level or control flags */ @@ -1473,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = text[1] - '0'; case 'd': /* KERN_DEFAULT */ - prefix = true; + lflags |= LOG_PREFIX; case 'c': /* KERN_CONT */ text += 3; text_len -= 3; @@ -1483,22 +1513,20 @@ asmlinkage int vprintk_emit(int facility, int level, if (level == -1) level = default_message_loglevel; - if (dict) { - prefix = true; - newline = true; - } + if (dict) + lflags |= LOG_PREFIX|LOG_NEWLINE; - if (!newline) { + if (!(lflags & LOG_NEWLINE)) { /* * Flush the conflicting buffer. An earlier newline was missing, * or another task also prints continuation lines. */ - if (cont.len && (prefix || cont.owner != current)) + if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) cont_flush(); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, LOG_DEFAULT, 0, + log_store(facility, level, lflags | LOG_CONT, 0, dict, dictlen, text, text_len); } else { bool stored = false; @@ -1510,13 +1538,13 @@ asmlinkage int vprintk_emit(int facility, int level, * flush it out and store this line separately. */ if (cont.len && cont.owner == current) { - if (!prefix) + if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); cont_flush(); } if (!stored) - log_store(facility, level, LOG_DEFAULT, 0, + log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); } printed_len += text_len; @@ -1615,8 +1643,8 @@ static struct cont { static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, bool syslog, - char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct log *msg, enum log_flags prev, + bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1892,6 +1920,7 @@ void wake_up_klogd(void) /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static enum log_flags console_prev; /** * console_unlock - unlock the console system @@ -1952,6 +1981,7 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; + console_prev = 0; } skip: if (console_seq == log_next_seq) @@ -1975,10 +2005,11 @@ skip: } level = msg->level; - len = msg_print_text(msg, false, text, sizeof(text)); - + len = msg_print_text(msg, console_prev, false, + text, sizeof(text)); console_idx = log_next(console_idx); console_seq++; + console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2241,6 +2272,7 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; + console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -2534,8 +2566,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, } msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, - line, size); + l = msg_print_text(msg, 0, syslog, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -2575,6 +2606,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 idx; u64 next_seq; u32 next_idx; + enum log_flags prev; size_t l = 0; bool ret = false; @@ -2597,23 +2629,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; + prev = 0; while (seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l += msg_print_text(msg, true, NULL, 0); + l += msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; + prev = 0; while (l > size && seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l -= msg_print_text(msg, true, NULL, 0); + l -= msg_print_text(msg, prev, true, NULL, 0); idx = log_next(idx); seq++; + prev = msg->flags; } /* last message in next interation */ @@ -2621,14 +2657,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; + prev = 0; while (seq < dumper->next_seq) { struct log *msg = log_from_idx(idx); - l += msg_print_text(msg, syslog, - buf + l, size - l); - + l += msg_print_text(msg, prev, syslog, buf + l, size - l); idx = log_next(idx); seq++; + prev = msg->flags; } dumper->next_seq = next_seq; -- cgit v1.2.3 From f55a6faa384304c89cfef162768e88374d3312cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:19 -0400 Subject: hrtimer: Provide clock_was_set_delayed() clock_was_set() cannot be called from hard interrupt context because it calls on_each_cpu(). For fixing the widely reported leap seconds issue it is necessary to call it from hard interrupt context, i.e. the timer tick code, which does the timekeeping updates. Provide a new function which denotes it in the hrtimer cpu base structure of the cpu on which it is called and raise the hrtimer softirq. We then execute the clock_was_set() notificiation from softirq context in run_hrtimer_softirq(). The hrtimer softirq is rarely used, so polling the flag there is not a performance issue. [ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get rid of all this ifdeffery ASAP ] Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 9 ++++++++- kernel/hrtimer.c | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0dc30c9f15..c9ec9400ee5b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -165,6 +165,7 @@ enum hrtimer_base_type { * @lock: lock protecting the base and associated clock bases * and timers * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set: Indicates that clock was set from irq context. * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode @@ -177,7 +178,8 @@ enum hrtimer_base_type { */ struct hrtimer_cpu_base { raw_spinlock_t lock; - unsigned long active_bases; + unsigned int active_bases; + unsigned int clock_was_set; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void); # define MONOTONIC_RES_NSEC HIGH_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_HIGH_RES +extern void clock_was_set_delayed(void); + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC @@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; } + +static inline void clock_was_set_delayed(void) { } + #endif extern void clock_was_set(void); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..3c24fb2c25c8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void) return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } -- cgit v1.2.3 From 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:20 -0400 Subject: timekeeping: Fix leapsecond triggered load spike issue The timekeeping code misses an update of the hrtimer subsystem after a leap second happened. Due to that timers based on CLOCK_REALTIME are either expiring a second early or late depending on whether a leap second has been inserted or deleted until an operation is initiated which causes that update. Unless the update happens by some other means this discrepancy between the timekeeping and the hrtimer data stays forever and timers are expired either early or late. The reported immediate workaround - $ data -s "`date`" - is causing a call to clock_was_set() which updates the hrtimer data structures. See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix Add the missing clock_was_set() call to update_wall_time() in case of a leap second event. The actual update is deferred to softirq context as the necessary smp function call cannot be invoked from hard interrupt context. Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8a..a413e5940e06 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -963,6 +963,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } /* Accumulate raw time */ @@ -1079,6 +1081,8 @@ static void update_wall_time(void) leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; timekeeper.wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } timekeeping_update(false); -- cgit v1.2.3 From 5b9fe759a678e05be4937ddf03d50e950207c1c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:21 -0400 Subject: timekeeping: Maintain ktime_t based offsets for hrtimers We need to update the hrtimer clock offsets from the hrtimer interrupt context. To avoid conversions from timespec to ktime_t maintain a ktime_t based representation of those offsets in the timekeeper. This puts the conversion overhead into the code which updates the underlying offsets and provides fast accessible values in the hrtimer interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a413e5940e06..1c038dac71a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,12 @@ struct timekeeper { /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ struct timespec raw_time; + /* Offset clock monotonic -> clock realtime */ + ktime_t offs_real; + + /* Offset clock monotonic -> clock boottime */ + ktime_t offs_boot; + /* Seqlock for all timekeeper values */ seqlock_t lock; }; @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } +static void update_rt_offset(void) +{ + struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + + set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); + timekeeper.offs_real = timespec_to_ktime(tmp); +} + /* must hold write on timekeeper.lock */ static void timekeeping_update(bool clearntp) { @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp) timekeeper.ntp_error = 0; ntp_clear(); } + update_rt_offset(); update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -604,6 +619,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_rt_offset(); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -612,6 +628,12 @@ void __init timekeeping_init(void) /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ + timekeeper.total_sleep_time = t; + timekeeper.offs_boot = timespec_to_ktime(t); +} + /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); - timekeeper.total_sleep_time = timespec_add( - timekeeper.total_sleep_time, *delta); + update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta)); } -- cgit v1.2.3 From 196951e91262fccda81147d2bcf7fdab08668b40 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:23 -0400 Subject: hrtimers: Move lock held region in hrtimer_interrupt() We need to update the base offsets from this code and we need to do that under base->lock. Move the lock held region around the ktime_get() calls. The ktime_get() calls are going to be replaced with a function which gets the time and the offsets atomically. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-6-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3c24fb2c25c8..8f320af837b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1263,11 +1263,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; + raw_spin_lock(&cpu_base->lock); entry_time = now = ktime_get(); retry: expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1344,6 +1343,7 @@ retry: * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. */ + raw_spin_lock(&cpu_base->lock); now = ktime_get(); cpu_base->nr_retries++; if (++retries < 3) @@ -1356,6 +1356,7 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; -- cgit v1.2.3 From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:24 -0400 Subject: timekeeping: Provide hrtimer update function To finally fix the infamous leap second issue and other race windows caused by functions which change the offsets between the various time bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a function which atomically gets the current monotonic time and updates the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic overhead. The previous patch which provides ktime_t offsets allows us to make this function almost as cheap as ktime_get() which is going to be replaced in hrtimer_interrupt(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ec9400ee5b..cc07d2777bbe 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -327,6 +327,7 @@ extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); extern ktime_t ktime_get_boottime(void); extern ktime_t ktime_get_monotonic_offset(void); +extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1c038dac71a2..269b1fe5f2ae 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1271,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&timekeeper.lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&timekeeper.lock); + + secs = timekeeper.xtime.tv_sec; + nsecs = timekeeper.xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *offs_real = timekeeper.offs_real; + *offs_boot = timekeeper.offs_boot; + } while (read_seqretry(&timekeeper.lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ -- cgit v1.2.3 From 5baefd6d84163443215f4a99f6a20f054ef11236 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:25 -0400 Subject: hrtimer: Update hrtimer base offsets each hrtimer_interrupt The update of the hrtimer base offsets on all cpus cannot be made atomically from the timekeeper.lock held and interrupt disabled region as smp function calls are not allowed there. clock_was_set(), which enforces the update on all cpus, is called either from preemptible process context in case of do_settimeofday() or from the softirq context when the offset modification happened in the timer interrupt itself due to a leap second. In both cases there is a race window for an hrtimer interrupt between dropping timekeeper lock, enabling interrupts and clock_was_set() issuing the updates. Any interrupt which arrives in that window will see the new time but operate on stale offsets. So we need to make sure that an hrtimer interrupt always sees a consistent state of time and offsets. ktime_get_update_offsets() allows us to get the current monotonic time and update the per cpu hrtimer base offsets from hrtimer_interrupt() to capture a consistent state of monotonic time and the offsets. The function replaces the existing ktime_get() calls in hrtimer_interrupt(). The overhead of the new function vs. ktime_get() is minimal as it just adds two store operations. This ensures that any changes to realtime or boottime offsets are noticed and stored into the per-cpu hrtimer base structures, prior to any hrtimer expiration and guarantees that timers are not expired early. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-8-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8f320af837b5..6db7a5ed52b5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + + return ktime_get_update_offsets(offs_real, offs_boot); +} + /* * Retrigger next event is called after clock was set * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; if (!hrtimer_hres_active()) return; - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - + hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } @@ -710,7 +708,6 @@ static int hrtimer_switch_to_hres(void) base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); @@ -1264,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; raw_spin_lock(&cpu_base->lock); - entry_time = now = ktime_get(); + entry_time = now = hrtimer_update_base(cpu_base); retry: expires_next.tv64 = KTIME_MAX; /* @@ -1342,9 +1339,12 @@ retry: * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. */ raw_spin_lock(&cpu_base->lock); - now = ktime_get(); + now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; -- cgit v1.2.3 From 4229fb1dc6843c49a14bb098719f8a696cdc44f8 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 11 Jul 2012 14:02:11 -0700 Subject: c/r: prctl: less paranoid prctl_set_mm_exe_file() "no other files mapped" requirement from my previous patch (c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal) is too paranoid, it forbids operation even if there mapped one shared-anon vma. Let's check that current mm->exe_file already unmapped, in this case exe_file symlink already outdated and its changing is reasonable. Plus, this patch fixes exit code in case operation success. Signed-off-by: Konstantin Khlebnikov Reported-by: Cyrill Gorcunov Tested-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e0c8ffc50d7f..2d39a84cd857 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) down_write(&mm->mmap_sem); /* - * Forbid mm->exe_file change if there are mapped other files. + * Forbid mm->exe_file change if old file still mapped. */ err = -EBUSY; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_file && !path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_unlock; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; } /* @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) goto exit_unlock; + err = 0; set_mm_exe_file(mm, exe_file); exit_unlock: up_write(&mm->mmap_sem); -- cgit v1.2.3 From 6b1859dba01c7d512b72d77e3fd7da8354235189 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:50 -0400 Subject: ntp: Fix STA_INS/DEL clearing bug In commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d, I introduced a bug that kept the STA_INS or STA_DEL bit from being cleared from time_status via adjtimex() without forcing STA_PLL first. Usually once the STA_INS is set, it isn't cleared until the leap second is applied, so its unlikely this affected anyone. However during testing I noticed it took some effort to cancel a leap second once STA_INS was set. Signed-off-by: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava CC: stable@vger.kernel.org # 3.4 Link: http://lkml.kernel.org/r/1342156917-25092-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 70b33abcc7bb..b7fbadc5c973 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -409,7 +409,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -418,7 +420,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; -- cgit v1.2.3 From 3e997130bd2e8c6f5aaa49d6e3161d4d29b43ab0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 12:50:42 -0400 Subject: timekeeping: Add missing update call in timekeeping_resume() The leap second rework unearthed another issue of inconsistent data. On timekeeping_resume() the timekeeper data is updated, but nothing calls timekeeping_update(), so now the update code in the timer interrupt sees stale values. This has been the case before those changes, but then the timer interrupt was using stale data as well so this went unnoticed for quite some time. Add the missing update call, so all the data is consistent everywhere. Reported-by: Andreas Schwab Reported-and-tested-by: "Rafael J. Wysocki" Reported-and-tested-by: Martin Steigerwald Cc: LKML Cc: Linux PM list Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra , Cc: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 269b1fe5f2ae..3447cfaf11e7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -717,6 +717,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; + timekeeping_update(false); write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); -- cgit v1.2.3 From eea03c20ae38a55405c0865ed9adfccc400e4c8e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Jul 2012 18:15:46 -0700 Subject: Make wait_for_device_probe() also do scsi_complete_async_scans() Commit a7a20d103994 ("sd: limit the scope of the async probe domain") make the SCSI device probing run device discovery in it's own async domain. However, as a result, the partition detection was no longer synchronized by async_synchronize_full() (which, despite the name, only synchronizes the global async space, not all of them). Which in turn meant that "wait_for_device_probe()" would not wait for the SCSI partitions to be parsed. And "wait_for_device_probe()" was what the boot time init code relied on for mounting the root filesystem. Now, most people never noticed this, because not only is it timing-dependent, but modern distributions all use initrd. So the root filesystem isn't actually on a disk at all. And then before they actually mount the final disk filesystem, they will have loaded the scsi-wait-scan module, which not only does the expected wait_for_device_probe(), but also does scsi_complete_async_scans(). [ Side note: scsi_complete_async_scans() had also been partially broken, but that was fixed in commit 43a8d39d0137 ("fix async probe regression"), so that same commit a7a20d103994 had actually broken setups even if you used scsi-wait-scan explicitly ] Solve this problem by just moving the scsi_complete_async_scans() call into wait_for_device_probe(). Everybody who wants to wait for device probing to finish really wants the SCSI probing to complete, so there's no reason not to do this. So now "wait_for_device_probe()" really does what the name implies, and properly waits for device probing to finish. This also removes the now unnecessary extra calls to scsi_complete_async_scans(). Reported-and-tested-by: Artem S. Tashkinov Cc: Dan Williams Cc: Alan Stern Cc: James Bottomley Cc: Borislav Petkov Cc: linux-scsi Signed-off-by: Linus Torvalds --- drivers/base/dd.c | 2 ++ drivers/scsi/scsi_wait_scan.c | 5 ----- include/linux/device.h | 2 -- kernel/power/hibernate.c | 8 -------- kernel/power/user.c | 2 -- 5 files changed, 2 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index dcb8a6e48692..4b01ab3d2c24 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -332,6 +333,7 @@ void wait_for_device_probe(void) /* wait for the known devices to complete their probing */ wait_event(probe_waitqueue, atomic_read(&probe_count) == 0); async_synchronize_full(); + scsi_complete_async_scans(); } EXPORT_SYMBOL_GPL(wait_for_device_probe); diff --git a/drivers/scsi/scsi_wait_scan.c b/drivers/scsi/scsi_wait_scan.c index ae7814874618..072734538876 100644 --- a/drivers/scsi/scsi_wait_scan.c +++ b/drivers/scsi/scsi_wait_scan.c @@ -22,11 +22,6 @@ static int __init wait_scan_init(void) * and might not yet have reached the scsi async scanning */ wait_for_device_probe(); - /* - * and then we wait for the actual asynchronous scsi scan - * to finish. - */ - scsi_complete_async_scans(); return 0; } diff --git a/include/linux/device.h b/include/linux/device.h index 161d96241b1b..6de94151ff6f 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -865,8 +865,6 @@ extern int (*platform_notify_remove)(struct device *dev); extern struct device *get_device(struct device *dev); extern void put_device(struct device *dev); -extern void wait_for_device_probe(void); - #ifdef CONFIG_DEVTMPFS extern int devtmpfs_create_node(struct device *dev); extern int devtmpfs_delete_node(struct device *dev); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8b53db38a279..238025f5472e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -27,7 +27,6 @@ #include #include #include -#include #include "power.h" @@ -748,13 +747,6 @@ static int software_resume(void) async_synchronize_full(); } - /* - * We can't depend on SCSI devices being available after loading - * one of their modules until scsi_complete_async_scans() is - * called and the resume device usually is a SCSI one. - */ - scsi_complete_async_scans(); - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { error = -ENODEV; diff --git a/kernel/power/user.c b/kernel/power/user.c index 91b0fd021a95..4ed81e74f86f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,7 +24,6 @@ #include #include #include -#include #include @@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) * appear. */ wait_for_device_probe(); - scsi_complete_async_scans(); data->swap = -1; data->mode = O_WRONLY; -- cgit v1.2.3