diff options
Diffstat (limited to 'kernel')
56 files changed, 1997 insertions, 1039 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd52..2da48d3515eb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + notifier.o ksysfs.o sched_clock.o cred.o \ async.o range.o obj-y += groups.o diff --git a/kernel/async.c b/kernel/async.c index d5fe7af0de2e..4c2843c0043e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t uninitialized_var(calltime), delta, rettime; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); @@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); void async_synchronize_cookie_domain(async_cookie_t cookie, struct list_head *running) { - ktime_t starttime, delta, endtime; + ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..453100a4159d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -4014,11 +4014,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } diff --git a/kernel/cred.c b/kernel/cred.c index 8ef31f53c44c..bb55d052d858 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -644,6 +644,9 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif const struct cred *old; struct cred *new; @@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } +#endif + kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_inc(&init_tgcred.usage); - new->tgcred = &init_tgcred; + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = NULL; + new->tgcred = tgcred; new->request_key_auth = NULL; new->thread_keyring = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; diff --git a/kernel/freezer.c b/kernel/freezer.c index 7b01de98bb6a..66a594e8ad2f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p) unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); + signal_wake_up(p, 1); spin_unlock_irqrestore(&p->sighand->siglock, flags); } diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..1511dff0cfd6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) { struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; - u32 curval, newval; + u32 uninitialized_var(curval), newval; if (!pi_state) return -EINVAL; @@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) static int unlock_futex_pi(u32 __user *uaddr, u32 uval) { - u32 oldval; + u32 uninitialized_var(oldval); /* * There is no waiter, so we unlock the futex. The owner died @@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner = pi_state->owner; - u32 uval, curval, newval; + u32 uval, uninitialized_var(curval), newval; int ret; /* Owner died? */ @@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Returns: * 0 - uaddr contains val and hb has been locked - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2481,7 +2481,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval, mval; + u32 uval, uninitialized_var(nval), mval; retry: if (get_user(uval, uaddr)) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d5828da3fd38..b57a3776de44 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -29,7 +29,11 @@ void irq_domain_add(struct irq_domain *domain) */ for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); - if (d || d->domain) { + if (!d) { + WARN(1, "error: assigning domain to non existant irq_desc"); + return; + } + if (d->domain) { /* things are broken; just report, don't clean up */ WARN(1, "error: irq_desc already assigned to a domain"); return; diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c1305..a4bea97c75b6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...) atomic_inc(&kmod_concurrent); if (atomic_read(&kmod_concurrent) > max_modprobes) { /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg++ < 5) + if (kmod_loop_msg < 5) { printk(KERN_ERR "request_module: runaway loop modprobe %s\n", module_name); + kmod_loop_msg++; + } atomic_dec(&kmod_concurrent); return -ENOMEM; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b30fd54eb985..2f193d0ba7f2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, @@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1959,7 +1959,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 376066e10413..4ac8ebfcab59 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -58,7 +58,7 @@ #include <linux/list.h> #include <linux/stacktrace.h> -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p) if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..e69434b070da 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -96,8 +96,13 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { + /* + * The lockdep graph lock isn't locked while we expect it to + * be, we're confused now, bye! + */ return DEBUG_LOCKS_WARN_ON(1); + } current->lockdep_recursion--; arch_spin_unlock(&lockdep_lock); @@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { if (!hlock->class_idx) { + /* + * Someone passed in garbage, we give up. + */ DEBUG_LOCKS_WARN_ON(1); return NULL; } @@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) */ list_for_each_entry(class, hash_head, hash_entry) { if (class->key == key) { + /* + * Huh! same key, different name? Did someone trample + * on some memory? We're most confused. + */ WARN_ON_ONCE(class->name != lock->name); return class; } @@ -800,6 +812,10 @@ out_unlock_set: else if (subclass < NR_LOCKDEP_CACHING_CLASSES) lock->class_cache[subclass] = class; + /* + * Hash collision, did we smoke some? We found a class with a matching + * hash but the subclass -- which is hashed in -- didn't match. + */ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n=======================================================\n"); - printk( "[ INFO: possible circular locking dependency detected ]\n"); + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: possible circular locking dependency detected ]\n"); print_kernel_version(); - printk( "-------------------------------------------------------\n"); + printk("-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret) if (!debug_locks_off_graph_unlock()) return 0; + /* + * Breadth-first-search failed, graph got corrupted? + */ WARN(1, "lockdep bfs error:%d\n", ret); return 0; @@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n======================================================\n"); - printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); print_kernel_version(); - printk( "------------------------------------------------------\n"); + printk("------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=============================================\n"); - printk( "[ INFO: possible recursive locking detected ]\n"); + printk("\n"); + printk("=============================================\n"); + printk("[ INFO: possible recursive locking detected ]\n"); print_kernel_version(); - printk( "---------------------------------------------\n"); + printk("---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -1944,6 +1966,11 @@ out_bug: if (!debug_locks_off_graph_unlock()) return 0; + /* + * Clearly we all shouldn't be here, but since we made it we + * can reliable say we messed up our state. See the above two + * gotos for reasons why we could possibly end up here. + */ WARN_ON(1); return 0; @@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock_curr, *hlock_next; int i, j; + /* + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; /* @@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); + /* + * We got mighty confused, our chain keys don't match + * with what we expect, someone trample on our task state? + */ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr) return; } id = hlock->class_idx - 1; + /* + * Whoops ran out of static storage again? + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); + /* + * More smoking hash instead of calculating it, damn see these + * numbers float.. I bet that a pink elephant stepped on my memory. + */ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); + printk("\n"); + printk("=================================\n"); + printk("[ INFO: inconsistent lock state ]\n"); print_kernel_version(); - printk( "---------------------------------\n"); + printk("---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=========================================================\n"); - printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + printk("\n"); + printk("=========================================================\n"); + printk("[ INFO: possible irq lock inversion dependency detected ]\n"); print_kernel_version(); - printk( "---------------------------------------------------------\n"); + printk("---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip) return; } + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; + /* + * See the fine text that goes along with this variable definition. + */ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; + /* + * Can't allow enabling interrupts while in an interrupt handler, + * that's general bad form and such. Recursion, limited stack etc.. + */ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; @@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * So we're supposed to get called after you mask local IRQs, but for + * some reason the hardware doesn't quite think you did a proper job. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c, avoids + * funny state and nesting things. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip) curr->softirq_disable_ip = ip; curr->softirq_disable_event = ++curr->irq_events; debug_atomic_inc(softirqs_off_events); + /* + * Whoops, we wanted softirqs off, so why aren't they? + */ DEBUG_LOCKS_WARN_ON(!softirq_count()); } else debug_atomic_inc(redundant_softirqs_off); @@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (!(gfp_mask & __GFP_FS)) return; + /* + * Oi! Can't be having __GFP_FS allocations with IRQs disabled. + */ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; @@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ static inline int mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - WARN_ON(1); + WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ return 1; } @@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask) { } -#endif +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* * Mark a lock with a usage bit, and validate the state transition: @@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->cpu = raw_smp_processor_id(); #endif + /* + * Can't be having no nameless bastards around this place! + */ if (DEBUG_LOCKS_WARN_ON(!name)) { lock->name = "NULL"; return; @@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + /* + * No key, no joy, we need to hash something. + */ if (DEBUG_LOCKS_WARN_ON(!key)) return; /* @@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, */ if (!static_obj(key)) { printk("BUG: key %p not in .data!\n", key); + /* + * What it says above ^^^^^, I suggest you read it. + */ DEBUG_LOCKS_WARN_ON(1); return; } @@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, otherwise we could + * get an interrupt which would want to take locks, which would + * end up in lockdep and have you got a head-ache already? + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * dependency checks are done) */ depth = curr->lockdep_depth; + /* + * Ran out of static storage for our per-task lock stack again have we? + */ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; @@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } hlock = curr->held_locks + depth; + /* + * Plain impossible, we just registered it and checked it weren't no + * NULL like.. I bet this mushroom I ate was good! + */ if (DEBUG_LOCKS_WARN_ON(!class)) return 0; hlock->class_idx = class_idx; @@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ id = class - lock_classes; + /* + * Whoops, we did it again.. ran straight out of our static allocation. + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; if (!depth) { + /* + * How can we have a chain hash when we ain't got no keys?! + */ if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) return 0; chain_head = 1; @@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=====================================\n"); - printk( "[ BUG: bad unlock balance detected! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: bad unlock balance detected! ]\n"); + printk("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, { if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -3120,6 +3225,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (!class) return 0; + /* + * References, but not a lock we're actually ref-counting? + * State got messed up, follow the sites that change ->references + * and try to make sense of it. + */ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; @@ -3142,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, int i; depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3177,6 +3291,10 @@ found_it: return 0; } + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; return 1; @@ -3201,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr, * of held locks: */ depth = curr->lockdep_depth; + /* + * So we're all set to release this lock.. wait what lock? We don't + * own any locks, you've been drinking again? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3253,6 +3375,10 @@ found_it: return 0; } + /* + * We had N bottles of beer on the wall, we drank one, but now + * there's not N-1 bottles of beer left on the wall... + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; return 1; @@ -3283,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr, return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; + /* + * No more locks, but somehow we've got hash left over, who left it? + */ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) return 0; @@ -3365,10 +3494,13 @@ static void check_flags(unsigned long flags) * check if not in hardirq contexts: */ if (!hardirq_count()) { - if (softirq_count()) + if (softirq_count()) { + /* like the above, but with softirqs */ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - else + } else { + /* lick the above, does it taste good? */ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } } if (!debug_locks) @@ -3478,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ BUG: bad contention detected! ]\n"); - printk( "---------------------------------\n"); + printk("\n"); + printk("=================================\n"); + printk("[ BUG: bad contention detected! ]\n"); + printk("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3506,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) int i, contention_point, contending_point; depth = curr->lockdep_depth; + /* + * Whee, we contended on this lock, except it seems we're not + * actually trying to acquire anything much at all.. + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3555,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) int i, cpu; depth = curr->lockdep_depth; + /* + * Yay, we acquired ownership of this lock we didn't try to + * acquire, how the heck did that happen? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3759,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) match |= class == lock->class_cache[j]; if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ WARN_ON(1); + } goto out_restore; } } @@ -3839,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n=========================\n"); - printk( "[ BUG: held lock freed! ]\n"); - printk( "-------------------------\n"); + printk("\n"); + printk("=========================\n"); + printk("[ BUG: held lock freed! ]\n"); + printk("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -3895,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr) if (debug_locks_silent) return; - printk("\n=====================================\n"); - printk( "[ BUG: lock held at task exit time! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: lock held at task exit time! ]\n"); + printk("-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); lockdep_print_held_locks(curr); @@ -3991,16 +4138,17 @@ void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n================================================\n"); - printk( "[ BUG: lock held when returning to user space! ]\n"); - printk( "------------------------------------------------\n"); + printk("\n"); + printk("================================================\n"); + printk("[ BUG: lock held when returning to user space! ]\n"); + printk("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } } -void lockdep_rcu_dereference(const char *file, const int line) +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; @@ -4009,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line) return; #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n===================================================\n"); - printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); - printk( "---------------------------------------------------\n"); - printk("%s:%d invoked rcu_dereference_check() without protection!\n", - file, line); + printk("\n"); + printk("===============================\n"); + printk("[ INFO: suspicious RCU usage. ]\n"); + printk("-------------------------------\n"); + printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); } -EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); +EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/params.c b/kernel/params.c index 22df3e0d142a..821788947e40 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param) } } -static inline char dash2underscore(char c) +static char dash2underscore(char c) { if (c == '-') return '_'; return c; } -static inline int parameq(const char *input, const char *paramname) +bool parameqn(const char *a, const char *b, size_t n) { - unsigned int i; - for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) - if (input[i] == '\0') - return 1; - return 0; + size_t i; + + for (i = 0; i < n; i++) { + if (dash2underscore(a[i]) != dash2underscore(b[i])) + return false; + } + return true; +} + +bool parameq(const char *a, const char *b) +{ + return parameqn(a, b, strlen(a)+1); } static int parse_one(char *param, diff --git a/kernel/pid.c b/kernel/pid.c index e432057f3b21..8cafe7e72ad2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held()); + rcu_lockdep_assert(rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock()" + " protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 58f405b581e7..e7cb76dc18f5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); - times->sum_exec_runtime += t->se.sum_exec_runtime; + times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); @@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct task_cputime sum; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); if (!cputimer->running) { - cputimer->running = 1; /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have @@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * it. */ thread_group_cputime(tsk, &sum); + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; update_gt_cputime(&cputimer->cputime, &sum); - } + } else + raw_spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -312,7 +313,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock, cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = thread_group_sched_runtime(p); + thread_group_cputime(p, &cputime); + cpu->sched = cputime.sum_exec_runtime; break; } return 0; @@ -997,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1289,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - spin_lock(&sig->cputimer.lock); + raw_spin_lock(&sig->cputimer.lock); group_sample = sig->cputimer.cputime; - spin_unlock(&sig->cputimer.lock); + raw_spin_unlock(&sig->cputimer.lock); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19b..cedd9982306a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,6 +27,7 @@ config HIBERNATION select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS + select CRC32 ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -65,6 +66,9 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.txt>. +config ARCH_SAVE_PAGE_KEYS + bool + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a90643..07e0e28ffba7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,8 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o -obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_PM) += main.o qos.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 218e5af90156..b1dc456474b5 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,5 +1,5 @@ /* - * drivers/power/process.c - Functions for saving/restoring console. + * Functions for saving/restoring console. * * Originally from swsusp. */ @@ -10,7 +10,6 @@ #include <linux/module.h> #include "power.h" -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; @@ -32,4 +31,3 @@ void pm_restore_console(void) vt_kmsg_redirect(orig_kmsg); } } -#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8f7b1db1ece1..1c53f7fad5f7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,6 +14,7 @@ #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/async.h> #include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> @@ -29,12 +30,14 @@ #include "power.h" -static int nocompress = 0; -static int noresume = 0; +static int nocompress; +static int noresume; +static int resume_wait; +static int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata = 0; +int in_suspend __nosavedata; enum { HIBERNATION_INVALID, @@ -334,13 +337,17 @@ int hibernation_snapshot(int platform_mode) if (error) goto Close; - error = dpm_prepare(PMSG_FREEZE); - if (error) - goto Complete_devices; - /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Close; + + error = dpm_prepare(PMSG_FREEZE); + if (error) goto Complete_devices; suspend_console(); @@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode) * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snaphot(). + * reappears in the restored target kernel in hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -650,6 +657,9 @@ int hibernate(void) flags |= SF_PLATFORM_MODE; if (nocompress) flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -724,6 +734,12 @@ static int software_resume(void) pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + if (resume_delay) { + printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + resume_delay); + ssleep(resume_delay); + } + /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { @@ -732,6 +748,13 @@ static int software_resume(void) * to wait for this to finish. */ wait_for_device_probe(); + + if (resume_wait) { + while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) + msleep(10); + async_synchronize_full(); + } + /* * We can't depend on SCSI devices being available after loading * one of their modules until scsi_complete_async_scans() is @@ -1060,7 +1083,21 @@ static int __init noresume_setup(char *str) return 1; } +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + resume_delay = simple_strtoul(str, NULL, 0); + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6c601f871964..a52e88425a31 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -12,6 +12,8 @@ #include <linux/string.h> #include <linux/resume-trace.h> #include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include "power.h" @@ -131,6 +133,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_DEBUG */ +#ifdef CONFIG_DEBUG_FS +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; + } +} + +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); + } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } + + return 0; +} + +static int suspend_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_stats_show, NULL); +} + +static const struct file_operations suspend_stats_operations = { + .open = suspend_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init pm_debugfs_init(void) +{ + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_operations); + return 0; +} + +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ + #endif /* CONFIG_PM_SLEEP */ struct kobject *power_kobj; @@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } if (state < PM_SUSPEND_MAX && *s) error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else + suspend_stats.success++; #endif Exit: diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a26280..23a2db1ec442 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void); */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 /* kernel/power/hibernate.c */ extern int swsusp_check(void); @@ -228,7 +229,8 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - return freeze_processes(); + int error = freeze_processes(); + return error ? : freeze_kernel_threads(); } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9d..addbbe5531bc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only) } /** - * freeze_processes - tell processes to enter the refrigerator + * freeze_processes - Signal user space processes to enter the refrigerator. */ int freeze_processes(void) { @@ -143,20 +143,30 @@ int freeze_processes(void) printk("Freezing user space processes ... "); error = try_to_freeze_tasks(true); - if (error) - goto Exit; - printk("done.\n"); + if (!error) { + printk("done."); + oom_killer_disable(); + } + printk("\n"); + BUG_ON(in_atomic()); + + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + */ +int freeze_kernel_threads(void) +{ + int error; printk("Freezing remaining freezable tasks ... "); error = try_to_freeze_tasks(false); - if (error) - goto Exit; - printk("done."); + if (!error) + printk("done."); - oom_killer_disable(); - Exit: - BUG_ON(in_atomic()); printk("\n"); + BUG_ON(in_atomic()); return error; } diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c index 37f05d0f0793..1c1797dd1d1d 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/power/qos.c @@ -29,7 +29,7 @@ /*#define DEBUG*/ -#include <linux/pm_qos_params.h> +#include <linux/pm_qos.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -45,62 +45,57 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to requests or notifiers lists + * locking rule: all changes to constraints or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -enum pm_qos_type { - PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ -}; - -/* - * Note: The lockless read path depends on the CPU accessing - * target_value atomically. Atomic access is only guaranteed on all CPU - * types linux supports for 32 bit quantites - */ struct pm_qos_object { - struct plist_head requests; - struct blocking_notifier_head *notifiers; + struct pm_qos_constraints *constraints; struct miscdevice pm_qos_power_miscdev; char *name; - s32 target_value; /* Do not change to 64 bit */ - s32 default_value; - enum pm_qos_type type; }; static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; + static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), - .notifiers = &cpu_dma_lat_notifier, - .name = "cpu_dma_latency", +static struct pm_qos_constraints cpu_dma_constraints = { + .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, + .notifiers = &cpu_dma_lat_notifier, +}; +static struct pm_qos_object cpu_dma_pm_qos = { + .constraints = &cpu_dma_constraints, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), - .notifiers = &network_lat_notifier, - .name = "network_latency", +static struct pm_qos_constraints network_lat_constraints = { + .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN + .type = PM_QOS_MIN, + .notifiers = &network_lat_notifier, +}; +static struct pm_qos_object network_lat_pm_qos = { + .constraints = &network_lat_constraints, + .name = "network_latency", }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), - .notifiers = &network_throughput_notifier, - .name = "network_throughput", +static struct pm_qos_constraints network_tput_constraints = { + .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, + .notifiers = &network_throughput_notifier, +}; +static struct pm_qos_object network_throughput_pm_qos = { + .constraints = &network_tput_constraints, + .name = "network_throughput", }; @@ -127,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = { }; /* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_object *o) +static inline int pm_qos_get_value(struct pm_qos_constraints *c) { - if (plist_head_empty(&o->requests)) - return o->default_value; + if (plist_head_empty(&c->list)) + return c->default_value; - switch (o->type) { + switch (c->type) { case PM_QOS_MIN: - return plist_first(&o->requests)->prio; + return plist_first(&c->list)->prio; case PM_QOS_MAX: - return plist_last(&o->requests)->prio; + return plist_last(&c->list)->prio; default: /* runtime check for not using enum */ @@ -145,69 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) } } -static inline s32 pm_qos_read_value(struct pm_qos_object *o) +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - return o->target_value; + return c->target_value; } -static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - o->target_value = value; + c->target_value = value; } -static void update_target(struct pm_qos_object *o, struct plist_node *node, - int del, int value) +/** + * pm_qos_update_target - manages the constraints list and calls the notifiers + * if needed + * @c: constraints data struct + * @node: request to add to the list, to update or to remove + * @action: action to take on the constraints list + * @value: value of the request to add or update + * + * This function returns 1 if the aggregated constraint value has changed, 0 + * otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) { unsigned long flags; - int prev_value, curr_value; + int prev_value, curr_value, new_value; spin_lock_irqsave(&pm_qos_lock, flags); - prev_value = pm_qos_get_value(o); - /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ - if (value != PM_QOS_DEFAULT_VALUE) { + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: /* * to change the list, we atomically remove, reinit * with new value and add, then see if the extremal * changed */ - plist_del(node, &o->requests); - plist_node_init(node, value); - plist_add(node, &o->requests); - } else if (del) { - plist_del(node, &o->requests); - } else { - plist_add(node, &o->requests); + plist_del(node, &c->list); + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; } - curr_value = pm_qos_get_value(o); - pm_qos_set_value(o, curr_value); + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + spin_unlock_irqrestore(&pm_qos_lock, flags); - if (prev_value != curr_value) - blocking_notifier_call_chain(o->notifiers, + if (prev_value != curr_value) { + blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, NULL); -} - -static int register_pm_qos_misc(struct pm_qos_object *qos) -{ - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - return misc_register(&qos->pm_qos_power_miscdev); -} - -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; - - for (pm_qos_class = 0; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; + return 1; + } else { + return 0; } - return -1; } /** @@ -218,11 +217,11 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return pm_qos_read_value(pm_qos_array[pm_qos_class]); + return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); } EXPORT_SYMBOL_GPL(pm_qos_request); -int pm_qos_request_active(struct pm_qos_request_list *req) +int pm_qos_request_active(struct pm_qos_request *req) { return req->pm_qos_class != 0; } @@ -230,40 +229,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); /** * pm_qos_add_request - inserts new qos request into the list - * @dep: pointer to a preallocated handle + * @req: pointer to a preallocated handle * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * for the pm_qos_class of parameters and initializes the pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. */ -void pm_qos_add_request(struct pm_qos_request_list *dep, +void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class, s32 value) { - struct pm_qos_object *o = pm_qos_array[pm_qos_class]; - int new_value; + if (!req) /*guard against callers passing in null */ + return; - if (pm_qos_request_active(dep)) { + if (pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); return; } - if (value == PM_QOS_DEFAULT_VALUE) - new_value = o->default_value; - else - new_value = value; - plist_node_init(&dep->list, new_value); - dep->pm_qos_class = pm_qos_class; - update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); + req->pm_qos_class = pm_qos_class; + pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, + &req->node, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(pm_qos_add_request); /** * pm_qos_update_request - modifies an existing qos request - * @pm_qos_req : handle to list element holding a pm_qos request to use + * @req : handle to list element holding a pm_qos request to use * @value: defines the qos request * * Updates an existing qos request for the pm_qos_class of parameters along @@ -271,56 +266,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * * Attempts are made to make this code callable on hot code paths. */ -void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, +void pm_qos_update_request(struct pm_qos_request *req, s32 new_value) { - s32 temp; - struct pm_qos_object *o; - - if (!pm_qos_req) /*guard against callers passing in null */ + if (!req) /*guard against callers passing in null */ return; - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = o->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->list.prio) - update_target(o, &pm_qos_req->list, 0, temp); + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); /** * pm_qos_remove_request - modifies an existing qos request - * @pm_qos_req: handle to request list element + * @req: handle to request list element * - * Will remove pm qos request from the list of requests and + * Will remove pm qos request from the list of constraints and * recompute the current target value for the pm_qos_class. Call this * on slow code paths. */ -void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) +void pm_qos_remove_request(struct pm_qos_request *req) { - struct pm_qos_object *o; - - if (pm_qos_req == NULL) + if (!req) /*guard against callers passing in null */ return; /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); - memset(pm_qos_req, 0, sizeof(*pm_qos_req)); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -337,7 +323,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } @@ -356,19 +343,43 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); +/* User space interface to PM QoS classes via misc devices */ +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = 0; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + static int pm_qos_power_open(struct inode *inode, struct file *filp) { long pm_qos_class; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); + struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; @@ -383,7 +394,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) static int pm_qos_power_release(struct inode *inode, struct file *filp) { - struct pm_qos_request_list *req; + struct pm_qos_request *req; req = filp->private_data; pm_qos_remove_request(req); @@ -398,17 +409,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, { s32 value; unsigned long flags; - struct pm_qos_object *o; - struct pm_qos_request_list *pm_qos_req = filp->private_data; + struct pm_qos_request *req = filp->private_data; - if (!pm_qos_req) + if (!req) return -EINVAL; - if (!pm_qos_request_active(pm_qos_req)) + if (!pm_qos_request_active(req)) return -EINVAL; - o = pm_qos_array[pm_qos_req->pm_qos_class]; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(o); + value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); @@ -418,7 +427,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request_list *pm_qos_req; + struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -449,8 +458,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return -EINVAL; } - pm_qos_req = filp->private_data; - pm_qos_update_request(pm_qos_req, value); + req = filp->private_data; + pm_qos_update_request(req, value); return count; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 06efa54f93d6..cbe2c1441392 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; + /* Add number of pages required for page keys (s390 only). */ + size += page_key_additional_pages(saveable); + /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Save page key for data page (s390 only). */ + page_key_read(buf + j); } } @@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Extract and buffer page key for data page (s390 only). */ + page_key_memorize(buf + j); + if (memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else @@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + /* Allocate buffer for page keys. */ + error = page_key_alloc(nr_copy_pages); + if (error) + return error; + } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle) } } else { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + page_key_free(); /* Free only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208f..fdd4263b995d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -104,7 +104,10 @@ static int suspend_prepare(void) goto Finish; error = suspend_freeze_processes(); - if (!error) + if (error) { + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); + } else return 0; suspend_thaw_processes(); @@ -315,8 +318,16 @@ int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); + int ret; + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { + ret = enter_state(state); + if (ret) { + suspend_stats.fail++; + dpm_save_failed_errno(ret); + } else + suspend_stats.success++; + return ret; + } return -EINVAL; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c97c3a0eee3..11a594c4ba25 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -27,6 +27,10 @@ #include <linux/slab.h> #include <linux/lzo.h> #include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> #include "power.h" @@ -43,8 +47,7 @@ * allocated and populated one at a time, so we only need one memory * page to set up the entire structure. * - * During resume we also only need to use one swap_map_page structure - * at a time. + * During resume we pick up all swap_map_page structures into a list. */ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) @@ -54,6 +57,11 @@ struct swap_map_page { sector_t next_swap; }; +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + /** * The swap_map_handle structure is used for handling swap in * a file-alike way @@ -61,13 +69,18 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; + struct swap_map_page_list *maps; sector_t cur_swap; sector_t first_sector; unsigned int k; + unsigned long nr_free_pages, written; + u32 crc32; }; struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32)]; + u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; @@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { @@ -245,6 +260,7 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { void *src; + int ret; if (!offset) return -ENOSPC; @@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (src) { copy_page(src, buf); } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; + } } } else { src = buf; @@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; + handle->nr_free_pages = nr_free_pages() >> 1; + handle->written = 0; handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); + error = write_page(handle->cur, handle->cur_swap, bio_chain); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } + if (bio_chain && ++handle->written > handle->nr_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + handle->written = 0; + } out: return error; } @@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle, LZO_HEADER, PAGE_SIZE) #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Maximum number of pages for read buffering. */ +#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) + + /** * save_image - save the suspend image data */ @@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle, return ret; } +/** + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/** + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/** + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/** + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} /** * save_image_lzo - Save the suspend image data compressed with LZO. @@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle, struct bio *bio; struct timeval start; struct timeval stop; - size_t off, unc_len, cmp_len; - unsigned char *unc, *cmp, *wrk, *page; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out_clean; } - wrk = vmalloc(LZO1X_1_MEM_COMPRESS); - if (!wrk) { - printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); - free_page((unsigned long)page); - return -ENOMEM; + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct cmp_data, go)); - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + /* + * Adjust number of free pages after all allocations have been done. + * We don't want to run out of pages when writing. + */ + handle->nr_free_pages = nr_free_pages() >> 1; + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; } printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", - nr_to_write); + nr_threads, nr_to_write); m = nr_to_write / 100; if (!m) m = 1; @@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for (;;) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { - ret = snapshot_read_next(snapshot); - if (ret < 0) - goto out_finish; - - if (!ret) + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", + nr_pages / m); + nr_pages++; + } + if (!off) break; - memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + data[thr].unc_len = off; - if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (!off) + if (!thr) break; - unc_len = off; - ret = lzo1x_1_compress(unc, unc_len, - cmp + LZO_HEADER, &cmp_len, wrk); - if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); - break; - } + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(unc_len))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - ret = -1; - break; - } + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); - *(size_t *)cmp = cmp_len; + ret = data[thr].ret; - /* - * Given we are writing one page at a time to disk, we copy - * that much from the buffer, although the last bit will likely - * be smaller than full page. This is OK - we saved the length - * of the compressed data, so any garbage at the end will be - * discarded when we read it. - */ - for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { - memcpy(page, cmp + off, PAGE_SIZE); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + goto out_finish; + } - ret = swap_write_page(handle, page, &bio); - if (ret) + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } } + + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); } out_finish: @@ -536,16 +737,25 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) + if (!ret) { printk(KERN_CONT "\b\b\b\bdone\n"); - else + } else { printk(KERN_CONT "\n"); + } swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - - vfree(cmp); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); +out_clean: + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); return ret; } @@ -625,8 +835,15 @@ out_finish: static void release_swap_reader(struct swap_map_handle *handle) { - if (handle->cur) - free_page((unsigned long)handle->cur); + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } handle->cur = NULL; } @@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle, unsigned int *flags_p) { int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; *flags_p = swsusp_header->flags; if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); - if (!handle->cur) - return -ENOMEM; + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + memset(tmp, 0, sizeof(*tmp)); + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } - error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; + error = hib_bio_read_page(offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; } handle->k = 0; + handle->cur = handle->maps->map; return 0; } @@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, { sector_t offset; int error; + struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; @@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) release_swap_reader(handle); - else if (!error) - error = hib_bio_read_page(offset, handle->cur, NULL); + else + handle->cur = handle->maps->map; } return error; } @@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; struct timeval start; struct timeval stop; struct bio *bio; @@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) + ret = swap_read_page(handle, data_of(*snapshot), &bio); + if (ret) break; if (snapshot->sync_read) - error = hib_wait_on_bio_chain(&bio); - if (error) + ret = hib_wait_on_bio_chain(&bio); + if (ret) break; if (!(nr_pages % m)) printk("\b\b\b\b%3d%%", nr_pages / m); @@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle, } err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { + if (!ret) + ret = err2; + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return error; + return ret; +} + +/** + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/** + * Deompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; } /** @@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; + int eof = 0; struct bio *bio; struct timeval start; struct timeval stop; unsigned nr_pages; - size_t i, off, unc_len, cmp_len; - unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; - - for (i = 0; i < LZO_CMP_PAGES; i++) { - page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page[i]) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } - while (i) - free_page((unsigned long)page[--i]); + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct dec_data, go)); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; } } - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); - - return -ENOMEM; + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); + /* + * Adjust number of pages for read buffering, in case we are short. + */ + read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; + read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); - return -ENOMEM; + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : + __GFP_WAIT); + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + printk(KERN_ERR + "PM: Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } } + want = ring_size = i; printk(KERN_INFO + "PM: Using %u thread(s) for decompression.\n" "PM: Loading and decompressing image data (%u pages) ... ", - nr_to_read); + nr_threads, nr_to_read); m = nr_to_read / 100; if (!m) m = 1; @@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) goto out_finish; - for (;;) { - error = swap_read_page(handle, page[0], NULL); /* sync */ - if (error) - break; - - cmp_len = *(size_t *)page[0]; - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - error = -1; - break; + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &bio); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; } + asked += i; + want -= i; - for (off = PAGE_SIZE, i = 1; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - error = swap_read_page(handle, page[i], &bio); - if (error) + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_on_bio_chain(&bio); + if (ret) goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - error = hib_wait_on_bio_chain(&bio); /* need all data now */ - if (error) - goto out_finish; - - for (off = 0, i = 0; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - memcpy(cmp + off, page[i], PAGE_SIZE); + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; } - unc_len = LZO_UNC_SIZE; - error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, - unc, &unc_len); - if (error < 0) { - printk(KERN_ERR "PM: LZO decompression failed\n"); - break; + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (unlikely(!unc_len || - unc_len > LZO_UNC_SIZE || - unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); - error = -1; - break; + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - for (off = 0; off < unc_len; off += PAGE_SIZE) { - memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + if (ret < 0) { + printk(KERN_ERR + "PM: LZO decompression failed\n"); + goto out_finish; + } - error = snapshot_write_next(snapshot); - if (error <= 0) + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR + "PM: Invalid LZO uncompressed length\n"); + ret = -1; goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } } + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); } out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } do_gettimeofday(&stop); - if (!error) { + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + printk(KERN_ERR + "PM: Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - - vfree(cmp); - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) +out_clean: + for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) vfree(page); - return error; + return ret; } /** diff --git a/kernel/printk.c b/kernel/printk.c index 28a40d8171b8..b7da18391c38 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -100,7 +100,7 @@ static int console_locked, console_suspended; * It is also used in interesting ways to provide interlocking in * console_unlock();. */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -212,7 +212,7 @@ void __init setup_log_buf(int early) return; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -230,7 +230,7 @@ void __init setup_log_buf(int early) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -689,7 +689,7 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ sema_init(&console_sem, 1); } @@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); if (wake) up(&console_sem); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; if (recursion_bug) { @@ -1257,14 +1257,14 @@ void console_unlock(void) again: for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); @@ -1276,7 +1276,7 @@ again: if (unlikely(exclusive_console)) exclusive_console = NULL; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); up(&console_sem); @@ -1286,13 +1286,13 @@ again: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); if (con_start != log_end) retry = 1; - spin_unlock_irqrestore(&logbuf_lock, flags); if (retry && console_trylock()) goto again; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); } @@ -1522,9 +1522,9 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (chars > end) { s1 = log_buf + log_buf_len - chars + end; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9de3ecfd20f9..a70d2a5d8c7b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request, break; si = child->last_siginfo; - if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) - break; - - child->jobctl |= JOBCTL_LISTENING; - - /* - * If NOTIFY is set, it means event happened between start - * of this trap and now. Trigger re-trap immediately. - */ - if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); - + if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) { + child->jobctl |= JOBCTL_LISTENING; + /* + * If NOTIFY is set, it means event happened between + * start of this trap and now. Trigger re-trap. + */ + if (child->jobctl & JOBCTL_TRAP_NOTIFY) + signal_wake_up(child, true); + ret = 0; + } unlock_task_sighand(child, &flags); - ret = 0; break; case PTRACE_DETACH: /* detach a process that was attached. */ diff --git a/kernel/rcu.h b/kernel/rcu.h new file mode 100644 index 000000000000..f600868d550d --- /dev/null +++ b/kernel/rcu.h @@ -0,0 +1,85 @@ +/* + * Read-Copy Update definitions shared among RCU implementations. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2011 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifndef __LINUX_RCU_H +#define __LINUX_RCU_H + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally + * by call_rcu() and rcu callback execution, and are therefore not part of the + * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +# define STATE_RCU_HEAD_READY 0 +# define STATE_RCU_HEAD_QUEUED 1 + +extern struct debug_obj_descr rcuhead_debug_descr; + +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ + WARN_ON_ONCE((unsigned long)head & 0x3); + debug_object_activate(head, &rcuhead_debug_descr); + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_READY, + STATE_RCU_HEAD_QUEUED); +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_QUEUED, + STATE_RCU_HEAD_READY); + debug_object_deactivate(head, &rcuhead_debug_descr); +} +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ +} +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +extern void kfree(const void *); + +static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) { + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + kfree((void *)head - offset); + } else { + RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + head->func(head); + } +} + +#endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ddddb320be61..ca0d23b6b3e8 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,6 +46,11 @@ #include <linux/module.h> #include <linux/hardirq.h> +#define CREATE_TRACE_POINTS +#include <trace/events/rcu.h> + +#include "rcu.h" + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -void wakeme_after_rcu(struct rcu_head *head) +static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +void wait_rcu_gp(call_rcu_func_t crf) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + crf(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(wait_rcu_gp); + #ifdef CONFIG_PROVE_RCU /* * wrapper function to avoid #include problems. diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7bbac7d0f5ab..da775c87f27f 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -37,16 +37,17 @@ #include <linux/cpu.h> #include <linux/prefetch.h> -/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; +#ifdef CONFIG_RCU_TRACE +#include <trace/events/rcu.h> +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +#include "rcu.h" /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_kthread(void); -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static int rcu_kthread(void *arg); +static void invoke_rcu_callbacks(void); +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); @@ -96,16 +97,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); -} - -/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. @@ -117,7 +108,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } @@ -130,7 +121,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } @@ -154,18 +145,23 @@ void rcu_check_callbacks(int cpu, int user) * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure * whose grace period has elapsed. */ -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { + char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) + if (&rcp->rcucblist == rcp->donetail) { + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); return; + } /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -176,49 +172,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ + RCU_TRACE(rn = rcp->name); while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(list); + __rcu_reclaim(rn, list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); } -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that was used previously for this purpose. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) +static void rcu_process_callbacks(struct softirq_action *unused) { - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) { - rcu_process_callbacks(&rcu_sched_ctrlblk); - rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); - } - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -280,45 +253,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); - -void rcu_barrier_bh(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -void rcu_barrier_sched(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f259c676195f..02aa7139861c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -26,29 +26,26 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(char *name); /* Name of RCU type. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_sched_ctrlblk = { .donetail = &rcu_sched_ctrlblk.rcucblist, .curtail = &rcu_sched_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_sched") }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .donetail = &rcu_bh_ctrlblk.rcucblist, .curtail = &rcu_bh_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_bh") }; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), + RCU_TRACE(.rcb.name = "rcu_preempt") }; static int rcu_preempted_readers_exp(void); @@ -247,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) #include "rtmutex_common.h" +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO + +/* Controls for rcu_kthread() kthread. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; + /* * Carry out RCU priority boosting on the task indicated by ->boost_tasks, * and advance ->boost_tasks to the next task in the ->blkd_tasks list. @@ -334,7 +339,7 @@ static int rcu_initiate_boost(void) if (rcu_preempt_ctrlblk.exp_tasks == NULL) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; @@ -353,14 +358,6 @@ static void rcu_preempt_boost_start_gp(void) #else /* #ifdef CONFIG_RCU_BOOST */ /* - * If there is no RCU priority boosting, we don't boost. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * If there is no RCU priority boosting, we don't initiate boosting, * but we do indicate whether there are blocked readers blocking the * current grace period. @@ -427,7 +424,7 @@ static void rcu_preempt_cpu_qs(void) /* If there are done callbacks, cause them to be invoked. */ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } /* @@ -648,7 +645,7 @@ static void rcu_preempt_check_callbacks(void) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && rcu_preempt_running_reader()) @@ -674,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_preempt_process_callbacks(void) { - rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); } /* @@ -697,20 +694,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - /* * synchronize_rcu - wait until a grace period has elapsed. * @@ -864,15 +847,6 @@ static void show_tiny_preempt_stats(struct seq_file *m) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Because preemptible RCU does not exist, it is never necessary to - * boost preempted RCU readers. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ @@ -898,6 +872,78 @@ static void rcu_preempt_process_callbacks(void) #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_callbacks(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. + */ +static int rcu_kthread(void *arg) +{ + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(NULL); + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) +{ + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; +} +early_initcall(rcu_spawn_kthreads); + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Start up softirq processing of callbacks. + */ +void invoke_rcu_callbacks(void) +{ + raise_softirq(RCU_SOFTIRQ); +} + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> @@ -913,12 +959,6 @@ void __init rcu_scheduler_starting(void) #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#ifdef CONFIG_RCU_BOOST -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else /* #ifdef CONFIG_RCU_BOOST */ -#define RCU_BOOST_PRIO 1 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_RCU_TRACE #ifdef CONFIG_RCU_BOOST diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98f51b13bb7e..764825c2685c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -73,7 +73,7 @@ module_param(nreaders, int, 0444); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); module_param(nfakewriters, int, 0444); MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0444); +module_param(stat_interval, int, 0644); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0444); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); @@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } -struct rcu_bh_torture_synchronize { - struct rcu_head head; - struct completion completion; -}; - -static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) -{ - struct rcu_bh_torture_synchronize *rcu; - - rcu = container_of(head, struct rcu_bh_torture_synchronize, head); - complete(&rcu->completion); -} - -static void rcu_bh_torture_synchronize(void) -{ - struct rcu_bh_torture_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} - static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, @@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .name = "rcu_bh_sync" }; +static struct rcu_torture_ops rcu_bh_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_bh_expedited, + .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_expedited" +}; + /* * Definitions for srcu torture testing. */ @@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); } -static void sched_torture_synchronize(void) -{ - synchronize_sched(); -} - static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = NULL, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg) do { /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (jiffies - oldstarttime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_uninterruptible(1); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || @@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (jiffies - endtime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!rbi.inflight) { smp_mb(); /* RCU core before ->inflight = 1. */ @@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime) { + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; @@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); return 0; } @@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (jiffies - fqs_resume_time > LONG_MAX) { + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { schedule_timeout_interruptible(1); } fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0) { + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { cur_ops->fqs(); udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; @@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); @@ -1424,7 +1415,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, &srcu_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207b1dd3..e234eb92a177 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,13 +52,16 @@ #include <linux/prefetch.h> #include "rcutree.h" +#include <trace/events/rcu.h> + +#include "rcu.h" /* Data structures. */ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; #define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname.node[0] }, \ + .level = { &structname##_state.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ @@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ .name = #structname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); -#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ - /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least * one since the start of the grace period, this just sets a flag. + * The caller must have disabled preemption. */ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. + * The caller must have disabled preemption. */ void rcu_note_context_switch(int cpu) { + trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { }; #endif /* #ifdef CONFIG_NO_HZ */ -static int blimit = 10; /* Maximum callbacks per softirq. */ +static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) * trust its state not to change because interrupts are disabled. */ if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } @@ -354,19 +364,13 @@ void rcu_enter_nohz(void) local_irq_restore(flags); return; } + trace_rcu_dyntick("Start"); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (in_irq() && - (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist || - rcu_preempt_needs_cpu(smp_processor_id()))) - set_need_resched(); } /* @@ -391,6 +395,7 @@ void rcu_exit_nohz(void) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + trace_rcu_dyntick("End"); local_irq_restore(flags); } @@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - unsigned long curr; - unsigned long snap; + unsigned int curr; + unsigned int snap; - curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned long)rdp->dynticks_snap; + curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned int)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { + if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); rdp->dynticks_fqs++; return 1; } @@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; + int ndetected; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * Now rat on any tasks that got kicked up to the root rcu_node * due to CPU offlining. */ - rcu_print_task_stall(rnp); + ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rsp->name); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); - rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (rnp->qsmask == 0) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) + if (rnp->qsmask & (1UL << cpu)) { printk(" %d", rnp->grplo + cpu); + ndetected++; + } } printk("} (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); - trigger_all_cpu_backtrace(); + if (ndetected == 0) + printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + else if (!trigger_all_cpu_backtrace()) + dump_stack(); /* If so configured, complain about tasks blocking the grace period. */ @@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp) */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); - trigger_all_cpu_backtrace(); + if (!trigger_all_cpu_backtrace()) + dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) @@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct * go looking for one. */ rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; } @@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); /* * If we were in an extended quiescent state, we may have @@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { - if (cpu_needs_another_gp(rsp, rdp)) - rsp->fqs_need_gp = 1; - if (rnp->completed == rsp->completed) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (!rcu_scheduler_fully_active || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either the scheduler hasn't yet spawned the first + * non-idle task or this CPU does not need another + * grace period. Either way, don't start a new grace + * period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rsp->fqs_active) { /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. + * This CPU needs a grace period, but force_quiescent_state() + * is running. Tell it to start one on this CPU's behalf. */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->completed; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - local_irq_restore(flags); + rsp->fqs_need_gp = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; @@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { unsigned long gp_duration; + struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); @@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; - rsp->completed = rsp->gpnum; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->gpnum; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + } + + rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; + trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + mask, rnp->qsmask, rnp->level, + rnp->grplo, rnp->grphi, + !!rnp->gp_tasks); if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ @@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) { unsigned long flags; unsigned long mask; @@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != rnp->completed) { + if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { /* - * Someone beat us to it for this grace period, so leave. - * The race with GP start is resolved by the fact that we - * hold the leaf rcu_node lock, so that the per-CPU bits - * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in rcu_report_qs_rnp() if this - * race occurred. + * The grace period in which this quiescent state was + * recorded has ended, so don't report it upwards. + * We will instead need a new quiescent state that lies + * within the current grace period. */ - rdp->passed_quiesc = 0; /* try again later! */ + rdp->passed_quiesce = 0; /* need qs for new gp. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesc) + if (!rdp->passed_quiesce) return; /* * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); } #ifdef CONFIG_HOTPLUG_CPU @@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + else + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); break; } - if (rnp == rdp->mynode) + if (rnp == rdp->mynode) { + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else + } else raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; @@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int count; + int bl, count; /* If no callbacks are ready, just return.*/ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) + if (!cpu_has_callbacks_ready_to_invoke(rdp)) { + trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_end(rsp->name, 0); return; + } /* * Extract the list of ready callbacks, disabling to prevent * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + bl = rdp->blimit; + trace_rcu_batch_start(rsp->name, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(list); + __rcu_reclaim(rsp->name, list); list = next; - if (++count >= rdp->blimit) + if (++count >= bl) break; } local_irq_save(flags); + trace_rcu_batch_end(rsp->name, count); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; @@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); - /* Re-raise the RCU softirq if there are callbacks remaining. */ + /* Re-invoke RCU core processing if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_core(); } @@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Check to see if this CPU is in a non-context-switch quiescent state * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule the RCU softirq handler. + * Also schedule RCU core processing. * * This function must be called with hardirqs disabled. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns @@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { + trace_rcu_utilization("Start scheduler-tick"); if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + trace_rcu_utilization("End scheduler-tick"); } #ifdef CONFIG_SMP @@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_gp_in_progress(rsp)) + trace_rcu_utilization("Start fqs"); + if (!rcu_gp_in_progress(rsp)) { + trace_rcu_utilization("End fqs"); return; /* No grace period in progress, nothing to force. */ + } if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + trace_rcu_utilization("End fqs"); return; /* Someone else is already on the job. */ } if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) @@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ rsp->fqs_need_gp = 0; rcu_start_gp(rsp, flags); /* releases rnp->lock */ + trace_rcu_utilization("End fqs"); return; } raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ unlock_fqs_ret: raw_spin_unlock_irqrestore(&rsp->fqslock, flags); + trace_rcu_utilization("End fqs"); } #else /* #ifdef CONFIG_SMP */ @@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) #endif /* #else #ifdef CONFIG_SMP */ /* - * This does the RCU processing work from softirq context for the - * specified rcu_state and rcu_data structures. This may be called - * only from the CPU to whom the rdp belongs. + * This does the RCU core processing work for the specified rcu_state + * and rcu_data structures. This may be called only from the CPU to + * whom the rdp belongs. */ static void __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Do softirq processing for the current CPU. + * Do RCU core processing for the current CPU. */ static void rcu_process_callbacks(struct softirq_action *unused) { + trace_rcu_utilization("Start RCU core"); __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ - rcu_needs_cpu_flush(); + trace_rcu_utilization("End RCU core"); } /* - * Wake up the current CPU's kthread. This replaces raise_softirq() - * in earlier versions of RCU. Note that because we are running on - * the current CPU with interrupts disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Schedule RCU callback invocation. If the specified type of RCU + * does not support RCU priority boosting, just do a direct call, + * otherwise wake up the per-CPU kernel kthread. Note that because we + * are running on the current CPU with interrupts disabled, the + * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { @@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (__is_kfree_rcu_offset((unsigned long)func)) + trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + rdp->qlen); + else + trace_rcu_callback(rsp->name, head, rdp->qlen); + /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { local_irq_restore(flags); @@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending && !rdp->passed_quiesc) { + if (rcu_scheduler_fully_active && + rdp->qs_pending && !rdp->passed_quiesce) { /* * If force_quiescent_state() coming soon and this CPU @@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); - } else if (rdp->qs_pending && rdp->passed_quiesc) { + } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; } @@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); #endif /* #ifdef CONFIG_NO_HZ */ rdp->cpu = cpu; + rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->passed_quiesc = 0; /* We could be racing with new GP, */ - rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; @@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rnp->qsmaskinit |= mask; mask = rnp->grpmask; if (rnp == rdp->mynode) { - rdp->gpnum = rnp->completed; /* if GP in progress... */ + /* + * If there is a grace period in progress, we will + * set up to wait for it next time we run the + * RCU core code. + */ + rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; - rdp->passed_quiesc_completed = rnp->completed - 1; + rdp->passed_quiesce = 0; + rdp->qs_pending = 0; + rdp->passed_quiesce_gpnum = rnp->gpnum - 1; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + trace_rcu_utilization("Start CPU hotplug"); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, default: break; } + trace_rcu_utilization("End CPU hotplug"); return NOTIFY_OK; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 01b2ccda26fb..849ce9ec51fe 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -230,9 +230,9 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesc_completed; - /* Value of completed at time of qs. */ - bool passed_quiesc; /* User-mode/idle loop etc. */ + unsigned long passed_quiesce_gpnum; + /* gpnum at time of quiescent state. */ + bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool preemptible; /* Preemptible RCU? */ @@ -299,6 +299,7 @@ struct rcu_data { unsigned long n_rp_need_nothing; int cpu; + struct rcu_state *rsp; }; /* Values for signaled field in struct rcu_state. */ @@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); +#endif /* #ifdef CONFIG_RCU_BOOST */ + #ifndef RCU_TREE_NONCORE /* Forward declarations for rcutree_plugin.h */ @@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); -static void rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU @@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); -static void rcu_needs_cpu_flush(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8aafbb80b8b0..4b9b9f8a4184 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -27,6 +27,14 @@ #include <linux/delay.h> #include <linux/stop_machine.h> +#define RCU_KTHREAD_PRIO 1 + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else +#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO +#endif + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu) if (rnp->qsmask & rdp->grpmask) rnp->gp_tasks = &t->rcu_node_entry; } + trace_rcu_preempt_task(rdp->rsp->name, + t->pid, + (rnp->qsmask & rdp->grpmask) + ? rnp->gpnum + : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special) { @@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) int empty_exp; unsigned long flags; struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + t->rcu_blocked_node = NULL; + trace_rcu_unlock_preempted_task("rcu_preempt", + rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ - if (t->rcu_boosted) { - special |= RCU_READ_UNLOCK_BOOSTED; - t->rcu_boosted = 0; + /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ + if (t->rcu_boost_mutex) { + rbmp = t->rcu_boost_mutex; + t->rcu_boost_mutex = NULL; } #endif /* #ifdef CONFIG_RCU_BOOST */ - t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - if (empty) - raw_spin_unlock_irqrestore(&rnp->lock, flags); - else + if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + trace_rcu_quiescent_state_report("preempt_rcu", + rnp->gpnum, + 0, rnp->qsmask, + rnp->level, + rnp->grplo, + rnp->grphi, + !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); + } else + raw_spin_unlock_irqrestore(&rnp->lock, flags); #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (special & RCU_READ_UNLOCK_BOOSTED) { - rt_mutex_unlock(t->rcu_boost_mutex); - t->rcu_boost_mutex = NULL; - } + if (rbmp) + rt_mutex_unlock(rbmp); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -399,10 +424,10 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ if (t->rcu_read_lock_nesting != 1) --t->rcu_read_lock_nesting; else { + barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) @@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { struct task_struct *t; + int ndetected = 0; if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; + return 0; t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { printk(" P%d", t->pid); + ndetected++; + } + return ndetected; } /* @@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - struct rcu_synchronize rcu; - if (!rcu_scheduler_active) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { + return 0; } /* @@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static struct lock_class_key rcu_boost_class; + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); + /* Avoid lockdep false positives. This rt_mutex is its own thing. */ + lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, + "rcu_boost_mutex"); t->rcu_boost_mutex = &mtx; - t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; + trace_rcu_utilization("Start boost kthread@init"); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End boost kthread@rcu_wait"); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + trace_rcu_utilization("Start boost kthread@rcu_wait"); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + trace_rcu_utilization("End boost kthread@rcu_yield"); rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } } /* NOTREACHED */ + trace_rcu_utilization("End boost kthread@notreached"); return 0; } @@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { - local_irq_restore(flags); - return; - } - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_cpu_kthread_task)) + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); local_irq_restore(flags); } @@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, - "rcub%d", rnp_index); + "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = RCU_BOOST_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) { struct sched_param sp; struct timer_list yield_timer; + int prio = current->rt_priority; setup_timer_on_stack(&yield_timer, f, arg); mod_timer(&yield_timer, jiffies + 2); @@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); set_user_nice(current, 19); schedule(); - sp.sched_priority = RCU_KTHREAD_PRIO; + set_user_nice(current, 0); + sp.sched_priority = prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); del_timer(&yield_timer); } @@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu) /* * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * earlier RCU softirq. + * RCU softirq used in flavors and configurations of RCU that do not + * support RCU priority boosting. */ static int rcu_cpu_kthread(void *arg) { @@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg) char work; char *workp = &per_cpu(rcu_cpu_has_work, cpu); + trace_rcu_utilization("Start CPU kthread@init"); for (;;) { *statusp = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End CPU kthread@rcu_wait"); rcu_wait(*workp != 0 || kthread_should_stop()); + trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); if (rcu_cpu_kthread_should_stop(cpu)) { local_bh_enable(); @@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg) spincnt = 0; if (spincnt > 10) { *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("End CPU kthread@rcu_yield"); rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + trace_rcu_utilization("Start CPU kthread@rcu_yield"); spincnt = 0; } } *statusp = RCU_KTHREAD_STOPPED; + trace_rcu_utilization("End CPU kthread@term"); return 0; } @@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (!rcu_scheduler_fully_active || per_cpu(rcu_cpu_kthread_task, cpu) != NULL) return 0; - t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + t = kthread_create_on_node(rcu_cpu_kthread, + (void *)(long)cpu, + cpu_to_node(cpu), + "rcuc/%d", cpu); if (IS_ERR(t)) return PTR_ERR(t); if (cpu_online(cpu)) @@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, return 0; if (rnp->node_kthread_task == NULL) { t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun%d", rnp_index); + "rcun/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu) return rcu_needs_cpu_quick_check(cpu); } -/* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle - * entry is not configured, so we never do need to. - */ -static void rcu_needs_cpu_flush(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #define RCU_NEEDS_CPU_FLUSHES 5 @@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu) return c; } -/* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. - */ -static void rcu_needs_cpu_flush(void) -{ - int cpu = smp_processor_id(); - unsigned long flags; - - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) - return; - local_irq_save(flags); - (void)rcu_needs_cpu(cpu); - local_irq_restore(flags); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3b0c0986afc0..9feffa4c0695 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -48,11 +48,6 @@ #ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); - static char convert_kthread_status(unsigned int kthread_status) { if (kthread_status > RCU_KTHREAD_MAX) @@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, " dt=%d/%d/%d df=%lu", @@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", @@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ diff --git a/kernel/resource.c b/kernel/resource.c index 3b3cedc52592..c8dc249da5ce 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old, else tmp.end = root->end; + if (tmp.end < tmp.start) + goto next; + resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); @@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old, return 0; } } - if (!this) + +next: if (!this || this->end == root->end) break; + if (this != old) tmp.start = this->end + 1; this = this->sibling; diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 3c7cbc2c33be..a2e7e7210f3e 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) { + rcu_read_unlock(); + return; + } printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 255e1662acdb..5e8d9cce7470 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct rt_mutex_waiter *waiter) { int ret = 0; + int was_disabled; for (;;) { /* Try to acquire the lock: */ @@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, raw_spin_unlock(&lock->wait_lock); + was_disabled = irqs_disabled(); + if (was_disabled) + local_irq_enable(); + debug_rt_mutex_print_deadlock(waiter); schedule_rt_mutex(lock); + if (was_disabled) + local_irq_disable(); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } diff --git a/kernel/sched.c b/kernel/sched.c index ec5f472bc5b9..03ad0113801a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1739,7 +1739,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of + * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); @@ -3725,30 +3725,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) } /* - * Return sum_exec_runtime for the thread group. - * In case the task is currently running, return the sum plus current's - * pending runtime that have not been accounted yet. - * - * Note that the thread group might have other running tasks as well, - * so the return value not includes other pending runtime that other - * running tasks might have. - */ -unsigned long long thread_group_sched_runtime(struct task_struct *p) -{ - struct task_cputime totals; - unsigned long flags; - struct rq *rq; - u64 ns; - - rq = task_rq_lock(p, &flags); - thread_group_cputime(p, &totals); - ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update @@ -4237,6 +4213,7 @@ static inline void schedule_debug(struct task_struct *prev) */ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); + rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4372,7 +4349,7 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; @@ -5979,15 +5956,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) } /* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_BITS_NONE. - */ -cpumask_var_t nohz_cpu_mask; - -/* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible * to users decreases. But the relationship is not linear, @@ -8199,8 +8167,6 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; - /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ @@ -8230,6 +8196,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || system_state != SYSTEM_RUNNING || oops_in_progress) return; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 97540f0c9e47..af1177858be3 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1050,7 +1050,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) */ if (curr && unlikely(rt_task(curr)) && (curr->rt.nr_cpus_allowed < 2 || - curr->prio < p->prio) && + curr->prio <= p->prio) && (p->rt.nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); @@ -1581,7 +1581,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) p->rt.nr_cpus_allowed > 1 && rt_task(rq->curr) && (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio)) + rq->curr->prio <= p->prio)) push_rt_tasks(rq); } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 331e01bcd026..87f9e36ea56e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0d4ade..d831841e55a7 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..d252be2d3de5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +static int kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) +{ + const struct cred *pcred = __task_cred(target); + if (cred->user_ns != pcred->user_ns) + return 0; + if (cred->euid != pcred->suid && cred->euid != pcred->uid && + cred->uid != pcred->suid && cred->uid != pcred->uid) + return 0; + return 1; +} + /* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, - uid_t uid, uid_t euid, u32 secid) +int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, + const struct cred *cred, u32 secid) { int ret = -EINVAL; struct task_struct *p; - const struct cred *pcred; unsigned long flags; if (!valid_signal(sig)) @@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - pcred = __task_cred(p); - if (si_fromuser(info) && - euid != pcred->suid && euid != pcred->uid && - uid != pcred->suid && uid != pcred->uid) { + if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } @@ -1384,7 +1392,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); /* * kill_something_info() interprets pid in interesting ways just like kill(2). diff --git a/kernel/sys.c b/kernel/sys.c index 18ee1d2f6474..58459509b14c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1172,7 +1172,7 @@ DECLARE_RWSEM(uts_sem); static int override_release(char __user *release, int len) { int ret = 0; - char buf[len]; + char buf[65]; if (current->personality & UNAME26) { char *rest = UTS_RELEASE; @@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); + proc_comm_connector(me); return 0; case PR_GET_NAME: get_task_comm(comm, me); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..6318b511afa1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = { { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e19ce1454ee1..e66046456f4f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, }; static struct genl_ops cgroupstats_ops = { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..eb98e55196b9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - cpumask_clear_cpu(cpu, nohz_cpu_mask); ts->idle_waketime = now; local_irq_save(flags); @@ -389,9 +388,6 @@ void tick_nohz_stop_sched_tick(int inidle) else expires.tv64 = KTIME_MAX; - if (delta_jiffies > 1) - cpumask_set_cpu(cpu, nohz_cpu_mask); - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; @@ -441,7 +437,6 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -524,7 +519,6 @@ void tick_nohz_restart_sched_tick(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); - cpumask_clear_cpu(cpu, nohz_cpu_mask); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index a5d0a3a85dd8..0b537f27b559 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..f49405f842f4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM_RUNTIME),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f2f821acc597 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -1062,7 +1062,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1259,7 +1259,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1277,7 +1277,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1288,7 +1288,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1303,7 +1303,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -2804,9 +2804,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3265,12 +3265,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3295,9 +3295,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3337,7 +3337,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3346,7 +3346,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, } if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3438,11 +3438,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3477,7 +3477,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3488,7 +3488,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3557,7 +3557,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3569,7 +3569,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3607,10 +3607,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3641,10 +3641,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3841,7 +3841,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3964,7 +3964,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000000..4b3b5eaf94d1 --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..0c8bdeeb358b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); static void wakeup_work_handler(struct work_struct *work) { @@ -960,7 +960,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -985,7 +985,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -1000,7 +1000,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1018,7 +1018,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..11186212068c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr, max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 24dc60d9fa1f..5bbfac85866e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) #define KB 1024 #define MB (1024*KB) +#define KB_MASK (~(KB-1)) /* * fill in extended accounting fields */ @@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->ioac.rchar; - stats->write_char = p->ioac.wchar; - stats->read_syscalls = p->ioac.syscr; - stats->write_syscalls = p->ioac.syscw; + stats->read_char = p->ioac.rchar & KB_MASK; + stats->write_char = p->ioac.wchar & KB_MASK; + stats->read_syscalls = p->ioac.syscr & KB_MASK; + stats->write_syscalls = p->ioac.syscw & KB_MASK; #ifdef CONFIG_TASK_IO_ACCOUNTING - stats->read_bytes = p->ioac.read_bytes; - stats->write_bytes = p->ioac.write_bytes; - stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; + stats->read_bytes = p->ioac.read_bytes & KB_MASK; + stats->write_bytes = p->ioac.write_bytes & KB_MASK; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; #else stats->read_bytes = 0; stats->write_bytes = 0; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 25fb1b0e53fa..1783aabc6128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2412,8 +2412,13 @@ reflush: for_each_cwq_cpu(cpu, wq) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + bool drained; - if (!cwq->nr_active && list_empty(&cwq->delayed_works)) + spin_lock_irq(&cwq->gcwq->lock); + drained = !cwq->nr_active && list_empty(&cwq->delayed_works); + spin_unlock_irq(&cwq->gcwq->lock); + + if (drained) continue; if (++flush_cnt == 10 || |