diff options
Diffstat (limited to 'kernel/locking')
| -rw-r--r-- | kernel/locking/Makefile | 8 | ||||
| -rw-r--r-- | kernel/locking/lock_events_list.h | 33 | ||||
| -rw-r--r-- | kernel/locking/lockdep.c | 135 | ||||
| -rw-r--r-- | kernel/locking/lockdep_internals.h | 19 | ||||
| -rw-r--r-- | kernel/locking/lockdep_proc.c | 4 | ||||
| -rw-r--r-- | kernel/locking/locktorture.c | 92 | ||||
| -rw-r--r-- | kernel/locking/mcs_spinlock.h | 10 | ||||
| -rw-r--r-- | kernel/locking/mutex-debug.c | 24 | ||||
| -rw-r--r-- | kernel/locking/mutex.c | 202 | ||||
| -rw-r--r-- | kernel/locking/mutex.h | 15 | ||||
| -rw-r--r-- | kernel/locking/percpu-rwsem.c | 15 | ||||
| -rw-r--r-- | kernel/locking/qspinlock.c | 193 | ||||
| -rw-r--r-- | kernel/locking/qspinlock.h | 201 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 63 | ||||
| -rw-r--r-- | kernel/locking/rtmutex_api.c | 74 | ||||
| -rw-r--r-- | kernel/locking/rtmutex_common.h | 36 | ||||
| -rw-r--r-- | kernel/locking/rwbase_rt.c | 1 | ||||
| -rw-r--r-- | kernel/locking/rwsem.c | 148 | ||||
| -rw-r--r-- | kernel/locking/semaphore.c | 98 | ||||
| -rw-r--r-- | kernel/locking/spinlock.c | 12 | ||||
| -rw-r--r-- | kernel/locking/spinlock_debug.c | 4 | ||||
| -rw-r--r-- | kernel/locking/test-ww_mutex.c | 176 | ||||
| -rw-r--r-- | kernel/locking/ww_mutex.h | 103 | ||||
| -rw-r--r-- | kernel/locking/ww_rt_mutex.c | 1 |
24 files changed, 1133 insertions, 534 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..cee1901d4cff 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,9 +3,15 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +CONTEXT_ANALYSIS_mutex.o := y +CONTEXT_ANALYSIS_rtmutex_api.o := y +CONTEXT_ANALYSIS_ww_rt_mutex.o := y +CONTEXT_ANALYSIS_rwsem.o := y + obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..4e36258cc34f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ /* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* * Locking events for rwsem */ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ @@ -67,3 +72,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f0226..2d4c5bab5af8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include <linux/lockdep.h> #include <linux/context_tracking.h> #include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" #include <trace/events/lock.h> @@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -216,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; +unsigned long nr_dynamic_keys; unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); @@ -293,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) dst->nr += src->nr; } -struct lock_class_stats lock_stats(struct lock_class *class) +void lock_stats(struct lock_class *class, struct lock_class_stats *stats) { - struct lock_class_stats stats; int cpu, i; - memset(&stats, 0, sizeof(struct lock_class_stats)); + memset(stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++) + stats->contention_point[i] += pcs->contention_point[i]; - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++) + stats->contending_point[i] += pcs->contending_point[i]; - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); + lock_time_add(&pcs->read_waittime, &stats->read_waittime); + lock_time_add(&pcs->write_waittime, &stats->write_waittime); - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + lock_time_add(&pcs->read_holdtime, &stats->read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats->write_holdtime); - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; + for (i = 0; i < ARRAY_SIZE(stats->bounces); i++) + stats->bounces[i] += pcs->bounces[i]; } - - return stats; } void clear_lock_stats(struct lock_class *class) @@ -1235,6 +1236,7 @@ void lockdep_register_key(struct lock_class_key *key) goto out_unlock; } hlist_add_head_rcu(&key->hash_entry, hash_head); + nr_dynamic_keys++; out_unlock: graph_unlock(); restore_irqs: @@ -1974,41 +1976,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, } /* - * We are about to add A -> B into the dependency graph, and in __bfs() a - * strong dependency path A -> .. -> B is found: hlock_class equals - * entry->class. - * - * If A -> .. -> B can replace A -> B in any __bfs() search (means the former - * is _stronger_ than or equal to the latter), we consider A -> B as redundant. - * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A - * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the - * dependency graph, as any strong path ..-> A -> B ->.. we can get with - * having dependency A -> B, we could already get a equivalent path ..-> A -> - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. - * - * We need to make sure both the start and the end of A -> .. -> B is not - * weaker than A -> B. For the start part, please see the comment in - * check_redundant(). For the end part, we need: - * - * Either - * - * a) A -> B is -(*R)-> (everything is not weaker than that) - * - * or - * - * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) - * - */ -static inline bool hlock_equal(struct lock_list *entry, void *data) -{ - struct held_lock *hlock = (struct held_lock *)data; - - return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ - (hlock->read == 2 || /* A -> B is -(*R)-> */ - !entry->only_xr); /* A -> .. -> B is -(*N)-> */ -} - -/* * We are about to add B -> A into the dependency graph, and in __bfs() a * strong dependency path A -> .. -> B is found: hlock_class equals * entry->class. @@ -2913,6 +2880,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) #ifdef CONFIG_LOCKDEP_SMALL /* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* * Check that the dependency graph starting at <src> can lead to * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. @@ -5091,8 +5093,15 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } + + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) + return 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5824,6 +5833,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { @@ -6249,6 +6266,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) @@ -6588,6 +6608,7 @@ void lockdep_unregister_key(struct lock_class_key *key) pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); need_callback = prepare_call_rcu_zapped(pf); + nr_dynamic_keys--; } lockdep_unlock(); raw_local_irq_restore(flags); @@ -6595,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key) if (need_callback) call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ - synchronize_rcu(); + /* + * Wait until is_dynamic_key() has finished accessing k->hash_entry. + * + * Some operations like __qdisc_destroy() will call this in a debug + * kernel, and the network traffic is disabled while waiting, hence + * the delay of the wait matters in debugging cases. Currently use a + * synchronize_rcu_expedited() to speed up the wait at the cost of + * system IPIs. TODO: Replace RCU with hazptr for this. + */ + synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(lockdep_unregister_key); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 20f9ef58d3d0..0e5e6ffe91a3 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -47,29 +47,31 @@ enum { __LOCKF(USED_READ) }; +enum { #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -static const unsigned long LOCKF_ENABLED_IRQ = + LOCKF_ENABLED_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | -static const unsigned long LOCKF_USED_IN_IRQ = + LOCKF_USED_IN_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | -static const unsigned long LOCKF_ENABLED_IRQ_READ = + LOCKF_ENABLED_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | -static const unsigned long LOCKF_USED_IN_IRQ_READ = + LOCKF_USED_IN_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE +}; #define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) #define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) @@ -138,6 +140,7 @@ extern unsigned long nr_lock_classes; extern unsigned long nr_zapped_classes; extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; +extern unsigned long nr_dynamic_keys; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6db0f43fc4df..1916db9aa46b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " dynamic-keys: %11lu\n", + nr_dynamic_keys); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", nr_list_entries, MAX_LOCKDEP_ENTRIES); seq_printf(m, " indirect dependencies: %11lu\n", @@ -655,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!test_bit(idx, lock_classes_in_use)) continue; iter->class = class; - iter->stats = lock_stats(class); + lock_stats(class, &iter->stats); iter++; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index cc33470f4de9..e618bcf75e2d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -103,8 +103,8 @@ static const struct kernel_param_ops lt_bind_ops = { .get = param_get_cpumask, }; -module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); -module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0444); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0444); long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); @@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = { .name = "raw_spin_lock_irq" }; +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -556,9 +610,8 @@ static void torture_ww_mutex_init(void) ww_mutex_init(&torture_ww_mutex_1, &torture_ww_class); ww_mutex_init(&torture_ww_mutex_2, &torture_ww_class); - ww_acquire_ctxs = kmalloc_array(cxt.nrealwriters_stress, - sizeof(*ww_acquire_ctxs), - GFP_KERNEL); + ww_acquire_ctxs = kmalloc_objs(*ww_acquire_ctxs, + cxt.nrealwriters_stress); if (!ww_acquire_ctxs) VERBOSE_TOROUT_STRING("ww_acquire_ctx: Out of memory"); } @@ -1075,7 +1128,8 @@ static int call_rcu_chain_init(void) if (call_rcu_chains <= 0) return 0; - call_rcu_chain_list = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain_list), GFP_KERNEL); + call_rcu_chain_list = kzalloc_objs(*call_rcu_chain_list, + call_rcu_chains); if (!call_rcu_chain_list) return -ENOMEM; for (i = 0; i < call_rcu_chains; i++) { @@ -1157,6 +1211,10 @@ end: cxt.cur_ops->exit(); cxt.init_called = false; } + + free_cpumask_var(bind_readers); + free_cpumask_var(bind_writers); + torture_cleanup_end(); } @@ -1168,6 +1226,9 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, @@ -1232,9 +1293,7 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { lock_is_write_held = false; - cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, - sizeof(*cxt.lwsa), - GFP_KERNEL); + cxt.lwsa = kmalloc_objs(*cxt.lwsa, cxt.nrealwriters_stress); if (cxt.lwsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; @@ -1262,9 +1321,8 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, - sizeof(*cxt.lrsa), - GFP_KERNEL); + cxt.lrsa = kmalloc_objs(*cxt.lrsa, + cxt.nrealreaders_stress); if (cxt.lrsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; @@ -1311,9 +1369,8 @@ static int __init lock_torture_init(void) } if (nwriters_stress) { - writer_tasks = kcalloc(cxt.nrealwriters_stress, - sizeof(writer_tasks[0]), - GFP_KERNEL); + writer_tasks = kzalloc_objs(writer_tasks[0], + cxt.nrealwriters_stress); if (writer_tasks == NULL) { TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; @@ -1326,9 +1383,8 @@ static int __init lock_torture_init(void) nested_locks = MAX_NESTED_LOCKS; if (cxt.cur_ops->readlock) { - reader_tasks = kcalloc(cxt.nrealreaders_stress, - sizeof(reader_tasks[0]), - GFP_KERNEL); + reader_tasks = kzalloc_objs(reader_tasks[0], + cxt.nrealreaders_stress); if (reader_tasks == NULL) { TOROUT_ERRSTRING("reader_tasks: Out of memory"); kfree(writer_tasks); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -15,12 +15,6 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* * Using smp_cond_load_acquire() provides the acquire semantics @@ -30,9 +24,7 @@ struct mcs_spinlock { * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 6e6f6071cfa2..785decd9d0c0 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(!lock->first_waiter); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } void debug_mutex_free_waiter(struct mutex_waiter *waiter) @@ -53,17 +52,17 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, { lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); + struct mutex *blocked_on = get_task_blocked_on(task); + DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; @@ -73,20 +72,11 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1..09534628dc01 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/hung_task.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -42,19 +43,17 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->first_waiter = NULL; + } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -72,6 +71,14 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* * Returns: __mutex_owner(lock) on failure or NULL on success. */ @@ -133,16 +140,24 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to * follow with a __mutex_trylock() before failing. */ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) + __cond_acquires(true, lock) { unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; @@ -150,12 +165,27 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) } static __always_inline bool __mutex_unlock_fast(struct mutex *lock) + __cond_releases(true, lock) { unsigned long curr = (unsigned long)current; return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockdep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { @@ -167,34 +197,61 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } -static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) -{ - return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; -} - /* - * Add @waiter to a given location in the lock wait_list and set the - * FLAG_WAITERS flag if it's the first waiter. + * Add @waiter to the @lock wait_list and set the FLAG_WAITERS flag if it's + * the first waiter. + * + * When @pos, @waiter is added before the waiter indicated by @pos. Otherwise + * @waiter will be added to the tail of the list. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct list_head *list) + struct mutex_waiter *pos) + __must_hold(&lock->wait_lock) { + struct mutex_waiter *first = lock->first_waiter; + + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); - list_add_tail(&waiter->list, list); - if (__mutex_waiter_is_first(lock, waiter)) - __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + if (pos) { + /* + * Insert @waiter before @pos. + */ + list_add_tail(&waiter->list, &pos->list); + /* + * If @pos == @first, then @waiter will be the new first. + */ + if (pos == first) + lock->first_waiter = waiter; + return; + } + + if (first) { + list_add_tail(&waiter->list, &first->list); + return; + } + + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter = waiter; + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { - list_del(&waiter->list); - if (likely(list_empty(&lock->wait_list))) + if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); + lock->first_waiter = NULL; + } else { + if (lock->first_waiter == waiter) + lock->first_waiter = list_next_entry(waiter, list); + list_del(&waiter->list); + } debug_mutex_remove_waiter(lock, waiter, current); + hung_task_clear_blocker(); } /* @@ -230,7 +287,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void __sched __mutex_lock_slowpath(struct mutex *lock); +static void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock); /** * mutex_lock - acquire the mutex @@ -311,7 +369,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && !__mutex_waiter_is_first(lock, waiter)) + if (waiter && data_race(lock->first_waiter != waiter)) return false; return true; @@ -496,7 +554,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, } #endif -static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock); /** * mutex_unlock - release the mutex @@ -536,6 +595,7 @@ EXPORT_SYMBOL(mutex_unlock); * of a unlocked mutex is not allowed. */ void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { __ww_mutex_unlock(lock); mutex_unlock(&lock->base); @@ -549,6 +609,7 @@ static __always_inline int __sched __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + __cond_acquires(0, lock) { DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; @@ -616,7 +677,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - __mutex_add_waiter(lock, &waiter, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, NULL); } else { /* * Add in stamp order, waking up waiters that must kill @@ -627,6 +688,8 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + raw_spin_lock(¤t->blocked_lock); + __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -639,8 +702,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * the handoff. */ if (__mutex_trylock(lock)) - goto acquired; + break; + raw_spin_unlock(¤t->blocked_lock); /* * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered @@ -661,8 +725,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas schedule_preempt_disabled(); - first = __mutex_waiter_is_first(lock, &waiter); + first = lock->first_waiter == &waiter; + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); + /* + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to acquire. + */ + __set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -673,25 +745,40 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas break; if (first) { + bool opt_acquired; + + /* + * mutex_optimistic_spin() can call schedule(), so + * we need to release these locks before calling it, + * and clear blocked on so we don't become unselectable + * to run. + */ + __clear_task_blocked_on(current, lock); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); - if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); + __set_task_blocked_on(current, lock); + + if (opt_acquired) break; trace_contention_begin(lock, LCB_F_MUTEX); } - - raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock_irqsave(&lock->wait_lock, flags); -acquired: + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); + raw_spin_unlock(¤t->blocked_lock); if (ww_ctx) { /* * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. */ - if (!ww_ctx->is_wait_die && - !__mutex_waiter_is_first(lock, &waiter)) + if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter) __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } @@ -712,9 +799,11 @@ skip_wait: return 0; err: + clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + WARN_ON(get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -726,6 +815,7 @@ err_early_kill: static int __sched __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } @@ -733,6 +823,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, unsigned long ip, struct ww_acquire_ctx *ww_ctx) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } @@ -780,6 +871,7 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -788,15 +880,17 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +_mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest) { - return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) @@ -815,12 +909,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) token = io_schedule_prepare(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); + __acquire(lock); io_schedule_finish(token); } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_releases(nonzero, lock) { #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH unsigned tmp; @@ -882,13 +978,16 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock) { struct task_struct *next = NULL; + struct mutex_waiter *waiter; DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; mutex_release(&lock->dep_map, ip); + __release(lock); /* * Release the lock before (potentially) taking the spinlock such that @@ -915,15 +1014,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_first_entry(&lock->wait_list, - struct mutex_waiter, list); - + waiter = lock->first_waiter; + if (waiter) { next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + set_task_blocked_on_waking(next, lock); wake_q_add(&wake_q, next); } @@ -1013,24 +1109,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io); static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __acquire(lock); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); @@ -1039,6 +1140,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); @@ -1046,6 +1148,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -1062,17 +1165,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { + MUTEX_WARN_ON(lock->magic != lock); + return __mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock); +#else +int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) +{ bool locked; MUTEX_WARN_ON(lock->magic != lock); - locked = __mutex_trylock(lock); if (locked) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); return locked; } -EXPORT_SYMBOL(mutex_trylock); +EXPORT_SYMBOL(_mutex_trylock_nest_lock); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index cbff35b9b7ae..3e263e98e5fc 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -6,7 +6,8 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ - +#ifndef CONFIG_PREEMPT_RT +#include <linux/mutex.h> /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -47,6 +48,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } +static inline struct mutex *get_task_blocked_on(struct task_struct *p) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + return __get_task_blocked_on(p); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); @@ -59,8 +66,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,5 +74,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -184,7 +187,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7d96bed718e4..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -25,8 +25,9 @@ #include <trace/events/lock.h> /* - * Include queued spinlock statistics code + * Include queued spinlock definitions and statistics code */ +#include "qspinlock.h" #include "qspinlock_stat.h" /* @@ -67,36 +68,6 @@ */ #include "mcs_spinlock.h" -#define MAX_NODES 4 - -/* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. - */ -struct qnode { - struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS - long reserved[2]; -#endif -}; - -/* - * The pending bit spinning loop count. - * This heuristic is used to limit the number of lockword accesses - * made by atomic_cond_read_relaxed when waiting for the lock to - * transition out of the "== _Q_PENDING_VAL" state. We don't spin - * indefinitely because there's no guarantee that we'll make forward - * progress. - */ -#ifndef _Q_PENDING_LOOPS -#define _Q_PENDING_LOOPS 1 -#endif /* * Per-CPU queue node structures; we can never have more than 4 nested @@ -106,161 +77,7 @@ struct qnode { * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&qnodes[idx].mcs, cpu); -} - -static inline __pure -struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) -{ - return &((struct qnode *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending - * @lock : Pointer to queued spinlock structure - * Return: The previous lock value - * - * *,*,* -> *,1,* - */ -#ifndef queued_fetch_set_pending_acquire -static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) -{ - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); -} -#endif - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -410,7 +227,7 @@ pv_queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES)) { lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); @@ -465,7 +282,7 @@ pv_queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); + prev = decode_tail(old, qnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbb..daeeeef973e2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -93,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, static __always_inline struct task_struct * rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { unsigned long val = (unsigned long)owner; @@ -104,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) static __always_inline void rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { /* * lock->wait_lock is held but explicit acquire semantics are needed @@ -113,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) } static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { /* lock->wait_lock is held so the unlock provides release semantics. */ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); @@ -126,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) + __must_hold(&lock->wait_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -327,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, } static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -1205,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1248,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); + __assume_ctx_lock(&rtm->rtmutex.wait_lock); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); if (res) { raw_spin_lock(&task->pi_lock); @@ -1355,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, } static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1504,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner_on_cpu(owner) || need_resched() || - !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) { res = false; break; } @@ -1534,20 +1544,27 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * * Must be called with lock->wait_lock held and interrupts disabled. It must * have just failed to try_to_take_rt_mutex(). + * + * When invoked from rt_mutex_start_proxy_lock() waiter::task != current ! */ static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); + struct task_struct *waiter_task = waiter->task; struct rt_mutex_base *next_lock; lockdep_assert_held(&lock->wait_lock); - raw_spin_lock(¤t->pi_lock); - rt_mutex_dequeue(lock, waiter); - current->pi_blocked_on = NULL; - raw_spin_unlock(¤t->pi_lock); + if (!waiter_task) /* never enqueued */ + return; + + scoped_guard(raw_spinlock, &waiter_task->pi_lock) { + rt_mutex_dequeue(lock, waiter); + waiter_task->pi_blocked_on = NULL; + } /* * Only update priority if the waiter was the highest priority @@ -1583,7 +1600,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, - next_lock, NULL, current); + next_lock, NULL, waiter_task); raw_spin_lock_irq(&lock->wait_lock); } @@ -1612,10 +1629,15 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); + + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1660,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1652,6 +1676,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, struct rt_mutex_base *lock, struct rt_mutex_waiter *w) + __must_hold(&lock->wait_lock) { /* * If the result is not -EDEADLOCK or the caller requested @@ -1688,12 +1713,15 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); int ret; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1729,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1748,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1740,6 +1771,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex_waiter waiter; int ret; @@ -1751,6 +1783,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1856,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1874,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1885,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1905,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 191e4720e546..514fce7a4e0a 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -13,6 +13,24 @@ */ int max_lock_depth = 1024; +static const struct ctl_table rtmutex_sysctl_table[] = { + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_rtmutex_sysctl(void) +{ + register_sysctl_init("kernel", rtmutex_sysctl_table); + return 0; +} + +subsys_initcall(init_rtmutex_sysctl); + /* * Debug aware fast / slowpath lock,trylock,unlock * @@ -347,7 +365,7 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); - if (unlikely(ret)) + if (unlikely(ret < 0)) remove_waiter(lock, waiter); preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); @@ -497,19 +515,18 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __acquires(lock) __no_context_analysis { int ret; @@ -524,6 +541,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -544,12 +568,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -int __sched mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass) +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) { @@ -563,8 +587,29 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); @@ -591,24 +636,19 @@ void __sched mutex_lock_io(struct mutex *lock) io_schedule_finish(token); } EXPORT_SYMBOL(mutex_lock_io); -#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ int __sched mutex_trylock(struct mutex *lock) { - int ret; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(&lock->rtmutex); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; + return __rt_mutex_trylock(&lock->rtmutex); } EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) + __releases(lock) __no_context_analysis { mutex_release(&lock->dep_map, _RET_IP_); __rt_mutex_unlock(&lock->rtmutex); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..c38b7bdea7b3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -79,12 +79,18 @@ struct rt_wake_q_head { * PI-futex support (proxy locking functions, etc.): */ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); + struct task_struct *proxy_owner) + __must_hold(&lock->wait_lock); + +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *); + struct wake_q_head *) + __must_hold(&lock->wait_lock); + extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); @@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter); -extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, @@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } @@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) */ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); @@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -153,15 +163,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. @@ -179,9 +180,10 @@ enum rtmutex_chainwalk { static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; - lock->owner = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; + } } /* Debug functions */ diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 9f4322c07486..82e078c0665a 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, unsigned long flags) + __releases(&rwb->rtmutex.wait_lock) { struct rt_mutex_base *rtm = &rwb->rtmutex; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..bf647097369c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT @@ -71,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ + rwsem_is_contended(sem) ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { @@ -319,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); atomic_long_set(&sem->owner, 0L); + scoped_guard (raw_spinlock_init, &sem->wait_lock) { + sem->first_waiter = NULL; + } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -340,8 +342,6 @@ struct rwsem_waiter { unsigned long timeout; bool handoff_set; }; -#define rwsem_first_waiter(sem) \ - list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -364,12 +364,22 @@ enum rwsem_wake_type { */ #define MAX_READERS_WAKEUP 0x100 -static inline void -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +static inline +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { - lockdep_assert_held(&sem->wait_lock); - list_add_tail(&waiter->list, &sem->wait_list); - /* caller will set RWSEM_FLAG_WAITERS */ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return false; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + } + list_del(&waiter->list); + + return true; } /* @@ -384,14 +394,24 @@ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); - list_del(&waiter->list); - if (likely(!list_empty(&sem->wait_list))) + if (__rwsem_del_waiter(sem, waiter)) return true; - atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); return false; } +static inline +struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, + const struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) +{ + struct rwsem_waiter *next = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + if (next == sem->first_waiter) + return NULL; + return next; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -410,7 +430,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter, *tmp; + struct rwsem_waiter *waiter, *next; long oldcount, woken = 0, adjustment = 0; struct list_head wlist; @@ -420,7 +440,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = rwsem_first_waiter(sem); + waiter = sem->first_waiter; if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -505,25 +525,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * put them into wake_q to be woken up later. */ INIT_LIST_HEAD(&wlist); - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + do { + next = next_waiter(sem, waiter); if (waiter->type == RWSEM_WAITING_FOR_WRITE) continue; woken++; list_move_tail(&waiter->list, &wlist); + if (sem->first_waiter == waiter) + sem->first_waiter = next; /* * Limit # of readers that can be woken up per wakeup call. */ if (unlikely(woken >= MAX_READERS_WAKEUP)) break; - } + } while ((waiter = next) != NULL); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); oldcount = atomic_long_read(&sem->count); - if (list_empty(&sem->wait_list)) { + if (!sem->first_waiter) { /* * Combined with list_move_tail() above, this implies * rwsem_del_waiter(). @@ -544,7 +567,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { + list_for_each_entry_safe(waiter, next, &wlist, list) { struct task_struct *tsk; tsk = waiter->task; @@ -576,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { - bool first = rwsem_first_waiter(sem) == waiter; + bool first = sem->first_waiter == waiter; wake_q_init(wake_q); @@ -601,8 +624,9 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { - struct rwsem_waiter *first = rwsem_first_waiter(sem); + struct rwsem_waiter *first = sem->first_waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -638,7 +662,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, new |= RWSEM_WRITER_LOCKED; new &= ~RWSEM_FLAG_HANDOFF; - if (list_is_singular(&sem->wait_list)) + if (list_empty(&first->list)) new &= ~RWSEM_FLAG_WAITERS; } } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); @@ -658,7 +682,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on * success. */ - list_del(&waiter->list); + __rwsem_del_waiter(sem, waiter); + rwsem_set_owner(sem); return true; } @@ -727,8 +752,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return ret; } -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) - static inline enum owner_state rwsem_owner_state(struct task_struct *owner, unsigned long flags) { @@ -835,7 +858,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) enum owner_state owner_state; owner_state = rwsem_spin_on_owner(sem); - if (!(owner_state & OWNER_SPINNABLE)) + if (owner_state == OWNER_NONSPINNABLE) break; /* @@ -995,7 +1018,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat { long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* @@ -1020,7 +1043,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat */ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -1036,7 +1059,8 @@ queue: waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { + first = sem->first_waiter; + if (!first) { /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return @@ -1052,8 +1076,11 @@ queue: return sem; } adjustment += RWSEM_FLAG_WAITERS; + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } else { + list_add_tail(&waiter.list, &first->list); } - rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1065,10 +1092,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1083,8 +1113,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1104,7 +1138,7 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1123,10 +1157,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); - /* we're now waiting on the lock */ - if (rwsem_first_waiter(sem) != &waiter) { + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), &wake_q); if (!wake_q_empty(&wake_q)) { @@ -1139,6 +1173,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) raw_spin_lock_irq(&sem->wait_lock); } } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } @@ -1146,6 +1182,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1179,6 +1218,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); @@ -1205,7 +1248,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1226,7 +1269,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1519,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) * lock for reading */ void __sched down_read(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1528,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); int __sched down_read_interruptible(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1542,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_interruptible); int __sched down_read_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1559,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable); * trylock for reading -- returns 1 if successful, 0 if contention */ int down_read_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_read_trylock(sem); @@ -1572,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock); * lock for writing */ void __sched down_write(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1583,6 +1631,7 @@ EXPORT_SYMBOL(down_write); * lock for writing */ int __sched down_write_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1601,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable); * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_write_trylock(sem); @@ -1615,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock); * release a read lock */ void up_read(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); @@ -1625,6 +1676,7 @@ EXPORT_SYMBOL(up_read); * release a write lock */ void up_write(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); @@ -1635,6 +1687,7 @@ EXPORT_SYMBOL(up_write); * downgrade write lock to read lock */ void downgrade_write(struct rw_semaphore *sem) + __no_context_analysis { lock_downgrade(&sem->dep_map, _RET_IP_); __downgrade_write(sem); @@ -1644,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC void down_read_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1652,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); int down_read_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1666,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_killable_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) + __no_context_analysis { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); @@ -1674,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); __down_read(sem); @@ -1688,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1696,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1711,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index de9117c0e671..74d41433ba13 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -21,7 +21,7 @@ * too. * * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. + * semaphore. If it's zero, there may be waiters. */ #include <linux/compiler.h> @@ -34,6 +34,7 @@ #include <linux/spinlock.h> #include <linux/ftrace.h> #include <trace/events/lock.h> +#include <linux/hung_task.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); @@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} + /** * down - acquire the semaphore * @sem: the semaphore to be acquired @@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -187,7 +223,10 @@ void __sched up(struct semaphore *sem) DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(list_empty(&sem->wait_list))) + + hung_task_sem_clear_if_holder(sem); + + if (likely(!sem->first_waiter)) sem->count++; else __up(sem, &wake_q); @@ -205,6 +244,21 @@ struct semaphore_waiter { bool up; }; +static inline +void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter) +{ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct semaphore_waiter, list); + } + list_del(&waiter->list); +} + /* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the @@ -213,9 +267,15 @@ struct semaphore_waiter { static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); + struct semaphore_waiter waiter, *first; + + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); + } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } waiter.task = current; waiter.up = false; @@ -228,16 +288,18 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -ETIME; interrupted: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -EINTR; } @@ -246,10 +308,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } @@ -276,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) static noinline void __sched __up(struct semaphore *sem, struct wake_q_head *wake_q) { - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); + struct semaphore_waiter *waiter = sem->first_waiter; + + sem_del_waiter(sem, waiter); waiter->up = true; wake_q_add(wake_q, waiter->task); } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7685defd7c52..b42d293da38b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -64,8 +64,9 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); * time (making _this_ CPU preemptible if possible), and we also signal * towards that other CPU that it should break the lock ASAP. */ -#define BUILD_LOCK_OPS(op, locktype) \ +#define BUILD_LOCK_OPS(op, locktype, lock_ctx_op) \ static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -78,6 +79,7 @@ static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ } \ \ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ unsigned long flags; \ \ @@ -96,11 +98,13 @@ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ } \ \ static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ _raw_##op##_lock_irqsave(lock); \ } \ \ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ unsigned long flags; \ \ @@ -123,11 +127,11 @@ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_irqsave() * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(spin, raw_spinlock, __acquires); #ifndef CONFIG_PREEMPT_RT -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(read, rwlock, __acquires_shared); +BUILD_LOCK_OPS(write, rwlock, __acquires); #endif #endif diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index bcb1b9fea588..838d631544ed 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -13,7 +13,8 @@ #include <linux/slab.h> #include <linux/ww_mutex.h> -static DEFINE_WD_CLASS(ww_class); +static DEFINE_WD_CLASS(wd_class); +static DEFINE_WW_CLASS(ww_class); struct workqueue_struct *wq; #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH @@ -54,16 +55,16 @@ static void test_mutex_work(struct work_struct *work) ww_mutex_unlock(&mtx->mutex); } -static int __test_mutex(unsigned int flags) +static int __test_mutex(struct ww_class *class, unsigned int flags) { #define TIMEOUT (HZ / 16) struct test_mutex mtx; struct ww_acquire_ctx ctx; int ret; - ww_mutex_init(&mtx.mutex, &ww_class); + ww_mutex_init(&mtx.mutex, class); if (flags & TEST_MTX_CTX) - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, class); INIT_WORK_ONSTACK(&mtx.work, test_mutex_work); init_completion(&mtx.ready); @@ -71,7 +72,7 @@ static int __test_mutex(unsigned int flags) init_completion(&mtx.done); mtx.flags = flags; - schedule_work(&mtx.work); + queue_work(wq, &mtx.work); wait_for_completion(&mtx.ready); ww_mutex_lock(&mtx.mutex, (flags & TEST_MTX_CTX) ? &ctx : NULL); @@ -106,13 +107,13 @@ static int __test_mutex(unsigned int flags) #undef TIMEOUT } -static int test_mutex(void) +static int test_mutex(struct ww_class *class) { int ret; int i; for (i = 0; i < __TEST_MTX_LAST; i++) { - ret = __test_mutex(i); + ret = __test_mutex(class, i); if (ret) return ret; } @@ -120,15 +121,15 @@ static int test_mutex(void) return 0; } -static int test_aa(bool trylock) +static int test_aa(struct ww_class *class, bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; const char *from = trylock ? "trylock" : "lock"; - ww_mutex_init(&mutex, &ww_class); - ww_acquire_init(&ctx, &ww_class); + ww_mutex_init(&mutex, class); + ww_acquire_init(&ctx, class); if (!trylock) { ret = ww_mutex_lock(&mutex, &ctx); @@ -177,6 +178,7 @@ out: struct test_abba { struct work_struct work; + struct ww_class *class; struct ww_mutex a_mutex; struct ww_mutex b_mutex; struct completion a_ready; @@ -191,7 +193,7 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, abba->class); if (!abba->trylock) ww_mutex_lock(&abba->b_mutex, &ctx); else @@ -217,23 +219,24 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool trylock, bool resolve) +static int test_abba(struct ww_class *class, bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; int err, ret; - ww_mutex_init(&abba.a_mutex, &ww_class); - ww_mutex_init(&abba.b_mutex, &ww_class); + ww_mutex_init(&abba.a_mutex, class); + ww_mutex_init(&abba.b_mutex, class); INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.class = class; abba.trylock = trylock; abba.resolve = resolve; - schedule_work(&abba.work); + queue_work(wq, &abba.work); - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, class); if (!trylock) ww_mutex_lock(&abba.a_mutex, &ctx); else @@ -278,6 +281,7 @@ static int test_abba(bool trylock, bool resolve) struct test_cycle { struct work_struct work; + struct ww_class *class; struct ww_mutex a_mutex; struct ww_mutex *b_mutex; struct completion *a_signal; @@ -291,7 +295,7 @@ static void test_cycle_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err, erra = 0; - ww_acquire_init_noinject(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, cycle->class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -314,20 +318,21 @@ static void test_cycle_work(struct work_struct *work) cycle->result = err ?: erra; } -static int __test_cycle(unsigned int nthreads) +static int __test_cycle(struct ww_class *class, unsigned int nthreads) { struct test_cycle *cycles; unsigned int n, last = nthreads - 1; int ret; - cycles = kmalloc_array(nthreads, sizeof(*cycles), GFP_KERNEL); + cycles = kmalloc_objs(*cycles, nthreads); if (!cycles) return -ENOMEM; for (n = 0; n < nthreads; n++) { struct test_cycle *cycle = &cycles[n]; - ww_mutex_init(&cycle->a_mutex, &ww_class); + cycle->class = class; + ww_mutex_init(&cycle->a_mutex, class); if (n == last) cycle->b_mutex = &cycles[0].a_mutex; else @@ -367,13 +372,13 @@ static int __test_cycle(unsigned int nthreads) return ret; } -static int test_cycle(unsigned int ncpus) +static int test_cycle(struct ww_class *class, unsigned int ncpus) { unsigned int n; int ret; for (n = 2; n <= ncpus + 1; n++) { - ret = __test_cycle(n); + ret = __test_cycle(class, n); if (ret) return ret; } @@ -384,6 +389,7 @@ static int test_cycle(unsigned int ncpus) struct stress { struct work_struct work; struct ww_mutex *locks; + struct ww_class *class; unsigned long timeout; int nlocks; }; @@ -406,7 +412,7 @@ static int *get_random_order(int count) int *order; int n, r; - order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); + order = kmalloc_objs(*order, count); if (!order) return order; @@ -443,7 +449,7 @@ static void stress_inorder_work(struct work_struct *work) int contended = -1; int n, err; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, stress->class); retry: err = 0; for (n = 0; n < nlocks; n++) { @@ -500,7 +506,7 @@ static void stress_reorder_work(struct work_struct *work) return; for (n = 0; n < stress->nlocks; n++) { - ll = kmalloc(sizeof(*ll), GFP_KERNEL); + ll = kmalloc_obj(*ll); if (!ll) goto out; @@ -511,7 +517,7 @@ static void stress_reorder_work(struct work_struct *work) order = NULL; do { - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init(&ctx, stress->class); list_for_each_entry(ll, &locks, link) { err = ww_mutex_lock(ll->lock, &ctx); @@ -570,25 +576,24 @@ static void stress_one_work(struct work_struct *work) #define STRESS_ONE BIT(2) #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) -static int stress(int nlocks, int nthreads, unsigned int flags) +static int stress(struct ww_class *class, int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; struct stress *stress_array; int n, count; - locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); + locks = kmalloc_objs(*locks, nlocks); if (!locks) return -ENOMEM; - stress_array = kmalloc_array(nthreads, sizeof(*stress_array), - GFP_KERNEL); + stress_array = kmalloc_objs(*stress_array, nthreads); if (!stress_array) { kfree(locks); return -ENOMEM; } for (n = 0; n < nlocks; n++) - ww_mutex_init(&locks[n], &ww_class); + ww_mutex_init(&locks[n], class); count = 0; for (n = 0; nthreads; n++) { @@ -617,6 +622,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); + stress->class = class; stress->locks = locks; stress->nlocks = nlocks; stress->timeout = jiffies + 2*HZ; @@ -635,59 +641,131 @@ static int stress(int nlocks, int nthreads, unsigned int flags) return 0; } -static int __init test_ww_mutex_init(void) +static int run_tests(struct ww_class *class) { int ncpus = num_online_cpus(); int ret, i; - printk(KERN_INFO "Beginning ww mutex selftests\n"); - - prandom_seed_state(&rng, get_random_u64()); - - wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); - if (!wq) - return -ENOMEM; - - ret = test_mutex(); + ret = test_mutex(class); if (ret) return ret; - ret = test_aa(false); + ret = test_aa(class, false); if (ret) return ret; - ret = test_aa(true); + ret = test_aa(class, true); if (ret) return ret; for (i = 0; i < 4; i++) { - ret = test_abba(i & 1, i & 2); + ret = test_abba(class, i & 1, i & 2); if (ret) return ret; } - ret = test_cycle(ncpus); + ret = test_cycle(class, ncpus); + if (ret) + return ret; + + ret = stress(class, 16, 2 * ncpus, STRESS_INORDER); + if (ret) + return ret; + + ret = stress(class, 16, 2 * ncpus, STRESS_REORDER); if (ret) return ret; - ret = stress(16, 2*ncpus, STRESS_INORDER); + ret = stress(class, 2046, hweight32(STRESS_ALL) * ncpus, STRESS_ALL); if (ret) return ret; - ret = stress(16, 2*ncpus, STRESS_REORDER); + return 0; +} + +static int run_test_classes(void) +{ + int ret; + + pr_info("Beginning ww (wound) mutex selftests\n"); + + ret = run_tests(&ww_class); if (ret) return ret; - ret = stress(2046, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); + pr_info("Beginning ww (die) mutex selftests\n"); + ret = run_tests(&wd_class); if (ret) return ret; - printk(KERN_INFO "All ww mutex selftests passed\n"); + pr_info("All ww mutex selftests passed\n"); return 0; } +static DEFINE_MUTEX(run_lock); + +static ssize_t run_tests_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!mutex_trylock(&run_lock)) { + pr_err("Test already running\n"); + return count; + } + + run_test_classes(); + mutex_unlock(&run_lock); + + return count; +} + +static struct kobj_attribute run_tests_attribute = + __ATTR(run_tests, 0664, NULL, run_tests_store); + +static struct attribute *attrs[] = { + &run_tests_attribute.attr, + NULL, /* need to NULL terminate the list of attributes */ +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *test_ww_mutex_kobj; + +static int __init test_ww_mutex_init(void) +{ + int ret; + + prandom_seed_state(&rng, get_random_u64()); + + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); + if (!wq) + return -ENOMEM; + + test_ww_mutex_kobj = kobject_create_and_add("test_ww_mutex", kernel_kobj); + if (!test_ww_mutex_kobj) { + destroy_workqueue(wq); + return -ENOMEM; + } + + /* Create the files associated with this kobject */ + ret = sysfs_create_group(test_ww_mutex_kobj, &attr_group); + if (ret) { + kobject_put(test_ww_mutex_kobj); + destroy_workqueue(wq); + return ret; + } + + mutex_lock(&run_lock); + ret = run_test_classes(); + mutex_unlock(&run_lock); + + return ret; +} + static void __exit test_ww_mutex_exit(void) { + kobject_put(test_ww_mutex_kobj); destroy_workqueue(wq); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 37f025a096c9..6c12452097e1 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -4,58 +4,83 @@ #define MUTEX mutex #define MUTEX_WAITER mutex_waiter +#define WAIT_LOCK wait_lock + +/* + * +--------+ + * | first | + * +--------+ + * | + * v + * +----+ +----+ +----+ + * | W3 | <-> | W1 | <-> | W2 | + * +----+ +----+ +----+ + * ^ ^ + * +---------------------+ + */ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) + __must_hold(&lock->wait_lock) { - struct mutex_waiter *w; - - w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; - - return w; + return lock->first_waiter; } +/* + * for (cur = __ww_waiter_first(); cur; cur = __ww_waiter_next()) + * + * Should iterate like: W1, W2, W3 + */ static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_next_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + /* + * Terminate if the next entry is the first again, that has already + * been observed. + */ + if (lock->first_waiter == w) return NULL; return w; } +/* + * for (cur = __ww_waiter_last(); cur; cur = __ww_waiter_prev()) + * + * Should iterate like: W3, W2, W1 + */ static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { - w = list_prev_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + /* + * Terminate at the first entry, the previous entry of first is the + * last and that has already been observed. + */ + if (lock->first_waiter == w) return NULL; - return w; + return list_prev_entry(w, list); } static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) + __must_hold(&lock->wait_lock) { - struct mutex_waiter *w; - - w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; + struct mutex_waiter *w = lock->first_waiter; + if (w) + w = list_prev_entry(w, list); return w; } static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) + __must_hold(&lock->wait_lock) { - struct list_head *p = &lock->wait_list; - if (pos) - p = &pos->list; - __mutex_add_waiter(lock, waiter, p); + __mutex_add_waiter(lock, waiter, pos); } static inline struct task_struct * @@ -71,16 +96,19 @@ __ww_mutex_has_waiters(struct mutex *lock) } static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) + __acquires(&lock->wait_lock) { raw_spin_lock_irqsave(&lock->wait_lock, *flags); } static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) + __releases(&lock->wait_lock) { raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); } @@ -89,9 +117,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock) #define MUTEX rt_mutex #define MUTEX_WAITER rt_mutex_waiter +#define WAIT_LOCK rtmutex.wait_lock static inline struct rt_mutex_waiter * __ww_waiter_first(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) @@ -119,6 +149,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) static inline struct rt_mutex_waiter * __ww_waiter_last(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) @@ -140,21 +171,25 @@ __ww_mutex_owner(struct rt_mutex *lock) static inline bool __ww_mutex_has_waiters(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { return rt_mutex_has_waiters(&lock->rtmutex); } static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __acquires(&lock->rtmutex.wait_lock) { raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __releases(&lock->rtmutex.wait_lock) { raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { lockdep_assert_held(&lock->rtmutex.wait_lock); } @@ -284,6 +319,12 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); #endif + /* + * When waking up the task to die, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. + */ + set_task_blocked_on_waking(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -301,6 +342,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -331,9 +373,19 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ - if (owner != current) + if (owner != current) { + /* + * When waking up the task to wound, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. + */ + set_task_blocked_on_waking(owner, NULL); wake_q_add(wake_q, owner); - + } return true; } @@ -355,6 +407,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur; @@ -381,6 +434,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { DEFINE_WAKE_Q(wake_q); unsigned long flags; + bool has_waiters; ww_mutex_lock_acquired(lock, ctx); @@ -402,7 +456,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx * and/or !empty list. */ - if (likely(!__ww_mutex_has_waiters(&lock->base))) + has_waiters = data_race(__ww_mutex_has_waiters(&lock->base)); + if (likely(!has_waiters)) return; /* @@ -448,6 +503,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) + __must_hold(&lock->WAIT_LOCK) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -498,6 +554,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index c7196de838ed..e07fb3b96bc3 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) EXPORT_SYMBOL(ww_mutex_lock_interruptible); void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { struct rt_mutex *rtm = &lock->base; |
