diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-30 14:38:01 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-30 14:38:01 -0700 |
commit | d42f323a7df0b298c07313db00b44b78555ca8e6 (patch) | |
tree | e9ac2b9f20fed683ff78b294c3792acb157787e5 /kernel | |
parent | 65ec0a7d24913b146cd1500d759b8c340319d55e (diff) | |
parent | 4d75136be8bf3ae01b0bc3e725b2cdc921e103bd (diff) | |
download | lwn-d42f323a7df0b298c07313db00b44b78555ca8e6.tar.gz lwn-d42f323a7df0b298c07313db00b44b78555ca8e6.zip |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
"A few misc subsystems and some of MM.
175 patches.
Subsystems affected by this patch series: ia64, kbuild, scripts, sh,
ocfs2, kfifo, vfs, kernel/watchdog, and mm (slab-generic, slub,
kmemleak, debug, pagecache, msync, gup, memremap, memcg, pagemap,
mremap, dma, sparsemem, vmalloc, documentation, kasan, initialization,
pagealloc, and memory-failure)"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (175 commits)
mm/memory-failure: unnecessary amount of unmapping
mm/mmzone.h: fix existing kernel-doc comments and link them to core-api
mm: page_alloc: ignore init_on_free=1 for debug_pagealloc=1
net: page_pool: use alloc_pages_bulk in refill code path
net: page_pool: refactor dma_map into own function page_pool_dma_map
SUNRPC: refresh rq_pages using a bulk page allocator
SUNRPC: set rq_page_end differently
mm/page_alloc: inline __rmqueue_pcplist
mm/page_alloc: optimize code layout for __alloc_pages_bulk
mm/page_alloc: add an array-based interface to the bulk page allocator
mm/page_alloc: add a bulk page allocator
mm/page_alloc: rename alloced to allocated
mm/page_alloc: duplicate include linux/vmalloc.h
mm, page_alloc: avoid page_to_pfn() in move_freepages()
mm/Kconfig: remove default DISCONTIGMEM_MANUAL
mm: page_alloc: dump migrate-failed pages
mm/mempolicy: fix mpol_misplaced kernel-doc
mm/mempolicy: rewrite alloc_pages_vma documentation
mm/mempolicy: rewrite alloc_pages documentation
mm/mempolicy: rename alloc_pages_current to alloc_pages
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/cgroup.c | 34 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 63 | ||||
-rw-r--r-- | kernel/dma/remap.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/irq_work.c | 7 | ||||
-rw-r--r-- | kernel/task_work.c | 3 | ||||
-rw-r--r-- | kernel/watchdog.c | 88 |
7 files changed, 118 insertions, 91 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9153b20e5cc6..e049edd66776 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1339,6 +1339,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) mutex_unlock(&cgroup_mutex); + cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1751,6 +1752,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); + if (ss->css_rstat_flush) { + list_del_rcu(&css->rstat_css_node); + list_add_rcu(&css->rstat_css_node, + &dcgrp->rstat_css_list); + } + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -1971,10 +1978,14 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = rebind_subsystems(root, ss_mask); + ret = cgroup_rstat_init(root_cgrp); if (ret) goto destroy_root; + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto exit_stats; + ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); @@ -2006,6 +2017,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = 0; goto out; +exit_stats: + cgroup_rstat_exit(root_cgrp); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -4934,8 +4947,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4976,8 +4988,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - if (cgroup_on_dfl(cgrp)) - cgroup_rstat_flush(cgrp); + cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5034,7 +5045,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + if (ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); @@ -5159,11 +5170,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - if (cgroup_on_dfl(parent)) { - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - } + ret = cgroup_rstat_init(cgrp); + if (ret) + goto out_cancel_ref; /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); @@ -5250,8 +5259,7 @@ out_psi_free: out_kernfs_remove: kernfs_remove(cgrp->kn); out_stat_exit: - if (cgroup_on_dfl(parent)) - cgroup_rstat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d51175cedfca..3a3fd2993a65 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -25,13 +25,8 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup *parent; unsigned long flags; - /* nothing to do for root */ - if (!cgroup_parent(cgrp)) - return; - /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. @@ -46,10 +41,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { + while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -58,8 +53,17 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (rstatc->updated_next) break; + /* Root has no parent to link it to, but mark it busy */ + if (!parent) { + rstatc->updated_next = cgrp; + break; + } + + prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; + + cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); @@ -113,23 +117,26 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); - struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); - struct cgroup_rstat_cpu *nrstatc; - struct cgroup **nextp; - - nextp = &prstatc->updated_children; - while (true) { - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; + + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; + + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (true) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - return pos; } @@ -285,8 +292,6 @@ void __init cgroup_rstat_boot(void) for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } /* @@ -311,11 +316,15 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; + /* Root-level stats are sourced from system-wide CPU stats */ + if (!parent) + return; + /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); @@ -328,8 +337,8 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); - /* propagate global delta to parent */ - if (parent) { + /* propagate global delta to parent (unless that's root) */ + if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 905c3fa005f1..b4526668072e 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -66,6 +66,5 @@ void dma_common_free_remap(void *cpu_addr, size_t size) return; } - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); vunmap(cpu_addr); } diff --git a/kernel/fork.c b/kernel/fork.c index 0a5d28fe9990..771e0ea90499 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -380,14 +380,17 @@ static void account_kernel_stack(struct task_struct *tsk, int account) void *stack = task_stack_page(tsk); struct vm_struct *vm = task_stack_vm_area(tsk); + if (vm) { + int i; - /* All stack pages are in the same node. */ - if (vm) - mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); - else + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + account * (PAGE_SIZE / 1024)); + } else { + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); + } } static int memcg_charge_kernel_stack(struct task_struct *tsk) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e8da1e71583a..23a7a0ba1388 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -19,7 +19,7 @@ #include <linux/notifier.h> #include <linux/smp.h> #include <asm/processor.h> - +#include <linux/kasan.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); @@ -70,6 +70,9 @@ bool irq_work_queue(struct irq_work *work) if (!irq_work_claim(work)) return false; + /*record irq_work call stack in order to print it in KASAN reports*/ + kasan_record_aux_stack(work); + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); @@ -98,6 +101,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; + kasan_record_aux_stack(work); + preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ diff --git a/kernel/task_work.c b/kernel/task_work.c index e9316198c64b..1698fbe6f0e1 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -34,6 +34,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, { struct callback_head *head; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + do { head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 107bc38b1945..7c397907d0e9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,11 @@ static void lockup_detector_update_enable(void) #ifdef CONFIG_SOFTLOCKUP_DETECTOR -#define SOFTLOCKUP_RESET ULONG_MAX +/* + * Delay the soflockup report when running a known slow code. + * It does _not_ affect the timestamp of the last successdul reschedule. + */ +#define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; @@ -169,10 +173,12 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +/* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +/* Timestamp of the last softlockup report. */ +static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); -static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -235,10 +241,16 @@ static void set_sample_period(void) watchdog_update_hrtimer_threshold(sample_period); } +static void update_report_ts(void) +{ + __this_cpu_write(watchdog_report_ts, get_timestamp()); +} + /* Commands for resetting the watchdog */ -static void __touch_watchdog(void) +static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); + update_report_ts(); } /** @@ -252,10 +264,10 @@ static void __touch_watchdog(void) notrace void touch_softlockup_watchdog_sched(void) { /* - * Preemption can be enabled. It doesn't matter which CPU's timestamp - * gets zeroed here, so use the raw_ operation. + * Preemption can be enabled. It doesn't matter which CPU's watchdog + * report period gets restarted here, so use the raw_ operation. */ - raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) @@ -279,7 +291,7 @@ void touch_all_softlockup_watchdogs(void) * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { - per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; + per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } @@ -287,16 +299,16 @@ void touch_all_softlockup_watchdogs(void) void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); - __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); + __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } -static int is_softlockup(unsigned long touch_ts) +static int is_softlockup(unsigned long touch_ts, unsigned long period_ts) { unsigned long now = get_timestamp(); if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ - if (time_after(now, touch_ts + get_softlockup_thresh())) + if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; @@ -332,7 +344,7 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __touch_watchdog(); + update_touch_ts(); complete(this_cpu_ptr(&softlockup_completion)); return 0; @@ -342,6 +354,7 @@ static int softlockup_fn(void *data) static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); + unsigned long period_ts = __this_cpu_read(watchdog_report_ts); struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; @@ -363,7 +376,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - if (touch_ts == SOFTLOCKUP_RESET) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a soft lockup. Check to see if the host + * stopped the vm before we process the timestamps. + */ + kvm_check_and_clear_guest_paused(); + + /* Reset the interval when touched by known problematic code. */ + if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically @@ -373,9 +394,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) sched_clock_tick(); } - /* Clear the guest paused flag on watchdog reset */ - kvm_check_and_clear_guest_paused(); - __touch_watchdog(); + update_report_ts(); return HRTIMER_RESTART; } @@ -385,31 +404,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) * indicate it is getting cpu time. If it hasn't then * this is a good indication some task is hogging the cpu */ - duration = is_softlockup(touch_ts); + duration = is_softlockup(touch_ts, period_ts); if (unlikely(duration)) { /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like a soft lockup, check to see if the host - * stopped the vm before we issue the warning + * Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. */ - if (kvm_check_and_clear_guest_paused()) - return HRTIMER_RESTART; - - /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) - return HRTIMER_RESTART; - if (softlockup_all_cpu_backtrace) { - /* Prevent multiple soft-lockup reports if one cpu is already - * engaged in dumping cpu back traces - */ - if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { - /* Someone else will report us. Let's give up */ - __this_cpu_write(soft_watchdog_warn, true); + if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; - } } + /* Start period for the next softlockup warning. */ + update_report_ts(); + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -421,22 +429,14 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) dump_stack(); if (softlockup_all_cpu_backtrace) { - /* Avoid generating two back traces for current - * given that one is already made above - */ trigger_allbutself_cpu_backtrace(); - - clear_bit(0, &soft_lockup_nmi_warn); - /* Barrier to sync with other cpus */ - smp_mb__after_atomic(); + clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); - __this_cpu_write(soft_watchdog_warn, true); - } else - __this_cpu_write(soft_watchdog_warn, false); + } return HRTIMER_RESTART; } @@ -461,7 +461,7 @@ static void watchdog_enable(unsigned int cpu) HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ - __touch_watchdog(); + update_touch_ts(); /* Enable the perf event */ if (watchdog_enabled & NMI_WATCHDOG_ENABLED) watchdog_nmi_enable(cpu); |