From 2fcd7bbae90a6d844da8660a9d27079281dfbba2 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 14 Oct 2022 19:05:51 +0800 Subject: sched/psi: Fix avgs_work re-arm in psi_avgs_work() Pavan reported a problem that PSI avgs_work idle shutoff is not working at all. Because PSI_NONIDLE condition would be observed in psi_avgs_work()->collect_percpu_times()->get_recent_times() even if only the kworker running avgs_work on the CPU. Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off") avoided the ping-pong wake problem when the worker sleep, psi_avgs_work() still will always re-arm the avgs_work, so shutoff is not working. This patch changes to use PSI_STATE_RESCHEDULE to flag whether to re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs we can just check PSI_NONIDLE delta. The new flag is only used in psi_avgs_work(), so we check in get_recent_times() that current_work() is avgs_work. One potential problem is that the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until avgs_work run next time. The buckets can hold 4s worth of time, and future activity will wake the avgs_work with a 2s delay, giving us 2s worth of data we can leave behind when shut off the avgs_work. If the kworker run other works after avgs_work shut off and doesn't have any scheduler activities for 2s, this maybe a problem. Reported-by: Pavan Kondeti Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Suren Baghdasaryan Tested-by: Chengming Zhou Link: https://lore.kernel.org/r/20221014110551.22695-1-zhouchengming@bytedance.com --- include/linux/psi_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 6e4372735068..325981833587 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -72,6 +72,9 @@ enum psi_states { /* Use one bit in the state mask to track TSK_ONCPU */ #define PSI_ONCPU (1 << NR_PSI_STATES) +/* Flag whether to re-arm avgs_work, see details in get_recent_times() */ +#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1)) + enum psi_aggregators { PSI_AVGS = 0, PSI_POLL, -- cgit v1.2.3 From 710ffe671e014d5ccbcff225130a178b088ef090 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 28 Oct 2022 12:45:41 -0700 Subject: sched/psi: Stop relying on timer_pending() for poll_work rescheduling Psi polling mechanism is trying to minimize the number of wakeups to run psi_poll_work and is currently relying on timer_pending() to detect when this work is already scheduled. This provides a window of opportunity for psi_group_change to schedule an immediate psi_poll_work after poll_timer_fn got called but before psi_poll_work could reschedule itself. Below is the depiction of this entire window: poll_timer_fn wake_up_interruptible(&group->poll_wait); psi_poll_worker wait_event_interruptible(group->poll_wait, ...) psi_poll_work psi_schedule_poll_work if (timer_pending(&group->poll_timer)) return; ... mod_timer(&group->poll_timer, jiffies + delay); Prior to 461daba06bdc we used to rely on poll_scheduled atomic which was reset and set back inside psi_poll_work and therefore this race window was much smaller. The larger window causes increased number of wakeups and our partners report visible power regression of ~10mA after applying 461daba06bdc. Bring back the poll_scheduled atomic and make this race window even narrower by resetting poll_scheduled only when we reach polling expiration time. This does not completely eliminate the possibility of extra wakeups caused by a race with psi_group_change however it will limit it to the worst case scenario of one extra wakeup per every tracking window (0.5s in the worst case). This patch also ensures correct ordering between clearing poll_scheduled flag and obtaining changed_states using memory barrier. Correct ordering between updating changed_states and setting poll_scheduled is ensured by atomic_xchg operation. By tracing the number of immediate rescheduling attempts performed by psi_group_change and the number of these attempts being blocked due to psi monitor being already active, we can assess the effects of this change: Before the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 684365 1385156 1261240 Immediate reschedules blocked: 682846 1381654 1258682 Immediate reschedules (delta): 1519 3502 2558 Immediate reschedules (% of attempted): 0.22% 0.25% 0.20% After the patch: Run#1 Run#2 Run#3 Immediate reschedules attempted: 882244 770298 426218 Immediate reschedules blocked: 881996 769796 426074 Immediate reschedules (delta): 248 502 144 Immediate reschedules (% of attempted): 0.03% 0.07% 0.03% The number of non-blocked immediate reschedules dropped from 0.22-0.25% to 0.03-0.07%. The drop is attributed to the decrease in the race window size and the fact that we allow this race only when psi monitors reach polling window expiration time. Fixes: 461daba06bdc ("psi: eliminate kthread_worker from psi trigger scheduling mechanism") Reported-by: Kathleen Chang Reported-by: Wenju Xu Reported-by: Jonathan Chen Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Tested-by: SH Chen Link: https://lore.kernel.org/r/20221028194541.813985-1-surenb@google.com --- include/linux/psi_types.h | 1 + kernel/sched/psi.c | 62 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 325981833587..1e0a0d7ace3a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -180,6 +180,7 @@ struct psi_group { struct timer_list poll_timer; wait_queue_head_t poll_wait; atomic_t poll_wakeup; + atomic_t poll_scheduled; /* Protects data used by the monitor */ struct mutex trigger_lock; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a4348af10036..8ac8b81bfee6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -189,6 +189,7 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); group->poll_min_period = U32_MAX; @@ -589,18 +590,17 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* Schedule polling if it's not already scheduled. */ -static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +/* Schedule polling if it's not already scheduled or forced. */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay, + bool force) { struct task_struct *task; /* - * Do not reschedule if already scheduled. - * Possible race with a timer scheduled after this check but before - * mod_timer below can be tolerated because group->polling_next_update - * will keep updates on schedule. + * atomic_xchg should be called even when !force to provide a + * full memory barrier (see the comment inside psi_poll_work). */ - if (timer_pending(&group->poll_timer)) + if (atomic_xchg(&group->poll_scheduled, 1) && !force) return; rcu_read_lock(); @@ -612,12 +612,15 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) */ if (likely(task)) mod_timer(&group->poll_timer, jiffies + delay); + else + atomic_set(&group->poll_scheduled, 0); rcu_read_unlock(); } static void psi_poll_work(struct psi_group *group) { + bool force_reschedule = false; u32 changed_states; u64 now; @@ -625,6 +628,43 @@ static void psi_poll_work(struct psi_group *group) now = sched_clock(); + if (now > group->polling_until) { + /* + * We are either about to start or might stop polling if no + * state change was recorded. Resetting poll_scheduled leaves + * a small window for psi_group_change to sneak in and schedule + * an immediate poll_work before we get to rescheduling. One + * potential extra wakeup at the end of the polling window + * should be negligible and polling_next_update still keeps + * updates correctly on schedule. + */ + atomic_set(&group->poll_scheduled, 0); + /* + * A task change can race with the poll worker that is supposed to + * report on it. To avoid missing events, ensure ordering between + * poll_scheduled and the task state accesses, such that if the poll + * worker misses the state update, the task change is guaranteed to + * reschedule the poll worker: + * + * poll worker: + * atomic_set(poll_scheduled, 0) + * smp_mb() + * LOAD states + * + * task change: + * STORE states + * if atomic_xchg(poll_scheduled, 1) == 0: + * schedule poll worker + * + * The atomic_xchg() implies a full barrier. + */ + smp_mb(); + } else { + /* Polling window is not over, keep rescheduling */ + force_reschedule = true; + } + + collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->poll_states) { @@ -650,7 +690,8 @@ static void psi_poll_work(struct psi_group *group) group->polling_next_update = update_triggers(group, now); psi_schedule_poll_work(group, - nsecs_to_jiffies(group->polling_next_update - now) + 1); + nsecs_to_jiffies(group->polling_next_update - now) + 1, + force_reschedule); out: mutex_unlock(&group->trigger_lock); @@ -811,7 +852,7 @@ static void psi_group_change(struct psi_group *group, int cpu, write_seqcount_end(&groupc->seq); if (state_mask & group->poll_states) - psi_schedule_poll_work(group, 1); + psi_schedule_poll_work(group, 1, false); if (wake_clock && !delayed_work_pending(&group->avgs_work)) schedule_delayed_work(&group->avgs_work, PSI_FREQ); @@ -965,7 +1006,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) write_seqcount_end(&groupc->seq); if (group->poll_states & (1 << PSI_IRQ_FULL)) - psi_schedule_poll_work(group, 1); + psi_schedule_poll_work(group, 1, false); } while ((group = group->parent)); } #endif @@ -1351,6 +1392,7 @@ void psi_trigger_destroy(struct psi_trigger *t) * can no longer be found through group->poll_task. */ kthread_stop(task_to_destroy); + atomic_set(&group->poll_scheduled, 0); } kfree(t); } -- cgit v1.2.3 From 52b33d87b9197c51e8ffdc61873739d90dd0a16f Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 26 Sep 2022 16:19:31 +0800 Subject: sched/psi: Use task->psi_flags to clear in CPU migration The commit d583d360a620 ("psi: Fix psi state corruption when schedule() races with cgroup move") fixed a race problem by making cgroup_move_task() use task->psi_flags instead of looking at the scheduler state. We can extend task->psi_flags usage to CPU migration, which should be a minor optimization for performance and code simplicity. Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220926081931.45420-1-zhouchengming@bytedance.com --- include/linux/sched.h | 3 --- kernel/sched/core.c | 2 +- kernel/sched/stats.h | 22 ++++------------------ 3 files changed, 5 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..23de7fe86cc4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -888,9 +888,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; -#ifdef CONFIG_PSI - unsigned sched_psi_wake_requeue:1; -#endif /* Force alignment to the next boundary: */ unsigned :0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 87c9cdf37a26..07ac08caf019 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2053,7 +2053,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) { sched_info_enqueue(rq, p); - psi_enqueue(p, flags & ENQUEUE_WAKEUP); + psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } uclamp_rq_inc(rq, p); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 84a188913cc9..38f3698f5e5b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -128,11 +128,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (p->in_memstall) set |= TSK_MEMSTALL_RUNNING; - if (!wakeup || p->sched_psi_wake_requeue) { + if (!wakeup) { if (p->in_memstall) set |= TSK_MEMSTALL; - if (p->sched_psi_wake_requeue) - p->sched_psi_wake_requeue = 0; } else { if (p->in_iowait) clear |= TSK_IOWAIT; @@ -143,8 +141,6 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING; - if (static_branch_likely(&psi_disabled)) return; @@ -157,10 +153,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) if (sleep) return; - if (p->in_memstall) - clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); - - psi_task_change(p, clear, 0); + psi_task_change(p, p->psi_flags, 0); } static inline void psi_ttwu_dequeue(struct task_struct *p) @@ -172,19 +165,12 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) * deregister its sleep-persistent psi states from the old * queue, and let psi_enqueue() know it has to requeue. */ - if (unlikely(p->in_iowait || p->in_memstall)) { + if (unlikely(p->psi_flags)) { struct rq_flags rf; struct rq *rq; - int clear = 0; - - if (p->in_iowait) - clear |= TSK_IOWAIT; - if (p->in_memstall) - clear |= TSK_MEMSTALL; rq = __task_rq_lock(p, &rf); - psi_task_change(p, clear, 0); - p->sched_psi_wake_requeue = 1; + psi_task_change(p, p->psi_flags, 0); __task_rq_unlock(rq, &rf); } } -- cgit v1.2.3